diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..7210b43 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,15 @@ +# Force LF line endings for shell scripts and config text files so they +# work correctly under bash/WSL even when the repo is cloned on Windows +# with core.autocrlf=true. CRLF in *.env files breaks `source`, and CRLF +# in *.sh files breaks the bash interpreter ("$'\r': command not found"). +*.sh text eol=lf +*.bash text eol=lf +*.env text eol=lf +*.yaml text eol=lf +*.yml text eol=lf +*.toml text eol=lf + +# Windows batch scripts must keep CRLF so cmd.exe parses them reliably. +*.bat text eol=crlf +*.cmd text eol=crlf +*.ps1 text eol=crlf diff --git a/.secrets.baseline b/.secrets.baseline index f33da84..427554c 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -3,7 +3,7 @@ "files": "^.secrets.baseline$", "lines": null }, - "generated_at": "2026-05-10T11:36:08Z", + "generated_at": "2026-05-28T12:15:43Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -306,6 +306,15 @@ "type": "Hex High Entropy String", "verified_result": null } + ], + "scripts/model_profiles.bat": [ + { + "hashed_secret": "af89b35ce32cfc9eaf4c102325da47616e6eff93", + "is_verified": false, + "line_number": 18, + "type": "Base64 High Entropy String", + "verified_result": null + } ] }, "version": "0.13.1+ibm.64.dss", diff --git a/README.md b/README.md index 18d3a27..59bace9 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,19 @@ git clone https://github.com/cuga-project/cuga-eval.git cd cuga-eval ``` +> **Windows users:** every `.sh` script in this repo has a sibling `.bat`. You don't need +> WSL or Git Bash for the simple wrappers (`setup_cuga.bat`, `run_app.bat`, `run_registry.bat`, +> `viz.bat`, `model_profiles.bat`, the per-benchmark `analyze.bat`, etc.) — they run on +> stock `cmd.exe`. The heavier scripts (eval/compare/clean and the `m3_pad_to_cap_verify` +> helper) delegate to bash via Git Bash or WSL because they use POSIX-only features. See +> [Running on Windows](#running-on-windows) below. +> +> If you're using WSL and cloned with Windows git (default `core.autocrlf=true`), +> the `*.sh` and `*.env` files end up with CRLF line endings, which break bash under WSL +> (`$'\r': command not found`) and `source`d env files. Run `fix_line_endings.bat` +> (double-click in Explorer, or run from `cmd.exe` / PowerShell) once before running any +> setup scripts under WSL. + ### 2. Run setup script ```bash # Clone CUGA agent and set up the base environment @@ -176,6 +189,57 @@ cd benchmarks/m3 && ./eval.sh cd benchmarks/appworld && ./eval.sh ``` +### Running on Windows + +Every script has a `.bat` sibling. Same flags, same semantics; just substitute the +extension and use `\` instead of `/`: + +```bat +:: Top-level dispatcher (these scripts delegate to bash — see note below) +scripts\eval.bat --benchmark bpo +scripts\eval.bat --benchmark m3 --model-profile gpt-oss +scripts\compare.bat --benchmark bpo --runs 3 + +:: Setup (pure cmd.exe — no bash required) +setup_cuga.bat +setup_m3.bat --verify +setup_appworld.bat + +:: Per-benchmark, from the benchmark dir +cd benchmarks\bpo && eval.bat +cd benchmarks\m3 && run_registry.bat + +:: Run from PowerShell the same way — pwsh launches .bat via cmd.exe +.\setup_cuga.bat +.\scripts\eval.bat --benchmark bpo +``` + +The `.bat` files fall into two groups: + +- **Pure `cmd.exe` ports** — setup scripts, env loaders, registry runners, app + launchers, model profiles, the analyze and viz thin-wrappers. Work on a vanilla + Windows install with `cmd.exe` or PowerShell. No bash needed. +- **Bash-delegate shims** — the heavy eval/compare/clean scripts and + `m3_pad_to_cap_verify`. These use POSIX features (signal traps, `lsof`, `pkill`, + process substitution, sourceable function libraries, embedded `python3` here-docs) + that don't have clean `cmd.exe` equivalents, so each shim calls + [`benchmarks\helpers\_delegate_to_bash.bat`](benchmarks/helpers/_delegate_to_bash.bat), + which finds a `bash` in this order: Git Bash (well-known install paths) → + `bash` on `PATH` → WSL. Install [Git for Windows](https://git-scm.com/download/win) + (provides Git Bash) or run `wsl --install` if neither is present. + +A smoke test for the `.bat` scripts ships at `scripts/test_bat_scripts.ps1`. It +runs on any platform with PowerShell 7+: + +```bash +pwsh scripts/test_bat_scripts.ps1 +``` + +It validates that every `.sh` has a `.bat` sibling, that each `.bat` is well-formed, +and that the delegate shims point to existing `.sh` files. Long-term, this whole +layer will move to Python (one entrypoint instead of two parallel script trees) — +tracked in [issue #88](../../issues/88). + ### Model profiles Available profiles: `gpt-oss`, `gpt4o`, `gpt4.1`, `opus4.5` diff --git a/benchmarks/appworld/compare.bat b/benchmarks/appworld/compare.bat new file mode 100644 index 0000000..fddd539 --- /dev/null +++ b/benchmarks/appworld/compare.bat @@ -0,0 +1,7 @@ +@echo off +REM Windows equivalent of benchmarks/appworld/compare.sh — delegates to bash. +setlocal +set "_THIS=%~dp0" +if "%_THIS:~-1%"=="\" set "_THIS=%_THIS:~0,-1%" +call "%_THIS%\..\helpers\_delegate_to_bash.bat" "%_THIS%\compare.sh" %* +exit /b %errorlevel% diff --git a/benchmarks/appworld/eval.bat b/benchmarks/appworld/eval.bat new file mode 100644 index 0000000..4f355c8 --- /dev/null +++ b/benchmarks/appworld/eval.bat @@ -0,0 +1,8 @@ +@echo off +REM Windows equivalent of benchmarks/appworld/eval.sh — delegates to bash +REM (traps, kill -0, lsof, process substitution, find with -mindepth). +setlocal +set "_THIS=%~dp0" +if "%_THIS:~-1%"=="\" set "_THIS=%_THIS:~0,-1%" +call "%_THIS%\..\helpers\_delegate_to_bash.bat" "%_THIS%\eval.sh" %* +exit /b %errorlevel% diff --git a/benchmarks/appworld/run_app.bat b/benchmarks/appworld/run_app.bat new file mode 100644 index 0000000..dc1567c --- /dev/null +++ b/benchmarks/appworld/run_app.bat @@ -0,0 +1,19 @@ +@echo off +REM Windows equivalent of benchmarks/appworld/run_app.sh +REM Loads env and starts AppWorld. + +setlocal +set "SCRIPT_DIR=%~dp0" +if "%SCRIPT_DIR:~-1%"=="\" set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%" +pushd "%SCRIPT_DIR%\..\.." >nul +set "PROJECT_ROOT=%CD%" +popd >nul + +echo Loading AppWorld configuration... +call "%PROJECT_ROOT%\benchmarks\helpers\load_env.bat" "appworld" + +echo. +echo Starting AppWorld... +cd /d "%PROJECT_ROOT%" +uv run cuga start appworld +exit /b %errorlevel% diff --git a/benchmarks/appworld/run_eval.bat b/benchmarks/appworld/run_eval.bat new file mode 100644 index 0000000..a14a943 --- /dev/null +++ b/benchmarks/appworld/run_eval.bat @@ -0,0 +1,13 @@ +@echo off +REM Windows equivalent of benchmarks/appworld/run_eval.sh +REM Loads AppWorld env and runs cuga-eval. + +setlocal +set "SCRIPT_DIR=%~dp0" +if "%SCRIPT_DIR:~-1%"=="\" set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%" +pushd "%SCRIPT_DIR%\..\.." >nul +set "PROJECT_ROOT=%CD%" +popd >nul +call "%PROJECT_ROOT%\benchmarks\helpers\load_env.bat" "appworld" +cuga-eval appworld %* +exit /b %errorlevel% diff --git a/benchmarks/appworld/run_registry.bat b/benchmarks/appworld/run_registry.bat new file mode 100644 index 0000000..bac0aab --- /dev/null +++ b/benchmarks/appworld/run_registry.bat @@ -0,0 +1,8 @@ +@echo off +REM Windows equivalent of benchmarks/appworld/run_registry.sh +REM Delegates to the generic helper. +setlocal +set "SCRIPT_DIR=%~dp0" +if "%SCRIPT_DIR:~-1%"=="\" set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%" +call "%SCRIPT_DIR%\..\helpers\run_registry.bat" "appworld" +exit /b %errorlevel% diff --git a/benchmarks/bpo/compare.bat b/benchmarks/bpo/compare.bat new file mode 100644 index 0000000..e1eb1bd --- /dev/null +++ b/benchmarks/bpo/compare.bat @@ -0,0 +1,7 @@ +@echo off +REM Windows equivalent of benchmarks/bpo/compare.sh — delegates to bash. +setlocal +set "_THIS=%~dp0" +if "%_THIS:~-1%"=="\" set "_THIS=%_THIS:~0,-1%" +call "%_THIS%\..\helpers\_delegate_to_bash.bat" "%_THIS%\compare.sh" %* +exit /b %errorlevel% diff --git a/benchmarks/bpo/eval.bat b/benchmarks/bpo/eval.bat new file mode 100644 index 0000000..b6d0e05 --- /dev/null +++ b/benchmarks/bpo/eval.bat @@ -0,0 +1,7 @@ +@echo off +REM Windows equivalent of benchmarks/bpo/eval.sh — delegates to bash. +setlocal +set "_THIS=%~dp0" +if "%_THIS:~-1%"=="\" set "_THIS=%_THIS:~0,-1%" +call "%_THIS%\..\helpers\_delegate_to_bash.bat" "%_THIS%\eval.sh" %* +exit /b %errorlevel% diff --git a/benchmarks/bpo/run_app.bat b/benchmarks/bpo/run_app.bat new file mode 100644 index 0000000..f3c414b --- /dev/null +++ b/benchmarks/bpo/run_app.bat @@ -0,0 +1,19 @@ +@echo off +REM Windows equivalent of benchmarks/bpo/run_app.sh +REM Loads env and runs the BPO FastAPI app on port 8095. + +setlocal +set "SCRIPT_DIR=%~dp0" +if "%SCRIPT_DIR:~-1%"=="\" set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%" +pushd "%SCRIPT_DIR%\..\.." >nul +set "PROJECT_ROOT=%CD%" +popd >nul + +echo Loading BPO configuration... +call "%PROJECT_ROOT%\benchmarks\helpers\load_env.bat" "bpo" + +echo. +echo Starting BPO FastAPI app on port 8095... +cd /d "%PROJECT_ROOT%" +uv run uvicorn benchmarks.bpo.main:app --reload --port 8095 +exit /b %errorlevel% diff --git a/benchmarks/bpo/run_registry.bat b/benchmarks/bpo/run_registry.bat new file mode 100644 index 0000000..7787dc8 --- /dev/null +++ b/benchmarks/bpo/run_registry.bat @@ -0,0 +1,7 @@ +@echo off +REM Windows equivalent of benchmarks/bpo/run_registry.sh +setlocal +set "SCRIPT_DIR=%~dp0" +if "%SCRIPT_DIR:~-1%"=="\" set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%" +call "%SCRIPT_DIR%\..\helpers\run_registry.bat" "bpo" +exit /b %errorlevel% diff --git a/benchmarks/helpers/_delegate_to_bash.bat b/benchmarks/helpers/_delegate_to_bash.bat new file mode 100644 index 0000000..aedb26e --- /dev/null +++ b/benchmarks/helpers/_delegate_to_bash.bat @@ -0,0 +1,60 @@ +@echo off +REM Shared helper: invokes a .sh script via Git Bash or WSL, forwarding all args. +REM +REM Usage (from another .bat): +REM call "\_delegate_to_bash.bat" "" %* +REM +REM Rationale: many of the .sh scripts in this repo use POSIX-only features +REM (process substitution, traps, lsof, pkill, comm, find -mindepth, mktemp, +REM heredocs, etc.) that don't have clean cmd.exe equivalents. Rather than +REM ship subtly-broken cmd.exe ports, we delegate to a real bash. A native +REM Python port is tracked in the follow-up issue. + +setlocal enabledelayedexpansion + +if "%~1"=="" ( + echo [ERROR] _delegate_to_bash.bat called without a script path + exit /b 2 +) +set "_SCRIPT=%~1" +shift + +if not exist "%_SCRIPT%" ( + echo [ERROR] Script not found: %_SCRIPT% + exit /b 2 +) + +REM Try Git Bash in well-known install locations +for %%G in ( + "%ProgramFiles%\Git\bin\bash.exe" + "%ProgramFiles(x86)%\Git\bin\bash.exe" + "%LocalAppData%\Programs\Git\bin\bash.exe" +) do ( + if exist %%G ( + %%G "%_SCRIPT%" %* + exit /b !errorlevel! + ) +) + +REM Then any bash on PATH (e.g. msys2, cygwin) +where bash >nul 2>&1 +if not errorlevel 1 ( + bash "%_SCRIPT%" %* + exit /b !errorlevel! +) + +REM Finally WSL +where wsl >nul 2>&1 +if not errorlevel 1 ( + for /f "delims=" %%P in ('wsl wslpath -u "%_SCRIPT%" 2^>nul') do set "_WSL_SCRIPT=%%P" + if not "!_WSL_SCRIPT!"=="" ( + wsl bash "!_WSL_SCRIPT!" %* + exit /b !errorlevel! + ) +) + +echo [ERROR] No bash interpreter found on this system. +echo This script requires bash. Install one of: +echo - Git for Windows ^(provides Git Bash^): https://git-scm.com/download/win +echo - WSL ^(Windows Subsystem for Linux^): wsl --install +exit /b 1 diff --git a/benchmarks/helpers/common.bat b/benchmarks/helpers/common.bat new file mode 100644 index 0000000..95913a3 --- /dev/null +++ b/benchmarks/helpers/common.bat @@ -0,0 +1,24 @@ +@echo off +REM Placeholder for benchmarks/helpers/common.sh. +REM +REM common.sh is a bash function library (port_in_use, wait_for_server, +REM parse_common_args, cleanup_pids, etc.) that gets sourced by other .sh +REM scripts. There's no equivalent of `source` for function definitions in +REM cmd.exe, so a direct port is not feasible. +REM +REM In practice, this file is never called directly: the heavy .bat files +REM in this repo (eval.bat, compare.bat, etc.) delegate to bash via +REM _delegate_to_bash.bat, and bash sources common.sh itself. +REM +REM If you ARE invoking this file directly, you probably want one of: +REM - call _delegate_to_bash.bat ".\common.sh" ^ (run from bash) +REM - Use Git Bash or WSL to source it the normal way +REM +REM See the follow-up issue for the Python migration that removes this gap. + +if "%~1"=="" ( + echo common.bat is a placeholder. See comment block in this file. + exit /b 0 +) +echo [WARN] common.bat does not implement %~1 in cmd.exe. Use bash to source common.sh. +exit /b 1 diff --git a/benchmarks/helpers/load_env.bat b/benchmarks/helpers/load_env.bat new file mode 100644 index 0000000..863c4e4 --- /dev/null +++ b/benchmarks/helpers/load_env.bat @@ -0,0 +1,55 @@ +@echo off +REM Windows equivalent of load_env.sh +REM +REM Usage: call load_env.bat [benchmark_name] +REM +REM Sourcing semantics: this script writes a temporary .bat snippet of `set` +REM commands and calls it, so env vars persist into the caller's scope when +REM invoked via `call`. + +setlocal enabledelayedexpansion + +set "BENCHMARK_NAME=%~1" + +set "HELPERS_DIR=%~dp0" +if "%HELPERS_DIR:~-1%"=="\" set "HELPERS_DIR=%HELPERS_DIR:~0,-1%" +pushd "%HELPERS_DIR%\..\.." >nul +set "PROJECT_ROOT=%CD%" +popd >nul +set "CONFIG_DIR=%PROJECT_ROOT%\config" + +REM Temp file holds the set-commands we'll call from the caller's scope +set "_SETS=%TEMP%\cuga_loadenv_%RANDOM%_%RANDOM%.bat" +echo @echo off> "%_SETS%" + +call :emit_env_file "%PROJECT_ROOT%\.env" ".env (secrets)" +call :emit_env_file "%CONFIG_DIR%\global.env" "global.env" +if not "%BENCHMARK_NAME%"=="" ( + call :emit_env_file "%PROJECT_ROOT%\benchmarks\%BENCHMARK_NAME%\config\%BENCHMARK_NAME%.env" "%BENCHMARK_NAME%.env" +) + +REM Default LOGURU_LEVEL handling +if "%LOGURU_LEVEL%"=="" echo set "LOGURU_LEVEL=WARNING">> "%_SETS%" +if /i "%VERBOSE%"=="true" echo set "LOGURU_LEVEL=DEBUG">> "%_SETS%" + +REM Single-line endlocal so %_SETS% is expanded at parse time (before endlocal runs) +endlocal & call "%_SETS%" & del "%_SETS%" 2>nul +exit /b 0 + +:emit_env_file +set "_FILE=%~1" +set "_LABEL=%~2" +if not exist "%_FILE%" ( + if not "%_LABEL%"=="" echo (skipping missing %_LABEL%) + exit /b 0 +) +echo [ok] Loading %_LABEL% +for /f "usebackq tokens=* eol=#" %%L in ("%_FILE%") do ( + set "_line=%%L" + if not "!_line!"=="" ( + for /f "tokens=1,* delims==" %%A in ("!_line!") do ( + echo set "%%A=%%B">> "%_SETS%" + ) + ) +) +exit /b 0 diff --git a/benchmarks/helpers/run_registry.bat b/benchmarks/helpers/run_registry.bat new file mode 100644 index 0000000..306a64f --- /dev/null +++ b/benchmarks/helpers/run_registry.bat @@ -0,0 +1,28 @@ +@echo off +REM Windows equivalent of run_registry.sh +REM Loads env (global + benchmark-specific) and starts the registry server. +REM Usage: run_registry.bat ^ + +setlocal + +set "BENCHMARK_NAME=%~1" +if "%BENCHMARK_NAME%"=="" ( + echo Usage: %~nx0 ^ + echo Example: %~nx0 m3 + exit /b 1 +) + +set "SCRIPT_DIR=%~dp0" +if "%SCRIPT_DIR:~-1%"=="\" set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%" +pushd "%SCRIPT_DIR%\..\.." >nul +set "PROJECT_ROOT=%CD%" +popd >nul + +echo Loading %BENCHMARK_NAME% evaluation configuration... +call "%SCRIPT_DIR%\load_env.bat" "%BENCHMARK_NAME%" + +echo. +echo Starting registry server... +cd /d "%PROJECT_ROOT%" +uv run registry +exit /b %errorlevel% diff --git a/benchmarks/m3/clean.bat b/benchmarks/m3/clean.bat new file mode 100644 index 0000000..757be01 --- /dev/null +++ b/benchmarks/m3/clean.bat @@ -0,0 +1,8 @@ +@echo off +REM Windows equivalent of benchmarks/m3/clean.sh — delegates to bash. +REM (Uses pkill, lsof, docker exec curl loops, glob removal — POSIX-only.) +setlocal +set "_THIS=%~dp0" +if "%_THIS:~-1%"=="\" set "_THIS=%_THIS:~0,-1%" +call "%_THIS%\..\helpers\_delegate_to_bash.bat" "%_THIS%\clean.sh" %* +exit /b %errorlevel% diff --git a/benchmarks/m3/compare.bat b/benchmarks/m3/compare.bat new file mode 100644 index 0000000..835cf99 --- /dev/null +++ b/benchmarks/m3/compare.bat @@ -0,0 +1,7 @@ +@echo off +REM Windows equivalent of benchmarks/m3/compare.sh — delegates to bash. +setlocal +set "_THIS=%~dp0" +if "%_THIS:~-1%"=="\" set "_THIS=%_THIS:~0,-1%" +call "%_THIS%\..\helpers\_delegate_to_bash.bat" "%_THIS%\compare.sh" %* +exit /b %errorlevel% diff --git a/benchmarks/m3/eval.bat b/benchmarks/m3/eval.bat new file mode 100644 index 0000000..04a4e4b --- /dev/null +++ b/benchmarks/m3/eval.bat @@ -0,0 +1,11 @@ +@echo off +REM Windows equivalent of benchmarks/m3/eval.sh +REM Delegates to bash (Git Bash / WSL) because the script uses POSIX features +REM that don't translate cleanly to cmd.exe (traps, lsof, process subs, ...). +REM Tracked in the follow-up issue: migrate these to Python. + +setlocal +set "_THIS=%~dp0" +if "%_THIS:~-1%"=="\" set "_THIS=%_THIS:~0,-1%" +call "%_THIS%\..\helpers\_delegate_to_bash.bat" "%_THIS%\eval.sh" %* +exit /b %errorlevel% diff --git a/benchmarks/m3/eval/scripts/monitor_eval.bat b/benchmarks/m3/eval/scripts/monitor_eval.bat new file mode 100644 index 0000000..cfcaebc --- /dev/null +++ b/benchmarks/m3/eval/scripts/monitor_eval.bat @@ -0,0 +1,7 @@ +@echo off +REM Windows equivalent of monitor_eval.sh — delegates to bash. +setlocal +set "_THIS=%~dp0" +if "%_THIS:~-1%"=="\" set "_THIS=%_THIS:~0,-1%" +call "%_THIS%\..\..\..\helpers\_delegate_to_bash.bat" "%_THIS%\monitor_eval.sh" %* +exit /b %errorlevel% diff --git a/benchmarks/m3/eval/scripts/run_eval_background.bat b/benchmarks/m3/eval/scripts/run_eval_background.bat new file mode 100644 index 0000000..69a18b3 --- /dev/null +++ b/benchmarks/m3/eval/scripts/run_eval_background.bat @@ -0,0 +1,8 @@ +@echo off +REM Windows equivalent of run_eval_background.sh — delegates to bash +REM (nohup, &, signal traps, PID files — POSIX background-job semantics). +setlocal +set "_THIS=%~dp0" +if "%_THIS:~-1%"=="\" set "_THIS=%_THIS:~0,-1%" +call "%_THIS%\..\..\..\helpers\_delegate_to_bash.bat" "%_THIS%\run_eval_background.sh" %* +exit /b %errorlevel% diff --git a/benchmarks/m3/eval/scripts/setup_m3_eval.bat b/benchmarks/m3/eval/scripts/setup_m3_eval.bat new file mode 100644 index 0000000..a60b413 --- /dev/null +++ b/benchmarks/m3/eval/scripts/setup_m3_eval.bat @@ -0,0 +1,8 @@ +@echo off +REM Windows equivalent of setup_m3_eval.sh — delegates to bash (uses +REM interactive prompts, docker detection, file edits — bash-only). +setlocal +set "_THIS=%~dp0" +if "%_THIS:~-1%"=="\" set "_THIS=%_THIS:~0,-1%" +call "%_THIS%\..\..\..\helpers\_delegate_to_bash.bat" "%_THIS%\setup_m3_eval.sh" %* +exit /b %errorlevel% diff --git a/benchmarks/m3/run_registry.bat b/benchmarks/m3/run_registry.bat new file mode 100644 index 0000000..2ebc173 --- /dev/null +++ b/benchmarks/m3/run_registry.bat @@ -0,0 +1,7 @@ +@echo off +REM Windows equivalent of benchmarks/m3/run_registry.sh +setlocal +set "SCRIPT_DIR=%~dp0" +if "%SCRIPT_DIR:~-1%"=="\" set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%" +call "%SCRIPT_DIR%\..\helpers\run_registry.bat" "m3" +exit /b %errorlevel% diff --git a/benchmarks/m3/run_with_container.bat b/benchmarks/m3/run_with_container.bat new file mode 100644 index 0000000..72c8718 --- /dev/null +++ b/benchmarks/m3/run_with_container.bat @@ -0,0 +1,7 @@ +@echo off +REM Windows equivalent of benchmarks/m3/run_with_container.sh — delegates to bash. +setlocal +set "_THIS=%~dp0" +if "%_THIS:~-1%"=="\" set "_THIS=%_THIS:~0,-1%" +call "%_THIS%\..\helpers\_delegate_to_bash.bat" "%_THIS%\run_with_container.sh" %* +exit /b %errorlevel% diff --git a/benchmarks/oak_health_insurance/compare.bat b/benchmarks/oak_health_insurance/compare.bat new file mode 100644 index 0000000..8a9558f --- /dev/null +++ b/benchmarks/oak_health_insurance/compare.bat @@ -0,0 +1,7 @@ +@echo off +REM Windows equivalent of benchmarks/oak_health_insurance/compare.sh — delegates to bash. +setlocal +set "_THIS=%~dp0" +if "%_THIS:~-1%"=="\" set "_THIS=%_THIS:~0,-1%" +call "%_THIS%\..\helpers\_delegate_to_bash.bat" "%_THIS%\compare.sh" %* +exit /b %errorlevel% diff --git a/benchmarks/oak_health_insurance/eval.bat b/benchmarks/oak_health_insurance/eval.bat new file mode 100644 index 0000000..8a37a55 --- /dev/null +++ b/benchmarks/oak_health_insurance/eval.bat @@ -0,0 +1,7 @@ +@echo off +REM Windows equivalent of benchmarks/oak_health_insurance/eval.sh — delegates to bash. +setlocal +set "_THIS=%~dp0" +if "%_THIS:~-1%"=="\" set "_THIS=%_THIS:~0,-1%" +call "%_THIS%\..\helpers\_delegate_to_bash.bat" "%_THIS%\eval.sh" %* +exit /b %errorlevel% diff --git a/benchmarks/oak_health_insurance/run_app.bat b/benchmarks/oak_health_insurance/run_app.bat new file mode 100644 index 0000000..8bea569 --- /dev/null +++ b/benchmarks/oak_health_insurance/run_app.bat @@ -0,0 +1,19 @@ +@echo off +REM Windows equivalent of benchmarks/oak_health_insurance/run_app.sh +REM Loads env and runs the Oak Health Insurance FastAPI app on port 8090. + +setlocal +set "SCRIPT_DIR=%~dp0" +if "%SCRIPT_DIR:~-1%"=="\" set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%" +pushd "%SCRIPT_DIR%\..\.." >nul +set "PROJECT_ROOT=%CD%" +popd >nul + +echo Loading Oak Health Insurance configuration... +call "%PROJECT_ROOT%\benchmarks\helpers\load_env.bat" "oak_health_insurance" + +echo. +echo Starting FastAPI app... +cd /d "%SCRIPT_DIR%" +uv run uvicorn main:app --reload --port 8090 +exit /b %errorlevel% diff --git a/benchmarks/oak_health_insurance/run_registry.bat b/benchmarks/oak_health_insurance/run_registry.bat new file mode 100644 index 0000000..203b480 --- /dev/null +++ b/benchmarks/oak_health_insurance/run_registry.bat @@ -0,0 +1,7 @@ +@echo off +REM Windows equivalent of benchmarks/oak_health_insurance/run_registry.sh +setlocal +set "SCRIPT_DIR=%~dp0" +if "%SCRIPT_DIR:~-1%"=="\" set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%" +call "%SCRIPT_DIR%\..\helpers\run_registry.bat" "oak_health_insurance" +exit /b %errorlevel% diff --git a/fix_line_endings.bat b/fix_line_endings.bat new file mode 100644 index 0000000..4bfb851 --- /dev/null +++ b/fix_line_endings.bat @@ -0,0 +1,37 @@ +@echo off +REM ============================================================================ +REM fix_line_endings.bat +REM +REM Strips CRLF line endings from *.sh and *.env files in this repo so they +REM work under WSL bash. Run this once on Windows after cloning the repo (or +REM after pulling, if you have stale CRLF files), BEFORE running setup_cuga.sh +REM or setup_m3.sh under WSL. +REM +REM Usage: double-click, or from cmd.exe / PowerShell: fix_line_endings.bat +REM ============================================================================ + +setlocal +cd /d "%~dp0" + +echo. +echo Normalizing *.sh and *.env line endings (CRLF -^> LF) under: +echo %CD% +echo. + +powershell -NoProfile -ExecutionPolicy Bypass -Command "$ErrorActionPreference='Stop'; $root = (Get-Location).Path; $count = 0; $files = Get-ChildItem -Path . -Recurse -File -Include *.sh,*.env | Where-Object { $_.FullName -notmatch '[\\/](\.git|\.venv|vendor|node_modules)[\\/]' }; foreach ($f in $files) { $b = [IO.File]::ReadAllBytes($f.FullName); if ($b -contains 13) { $c = New-Object Collections.Generic.List[byte]; foreach ($x in $b) { if ($x -ne 13) { $c.Add($x) } }; [IO.File]::WriteAllBytes($f.FullName, $c.ToArray()); Write-Host (' normalized: ' + $f.FullName.Substring($root.Length + 1)); $count++ } }; Write-Host ''; Write-Host ('Normalized ' + $count + ' file(s).')" + +if errorlevel 1 ( + echo. + echo ERROR: normalization failed. See PowerShell error above. + exit /b 1 +) + +echo. +echo Done. You can now run setup_cuga.sh / setup_m3.sh under WSL. +echo. + +REM Pause so the window stays open if double-clicked from Explorer. +if defined PROMPT goto :end +pause +:end +endlocal diff --git a/scripts/compare.bat b/scripts/compare.bat new file mode 100644 index 0000000..b278bb7 --- /dev/null +++ b/scripts/compare.bat @@ -0,0 +1,7 @@ +@echo off +REM Windows equivalent of scripts/compare.sh — delegates to bash (sources common.sh). +setlocal +set "_THIS=%~dp0" +if "%_THIS:~-1%"=="\" set "_THIS=%_THIS:~0,-1%" +call "%_THIS%\..\benchmarks\helpers\_delegate_to_bash.bat" "%_THIS%\compare.sh" %* +exit /b %errorlevel% diff --git a/scripts/eval.bat b/scripts/eval.bat new file mode 100644 index 0000000..9005e7f --- /dev/null +++ b/scripts/eval.bat @@ -0,0 +1,9 @@ +@echo off +REM Windows equivalent of scripts/eval.sh — delegates to bash because the +REM script sources common.sh (a bash function library: parse_common_args, +REM apply_model_profile_if_set, check_langfuse_env, list_benchmarks). +setlocal +set "_THIS=%~dp0" +if "%_THIS:~-1%"=="\" set "_THIS=%_THIS:~0,-1%" +call "%_THIS%\..\benchmarks\helpers\_delegate_to_bash.bat" "%_THIS%\eval.sh" %* +exit /b %errorlevel% diff --git a/scripts/m3_pad_to_cap_verify.bat b/scripts/m3_pad_to_cap_verify.bat new file mode 100644 index 0000000..28abb0f --- /dev/null +++ b/scripts/m3_pad_to_cap_verify.bat @@ -0,0 +1,11 @@ +@echo off +REM Windows equivalent of scripts/m3_pad_to_cap_verify.sh — delegates to bash. +REM +REM This script uses tee, mktemp, embedded Python heredoc, gh api PATCH with +REM @file body, process substitution, and signal traps. None of these have +REM clean cmd.exe equivalents. Use Git Bash or WSL. +setlocal +set "_THIS=%~dp0" +if "%_THIS:~-1%"=="\" set "_THIS=%_THIS:~0,-1%" +call "%_THIS%\..\benchmarks\helpers\_delegate_to_bash.bat" "%_THIS%\m3_pad_to_cap_verify.sh" %* +exit /b %errorlevel% diff --git a/scripts/model_profiles.bat b/scripts/model_profiles.bat new file mode 100644 index 0000000..3372835 --- /dev/null +++ b/scripts/model_profiles.bat @@ -0,0 +1,56 @@ +@echo off +REM Windows equivalent of model_profiles.sh +REM Usage: call model_profiles.bat ^ +REM Sets AGENT_SETTING_CONFIG, MODEL_NAME, OPENAI_BASE_URL, OPENAI_API_VERSION. + +setlocal +set "PROFILE=%~1" + +set "_AGENT_SETTING=" +set "_MODEL_NAME=" +set "_BASE_URL=" +set "_API_VERSION=" +set "_RC=0" + +if "%PROFILE%"=="" goto done +if /i "%PROFILE%"=="gpt-oss" ( + set "_AGENT_SETTING=settings.groq.toml" + set "_MODEL_NAME=openai/gpt-oss-120b" + echo [OK] Model profile: gpt-oss + goto done +) +if /i "%PROFILE%"=="gpt4o" ( + set "_AGENT_SETTING=settings.openai.toml" + set "_MODEL_NAME=Azure/gpt-4o" + set "_BASE_URL=https://ete-litellm.bx.cloud9.ibm.com" + set "_API_VERSION=2024-08-06" + echo [OK] Model profile: gpt4o ^(Azure/gpt-4o^) + goto done +) +if /i "%PROFILE%"=="gpt4.1" ( + set "_AGENT_SETTING=settings.openai.toml" + set "_MODEL_NAME=Azure/gpt-4.1" + set "_BASE_URL=https://ete-litellm.bx.cloud9.ibm.com" + set "_API_VERSION=2024-08-06" + echo [OK] Model profile: gpt4.1 ^(Azure/gpt-4.1^) + goto done +) +if /i "%PROFILE%"=="opus4.5" ( + set "_AGENT_SETTING=settings.openai.toml" + set "_MODEL_NAME=claude-opus-4-5-20251101" + set "_BASE_URL=https://ete-litellm.bx.cloud9.ibm.com" + echo [OK] Model profile: opus4.5 + goto done +) +echo [ERROR] Unknown model profile '%PROFILE%' +echo Valid values: gpt-oss, gpt4o, gpt4.1, opus4.5 +set "_RC=1" + +:done +endlocal & ( + if not "%_AGENT_SETTING%"=="" set "AGENT_SETTING_CONFIG=%_AGENT_SETTING%" + if not "%_MODEL_NAME%"=="" set "MODEL_NAME=%_MODEL_NAME%" + if not "%_BASE_URL%"=="" (set "OPENAI_BASE_URL=%_BASE_URL%") else (set "OPENAI_BASE_URL=") + if not "%_API_VERSION%"=="" (set "OPENAI_API_VERSION=%_API_VERSION%") else (set "OPENAI_API_VERSION=") + exit /b %_RC% +) diff --git a/scripts/test_bat_scripts.ps1 b/scripts/test_bat_scripts.ps1 new file mode 100644 index 0000000..98ee32c --- /dev/null +++ b/scripts/test_bat_scripts.ps1 @@ -0,0 +1,131 @@ +#!/usr/bin/env pwsh +# Smoke-test the .bat scripts that mirror the .sh scripts in this repo. +# +# Runs on any platform with PowerShell 7+ (`pwsh`). Does NOT actually execute +# the .bat files — cmd.exe isn't available on macOS/Linux — but verifies +# structural invariants that catch typical authoring mistakes: +# 1. Every in-scope .sh has a sibling .bat +# 2. Every .bat starts with `@echo off` +# 3. Every .bat terminates the main flow with `exit /b` +# 4. Delegate shims reference an actually-existing .sh +# 5. `_delegate_to_bash.bat` exists where delegates expect it +# +# On Windows, this same script can be extended to actually invoke each .bat +# with --help (where supported) and check the exit code. +# +# Usage: pwsh scripts/test_bat_scripts.ps1 + +$ErrorActionPreference = 'Stop' +$repoRoot = (Resolve-Path "$PSScriptRoot/..").Path + +$failures = [System.Collections.Generic.List[string]]::new() +$passes = 0 +function Fail([string]$msg) { $script:failures.Add($msg); Write-Host " FAIL $msg" -ForegroundColor Red } +function Pass([string]$msg) { $script:passes++; Write-Host " ok $msg" -ForegroundColor DarkGreen } + +$excludedDirs = @('vendor', 'node_modules', '.venv', '.git', 'site-packages') +$excludedPathFrag = 'benchmarks' + [IO.Path]::DirectorySeparatorChar + 'appworld' + [IO.Path]::DirectorySeparatorChar + 'appworld' + +function IsExcluded([string]$path) { + foreach ($d in $excludedDirs) { + if ($path -match ([regex]::Escape([IO.Path]::DirectorySeparatorChar + $d + [IO.Path]::DirectorySeparatorChar))) { return $true } + } + if ($path -like "*$excludedPathFrag*") { return $true } + return $false +} + +function RelPath([string]$full) { return $full.Substring($repoRoot.Length + 1) } + +$shFiles = Get-ChildItem -Path $repoRoot -Recurse -Filter '*.sh' -File | Where-Object { -not (IsExcluded $_.FullName) } +$batFiles = Get-ChildItem -Path $repoRoot -Recurse -Filter '*.bat' -File | Where-Object { -not (IsExcluded $_.FullName) } + +Write-Host "Repo root: $repoRoot" +Write-Host "Found $($shFiles.Count) .sh files and $($batFiles.Count) .bat files in scope." + +# ---- [1] every .sh has a sibling .bat ----------------------------------- +Write-Host "`n[1] every .sh has a sibling .bat" -ForegroundColor Cyan +foreach ($sh in $shFiles) { + $sibling = [IO.Path]::ChangeExtension($sh.FullName, '.bat') + if (Test-Path -LiteralPath $sibling) { Pass (RelPath $sh.FullName) } + else { Fail "missing .bat sibling for $(RelPath $sh.FullName)" } +} + +# ---- [2] every .bat starts with @echo off -------------------------------- +Write-Host "`n[2] every .bat starts with '@echo off'" -ForegroundColor Cyan +foreach ($bat in $batFiles) { + $first = (Get-Content -LiteralPath $bat.FullName -TotalCount 1).Trim() + if ($first -eq '@echo off') { Pass (RelPath $bat.FullName) } + else { Fail "$(RelPath $bat.FullName) first line is '$first', expected '@echo off'" } +} + +# ---- [3] every .bat has an `exit /b` terminator -------------------------- +# common.bat is intentionally a placeholder and uses `exit /b 0` early; ok. +Write-Host "`n[3] every .bat contains 'exit /b' somewhere" -ForegroundColor Cyan +foreach ($bat in $batFiles) { + $content = Get-Content -LiteralPath $bat.FullName -Raw + if ($content -match 'exit\s+/b') { Pass (RelPath $bat.FullName) } + else { Fail "$(RelPath $bat.FullName) has no 'exit /b' terminator" } +} + +# ---- [4] delegate shims reference a real .sh ----------------------------- +# Skip _delegate_to_bash.bat itself — its REM comments contain example +# placeholders like "" that aren't +# actual code paths. +Write-Host "`n[4] delegate shims reference an existing .sh" -ForegroundColor Cyan +$delegateRegex = [regex]'_delegate_to_bash\.bat"\s+"([^"]+)"' +foreach ($bat in $batFiles) { + if ($bat.Name -eq '_delegate_to_bash.bat') { continue } + # Strip REM-prefixed lines so example syntax in comment blocks is ignored. + $codeLines = (Get-Content -LiteralPath $bat.FullName) | Where-Object { $_ -notmatch '^\s*(REM|::|@REM)\s' } + $content = $codeLines -join "`n" + $m = $delegateRegex.Match($content) + if (-not $m.Success) { continue } # not a delegate shim + $target = $m.Groups[1].Value + # Expand %_THIS% to the .bat's own directory, normalise separators. + $batDir = Split-Path -Parent $bat.FullName + $resolved = $target -replace '%_THIS%', $batDir + $resolved = $resolved -replace '\\', ([IO.Path]::DirectorySeparatorChar) + # Collapse parent traversals (Resolve-Path errors if the file is missing) + try { + $abs = [IO.Path]::GetFullPath($resolved) + } catch { $abs = $resolved } + if (Test-Path -LiteralPath $abs) { + Pass "$(RelPath $bat.FullName) -> $(Split-Path -Leaf $abs)" + } else { + Fail "$(RelPath $bat.FullName) delegates to missing $target (resolved: $abs)" + } +} + +# ---- [5] every delegate shim's _delegate_to_bash.bat actually exists ----- +# Skip the helper itself; its own REM block shows an example `call` statement. +Write-Host "`n[5] _delegate_to_bash.bat exists where shims expect it" -ForegroundColor Cyan +$delegateHelperRegex = [regex]'call\s+"([^"]*_delegate_to_bash\.bat)"' +foreach ($bat in $batFiles) { + if ($bat.Name -eq '_delegate_to_bash.bat') { continue } + $codeLines = (Get-Content -LiteralPath $bat.FullName) | Where-Object { $_ -notmatch '^\s*(REM|::|@REM)\s' } + $content = $codeLines -join "`n" + $m = $delegateHelperRegex.Match($content) + if (-not $m.Success) { continue } + $target = $m.Groups[1].Value + $batDir = Split-Path -Parent $bat.FullName + $resolved = $target -replace '%_THIS%', $batDir + $resolved = $resolved -replace '\\', ([IO.Path]::DirectorySeparatorChar) + try { $abs = [IO.Path]::GetFullPath($resolved) } catch { $abs = $resolved } + if (Test-Path -LiteralPath $abs) { + Pass (RelPath $bat.FullName) + } else { + Fail "$(RelPath $bat.FullName) calls missing $target (resolved: $abs)" + } +} + +# ---- summary ------------------------------------------------------------- +Write-Host "`n=== Summary ===" -ForegroundColor Cyan +Write-Host ("checks passed: {0}" -f $passes) -ForegroundColor Green +Write-Host ("checks failed: {0}" -f $failures.Count) -ForegroundColor ($(if ($failures.Count -eq 0) { 'Green' } else { 'Red' })) +if ($failures.Count -gt 0) { + Write-Host "`nFailures:" -ForegroundColor Red + $failures | ForEach-Object { Write-Host " - $_" -ForegroundColor Red } + exit 1 +} +Write-Host "`nAll structural checks passed." +exit 0 diff --git a/scripts/viz.bat b/scripts/viz.bat new file mode 100644 index 0000000..37bcb04 --- /dev/null +++ b/scripts/viz.bat @@ -0,0 +1,27 @@ +@echo off +REM Windows equivalent of viz.sh +REM Loads benchmark env and runs cuga viz against the trajectory_data dir. + +setlocal enabledelayedexpansion + +set "BENCHMARK_NAME=%~1" +if "%BENCHMARK_NAME%"=="" ( + echo Usage: %~nx0 ^ + echo Example: %~nx0 m3 + exit /b 1 +) + +set "SCRIPT_DIR=%~dp0" +if "%SCRIPT_DIR:~-1%"=="\" set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%" +pushd "%SCRIPT_DIR%\.." >nul +set "PROJECT_ROOT=%CD%" +popd >nul + +echo Loading %BENCHMARK_NAME% visualization configuration... +call "%PROJECT_ROOT%\benchmarks\helpers\load_env.bat" "%BENCHMARK_NAME%" + +echo. +echo Running cuga viz... +cd /d "%PROJECT_ROOT%" +uv run cuga-viz run %CUGA_LOGGING_DIR%\trajectory_data\ +exit /b %errorlevel% diff --git a/setup_appworld.bat b/setup_appworld.bat new file mode 100644 index 0000000..ea3feaa --- /dev/null +++ b/setup_appworld.bat @@ -0,0 +1,58 @@ +@echo off +REM Windows equivalent of setup_appworld.sh +REM +REM Sources the appworld env file (set -a equivalent), then installs AppWorld +REM via uv. Interactive reinstall prompt preserved. + +setlocal enabledelayedexpansion + +set "APPWORLD_DIR=benchmarks\appworld" +set "APPWORLD_ENV_FILE=benchmarks\appworld\config\appworld.env" +set "APPWORLD_REPO_DIR=%APPWORLD_DIR%\appworld" +set "APPWORLD_DATA_DIR=%APPWORLD_REPO_DIR%\data" + +if not exist "%APPWORLD_DIR%\" ( + echo Error: '%APPWORLD_DIR%' directory not found! + echo Please clone the repository first + exit /b 1 +) + +if not exist "%APPWORLD_ENV_FILE%" ( + echo Error: '%APPWORLD_ENV_FILE%' file not found! + exit /b 1 +) + +REM Load env file: each non-comment KEY=VALUE line becomes a set +for /f "usebackq tokens=* eol=#" %%L in ("%APPWORLD_ENV_FILE%") do ( + set "_line=%%L" + if not "!_line!"=="" ( + for /f "tokens=1,* delims==" %%A in ("!_line!") do ( + set "%%A=%%B" + ) + ) +) + +if not exist "%APPWORLD_REPO_DIR%\" ( + echo Error: '%APPWORLD_REPO_DIR%' directory not found! + echo Please clone the AppWorld repository into '%APPWORLD_REPO_DIR%' first + exit /b 1 +) + +if exist "%APPWORLD_DATA_DIR%\" ( + echo AppWorld repository already present at '%APPWORLD_REPO_DIR%'. + echo AppWorld data already exists at '%APPWORLD_DATA_DIR%'. + set /p REINSTALL="Would you like to reinstall AppWorld and re-download the data? [y/N] " + if /i not "!REINSTALL!"=="y" if /i not "!REINSTALL!"=="yes" ( + echo Keeping existing AppWorld installation and data. Skipping setup. + exit /b 0 + ) + echo Reinstalling AppWorld and downloading data... +) + +pushd "%APPWORLD_REPO_DIR%" || exit /b 1 +uv pip install . || (popd & exit /b 1) +uv run -m appworld.cli install || (popd & exit /b 1) +uv run appworld install --repo || (popd & exit /b 1) +uv run appworld download data || (popd & exit /b 1) +popd +exit /b 0 diff --git a/setup_cuga.bat b/setup_cuga.bat new file mode 100644 index 0000000..9515269 --- /dev/null +++ b/setup_cuga.bat @@ -0,0 +1,93 @@ +@echo off +REM Windows equivalent of setup_cuga.sh +REM Clones cuga-agent next to this repo (matches pyproject.toml path "../cuga-agent") +REM and sets up environment variables for the current session. +REM +REM Note: env vars set here only persist for this cmd.exe session. +REM See the follow-up issue tracking conversion of these scripts to Python. + +setlocal + +set "REPO_URL=https://github.com/cuga-project/cuga-agent.git" +set "REPO_BRANCH=main" +set "SCRIPT_DIR=%~dp0" +REM strip trailing backslash from %~dp0 +if "%SCRIPT_DIR:~-1%"=="\" set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%" +pushd "%SCRIPT_DIR%\.." >nul || (echo [ERROR] Could not access parent of %SCRIPT_DIR% & exit /b 1) +set "PARENT_DIR=%CD%" +popd >nul +set "REPO_NAME=cuga-agent" +set "REPO_PATH=%PARENT_DIR%\%REPO_NAME%" + +echo ============================================================ +echo CUGA Agent Setup Script +echo ============================================================ +echo. + +where git >nul 2>&1 +if errorlevel 1 ( + echo [ERROR] Git is not installed. Please install git and try again. + exit /b 1 +) + +if exist "%REPO_PATH%\.git" ( + echo [INFO] Repository already exists at %REPO_PATH% + echo [INFO] Pulling latest changes from branch: %REPO_BRANCH%... + pushd "%REPO_PATH%" || exit /b 1 + git fetch origin && git checkout "%REPO_BRANCH%" && git pull origin "%REPO_BRANCH%" + if errorlevel 1 echo [WARNING] Could not update repository. You may need to resolve conflicts manually. + popd +) else if exist "%REPO_PATH%\" ( + echo [WARNING] Directory exists but is not a git repository. Removing and cloning fresh... + rmdir /s /q "%REPO_PATH%" + call :clone_repo || exit /b 1 +) else ( + call :clone_repo || exit /b 1 +) + +echo [INFO] Exporting environment variables... +endlocal & ( + set "ENV_FILE=.\.env" + set "MCP_SERVERS_FILE=.\mcp_servers.yaml" + set "CUGA_LOGGING_DIR=.\logging" + echo [INFO] Exported ENV_FILE=.\.env + echo [INFO] Exported MCP_SERVERS_FILE=.\mcp_servers.yaml + echo [INFO] Exported CUGA_LOGGING_DIR=.\logging +) + +if not exist ".\logging" ( + echo [INFO] Creating logging directory... + mkdir ".\logging" +) + +REM Optionally run AppWorld setup if the script exists +if exist "%~dp0setup_appworld.bat" ( + echo [INFO] Running AppWorld setup... + call "%~dp0setup_appworld.bat" + if errorlevel 1 ( + echo [ERROR] AppWorld setup failed + exit /b 1 + ) +) else ( + echo [WARNING] AppWorld setup script not found. Skipping. +) + +echo. +echo [SUCCESS] Setup completed successfully! +echo. +echo Next steps: +echo 1. Check the cloned repository at: %REPO_PATH% +echo 2. Environment variables are now available in this terminal session +echo 3. Note: Variables will only persist for this terminal session +echo. +exit /b 0 + +:clone_repo +echo [INFO] Cloning %REPO_URL% (branch: %REPO_BRANCH%)... +git clone -b "%REPO_BRANCH%" "%REPO_URL%" "%REPO_PATH%" +if errorlevel 1 ( + echo [ERROR] Failed to clone repository. Please check your SSH keys and network connection. + exit /b 1 +) +echo [SUCCESS] Repository cloned successfully to %REPO_PATH% +exit /b 0 diff --git a/setup_m3.bat b/setup_m3.bat new file mode 100644 index 0000000..2592a3c --- /dev/null +++ b/setup_m3.bat @@ -0,0 +1,229 @@ +@echo off +REM Windows equivalent of setup_m3.sh +REM +REM Clones vakra into vendor\, sets up Python venv, installs deps, downloads +REM benchmark data, builds Docker image, starts containers. +REM +REM Requires: git, docker or podman, python (with venv), HF_TOKEN env var. + +setlocal enabledelayedexpansion + +set "REPO_URL=https://github.com/IBM/vakra.git" +set "VENDOR_DIR=.\vendor" +set "REPO_NAME=vakra" +set "REPO_PATH=%VENDOR_DIR%\%REPO_NAME%" +set "DATA_DIR=%REPO_PATH%\data" + +REM Mode flags +set "DOWNLOAD_ONLY=false" +set "BUILD_ONLY=false" +set "START_ONLY=false" +set "VERIFY_ONLY=false" +set "SKIP_DOWNLOAD=false" + +:parse_args +if "%~1"=="" goto args_done +if "%~1"=="--download-only" (set "DOWNLOAD_ONLY=true" & shift & goto parse_args) +if "%~1"=="--build-only" (set "BUILD_ONLY=true" & shift & goto parse_args) +if "%~1"=="--start-only" (set "START_ONLY=true" & shift & goto parse_args) +if "%~1"=="--verify" (set "VERIFY_ONLY=true" & shift & goto parse_args) +if "%~1"=="--skip-download" (set "SKIP_DOWNLOAD=true" & shift & goto parse_args) +if "%~1"=="--help" goto show_usage +echo [ERROR] Unknown option: %~1 +goto show_usage + +:show_usage +echo Usage: %~nx0 [OPTIONS] +echo. +echo Options: +echo --download-only Download data only (no build/start) +echo --build-only Only build image, don't start containers +echo --start-only Only start containers (assumes already built) +echo --verify Only verify containers are running +echo --skip-download Skip data download step +echo --help Show this help message +exit /b 0 + +:args_done +echo ============================================================ +echo Vakra Benchmark Setup Script +echo ============================================================ +echo. + +REM Check prerequisites +where git >nul 2>&1 +if errorlevel 1 ( + echo [ERROR] Missing required dependency: git + exit /b 1 +) +set "RUNTIME=" +where docker >nul 2>&1 && set "RUNTIME=docker" +if "%RUNTIME%"=="" where podman >nul 2>&1 && set "RUNTIME=podman" +if "%RUNTIME%"=="" ( + echo [ERROR] Missing required dependency: docker or podman + exit /b 1 +) +echo [INFO] Using container runtime: %RUNTIME% + +if "%VERIFY_ONLY%"=="true" ( + call :verify_containers + exit /b !errorlevel! +) + +if "%START_ONLY%"=="true" ( + call :start_containers || exit /b 1 + call :verify_containers + exit /b !errorlevel! +) + +REM Step 1: Clone or update repo +if not exist "%VENDOR_DIR%\" ( + echo [INFO] Creating vendor directory... + mkdir "%VENDOR_DIR%" +) +if exist "%REPO_PATH%\.git" ( + echo [INFO] Repository already exists at %REPO_PATH%, pulling latest... + pushd "%REPO_PATH%" || exit /b 1 + git pull origin main 2>nul || git pull origin master 2>nul + if errorlevel 1 echo [WARNING] Could not update repository + popd +) else if exist "%REPO_PATH%\" ( + echo [WARNING] Directory exists but is not a git repository. Removing and cloning fresh... + rmdir /s /q "%REPO_PATH%" + call :clone_repo || exit /b 1 +) else ( + call :clone_repo || exit /b 1 +) + +REM Step 2: Python env + install deps +echo [INFO] Step 2: Installing Python dependencies... +pushd "%REPO_PATH%" || exit /b 1 +if not exist ".venv\" ( + echo [INFO] Creating Python virtual environment... + python -m venv .venv || (echo [ERROR] Failed to create venv & popd & exit /b 1) +) +echo [INFO] Activating virtual environment and installing vakra... +call .venv\Scripts\activate.bat +pip install -e ".[init]" || (echo [ERROR] vakra install failed & popd & exit /b 1) +pip install -r requirements_benchmark.txt || (echo [ERROR] benchmark deps install failed & popd & exit /b 1) +popd + +REM Step 3: Download data +if "%SKIP_DOWNLOAD%"=="false" ( + if not exist "%DATA_DIR%\" ( + call :download_data || exit /b 1 + ) else ( + dir /b /a "%DATA_DIR%" >nul 2>&1 + if errorlevel 1 ( + call :download_data || exit /b 1 + ) else ( + echo [INFO] Data directory exists and is not empty, skipping download + ) + ) +) + +if "%DOWNLOAD_ONLY%"=="true" ( + echo [SUCCESS] Setup and data download completed! + exit /b 0 +) + +REM Build + start +call :build_image || exit /b 1 +if "%BUILD_ONLY%"=="false" ( + call :start_containers + call :verify_containers +) + +echo. +echo [SUCCESS] Vakra setup completed successfully! +echo. +echo Container Information: +echo * capability_1_bi_apis - Tool Chaining MCP Server +echo * capability_2_dashboard_apis - Tool Selection MCP Server +echo * capability_3_multihop_reasoning - Multi-hop Reasoning MCP Server +echo * capability_4_multiturn - Multi-hop Multi-Source MCP Server +exit /b 0 + +:clone_repo +echo [INFO] Cloning %REPO_URL%... +git clone "%REPO_URL%" "%REPO_PATH%" +if errorlevel 1 ( + echo [ERROR] Failed to clone repository. Check SSH keys / network. + exit /b 1 +) +echo [SUCCESS] Repository cloned successfully +exit /b 0 + +:download_data +echo [INFO] Downloading benchmark data from HuggingFace (~30 GB)... +if "%HF_TOKEN%"=="" ( + echo [ERROR] HF_TOKEN environment variable is not set + echo Set it with: set HF_TOKEN=hf_your_token_here + echo Get your token from: https://huggingface.co/settings/tokens + exit /b 1 +) +pushd "%REPO_PATH%" || exit /b 1 +if exist ".venv\Scripts\activate.bat" call .venv\Scripts\activate.bat +where make >nul 2>&1 +if not errorlevel 1 ( + make download +) else ( + python benchmark_setup.py --download-data +) +if errorlevel 1 ( + echo [ERROR] Failed to download data + popd + exit /b 1 +) +popd +echo [SUCCESS] Data downloaded successfully +exit /b 0 + +:build_image +echo [INFO] Building vakra Docker image using %RUNTIME%... +pushd "%REPO_PATH%" || exit /b 1 +where make >nul 2>&1 +if not errorlevel 1 ( + set "DOCKER=%RUNTIME%" && make build +) else ( + %RUNTIME% build -t m3_environ -f docker/Dockerfile.unified . +) +if errorlevel 1 ( + echo [ERROR] Failed to build image + popd + exit /b 1 +) +popd +echo [SUCCESS] Image built successfully +exit /b 0 + +:start_containers +echo [INFO] Starting containers using %RUNTIME% compose... +pushd "%REPO_PATH%" || exit /b 1 +where make >nul 2>&1 +if not errorlevel 1 ( + set "DOCKER=%RUNTIME%" && make start +) else ( + %RUNTIME% compose up -d +) +if errorlevel 1 ( + echo [ERROR] Failed to start containers + popd + exit /b 1 +) +popd +echo [SUCCESS] Containers started successfully +exit /b 0 + +:verify_containers +echo [INFO] Verifying containers... +%RUNTIME% ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" +for /f %%C in ('%RUNTIME% ps --format "{{.Names}}" ^| findstr /c:"capability_" /c:"" ^| find /c "capability_"') do set "RUNNING=%%C" +if "%RUNNING%"=="" set "RUNNING=0" +if %RUNNING% GEQ 4 ( + echo [SUCCESS] Found %RUNNING% capability containers running + exit /b 0 +) else ( + echo [WARNING] Only %RUNNING% capability containers running (expected 4) + exit /b 1 +)