Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
c0ce9f1
fix(m3): fix harness bugs that artificially zeroed CUGA M3 pass rate
haroldship May 20, 2026
58aabfe
fix(m3): mark P-OF-2 frontmatter as disabled to match filename
haroldship May 20, 2026
455315e
Merge branch 'main' into fix/m3-harness-bugs
haroldship May 28, 2026
329b198
fix(m3): use DYNACONF_SERVER_PORTS__REGISTRY for registry bind and agent
haroldship May 28, 2026
ab5ccc4
fix(m3): auto-sequence capability passes when --m3-data has no --capa…
haroldship May 28, 2026
3b0276c
Fix M3 bundle assembly and eval harness reliability.
haroldship May 29, 2026
de8ddd4
Fix create_eval_bundle import error when run as a script.
haroldship May 29, 2026
4947947
Fix bundle CLI when invoked outside the benchmarks package.
haroldship May 29, 2026
a300258
Fix CI failures from polluted eval env and bandit B108.
haroldship May 29, 2026
2eb12dd
fix(m3): one eval run = one result file + one trajectory run (all tasks)
haroldship May 31, 2026
1852ea5
fix(m3): make sequential per-domain registry restarts reliable on the…
haroldship May 31, 2026
cf4303e
fix(m3): add capability/domain/task# report columns + per-run bundle …
haroldship Jun 1, 2026
f4505c0
fix(m3): tune eval env and add defensive tool-output instructions
haroldship Jun 1, 2026
2ce23f5
Merge remote-tracking branch 'origin/main' into fix/m3-harness-bugs
haroldship Jun 2, 2026
ab3b4ef
fix(m3): single Langfuse trace per task on Watsonx/Cuga path
haroldship Jun 2, 2026
041af8b
fix(m3): gate Langfuse on settings and harden eval invoke fallbacks
haroldship Jun 3, 2026
671046c
Merge remote-tracking branch 'origin/main' into fix/m3-harness-bugs
haroldship Jun 3, 2026
f54f4a5
fix(m3): export should_trace_langfuse_task from benchmarks.helpers
haroldship Jun 3, 2026
7676764
fix(m3): wire --no-policies through compare and eval.sh
haroldship Jun 3, 2026
790518c
fix: address CodeRabbit review findings on PR #3
haroldship Jun 7, 2026
4049fb0
fix: address Sergey's and Offer's review findings on PR #3
haroldship Jun 7, 2026
eeea391
fix(m3): propagate --capability filter to react agent's registry expa…
haroldship Jun 7, 2026
acb0031
chore(m3): regenerate compiled policies.json from markdown sources
haroldship Jun 7, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 36 additions & 17 deletions benchmarks/helpers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,6 @@
"""Helper functions for SDK evaluation benchmarks."""

from .config_loader import load_eval_config
from .sdk_eval_helpers import (
MetricsConfig,
add_policy_via_agent,
check_keywords,
clear_all_policies,
create_activity_tracker_callback,
evaluate_multiturn_task_with_langfuse,
evaluate_multiturn_task_with_langfuse_react,
evaluate_task_with_langfuse,
evaluate_task_with_langfuse_react,
flush_langfuse,
print_evaluation_summary,
save_evaluation_results,
setup_agent_with_tools,
setup_langfuse,
setup_react_agent_for_evaluation,
)
from .token_usage import TokenUsageCallback

__all__ = [
Expand All @@ -27,6 +10,7 @@
"setup_agent_with_tools",
"setup_react_agent_for_evaluation",
"setup_langfuse",
"should_trace_langfuse_task",
"clear_all_policies",
"add_policy_via_agent",
"check_keywords",
Expand All @@ -39,3 +23,38 @@
"create_activity_tracker_callback",
"save_evaluation_results",
]

_LAZY_EXPORTS = {
"MetricsConfig": ("sdk_eval_helpers", "MetricsConfig"),
"setup_agent_with_tools": ("sdk_eval_helpers", "setup_agent_with_tools"),
"setup_react_agent_for_evaluation": ("sdk_eval_helpers", "setup_react_agent_for_evaluation"),
"setup_langfuse": ("sdk_eval_helpers", "setup_langfuse"),
"should_trace_langfuse_task": ("sdk_eval_helpers", "should_trace_langfuse_task"),
"clear_all_policies": ("sdk_eval_helpers", "clear_all_policies"),
"add_policy_via_agent": ("sdk_eval_helpers", "add_policy_via_agent"),
"check_keywords": ("sdk_eval_helpers", "check_keywords"),
"evaluate_task_with_langfuse": ("sdk_eval_helpers", "evaluate_task_with_langfuse"),
"evaluate_task_with_langfuse_react": ("sdk_eval_helpers", "evaluate_task_with_langfuse_react"),
"evaluate_multiturn_task_with_langfuse": ("sdk_eval_helpers", "evaluate_multiturn_task_with_langfuse"),
"evaluate_multiturn_task_with_langfuse_react": (
"sdk_eval_helpers",
"evaluate_multiturn_task_with_langfuse_react",
),
"print_evaluation_summary": ("sdk_eval_helpers", "print_evaluation_summary"),
"flush_langfuse": ("sdk_eval_helpers", "flush_langfuse"),
"create_activity_tracker_callback": ("sdk_eval_helpers", "create_activity_tracker_callback"),
"save_evaluation_results": ("sdk_eval_helpers", "save_evaluation_results"),
}

if not set(_LAZY_EXPORTS).issubset(__all__):
raise AssertionError("every lazy export must be declared in __all__")


def __getattr__(name: str):
if name in _LAZY_EXPORTS:
import importlib

module_name, attr_name = _LAZY_EXPORTS[name]
module = importlib.import_module(f".{module_name}", __name__)
return getattr(module, attr_name)
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
110 changes: 90 additions & 20 deletions benchmarks/helpers/bundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,18 @@
PROJECT_ROOT = _HELPERS_DIR.parent.parent


def _load_benchmark_env(benchmark_name: str) -> None:
"""Load global + benchmark .env files (dotenv strips inline comments)."""
from dotenv import load_dotenv

global_env = PROJECT_ROOT / "config" / "global.env"
if global_env.exists():
load_dotenv(global_env, override=True)
benchmark_env = PROJECT_ROOT / "benchmarks" / benchmark_name / "config" / f"{benchmark_name}.env"
if benchmark_env.exists():
load_dotenv(benchmark_env, override=True)


# ---------------------------------------------------------------------------
# Git / hash helpers
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -505,18 +517,40 @@ def assemble_compare_bundle(
bundle_root: Path | None = None,
model_envs: dict | None = None,
policies_dir: Path | None = None,
trajectory_dirs: dict[str, list[Path]] | None = None,
trajectory_dirs: dict[str, list[list[Path]]] | None = None,
log_files: dict[str, list[str | Path]] | None = None,
fetch_langfuse: bool = False,
) -> Path:
"""Create a comparison-level bundle directory."""
"""Create a comparison-level bundle directory.

``trajectory_dirs`` maps each config key to a list of RUNS, where each run
is itself a list of trajectory folders (cuga emits one folder per domain).
All folders within a run are merged into a single ``runN/trajectories`` dir.

``log_files`` maps each config key to either a grouped per-run list
(``[[run1 logs], [run2 logs], ...]`` → ``runN/logs``) or a flat list
(legacy / ``"shared"`` key → ``runs/<key>/logs``). Per-run grouping keeps
each run's own console + registry log instead of only the last run's.
"""
benchmark_dir = PROJECT_ROOT / "benchmarks" / benchmark_name
if bundle_root is None:
bundle_root = benchmark_dir / "evaluation_bundles"

timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
models = sorted(set(k.split(":")[0] for k in config_results))
bundle_dir = bundle_root / f"{timestamp}_compare_{'_'.join(models)}"
# Detect inner-dim variants (agent and/or policy mode) so the dir name
# reflects what was compared. Config keys are "model[:agent[:policy_mode]]".
agents = sorted({parts[1] for k in config_results if len(parts := k.split(":")) > 1 and parts[1]})
policy_modes = sorted({parts[2] for k in config_results if len(parts := k.split(":")) > 2 and parts[2]})
suffix_bits = ["_".join(models)]
if len(agents) > 1:
suffix_bits.append("_".join(agents))
if len(policy_modes) > 1:
suffix_bits.append("_vs_".join(policy_modes)) # e.g. "policies_vs_no-policies"
elif len(policy_modes) == 1 and policy_modes[0] == "no-policies":
suffix_bits.append("no-policies")
suffix = "_".join(suffix_bits)
bundle_dir = bundle_root / f"{timestamp}_compare_{suffix}"
bundle_dir.mkdir(parents=True, exist_ok=True)

# Per-run results
Expand All @@ -543,29 +577,48 @@ def assemble_compare_bundle(
# Policies
_copy_policies(bundle_dir, policies_dir)

# Cuga trajectories (per-model, per-run)
# Cuga trajectories (per-model, per-run). `trajectory_dirs[config]` is a
# list of RUNS, and each run is a list of trajectory folders (cuga writes
# one folder per domain). All folders belonging to one eval.sh run are
# merged into that run's single `trajectories/` dir, so one bundle run maps
# to one eval run (all 200 trajectories) rather than one per-domain folder.
if trajectory_dirs:
for config_key, traj_paths in trajectory_dirs.items():
for i, traj_path in enumerate(traj_paths, 1):
traj_path = Path(traj_path)
if not traj_path.exists():
continue
for config_key, run_groups in trajectory_dirs.items():
for i, group in enumerate(run_groups, 1):
run_label = f"{config_key.replace(':', '_')}_run{i}"
_copy_trajectories(
bundle_dir,
traj_path,
dest_subdir=f"runs/{run_label}/trajectories",
)
copied_any = False
for traj_path in group:
traj_path = Path(traj_path)
if not traj_path.exists():
continue
if _copy_trajectories(
bundle_dir,
traj_path,
dest_subdir=f"runs/{run_label}/trajectories",
):
copied_any = True
if not copied_any:
continue
# Copy .progress to run root so cuga-viz can find it
_run_progress = bundle_dir / "runs" / run_label / "trajectories" / ".progress"
if _run_progress.exists():
shutil.copy2(_run_progress, bundle_dir / "runs" / run_label / ".progress")

# Logs (per-model)
# Logs. Two accepted shapes per config key:
# grouped per-run (preferred): [[run1 logs...], [run2 logs...], ...]
# → each run's logs land in runs/<config>_run<i>/logs so every run in a
# multi-run comparison keeps its OWN console/registry log.
# flat (legacy / "shared"): [log, log, ...]
# → placed in runs/<config>/logs (e.g. the "shared" key → runs/shared/logs).
if log_files:
for config_key, lf_list in log_files.items():
run_label = f"{config_key.replace(':', '_')}"
_copy_logs(bundle_dir, lf_list, dest_subdir=f"runs/{run_label}/logs")
for config_key, lf_val in log_files.items():
if lf_val and isinstance(lf_val[0], list):

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code tries to detect whether log files are organized as "one list per run" or "one flat list" by checking if the first element is itself a list. If someone accidentally passes an empty nested structure like [[]], the code won't crash immediately but might behave unexpectedly—it'll try to copy logs from an empty group, which could silently fail or do nothing.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checked this — [[]] doesn't actually misbehave: _copy_logs early-returns False on a falsy/empty log_files list (line 250: if not log_files: return False), so the inner empty group is just a no-op — no silent partial copy, no crash. The shape-detection branch correctly routes it to the grouped path and then does nothing for the empty group. Leaving as-is; the existing guard already covers this.

for i, group in enumerate(lf_val, 1):
run_label = f"{config_key.replace(':', '_')}_run{i}"
_copy_logs(bundle_dir, group, dest_subdir=f"runs/{run_label}/logs")
else:
run_label = f"{config_key.replace(':', '_')}"
_copy_logs(bundle_dir, lf_val, dest_subdir=f"runs/{run_label}/logs")

# Langfuse traces (per-model, per-run)
if fetch_langfuse:
Expand Down Expand Up @@ -698,7 +751,12 @@ def cli():
p_cmp.add_argument("--task-files", nargs="*", default=None)
p_cmp.add_argument("--policies-dir", default=None)
p_cmp.add_argument("--model-envs", default=None, help='JSON: {"model": {"MODEL_NAME": "...", ...}}')
p_cmp.add_argument("--trajectory-dirs", default=None, help='JSON: {"model": ["/path/to/traj_run1", ...]}')
p_cmp.add_argument(
"--trajectory-dirs",
default=None,
help='JSON grouped by run: {"model": [["/run1/domA", "/run1/domB"], ["/run2/domA"]]}. '
'A flat {"model": ["/dir1", ...]} is still accepted (each dir treated as its own run).',
)
p_cmp.add_argument(
"--log-files",
default=None,
Expand All @@ -711,6 +769,10 @@ def cli():

args = parser.parse_args()

# Reload benchmark env from disk (dotenv strips inline comments). Shell-sourced
# vars from eval.sh may include trailing comment text in values.
_load_benchmark_env(args.benchmark)

policies_dir = Path(args.policies_dir) if getattr(args, "policies_dir", None) else None

if args.command == "assemble":
Expand Down Expand Up @@ -747,7 +809,15 @@ def cli():
traj_dirs = None
if args.trajectory_dirs:
raw = json.loads(args.trajectory_dirs)
traj_dirs = {k: [Path(p) for p in v] for k, v in raw.items()}
# Accept two shapes:
# grouped (preferred): {config: [[dir, ...run1], [dir, ...run2]]}
# legacy flat: {config: [dir, dir, ...]} -> each dir = 1 run
traj_dirs = {}
for k, v in raw.items():
if v and isinstance(v[0], list):
traj_dirs[k] = [[Path(p) for p in group] for group in v]
else:
traj_dirs[k] = [[Path(p)] for p in v]
log_file_map = None
if args.log_files:
log_file_map = json.loads(args.log_files)
Expand Down
54 changes: 44 additions & 10 deletions benchmarks/helpers/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ OUTPUT_FILE="${OUTPUT_FILE:-}"
DRY_RUN="${DRY_RUN:-false}"
VERBOSE="${VERBOSE:-false}"
MODEL_PROFILE="${MODEL_PROFILE:-}"
CLI_MODEL_NAME="${CLI_MODEL_NAME:-}"
CLI_OPENAI_BASE_URL="${CLI_OPENAI_BASE_URL:-}"
AGENT="${AGENT:-cuga}"
AGENTS="${AGENTS:-}"
COMPARE_AGENTS="${COMPARE_AGENTS:-false}"
Expand Down Expand Up @@ -163,6 +165,14 @@ parse_common_args() {
MODEL_PROFILE="${args[$((idx+1))]}"
idx=$((idx+2))
;;
--model-name)
CLI_MODEL_NAME="${args[$((idx+1))]}"
idx=$((idx+2))
;;
--openai-base-url)
CLI_OPENAI_BASE_URL="${args[$((idx+1))]}"
idx=$((idx+2))
;;
--agent)
AGENT="${args[$((idx+1))]}"
idx=$((idx+2))
Expand Down Expand Up @@ -205,22 +215,46 @@ parse_common_args() {
fi
}

# Source scripts/model_profiles.sh once (idempotent).
_ensure_model_profiles_loaded() {
local script_dir profiles_script
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
profiles_script="$script_dir/../../scripts/model_profiles.sh"
if [ -f "$profiles_script" ]; then
# shellcheck source=/dev/null
source "$profiles_script"
return 0
fi
echo -e "${RED}Error: model_profiles.sh not found at $profiles_script${NC}"
return 1
}

# Apply model profile if specified
apply_model_profile_if_set() {
if [ -n "$MODEL_PROFILE" ]; then
local script_dir
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
local profiles_script="$script_dir/../../scripts/model_profiles.sh"
if [ -f "$profiles_script" ]; then
source "$profiles_script"
apply_model_profile "$MODEL_PROFILE"
else
echo -e "${RED}Error: model_profiles.sh not found at $profiles_script${NC}"
return 1
fi
_ensure_model_profiles_loaded || return 1
apply_model_profile "$MODEL_PROFILE"
fi
}

# Apply per-run CLI overrides (after profile and .env load).
apply_model_cli_overrides_if_set() {
if [ -n "$CLI_MODEL_NAME" ]; then
export MODEL_NAME="$CLI_MODEL_NAME"
echo -e "${GREEN}✓${NC} MODEL_NAME override: $MODEL_NAME"
fi
if [ -n "$CLI_OPENAI_BASE_URL" ]; then
export OPENAI_BASE_URL="$CLI_OPENAI_BASE_URL"
echo -e "${GREEN}✓${NC} OPENAI_BASE_URL override: $OPENAI_BASE_URL"
fi
}

# Apply profile then CLI overrides. Call after load_env.sh and arg parsing.
finalize_model_config() {
apply_model_profile_if_set || return 1
apply_model_cli_overrides_if_set
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

# Build model-envs JSON for bundle CLI.
# Usage: build_model_envs_json model1 model2 ...
# Applies each profile, captures env vars, and outputs JSON to stdout.
Expand Down
Loading
Loading