From 43e77ac93813935e401e7d562e212233293124a9 Mon Sep 17 00:00:00 2001 From: Wojciech Prazuch Date: Mon, 18 May 2026 14:54:03 -0700 Subject: [PATCH 1/6] feat(benchmarks): add ToolSandbox multi-turn tool-use benchmark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ToolSandbox (https://github.com/apple/ToolSandbox) evaluates stateful, multi-turn tool-use with a user simulator — the standard seed/solve/verify loop does not apply. This integration runs the full benchmark inside a dedicated Docker container via run_batch() and parses result_summary.json into the standard NEL bundle format. Key design choices: - ToolSandboxEnvironment subclasses EvalEnvironment and overrides run_batch() - Both agent and user simulator call the NVIDIA Inference API (no OpenAI key) - docker/toolsandbox_entrypoint.py patches AGENT_TYPE_TO_FACTORY[Gorilla] and USER_TYPE_TO_FACTORY[GPT_4_o_2024_05_13] with NVIDIANIMAgent/NVIDIANIMUser, which use native function-calling format and read NVIDIA_BASE_URL from env - URL normalization strips /chat/completions from NEL service URLs before passing to the OpenAI SDK base_url Files added: src/nemo_evaluator/benchmarks/toolsandbox.py docker/Dockerfile.toolsandbox docker/toolsandbox_entrypoint.py examples/configs/toolsandbox.yaml tests/test_environments/test_toolsandbox.py (21 offline tests, all green) Signed-off-by: Wojciech Prazuch --- docker/Dockerfile.toolsandbox | 35 +++ docker/toolsandbox_entrypoint.py | 87 ++++++ examples/configs/toolsandbox.yaml | 43 +++ src/nemo_evaluator/benchmarks/__init__.py | 1 + src/nemo_evaluator/benchmarks/toolsandbox.py | 257 ++++++++++++++++++ tests/test_environments/test_toolsandbox.py | 268 +++++++++++++++++++ 6 files changed, 691 insertions(+) create mode 100644 docker/Dockerfile.toolsandbox create mode 100644 docker/toolsandbox_entrypoint.py create mode 100644 examples/configs/toolsandbox.yaml create mode 100644 src/nemo_evaluator/benchmarks/toolsandbox.py create mode 100644 tests/test_environments/test_toolsandbox.py diff --git a/docker/Dockerfile.toolsandbox b/docker/Dockerfile.toolsandbox new file mode 100644 index 000000000..a96b69614 --- /dev/null +++ b/docker/Dockerfile.toolsandbox @@ -0,0 +1,35 @@ +# ToolSandbox evaluation container for NEL Next. +# +# Bundles Apple's ToolSandbox benchmark with a custom NVIDIA NIM agent/user +# that routes model calls through any OpenAI-compatible endpoint. +# +# Build: +# docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest . +# +# Pin to a specific commit for reproducible builds: +# docker build -f docker/Dockerfile.toolsandbox \ +# --build-arg TOOLSANDBOX_REF= \ +# -t toolsandbox-nel: . +# +# Required env vars at runtime (injected by ToolSandboxEnvironment.run_batch): +# NVIDIA_BASE_URL – OpenAI-compatible base URL +# NVIDIA_API_KEY – API key +# NVIDIA_AGENT_MODEL – Model ID for the agent under evaluation +# NVIDIA_USER_MODEL – Model ID for the user simulator + +FROM python:3.11-slim + +RUN apt-get update \ + && apt-get install -y --no-install-recommends git \ + && rm -rf /var/lib/apt/lists/* + +ARG TOOLSANDBOX_REF=main + +# ToolSandbox pins openai==1.17.0 and other specific versions — install in +# a clean environment separate from any NEL dependencies. +RUN pip install --no-cache-dir \ + "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}" + +COPY docker/toolsandbox_entrypoint.py /opt/toolsandbox_entrypoint.py + +ENTRYPOINT ["python", "/opt/toolsandbox_entrypoint.py"] diff --git a/docker/toolsandbox_entrypoint.py b/docker/toolsandbox_entrypoint.py new file mode 100644 index 000000000..c8eea9fd4 --- /dev/null +++ b/docker/toolsandbox_entrypoint.py @@ -0,0 +1,87 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ToolSandbox NEL entrypoint. + +Patches the ToolSandbox agent/user factories to use NVIDIA-hosted models via +any OpenAI-compatible endpoint, then delegates to the standard tool_sandbox CLI. + +Required environment variables: + NVIDIA_BASE_URL – OpenAI-compatible endpoint base URL + (e.g. https://integrate.api.nvidia.com/v1) + NVIDIA_API_KEY – API key for both agent and user models + NVIDIA_AGENT_MODEL – Model name for the agent under evaluation + +Optional: + NVIDIA_USER_MODEL – Model for user simulator + (default: meta/llama-3.1-70b-instruct) + +CLI args (after patching) follow the standard tool_sandbox interface: + --agent Gorilla --user GPT_4_o_2024_05_13 [--scenarios ...] [--test_mode] +""" +from __future__ import annotations + +import os +import sys + + +def _require_env(name: str) -> str: + val = os.environ.get(name, "").strip() + if not val: + raise RuntimeError(f"Required environment variable {name!r} is not set") + return val + + +def _register_nvidia_roles() -> None: + """Replace Gorilla agent and GPT-4o user with NVIDIA NIM-backed classes. + + We reuse existing RoleImplType enum keys so the CLI accepts + ``--agent Gorilla --user GPT_4_o_2024_05_13`` without modification. + """ + from openai import OpenAI + from tool_sandbox.cli.utils import AGENT_TYPE_TO_FACTORY, RoleImplType, USER_TYPE_TO_FACTORY + from tool_sandbox.roles.openai_api_agent import OpenAIAPIAgent + from tool_sandbox.roles.openai_api_user import OpenAIAPIUser + + base_url = _require_env("NVIDIA_BASE_URL") + api_key = _require_env("NVIDIA_API_KEY") + agent_model = _require_env("NVIDIA_AGENT_MODEL") + user_model = os.environ.get("NVIDIA_USER_MODEL", "meta/llama-3.1-70b-instruct") + + def _client() -> OpenAI: + return OpenAI(base_url=base_url, api_key=api_key) + + class NVIDIANIMAgent(OpenAIAPIAgent): + model_name: str = agent_model + + def __init__(self) -> None: + super().__init__() + self.openai_client = _client() + + class NVIDIANIMUser(OpenAIAPIUser): + model_name: str = user_model + + def __init__(self) -> None: + super().__init__() + self.openai_client = _client() + + AGENT_TYPE_TO_FACTORY[RoleImplType.Gorilla] = NVIDIANIMAgent + USER_TYPE_TO_FACTORY[RoleImplType.GPT_4_o_2024_05_13] = NVIDIANIMUser + + +if __name__ == "__main__": + _register_nvidia_roles() + from tool_sandbox.cli import main + + sys.exit(main()) diff --git a/examples/configs/toolsandbox.yaml b/examples/configs/toolsandbox.yaml new file mode 100644 index 000000000..12aa87fe2 --- /dev/null +++ b/examples/configs/toolsandbox.yaml @@ -0,0 +1,43 @@ +# Flavor: ToolSandbox — stateful multi-turn tool-use benchmark +# +# Evaluates an LLM's ability to use tools across stateful, multi-turn +# conversations (Apple's ToolSandbox: https://github.com/apple/ToolSandbox). +# Both the agent under evaluation AND the user simulator call the NVIDIA +# Inference API — no OpenAI key required. +# +# Prerequisites: +# - NVIDIA_API_KEY set in environment +# - Docker running locally +# - toolsandbox-nel image built (once): +# docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest . +# +# Run: nel eval run examples/configs/toolsandbox.yaml + +services: + nemotron: + type: api + url: https://integrate.api.nvidia.com/v1/chat/completions + protocol: chat_completions + model: nvidia/nemotron-3-super-120b-a12b + api_key: ${NVIDIA_API_KEY} + +benchmarks: + - name: toolsandbox + params: + image: toolsandbox-nel:latest + # Model used as user simulator (must be available on the same API) + user_model: meta/llama-3.1-70b-instruct + # Number of scenarios to run in parallel inside the container + parallel: 4 + # Set to true to run only a small predefined subset for quick validation + test_mode: false + # Specific scenarios to run — omit or set [] to run the full suite + # scenarios: [wifi_off, cellular_off, make_call] + solver: + type: simple + service: nemotron + timeout: 7200.0 + +output: + dir: ./results/toolsandbox + report: [markdown, json] diff --git a/src/nemo_evaluator/benchmarks/__init__.py b/src/nemo_evaluator/benchmarks/__init__.py index 3b2f9cd4b..de2b3879b 100644 --- a/src/nemo_evaluator/benchmarks/__init__.py +++ b/src/nemo_evaluator/benchmarks/__init__.py @@ -33,5 +33,6 @@ import nemo_evaluator.benchmarks.simpleqa # noqa: F401 import nemo_evaluator.benchmarks.terminal_bench_hard # noqa: F401 import nemo_evaluator.benchmarks.terminal_bench_v1 # noqa: F401 +import nemo_evaluator.benchmarks.toolsandbox # noqa: F401 import nemo_evaluator.benchmarks.triviaqa # noqa: F401 import nemo_evaluator.benchmarks.xstest # noqa: F401 diff --git a/src/nemo_evaluator/benchmarks/toolsandbox.py b/src/nemo_evaluator/benchmarks/toolsandbox.py new file mode 100644 index 000000000..53c147cee --- /dev/null +++ b/src/nemo_evaluator/benchmarks/toolsandbox.py @@ -0,0 +1,257 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ToolSandbox benchmark — Apple's stateful multi-turn tool-use evaluation. + +Registers ``toolsandbox`` as a built-in benchmark. Bypasses the standard +seed/solve/verify loop via ``run_batch()``, which spawns a pre-built Docker +image containing ToolSandbox and parses the resulting ``result_summary.json``. + +Config usage:: + + benchmarks: + - name: toolsandbox + params: + image: toolsandbox-nel:latest # pre-built Docker image + user_model: meta/llama-3.1-70b-instruct # user-simulator model + parallel: 4 # concurrent scenarios + test_mode: false # true = small subset only + scenarios: [] # [] = all scenarios + solver: + type: simple + service: my_model + timeout: 7200.0 + +Build the image once before running:: + + docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest . + +The agent under evaluation is taken from the ``solver.service`` entry. +The user simulator calls the same NVIDIA API base URL with ``user_model``. +Both require ``NVIDIA_API_KEY`` in the environment. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import os +import tempfile +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult +from nemo_evaluator.environments.registry import register + +if TYPE_CHECKING: + from nemo_evaluator.sandbox.base import Sandbox + +logger = logging.getLogger(__name__) + +_DEFAULT_IMAGE = "toolsandbox-nel:latest" +_DEFAULT_USER_MODEL = "meta/llama-3.1-70b-instruct" +_DEFAULT_BASE_URL = "https://integrate.api.nvidia.com/v1" +_CONTAINER_OUTPUT = "/output" + +# ToolSandbox uses --agent Gorilla / --user GPT_4_o_2024_05_13 as the CLI +# selectors; the entrypoint script patches those factory entries to point at +# NVIDIANIMAgent / NVIDIANIMUser backed by NVIDIA_BASE_URL. +_CLI_AGENT = "Gorilla" +_CLI_USER = "GPT_4_o_2024_05_13" + + +def _to_openai_base_url(url: str) -> str: + """Normalize a NEL service URL to the OpenAI SDK base_url format. + + NEL service URLs include the full path (e.g. /v1/chat/completions). + ToolSandbox expects just the base (e.g. https://host/v1). + """ + for suffix in ("/chat/completions", "/completions", "/responses"): + if url.endswith(suffix): + return url[: -len(suffix)] + return url.rstrip("/") + + +@register("toolsandbox") +class ToolSandboxEnvironment(EvalEnvironment): + """Runs ToolSandbox in a Docker container and parses aggregate metrics. + + The entire scenario suite executes as a single batch inside the container. + ``seed()`` and ``verify()`` are not used. + """ + + def __init__( + self, + image: str = _DEFAULT_IMAGE, + user_model: str = _DEFAULT_USER_MODEL, + scenarios: list[str] | None = None, + parallel: int = 4, + timeout: float = 7200.0, + test_mode: bool = False, + ) -> None: + super().__init__() + self._image = image + self._user_model = user_model + self._scenarios: list[str] = scenarios or [] + self._parallel = parallel + self._timeout = timeout + self._test_mode = test_mode + + # ------------------------------------------------------------------ + # EvalEnvironment interface + # ------------------------------------------------------------------ + + async def dataset_size(self) -> int: + return 0 + + async def seed(self, idx: int) -> SeedResult: + raise NotImplementedError("ToolSandboxEnvironment uses run_batch()") + + async def verify( + self, + response: str, + expected: str, + sandbox: "Sandbox | None" = None, + **metadata: Any, + ) -> VerifyResult: + raise NotImplementedError("ToolSandboxEnvironment uses run_batch()") + + # ------------------------------------------------------------------ + # Batch execution + # ------------------------------------------------------------------ + + async def run_batch(self, solver: Any = None, config: dict[str, Any] | None = None) -> dict[str, Any]: + config = config or {} + model_url = config.get("base_url", "") or _DEFAULT_BASE_URL + model_id = config.get("model", "") + api_key = config.get("api_key") or os.environ.get("NVIDIA_API_KEY", "") + + base_url = _to_openai_base_url(model_url) + + with tempfile.TemporaryDirectory(prefix="nel_toolsandbox_") as tmpdir: + output_dir = Path(tmpdir) / "output" + output_dir.mkdir() + + cmd = self._build_docker_cmd(output_dir, base_url, model_id, api_key) + logger.info("Launching ToolSandbox: %s", " ".join(cmd[:10]) + " ...") + + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + try: + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=self._timeout) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + logger.error("ToolSandbox timed out after %.0fs", self._timeout) + stderr = b"timeout" + + rc = proc.returncode or 0 + if rc != 0: + logger.error( + "ToolSandbox container exited %d:\n%s", + rc, + (stderr or b"").decode(errors="replace")[:2000], + ) + + return self._parse_results(output_dir, rc, model_id) + + # ------------------------------------------------------------------ + # Docker command builder + # ------------------------------------------------------------------ + + def _build_docker_cmd( + self, + output_dir: Path, + base_url: str, + model_id: str, + api_key: str, + ) -> list[str]: + cmd = [ + "docker", "run", "--rm", + "-v", f"{output_dir}:{_CONTAINER_OUTPUT}", + "-e", f"NVIDIA_BASE_URL={base_url}", + "-e", f"NVIDIA_AGENT_MODEL={model_id}", + "-e", f"NVIDIA_USER_MODEL={self._user_model}", + ] + if api_key: + cmd.extend(["-e", f"NVIDIA_API_KEY={api_key}"]) + + cmd.append(self._image) + + cmd.extend(["--agent", _CLI_AGENT, "--user", _CLI_USER]) + cmd.extend(["--output_dir", _CONTAINER_OUTPUT]) + cmd.extend(["--parallel", str(self._parallel)]) + + if self._test_mode: + cmd.append("--test_mode") + elif self._scenarios: + cmd.extend(["--scenarios"] + list(self._scenarios)) + + return cmd + + # ------------------------------------------------------------------ + # Results parsing + # ------------------------------------------------------------------ + + def _parse_results(self, output_dir: Path, exit_code: int, model_id: str) -> dict[str, Any]: + summary = self._load_result_summary(output_dir) + scores = self._extract_scores(summary) + + return { + "benchmark": { + "name": self.name, + "samples": summary.get("num_scenarios", len(summary.get("per_category", {}))), + "scores": scores, + }, + "config": { + "benchmark": self.name, + "image": self._image, + "model": model_id, + "user_model": self._user_model, + "framework": "toolsandbox", + "scenarios": self._scenarios or "all", + "test_mode": self._test_mode, + }, + "_container_exit_code": exit_code, + } + + def _load_result_summary(self, output_dir: Path) -> dict[str, Any]: + for candidate in sorted(output_dir.rglob("result_summary.json")): + try: + return json.loads(candidate.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError) as exc: + logger.warning("Could not parse %s: %s", candidate, exc) + logger.warning("No result_summary.json found in %s", output_dir) + return {} + + @staticmethod + def _extract_scores(summary: dict[str, Any]) -> dict[str, Any]: + scores: dict[str, Any] = {} + + for metric in ("similarity", "turn_count"): + if metric in summary: + scores[metric] = {"value": round(float(summary[metric]), 4)} + + per_category = summary.get("per_category") or {} + for cat_name, cat_data in per_category.items(): + if isinstance(cat_data, dict) and "similarity" in cat_data: + scores[f"per_category/{cat_name}/similarity"] = { + "value": round(float(cat_data["similarity"]), 4) + } + + return scores diff --git a/tests/test_environments/test_toolsandbox.py b/tests/test_environments/test_toolsandbox.py new file mode 100644 index 000000000..241152e19 --- /dev/null +++ b/tests/test_environments/test_toolsandbox.py @@ -0,0 +1,268 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Offline tests for ToolSandboxEnvironment (no Docker, no network).""" + +from __future__ import annotations + +import json +import pytest + +from nemo_evaluator.benchmarks.toolsandbox import ToolSandboxEnvironment, _to_openai_base_url + + +# --------------------------------------------------------------------------- +# URL normalization +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "url, expected", + [ + ( + "https://integrate.api.nvidia.com/v1/chat/completions", + "https://integrate.api.nvidia.com/v1", + ), + ( + "https://integrate.api.nvidia.com/v1/completions", + "https://integrate.api.nvidia.com/v1", + ), + ( + "https://integrate.api.nvidia.com/v1/responses", + "https://integrate.api.nvidia.com/v1", + ), + ( + "https://integrate.api.nvidia.com/v1", + "https://integrate.api.nvidia.com/v1", + ), + ( + "http://localhost:8000/v1/chat/completions", + "http://localhost:8000/v1", + ), + ( + "http://localhost:8000/v1", + "http://localhost:8000/v1", + ), + ], + ids=[ + "strip_chat_completions", + "strip_completions", + "strip_responses", + "no_op", + "localhost_with_suffix", + "localhost_base", + ], +) +def test_to_openai_base_url(url: str, expected: str) -> None: + assert _to_openai_base_url(url) == expected + + +# --------------------------------------------------------------------------- +# ToolSandboxEnvironment construction and defaults +# --------------------------------------------------------------------------- + + +def test_default_construction() -> None: + env = ToolSandboxEnvironment() + assert env._image == "toolsandbox-nel:latest" + assert env._user_model == "meta/llama-3.1-70b-instruct" + assert env._scenarios == [] + assert env._parallel == 4 + assert not env._test_mode + + +def test_custom_params() -> None: + env = ToolSandboxEnvironment( + image="toolsandbox-nel:v1.2", + user_model="meta/llama-3.1-8b-instruct", + scenarios=["wifi_off", "cellular_off"], + parallel=8, + test_mode=True, + ) + assert env._image == "toolsandbox-nel:v1.2" + assert env._scenarios == ["wifi_off", "cellular_off"] + assert env._parallel == 8 + assert env._test_mode + + +# --------------------------------------------------------------------------- +# Docker command construction +# --------------------------------------------------------------------------- + + +def test_docker_cmd_no_scenarios() -> None: + env = ToolSandboxEnvironment() + cmd = env._build_docker_cmd( + output_dir=__import__("pathlib").Path("/tmp/output"), + base_url="https://integrate.api.nvidia.com/v1", + model_id="nvidia/nemotron-3-super-120b-a12b", + api_key="test-key", + ) + assert "docker" in cmd + assert "--agent" in cmd + assert "Gorilla" in cmd + assert "--user" in cmd + assert "GPT_4_o_2024_05_13" in cmd + assert "--scenarios" not in cmd + assert "--test_mode" not in cmd + assert "NVIDIA_AGENT_MODEL=nvidia/nemotron-3-super-120b-a12b" in " ".join(cmd) + + +def test_docker_cmd_with_scenarios() -> None: + env = ToolSandboxEnvironment(scenarios=["wifi_off", "make_call"]) + cmd = env._build_docker_cmd( + output_dir=__import__("pathlib").Path("/tmp/output"), + base_url="https://integrate.api.nvidia.com/v1", + model_id="test-model", + api_key="key", + ) + idx = cmd.index("--scenarios") + assert cmd[idx + 1] == "wifi_off" + assert cmd[idx + 2] == "make_call" + + +def test_docker_cmd_test_mode() -> None: + env = ToolSandboxEnvironment(test_mode=True) + cmd = env._build_docker_cmd( + output_dir=__import__("pathlib").Path("/tmp/output"), + base_url="https://integrate.api.nvidia.com/v1", + model_id="test-model", + api_key="key", + ) + assert "--test_mode" in cmd + assert "--scenarios" not in cmd + + +def test_docker_cmd_no_api_key() -> None: + env = ToolSandboxEnvironment() + cmd = env._build_docker_cmd( + output_dir=__import__("pathlib").Path("/tmp/output"), + base_url="https://integrate.api.nvidia.com/v1", + model_id="test-model", + api_key="", + ) + cmd_str = " ".join(cmd) + assert "NVIDIA_API_KEY" not in cmd_str + + +# --------------------------------------------------------------------------- +# Score extraction +# --------------------------------------------------------------------------- + + +def test_extract_scores_full() -> None: + summary = { + "similarity": 0.72, + "turn_count": 4.3, + "per_category": { + "single_tool_call": {"similarity": 0.91, "count": 45}, + "multiple_tool_call": {"similarity": 0.61, "count": 20}, + }, + } + scores = ToolSandboxEnvironment._extract_scores(summary) + + assert scores["similarity"]["value"] == pytest.approx(0.72, abs=1e-4) + assert scores["turn_count"]["value"] == pytest.approx(4.3, abs=1e-4) + assert "per_category/single_tool_call/similarity" in scores + assert scores["per_category/single_tool_call/similarity"]["value"] == pytest.approx(0.91, abs=1e-4) + assert "per_category/multiple_tool_call/similarity" in scores + + +def test_extract_scores_empty_summary() -> None: + scores = ToolSandboxEnvironment._extract_scores({}) + assert scores == {} + + +def test_extract_scores_no_categories() -> None: + summary = {"similarity": 0.5} + scores = ToolSandboxEnvironment._extract_scores(summary) + assert scores == {"similarity": {"value": 0.5}} + + +def test_extract_scores_category_missing_similarity() -> None: + summary = { + "similarity": 0.8, + "per_category": {"single_tool_call": {"count": 10}}, + } + scores = ToolSandboxEnvironment._extract_scores(summary) + assert "per_category/single_tool_call/similarity" not in scores + assert scores["similarity"]["value"] == pytest.approx(0.8) + + +# --------------------------------------------------------------------------- +# Result summary loading (from temp directory) +# --------------------------------------------------------------------------- + + +def test_load_result_summary(tmp_path): + env = ToolSandboxEnvironment() + + # Simulate ToolSandbox output structure + run_dir = tmp_path / "agent_Gorilla_user_GPT_4_o_20240513_12345" + run_dir.mkdir() + summary_data = {"similarity": 0.65, "turn_count": 3.1} + (run_dir / "result_summary.json").write_text(json.dumps(summary_data)) + + result = env._load_result_summary(tmp_path) + assert result == summary_data + + +def test_load_result_summary_missing(tmp_path) -> None: + env = ToolSandboxEnvironment() + result = env._load_result_summary(tmp_path) + assert result == {} + + +# --------------------------------------------------------------------------- +# Bundle structure +# --------------------------------------------------------------------------- + + +def test_parse_results_bundle_keys(tmp_path) -> None: + env = ToolSandboxEnvironment() + + run_dir = tmp_path / "run_1" + run_dir.mkdir() + (run_dir / "result_summary.json").write_text( + json.dumps({"similarity": 0.7, "turn_count": 5.0}) + ) + + bundle = env._parse_results(tmp_path, exit_code=0, model_id="my-model") + + assert "benchmark" in bundle + assert "config" in bundle + assert bundle["benchmark"]["name"] == "toolsandbox" + assert bundle["benchmark"]["scores"]["similarity"]["value"] == pytest.approx(0.7) + assert bundle["config"]["framework"] == "toolsandbox" + assert bundle["config"]["model"] == "my-model" + assert bundle["_container_exit_code"] == 0 + + +# --------------------------------------------------------------------------- +# seed/verify raise NotImplementedError +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_seed_raises() -> None: + env = ToolSandboxEnvironment() + with pytest.raises(NotImplementedError): + await env.seed(0) + + +@pytest.mark.asyncio +async def test_verify_raises() -> None: + env = ToolSandboxEnvironment() + with pytest.raises(NotImplementedError): + await env.verify("response", "expected") From e633f976b041fdef6915c63fa418b97fdf602dea Mon Sep 17 00:00:00 2001 From: Wojciech Prazuch Date: Mon, 18 May 2026 15:06:08 -0700 Subject: [PATCH 2/6] feat(benchmarks): add subprocess/apptainer runners for SLURM Adds two runner modes to ToolSandboxEnvironment: - apptainer: runs toolsandbox-nel.sif via `apptainer run` (SLURM with apptainer, no Docker needed) - subprocess: runs toolsandbox_entrypoint.py directly as a Python subprocess (eval container has ToolSandbox pre-installed in a venv, zero nesting) Also adds Dockerfile.toolsandbox-combined (ToolSandbox in isolated /opt/toolsandbox-venv alongside NEL Next) and a SLURM example config that uses the subprocess runner. 24 offline tests, all green. Signed-off-by: Wojciech Prazuch --- docker/Dockerfile.toolsandbox-combined | 46 ++++++ examples/configs/toolsandbox_slurm.yaml | 56 +++++++ src/nemo_evaluator/benchmarks/toolsandbox.py | 156 +++++++++++++------ tests/test_environments/test_toolsandbox.py | 104 +++++++++++-- 4 files changed, 302 insertions(+), 60 deletions(-) create mode 100644 docker/Dockerfile.toolsandbox-combined create mode 100644 examples/configs/toolsandbox_slurm.yaml diff --git a/docker/Dockerfile.toolsandbox-combined b/docker/Dockerfile.toolsandbox-combined new file mode 100644 index 000000000..8c811e88e --- /dev/null +++ b/docker/Dockerfile.toolsandbox-combined @@ -0,0 +1,46 @@ +# ToolSandbox + NEL Next combined evaluation container. +# +# Used for SLURM runs where nested Docker is unavailable. ToolSandbox is +# installed in an isolated venv (/opt/toolsandbox-venv) so its pinned +# dependencies (openai==1.17.0, etc.) do not conflict with NEL Next. +# +# Build: +# docker build -f docker/Dockerfile.toolsandbox-combined \ +# -t toolsandbox-nel-combined:latest . +# +# Convert to squashfs for SLURM (run on a login node with Docker): +# enroot import -o /shared/nel/toolsandbox-nel-combined.sqsh \ +# dockerd://toolsandbox-nel-combined:latest +# +# NEL Next config (subprocess runner): +# benchmarks: +# - name: toolsandbox +# params: +# runner: subprocess +# python_exe: /opt/toolsandbox-venv/bin/python +# entrypoint: /opt/toolsandbox_entrypoint.py +# +# Required env vars at runtime: +# NVIDIA_BASE_URL – OpenAI-compatible base URL +# NVIDIA_API_KEY – API key +# NVIDIA_AGENT_MODEL – model ID for the agent under evaluation +# NVIDIA_USER_MODEL – model ID for user simulator + +ARG BASE_IMAGE=nemo-evaluator + +FROM ${BASE_IMAGE} + +# Install Python 3.11 for the ToolSandbox venv (avoids openai version conflicts +# with NEL Next which uses the system Python 3.12) +RUN apt-get update \ + && apt-get install -y --no-install-recommends python3.11 python3.11-venv \ + && rm -rf /var/lib/apt/lists/* + +ARG TOOLSANDBOX_REF=main + +# Create an isolated venv for ToolSandbox with its pinned deps +RUN python3.11 -m venv /opt/toolsandbox-venv \ + && /opt/toolsandbox-venv/bin/pip install --no-cache-dir \ + "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}" + +COPY docker/toolsandbox_entrypoint.py /opt/toolsandbox_entrypoint.py diff --git a/examples/configs/toolsandbox_slurm.yaml b/examples/configs/toolsandbox_slurm.yaml new file mode 100644 index 000000000..cc73df779 --- /dev/null +++ b/examples/configs/toolsandbox_slurm.yaml @@ -0,0 +1,56 @@ +# Flavor: ToolSandbox on SLURM (subprocess runner — no nested Docker) +# +# Uses the subprocess runner so ToolSandbox (pre-installed in +# /opt/toolsandbox-venv inside the eval container) runs directly without +# a nested Docker call. Suitable for any SLURM cluster. +# +# Prerequisites: +# - NVIDIA_API_KEY set in environment (or in cluster.container_env) +# - toolsandbox-nel-combined squashfs on shared storage: +# docker build -f docker/Dockerfile.toolsandbox-combined \ +# -t toolsandbox-nel-combined:latest . +# enroot import -o /shared/nel/toolsandbox-nel-combined.sqsh \ +# dockerd://toolsandbox-nel-combined:latest +# - SSH access to the SLURM login node +# +# Dry-run: nel eval run examples/configs/toolsandbox_slurm.yaml --dry-run +# Submit: nel eval run examples/configs/toolsandbox_slurm.yaml --submit + +services: + nemotron: + type: api + url: https://integrate.api.nvidia.com/v1/chat/completions + protocol: chat_completions + model: nvidia/nemotron-3-super-120b-a12b + api_key: ${NVIDIA_API_KEY} + +benchmarks: + - name: toolsandbox + params: + runner: subprocess + python_exe: /opt/toolsandbox-venv/bin/python + entrypoint: /opt/toolsandbox_entrypoint.py + user_model: meta/llama-3.1-70b-instruct + parallel: 8 + test_mode: false + solver: + type: simple + service: nemotron + timeout: 14400.0 + +output: + dir: ./results/toolsandbox_slurm + report: [markdown, json] + +cluster: + type: slurm + hostname: ${SLURM_LOGIN_HOST} + account: ${SLURM_ACCOUNT} + walltime: "04:00:00" + eval_image: ${SHARED_ROOT}/nel/toolsandbox-nel-combined.sqsh + container_env: + NVIDIA_API_KEY: ${NVIDIA_API_KEY} + node_pools: + default: + partition: cpu + nodes: 1 diff --git a/src/nemo_evaluator/benchmarks/toolsandbox.py b/src/nemo_evaluator/benchmarks/toolsandbox.py index 53c147cee..bc49b2448 100644 --- a/src/nemo_evaluator/benchmarks/toolsandbox.py +++ b/src/nemo_evaluator/benchmarks/toolsandbox.py @@ -15,31 +15,53 @@ """ToolSandbox benchmark — Apple's stateful multi-turn tool-use evaluation. Registers ``toolsandbox`` as a built-in benchmark. Bypasses the standard -seed/solve/verify loop via ``run_batch()``, which spawns a pre-built Docker -image containing ToolSandbox and parses the resulting ``result_summary.json``. +seed/solve/verify loop via ``run_batch()``, which runs ToolSandbox in one of +three modes and parses the resulting ``result_summary.json``. + +Runner modes +------------ +docker (default) + Spawns a pre-built Docker image. Requires Docker on the eval host. + + Build the image once:: + + docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest . + +apptainer + Same image as ``docker`` mode but executed via ``apptainer run``. + Use on SLURM clusters where Docker is unavailable. ``image`` should be + a path to a ``.sif`` or ``.sqsh`` file on the shared filesystem. + +subprocess + Runs the ToolSandbox entrypoint directly as a Python subprocess — no + container needed. Use when the eval container already has ToolSandbox + pre-installed (e.g. ``Dockerfile.toolsandbox-combined``). Set + ``python_exe`` to the venv Python that has ToolSandbox, and + ``entrypoint`` to the patch script path. Config usage:: benchmarks: - name: toolsandbox params: - image: toolsandbox-nel:latest # pre-built Docker image - user_model: meta/llama-3.1-70b-instruct # user-simulator model - parallel: 4 # concurrent scenarios - test_mode: false # true = small subset only - scenarios: [] # [] = all scenarios + # --- runner selection --- + runner: docker # docker | apptainer | subprocess + image: toolsandbox-nel:latest # docker image name / sif path + # --- subprocess-mode overrides --- + python_exe: /opt/toolsandbox-venv/bin/python + entrypoint: /opt/toolsandbox_entrypoint.py + # --- benchmark settings --- + user_model: meta/llama-3.1-70b-instruct # user-simulator model + parallel: 4 # concurrent scenarios + test_mode: false # true = small subset only + scenarios: [] # [] = all scenarios solver: type: simple service: my_model timeout: 7200.0 -Build the image once before running:: - - docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest . - -The agent under evaluation is taken from the ``solver.service`` entry. -The user simulator calls the same NVIDIA API base URL with ``user_model``. -Both require ``NVIDIA_API_KEY`` in the environment. +Both agent and user simulator call the NVIDIA Inference API — no OpenAI +key required. ``NVIDIA_API_KEY`` must be set in the environment. """ from __future__ import annotations @@ -48,9 +70,10 @@ import json import logging import os +import sys import tempfile from pathlib import Path -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult from nemo_evaluator.environments.registry import register @@ -63,21 +86,15 @@ _DEFAULT_IMAGE = "toolsandbox-nel:latest" _DEFAULT_USER_MODEL = "meta/llama-3.1-70b-instruct" _DEFAULT_BASE_URL = "https://integrate.api.nvidia.com/v1" +_DEFAULT_ENTRYPOINT = "/opt/toolsandbox_entrypoint.py" _CONTAINER_OUTPUT = "/output" -# ToolSandbox uses --agent Gorilla / --user GPT_4_o_2024_05_13 as the CLI -# selectors; the entrypoint script patches those factory entries to point at -# NVIDIANIMAgent / NVIDIANIMUser backed by NVIDIA_BASE_URL. _CLI_AGENT = "Gorilla" _CLI_USER = "GPT_4_o_2024_05_13" def _to_openai_base_url(url: str) -> str: - """Normalize a NEL service URL to the OpenAI SDK base_url format. - - NEL service URLs include the full path (e.g. /v1/chat/completions). - ToolSandbox expects just the base (e.g. https://host/v1). - """ + """Strip /chat/completions, /completions, /responses path suffix from NEL service URLs.""" for suffix in ("/chat/completions", "/completions", "/responses"): if url.endswith(suffix): return url[: -len(suffix)] @@ -86,15 +103,18 @@ def _to_openai_base_url(url: str) -> str: @register("toolsandbox") class ToolSandboxEnvironment(EvalEnvironment): - """Runs ToolSandbox in a Docker container and parses aggregate metrics. + """Runs ToolSandbox and parses aggregate metrics. - The entire scenario suite executes as a single batch inside the container. + The entire scenario suite executes as a single batch. ``seed()`` and ``verify()`` are not used. """ def __init__( self, + runner: Literal["docker", "apptainer", "subprocess"] = "docker", image: str = _DEFAULT_IMAGE, + python_exe: str | None = None, + entrypoint: str = _DEFAULT_ENTRYPOINT, user_model: str = _DEFAULT_USER_MODEL, scenarios: list[str] | None = None, parallel: int = 4, @@ -102,7 +122,10 @@ def __init__( test_mode: bool = False, ) -> None: super().__init__() + self._runner = runner self._image = image + self._python_exe = python_exe or sys.executable + self._entrypoint = entrypoint self._user_model = user_model self._scenarios: list[str] = scenarios or [] self._parallel = parallel @@ -144,13 +167,14 @@ async def run_batch(self, solver: Any = None, config: dict[str, Any] | None = No output_dir = Path(tmpdir) / "output" output_dir.mkdir() - cmd = self._build_docker_cmd(output_dir, base_url, model_id, api_key) - logger.info("Launching ToolSandbox: %s", " ".join(cmd[:10]) + " ...") + cmd, env = self._build_cmd(output_dir, base_url, model_id, api_key) + logger.info("Launching ToolSandbox (%s): %s", self._runner, " ".join(cmd[:10]) + " ...") proc = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, + env=env, ) try: stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=self._timeout) @@ -163,7 +187,7 @@ async def run_batch(self, solver: Any = None, config: dict[str, Any] | None = No rc = proc.returncode or 0 if rc != 0: logger.error( - "ToolSandbox container exited %d:\n%s", + "ToolSandbox exited %d:\n%s", rc, (stderr or b"").decode(errors="replace")[:2000], ) @@ -171,39 +195,82 @@ async def run_batch(self, solver: Any = None, config: dict[str, Any] | None = No return self._parse_results(output_dir, rc, model_id) # ------------------------------------------------------------------ - # Docker command builder + # Command builders # ------------------------------------------------------------------ - def _build_docker_cmd( + def _build_cmd( self, output_dir: Path, base_url: str, model_id: str, api_key: str, - ) -> list[str]: - cmd = [ - "docker", "run", "--rm", - "-v", f"{output_dir}:{_CONTAINER_OUTPUT}", + ) -> tuple[list[str], dict[str, str] | None]: + """Return (cmd, env) for the selected runner.""" + if self._runner == "subprocess": + return self._build_subprocess_cmd(output_dir, base_url, model_id, api_key) + if self._runner == "apptainer": + return self._build_apptainer_cmd(output_dir, base_url, model_id, api_key), None + return self._build_docker_cmd(output_dir, base_url, model_id, api_key), None + + def _toolsandbox_cli_args(self, output_dir_str: str) -> list[str]: + args = ["--agent", _CLI_AGENT, "--user", _CLI_USER] + args.extend(["--output_dir", output_dir_str]) + args.extend(["--parallel", str(self._parallel)]) + if self._test_mode: + args.append("--test_mode") + elif self._scenarios: + args.extend(["--scenarios"] + list(self._scenarios)) + return args + + def _container_env_flags(self, base_url: str, model_id: str, api_key: str) -> list[str]: + flags = [ "-e", f"NVIDIA_BASE_URL={base_url}", "-e", f"NVIDIA_AGENT_MODEL={model_id}", "-e", f"NVIDIA_USER_MODEL={self._user_model}", ] if api_key: - cmd.extend(["-e", f"NVIDIA_API_KEY={api_key}"]) + flags.extend(["-e", f"NVIDIA_API_KEY={api_key}"]) + return flags + def _build_docker_cmd(self, output_dir: Path, base_url: str, model_id: str, api_key: str) -> list[str]: + cmd = ["docker", "run", "--rm", "-v", f"{output_dir}:{_CONTAINER_OUTPUT}"] + cmd.extend(self._container_env_flags(base_url, model_id, api_key)) cmd.append(self._image) + cmd.extend(self._toolsandbox_cli_args(_CONTAINER_OUTPUT)) + return cmd - cmd.extend(["--agent", _CLI_AGENT, "--user", _CLI_USER]) - cmd.extend(["--output_dir", _CONTAINER_OUTPUT]) - cmd.extend(["--parallel", str(self._parallel)]) - - if self._test_mode: - cmd.append("--test_mode") - elif self._scenarios: - cmd.extend(["--scenarios"] + list(self._scenarios)) + def _build_apptainer_cmd(self, output_dir: Path, base_url: str, model_id: str, api_key: str) -> list[str]: + env_flags: list[str] = [] + for flag in self._container_env_flags(base_url, model_id, api_key): + if flag == "-e": + continue + env_flags.extend(["--env", flag]) + cmd = [ + "apptainer", "run", "--bind", f"{output_dir}:{_CONTAINER_OUTPUT}", + *env_flags, + self._image, + ] + cmd.extend(self._toolsandbox_cli_args(_CONTAINER_OUTPUT)) return cmd + def _build_subprocess_cmd( + self, output_dir: Path, base_url: str, model_id: str, api_key: str + ) -> tuple[list[str], dict[str, str]]: + # Env is passed via environment variables to the subprocess + env = { + **os.environ, + "NVIDIA_BASE_URL": base_url, + "NVIDIA_AGENT_MODEL": model_id, + "NVIDIA_USER_MODEL": self._user_model, + } + if api_key: + env["NVIDIA_API_KEY"] = api_key + + cmd = [self._python_exe, self._entrypoint] + cmd.extend(self._toolsandbox_cli_args(str(output_dir))) + return cmd, env + # ------------------------------------------------------------------ # Results parsing # ------------------------------------------------------------------ @@ -220,7 +287,8 @@ def _parse_results(self, output_dir: Path, exit_code: int, model_id: str) -> dic }, "config": { "benchmark": self.name, - "image": self._image, + "runner": self._runner, + "image": self._image if self._runner != "subprocess" else None, "model": model_id, "user_model": self._user_model, "framework": "toolsandbox", diff --git a/tests/test_environments/test_toolsandbox.py b/tests/test_environments/test_toolsandbox.py index 241152e19..9f2006734 100644 --- a/tests/test_environments/test_toolsandbox.py +++ b/tests/test_environments/test_toolsandbox.py @@ -75,6 +75,7 @@ def test_to_openai_base_url(url: str, expected: str) -> None: def test_default_construction() -> None: env = ToolSandboxEnvironment() + assert env._runner == "docker" assert env._image == "toolsandbox-nel:latest" assert env._user_model == "meta/llama-3.1-70b-instruct" assert env._scenarios == [] @@ -84,13 +85,16 @@ def test_default_construction() -> None: def test_custom_params() -> None: env = ToolSandboxEnvironment( + runner="subprocess", image="toolsandbox-nel:v1.2", + python_exe="/opt/toolsandbox-venv/bin/python", user_model="meta/llama-3.1-8b-instruct", scenarios=["wifi_off", "cellular_off"], parallel=8, test_mode=True, ) - assert env._image == "toolsandbox-nel:v1.2" + assert env._runner == "subprocess" + assert env._python_exe == "/opt/toolsandbox-venv/bin/python" assert env._scenarios == ["wifi_off", "cellular_off"] assert env._parallel == 8 assert env._test_mode @@ -100,16 +104,18 @@ def test_custom_params() -> None: # Docker command construction # --------------------------------------------------------------------------- +_P = __import__("pathlib").Path + def test_docker_cmd_no_scenarios() -> None: - env = ToolSandboxEnvironment() - cmd = env._build_docker_cmd( - output_dir=__import__("pathlib").Path("/tmp/output"), + env = ToolSandboxEnvironment(runner="docker") + cmd, _ = env._build_cmd( + output_dir=_P("/tmp/output"), base_url="https://integrate.api.nvidia.com/v1", model_id="nvidia/nemotron-3-super-120b-a12b", api_key="test-key", ) - assert "docker" in cmd + assert cmd[0] == "docker" assert "--agent" in cmd assert "Gorilla" in cmd assert "--user" in cmd @@ -120,9 +126,9 @@ def test_docker_cmd_no_scenarios() -> None: def test_docker_cmd_with_scenarios() -> None: - env = ToolSandboxEnvironment(scenarios=["wifi_off", "make_call"]) - cmd = env._build_docker_cmd( - output_dir=__import__("pathlib").Path("/tmp/output"), + env = ToolSandboxEnvironment(runner="docker", scenarios=["wifi_off", "make_call"]) + cmd, _ = env._build_cmd( + output_dir=_P("/tmp/output"), base_url="https://integrate.api.nvidia.com/v1", model_id="test-model", api_key="key", @@ -133,9 +139,9 @@ def test_docker_cmd_with_scenarios() -> None: def test_docker_cmd_test_mode() -> None: - env = ToolSandboxEnvironment(test_mode=True) - cmd = env._build_docker_cmd( - output_dir=__import__("pathlib").Path("/tmp/output"), + env = ToolSandboxEnvironment(runner="docker", test_mode=True) + cmd, _ = env._build_cmd( + output_dir=_P("/tmp/output"), base_url="https://integrate.api.nvidia.com/v1", model_id="test-model", api_key="key", @@ -145,15 +151,80 @@ def test_docker_cmd_test_mode() -> None: def test_docker_cmd_no_api_key() -> None: - env = ToolSandboxEnvironment() - cmd = env._build_docker_cmd( - output_dir=__import__("pathlib").Path("/tmp/output"), + env = ToolSandboxEnvironment(runner="docker") + cmd, _ = env._build_cmd( + output_dir=_P("/tmp/output"), + base_url="https://integrate.api.nvidia.com/v1", + model_id="test-model", + api_key="", + ) + assert "NVIDIA_API_KEY" not in " ".join(cmd) + + +# --------------------------------------------------------------------------- +# Apptainer command construction +# --------------------------------------------------------------------------- + + +def test_apptainer_cmd_basics() -> None: + env = ToolSandboxEnvironment(runner="apptainer", image="/shared/nel/toolsandbox.sif") + cmd, _ = env._build_cmd( + output_dir=_P("/tmp/output"), + base_url="https://integrate.api.nvidia.com/v1", + model_id="test-model", + api_key="key", + ) + assert cmd[0] == "apptainer" + assert "run" in cmd + assert "--bind" in cmd + assert "/shared/nel/toolsandbox.sif" in cmd + assert "--agent" in cmd + + +# --------------------------------------------------------------------------- +# Subprocess command construction +# --------------------------------------------------------------------------- + + +def test_subprocess_cmd_basics() -> None: + env = ToolSandboxEnvironment( + runner="subprocess", + python_exe="/opt/toolsandbox-venv/bin/python", + entrypoint="/opt/toolsandbox_entrypoint.py", + ) + cmd, env_vars = env._build_cmd( + output_dir=_P("/tmp/output"), + base_url="https://integrate.api.nvidia.com/v1", + model_id="test-model", + api_key="test-key", + ) + assert cmd[0] == "/opt/toolsandbox-venv/bin/python" + assert cmd[1] == "/opt/toolsandbox_entrypoint.py" + assert "--agent" in cmd + assert "Gorilla" in cmd + # API config passed via environment, not CLI flags + assert env_vars["NVIDIA_BASE_URL"] == "https://integrate.api.nvidia.com/v1" + assert env_vars["NVIDIA_AGENT_MODEL"] == "test-model" + assert env_vars["NVIDIA_API_KEY"] == "test-key" + # output_dir is a local host path, not the container-mount path /output + assert "/tmp/output" in " ".join(cmd) + assert cmd.count("/output") == 0 or all(c != "/output" for c in cmd) + + +def test_subprocess_cmd_no_api_key_does_not_override() -> None: + """When api_key='', we must not overwrite any existing env var with empty string.""" + import os as _os + env_env = ToolSandboxEnvironment(runner="subprocess") + _, env_vars = env_env._build_cmd( + output_dir=_P("/tmp/output"), base_url="https://integrate.api.nvidia.com/v1", model_id="test-model", api_key="", ) - cmd_str = " ".join(cmd) - assert "NVIDIA_API_KEY" not in cmd_str + # If NVIDIA_API_KEY already existed in os.environ, it should not be blanked. + # If it wasn't there, it should still not be there (or be identical to original). + original = _os.environ.get("NVIDIA_API_KEY", "") + assert env_vars.get("NVIDIA_API_KEY", "") == original # --------------------------------------------------------------------------- @@ -245,6 +316,7 @@ def test_parse_results_bundle_keys(tmp_path) -> None: assert bundle["benchmark"]["name"] == "toolsandbox" assert bundle["benchmark"]["scores"]["similarity"]["value"] == pytest.approx(0.7) assert bundle["config"]["framework"] == "toolsandbox" + assert bundle["config"]["runner"] == "docker" assert bundle["config"]["model"] == "my-model" assert bundle["_container_exit_code"] == 0 From 762f6d5a73ae4921d2f933a726473ad09fef28f4 Mon Sep 17 00:00:00 2001 From: Wojciech Prazuch Date: Mon, 18 May 2026 15:10:10 -0700 Subject: [PATCH 3/6] fix(toolsandbox): set OPENAI_API_KEY placeholder before parent __init__ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OpenAIAPIAgent/User.__init__ reads OPENAI_API_KEY to build a client that NVIDIANIMAgent immediately replaces — without the placeholder it raises even though we never use the parent's client. Signed-off-by: Wojciech Prazuch --- docker/toolsandbox_entrypoint.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docker/toolsandbox_entrypoint.py b/docker/toolsandbox_entrypoint.py index c8eea9fd4..44b25b1a6 100644 --- a/docker/toolsandbox_entrypoint.py +++ b/docker/toolsandbox_entrypoint.py @@ -59,6 +59,11 @@ def _register_nvidia_roles() -> None: agent_model = _require_env("NVIDIA_AGENT_MODEL") user_model = os.environ.get("NVIDIA_USER_MODEL", "meta/llama-3.1-70b-instruct") + # OpenAIAPIAgent/User.__init__ reads OPENAI_API_KEY to create a temporary + # client that NVIDIANIMAgent immediately replaces. Set a placeholder so + # the parent __init__ doesn't raise when the env var is absent. + os.environ.setdefault("OPENAI_API_KEY", api_key or "not-used") + def _client() -> OpenAI: return OpenAI(base_url=base_url, api_key=api_key) From 955baaaca65cd0be823eacd1f05b04c16ce3eba9 Mon Sep 17 00:00:00 2001 From: Wojciech Prazuch Date: Mon, 18 May 2026 15:10:51 -0700 Subject: [PATCH 4/6] fix(toolsandbox): pin httpx<0.28 to fix openai==1.17.0 proxies compat openai==1.17.0 (pinned by ToolSandbox) passes proxies= to httpx which removed that argument in 0.28.0, causing TypeError at runtime. Signed-off-by: Wojciech Prazuch --- docker/Dockerfile.toolsandbox | 3 ++- docker/Dockerfile.toolsandbox-combined | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.toolsandbox b/docker/Dockerfile.toolsandbox index a96b69614..a86c176af 100644 --- a/docker/Dockerfile.toolsandbox +++ b/docker/Dockerfile.toolsandbox @@ -28,7 +28,8 @@ ARG TOOLSANDBOX_REF=main # ToolSandbox pins openai==1.17.0 and other specific versions — install in # a clean environment separate from any NEL dependencies. RUN pip install --no-cache-dir \ - "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}" + "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}" \ + "httpx<0.28.0" COPY docker/toolsandbox_entrypoint.py /opt/toolsandbox_entrypoint.py diff --git a/docker/Dockerfile.toolsandbox-combined b/docker/Dockerfile.toolsandbox-combined index 8c811e88e..ab388fb70 100644 --- a/docker/Dockerfile.toolsandbox-combined +++ b/docker/Dockerfile.toolsandbox-combined @@ -41,6 +41,7 @@ ARG TOOLSANDBOX_REF=main # Create an isolated venv for ToolSandbox with its pinned deps RUN python3.11 -m venv /opt/toolsandbox-venv \ && /opt/toolsandbox-venv/bin/pip install --no-cache-dir \ - "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}" + "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}" \ + "httpx<0.28.0" COPY docker/toolsandbox_entrypoint.py /opt/toolsandbox_entrypoint.py From e62aed8e2728c25ca44081eba9dec47a1eec895c Mon Sep 17 00:00:00 2001 From: Wojciech Prazuch Date: Mon, 18 May 2026 15:14:50 -0700 Subject: [PATCH 5/6] fix(toolsandbox): correct result_summary.json parsing from smoke test Smoke test revealed the actual output format uses category_aggregated_results instead of a top-level similarity key. Overall score comes from ALL_CATEGORIES; per-category breakdown skips it to avoid duplication. Sample count reads from per_scenario_results length. Signed-off-by: Wojciech Prazuch --- src/nemo_evaluator/benchmarks/toolsandbox.py | 27 +++++++++++--- tests/test_environments/test_toolsandbox.py | 39 +++++++++++++------- 2 files changed, 46 insertions(+), 20 deletions(-) diff --git a/src/nemo_evaluator/benchmarks/toolsandbox.py b/src/nemo_evaluator/benchmarks/toolsandbox.py index bc49b2448..96dc67902 100644 --- a/src/nemo_evaluator/benchmarks/toolsandbox.py +++ b/src/nemo_evaluator/benchmarks/toolsandbox.py @@ -282,7 +282,7 @@ def _parse_results(self, output_dir: Path, exit_code: int, model_id: str) -> dic return { "benchmark": { "name": self.name, - "samples": summary.get("num_scenarios", len(summary.get("per_category", {}))), + "samples": len(summary.get("per_scenario_results", [])), "scores": scores, }, "config": { @@ -309,14 +309,29 @@ def _load_result_summary(self, output_dir: Path) -> dict[str, Any]: @staticmethod def _extract_scores(summary: dict[str, Any]) -> dict[str, Any]: + """Extract scores from ToolSandbox result_summary.json. + + Real format (confirmed from smoke test): + category_aggregated_results: + ALL_CATEGORIES: {similarity: float, turn_count: float} + STATE_DEPENDENCY: {similarity: float, turn_count: float} + ... + """ scores: dict[str, Any] = {} - for metric in ("similarity", "turn_count"): - if metric in summary: - scores[metric] = {"value": round(float(summary[metric]), 4)} + cat_results: dict[str, Any] = summary.get("category_aggregated_results") or {} - per_category = summary.get("per_category") or {} - for cat_name, cat_data in per_category.items(): + # Overall score comes from the ALL_CATEGORIES aggregate + all_cat = cat_results.get("ALL_CATEGORIES") or {} + if "similarity" in all_cat: + scores["similarity"] = {"value": round(float(all_cat["similarity"]), 4)} + if "turn_count" in all_cat: + scores["turn_count"] = {"value": round(float(all_cat["turn_count"]), 2)} + + # Per-category breakdown (skip ALL_CATEGORIES to avoid duplication) + for cat_name, cat_data in cat_results.items(): + if cat_name == "ALL_CATEGORIES": + continue if isinstance(cat_data, dict) and "similarity" in cat_data: scores[f"per_category/{cat_name}/similarity"] = { "value": round(float(cat_data["similarity"]), 4) diff --git a/tests/test_environments/test_toolsandbox.py b/tests/test_environments/test_toolsandbox.py index 9f2006734..642a97d38 100644 --- a/tests/test_environments/test_toolsandbox.py +++ b/tests/test_environments/test_toolsandbox.py @@ -233,21 +233,24 @@ def test_subprocess_cmd_no_api_key_does_not_override() -> None: def test_extract_scores_full() -> None: + # Real format confirmed by smoke test summary = { - "similarity": 0.72, - "turn_count": 4.3, - "per_category": { - "single_tool_call": {"similarity": 0.91, "count": 45}, - "multiple_tool_call": {"similarity": 0.61, "count": 20}, + "per_scenario_results": [], + "category_aggregated_results": { + "ALL_CATEGORIES": {"similarity": 0.72, "turn_count": 4.3}, + "STATE_DEPENDENCY": {"similarity": 0.91, "turn_count": 3.1}, + "MULTIPLE_TOOL_CALL": {"similarity": 0.61, "turn_count": 5.0}, }, } scores = ToolSandboxEnvironment._extract_scores(summary) assert scores["similarity"]["value"] == pytest.approx(0.72, abs=1e-4) - assert scores["turn_count"]["value"] == pytest.approx(4.3, abs=1e-4) - assert "per_category/single_tool_call/similarity" in scores - assert scores["per_category/single_tool_call/similarity"]["value"] == pytest.approx(0.91, abs=1e-4) - assert "per_category/multiple_tool_call/similarity" in scores + assert scores["turn_count"]["value"] == pytest.approx(4.3, abs=1e-2) + assert "per_category/STATE_DEPENDENCY/similarity" in scores + assert scores["per_category/STATE_DEPENDENCY/similarity"]["value"] == pytest.approx(0.91, abs=1e-4) + assert "per_category/MULTIPLE_TOOL_CALL/similarity" in scores + # ALL_CATEGORIES is not duplicated as a per_category entry + assert "per_category/ALL_CATEGORIES/similarity" not in scores def test_extract_scores_empty_summary() -> None: @@ -256,18 +259,20 @@ def test_extract_scores_empty_summary() -> None: def test_extract_scores_no_categories() -> None: - summary = {"similarity": 0.5} + summary = {"category_aggregated_results": {"ALL_CATEGORIES": {"similarity": 0.5}}} scores = ToolSandboxEnvironment._extract_scores(summary) assert scores == {"similarity": {"value": 0.5}} def test_extract_scores_category_missing_similarity() -> None: summary = { - "similarity": 0.8, - "per_category": {"single_tool_call": {"count": 10}}, + "category_aggregated_results": { + "ALL_CATEGORIES": {"similarity": 0.8, "turn_count": 3.0}, + "STATE_DEPENDENCY": {"turn_count": 3.0}, # no similarity + } } scores = ToolSandboxEnvironment._extract_scores(summary) - assert "per_category/single_tool_call/similarity" not in scores + assert "per_category/STATE_DEPENDENCY/similarity" not in scores assert scores["similarity"]["value"] == pytest.approx(0.8) @@ -306,7 +311,12 @@ def test_parse_results_bundle_keys(tmp_path) -> None: run_dir = tmp_path / "run_1" run_dir.mkdir() (run_dir / "result_summary.json").write_text( - json.dumps({"similarity": 0.7, "turn_count": 5.0}) + json.dumps({ + "per_scenario_results": [{"name": "s1"}, {"name": "s2"}], + "category_aggregated_results": { + "ALL_CATEGORIES": {"similarity": 0.7, "turn_count": 5.0} + }, + }) ) bundle = env._parse_results(tmp_path, exit_code=0, model_id="my-model") @@ -314,6 +324,7 @@ def test_parse_results_bundle_keys(tmp_path) -> None: assert "benchmark" in bundle assert "config" in bundle assert bundle["benchmark"]["name"] == "toolsandbox" + assert bundle["benchmark"]["samples"] == 2 assert bundle["benchmark"]["scores"]["similarity"]["value"] == pytest.approx(0.7) assert bundle["config"]["framework"] == "toolsandbox" assert bundle["config"]["runner"] == "docker" From 6b3b416abc9f7980b9b8dd26e38ca27e069ee197 Mon Sep 17 00:00:00 2001 From: Wojciech Prazuch Date: Mon, 18 May 2026 15:23:22 -0700 Subject: [PATCH 6/6] chore(toolsandbox): use inference-api.nvidia.com + gpt-4o in example configs Smoke test confirmed: inference-api.nvidia.com with azure/openai/gpt-4o runs 3/3 scenarios with similarity=0.952, no errors, no rate limits. Signed-off-by: Wojciech Prazuch --- examples/configs/toolsandbox.yaml | 10 +++++----- examples/configs/toolsandbox_slurm.yaml | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/configs/toolsandbox.yaml b/examples/configs/toolsandbox.yaml index 12aa87fe2..6e5db70ec 100644 --- a/examples/configs/toolsandbox.yaml +++ b/examples/configs/toolsandbox.yaml @@ -6,7 +6,7 @@ # Inference API — no OpenAI key required. # # Prerequisites: -# - NVIDIA_API_KEY set in environment +# - INFERENCE_API_KEY set in environment # - Docker running locally # - toolsandbox-nel image built (once): # docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest . @@ -16,17 +16,17 @@ services: nemotron: type: api - url: https://integrate.api.nvidia.com/v1/chat/completions + url: https://inference-api.nvidia.com/v1/chat/completions protocol: chat_completions - model: nvidia/nemotron-3-super-120b-a12b - api_key: ${NVIDIA_API_KEY} + model: azure/openai/gpt-4o + api_key: ${INFERENCE_API_KEY} benchmarks: - name: toolsandbox params: image: toolsandbox-nel:latest # Model used as user simulator (must be available on the same API) - user_model: meta/llama-3.1-70b-instruct + user_model: azure/openai/gpt-4o # Number of scenarios to run in parallel inside the container parallel: 4 # Set to true to run only a small predefined subset for quick validation diff --git a/examples/configs/toolsandbox_slurm.yaml b/examples/configs/toolsandbox_slurm.yaml index cc73df779..abc9842db 100644 --- a/examples/configs/toolsandbox_slurm.yaml +++ b/examples/configs/toolsandbox_slurm.yaml @@ -19,10 +19,10 @@ services: nemotron: type: api - url: https://integrate.api.nvidia.com/v1/chat/completions + url: https://inference-api.nvidia.com/v1/chat/completions protocol: chat_completions - model: nvidia/nemotron-3-super-120b-a12b - api_key: ${NVIDIA_API_KEY} + model: azure/openai/gpt-4o + api_key: ${INFERENCE_API_KEY} benchmarks: - name: toolsandbox @@ -49,7 +49,7 @@ cluster: walltime: "04:00:00" eval_image: ${SHARED_ROOT}/nel/toolsandbox-nel-combined.sqsh container_env: - NVIDIA_API_KEY: ${NVIDIA_API_KEY} + INFERENCE_API_KEY: ${INFERENCE_API_KEY} node_pools: default: partition: cpu