diff --git a/docker/Dockerfile.toolsandbox b/docker/Dockerfile.toolsandbox new file mode 100644 index 000000000..a86c176af --- /dev/null +++ b/docker/Dockerfile.toolsandbox @@ -0,0 +1,36 @@ +# ToolSandbox evaluation container for NEL Next. +# +# Bundles Apple's ToolSandbox benchmark with a custom NVIDIA NIM agent/user +# that routes model calls through any OpenAI-compatible endpoint. +# +# Build: +# docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest . +# +# Pin to a specific commit for reproducible builds: +# docker build -f docker/Dockerfile.toolsandbox \ +# --build-arg TOOLSANDBOX_REF= \ +# -t toolsandbox-nel: . +# +# Required env vars at runtime (injected by ToolSandboxEnvironment.run_batch): +# NVIDIA_BASE_URL – OpenAI-compatible base URL +# NVIDIA_API_KEY – API key +# NVIDIA_AGENT_MODEL – Model ID for the agent under evaluation +# NVIDIA_USER_MODEL – Model ID for the user simulator + +FROM python:3.11-slim + +RUN apt-get update \ + && apt-get install -y --no-install-recommends git \ + && rm -rf /var/lib/apt/lists/* + +ARG TOOLSANDBOX_REF=main + +# ToolSandbox pins openai==1.17.0 and other specific versions — install in +# a clean environment separate from any NEL dependencies. +RUN pip install --no-cache-dir \ + "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}" \ + "httpx<0.28.0" + +COPY docker/toolsandbox_entrypoint.py /opt/toolsandbox_entrypoint.py + +ENTRYPOINT ["python", "/opt/toolsandbox_entrypoint.py"] diff --git a/docker/Dockerfile.toolsandbox-combined b/docker/Dockerfile.toolsandbox-combined new file mode 100644 index 000000000..ab388fb70 --- /dev/null +++ b/docker/Dockerfile.toolsandbox-combined @@ -0,0 +1,47 @@ +# ToolSandbox + NEL Next combined evaluation container. +# +# Used for SLURM runs where nested Docker is unavailable. ToolSandbox is +# installed in an isolated venv (/opt/toolsandbox-venv) so its pinned +# dependencies (openai==1.17.0, etc.) do not conflict with NEL Next. +# +# Build: +# docker build -f docker/Dockerfile.toolsandbox-combined \ +# -t toolsandbox-nel-combined:latest . +# +# Convert to squashfs for SLURM (run on a login node with Docker): +# enroot import -o /shared/nel/toolsandbox-nel-combined.sqsh \ +# dockerd://toolsandbox-nel-combined:latest +# +# NEL Next config (subprocess runner): +# benchmarks: +# - name: toolsandbox +# params: +# runner: subprocess +# python_exe: /opt/toolsandbox-venv/bin/python +# entrypoint: /opt/toolsandbox_entrypoint.py +# +# Required env vars at runtime: +# NVIDIA_BASE_URL – OpenAI-compatible base URL +# NVIDIA_API_KEY – API key +# NVIDIA_AGENT_MODEL – model ID for the agent under evaluation +# NVIDIA_USER_MODEL – model ID for user simulator + +ARG BASE_IMAGE=nemo-evaluator + +FROM ${BASE_IMAGE} + +# Install Python 3.11 for the ToolSandbox venv (avoids openai version conflicts +# with NEL Next which uses the system Python 3.12) +RUN apt-get update \ + && apt-get install -y --no-install-recommends python3.11 python3.11-venv \ + && rm -rf /var/lib/apt/lists/* + +ARG TOOLSANDBOX_REF=main + +# Create an isolated venv for ToolSandbox with its pinned deps +RUN python3.11 -m venv /opt/toolsandbox-venv \ + && /opt/toolsandbox-venv/bin/pip install --no-cache-dir \ + "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}" \ + "httpx<0.28.0" + +COPY docker/toolsandbox_entrypoint.py /opt/toolsandbox_entrypoint.py diff --git a/docker/toolsandbox_entrypoint.py b/docker/toolsandbox_entrypoint.py new file mode 100644 index 000000000..44b25b1a6 --- /dev/null +++ b/docker/toolsandbox_entrypoint.py @@ -0,0 +1,92 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ToolSandbox NEL entrypoint. + +Patches the ToolSandbox agent/user factories to use NVIDIA-hosted models via +any OpenAI-compatible endpoint, then delegates to the standard tool_sandbox CLI. + +Required environment variables: + NVIDIA_BASE_URL – OpenAI-compatible endpoint base URL + (e.g. https://integrate.api.nvidia.com/v1) + NVIDIA_API_KEY – API key for both agent and user models + NVIDIA_AGENT_MODEL – Model name for the agent under evaluation + +Optional: + NVIDIA_USER_MODEL – Model for user simulator + (default: meta/llama-3.1-70b-instruct) + +CLI args (after patching) follow the standard tool_sandbox interface: + --agent Gorilla --user GPT_4_o_2024_05_13 [--scenarios ...] [--test_mode] +""" +from __future__ import annotations + +import os +import sys + + +def _require_env(name: str) -> str: + val = os.environ.get(name, "").strip() + if not val: + raise RuntimeError(f"Required environment variable {name!r} is not set") + return val + + +def _register_nvidia_roles() -> None: + """Replace Gorilla agent and GPT-4o user with NVIDIA NIM-backed classes. + + We reuse existing RoleImplType enum keys so the CLI accepts + ``--agent Gorilla --user GPT_4_o_2024_05_13`` without modification. + """ + from openai import OpenAI + from tool_sandbox.cli.utils import AGENT_TYPE_TO_FACTORY, RoleImplType, USER_TYPE_TO_FACTORY + from tool_sandbox.roles.openai_api_agent import OpenAIAPIAgent + from tool_sandbox.roles.openai_api_user import OpenAIAPIUser + + base_url = _require_env("NVIDIA_BASE_URL") + api_key = _require_env("NVIDIA_API_KEY") + agent_model = _require_env("NVIDIA_AGENT_MODEL") + user_model = os.environ.get("NVIDIA_USER_MODEL", "meta/llama-3.1-70b-instruct") + + # OpenAIAPIAgent/User.__init__ reads OPENAI_API_KEY to create a temporary + # client that NVIDIANIMAgent immediately replaces. Set a placeholder so + # the parent __init__ doesn't raise when the env var is absent. + os.environ.setdefault("OPENAI_API_KEY", api_key or "not-used") + + def _client() -> OpenAI: + return OpenAI(base_url=base_url, api_key=api_key) + + class NVIDIANIMAgent(OpenAIAPIAgent): + model_name: str = agent_model + + def __init__(self) -> None: + super().__init__() + self.openai_client = _client() + + class NVIDIANIMUser(OpenAIAPIUser): + model_name: str = user_model + + def __init__(self) -> None: + super().__init__() + self.openai_client = _client() + + AGENT_TYPE_TO_FACTORY[RoleImplType.Gorilla] = NVIDIANIMAgent + USER_TYPE_TO_FACTORY[RoleImplType.GPT_4_o_2024_05_13] = NVIDIANIMUser + + +if __name__ == "__main__": + _register_nvidia_roles() + from tool_sandbox.cli import main + + sys.exit(main()) diff --git a/examples/configs/toolsandbox.yaml b/examples/configs/toolsandbox.yaml new file mode 100644 index 000000000..6e5db70ec --- /dev/null +++ b/examples/configs/toolsandbox.yaml @@ -0,0 +1,43 @@ +# Flavor: ToolSandbox — stateful multi-turn tool-use benchmark +# +# Evaluates an LLM's ability to use tools across stateful, multi-turn +# conversations (Apple's ToolSandbox: https://github.com/apple/ToolSandbox). +# Both the agent under evaluation AND the user simulator call the NVIDIA +# Inference API — no OpenAI key required. +# +# Prerequisites: +# - INFERENCE_API_KEY set in environment +# - Docker running locally +# - toolsandbox-nel image built (once): +# docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest . +# +# Run: nel eval run examples/configs/toolsandbox.yaml + +services: + nemotron: + type: api + url: https://inference-api.nvidia.com/v1/chat/completions + protocol: chat_completions + model: azure/openai/gpt-4o + api_key: ${INFERENCE_API_KEY} + +benchmarks: + - name: toolsandbox + params: + image: toolsandbox-nel:latest + # Model used as user simulator (must be available on the same API) + user_model: azure/openai/gpt-4o + # Number of scenarios to run in parallel inside the container + parallel: 4 + # Set to true to run only a small predefined subset for quick validation + test_mode: false + # Specific scenarios to run — omit or set [] to run the full suite + # scenarios: [wifi_off, cellular_off, make_call] + solver: + type: simple + service: nemotron + timeout: 7200.0 + +output: + dir: ./results/toolsandbox + report: [markdown, json] diff --git a/examples/configs/toolsandbox_slurm.yaml b/examples/configs/toolsandbox_slurm.yaml new file mode 100644 index 000000000..abc9842db --- /dev/null +++ b/examples/configs/toolsandbox_slurm.yaml @@ -0,0 +1,56 @@ +# Flavor: ToolSandbox on SLURM (subprocess runner — no nested Docker) +# +# Uses the subprocess runner so ToolSandbox (pre-installed in +# /opt/toolsandbox-venv inside the eval container) runs directly without +# a nested Docker call. Suitable for any SLURM cluster. +# +# Prerequisites: +# - NVIDIA_API_KEY set in environment (or in cluster.container_env) +# - toolsandbox-nel-combined squashfs on shared storage: +# docker build -f docker/Dockerfile.toolsandbox-combined \ +# -t toolsandbox-nel-combined:latest . +# enroot import -o /shared/nel/toolsandbox-nel-combined.sqsh \ +# dockerd://toolsandbox-nel-combined:latest +# - SSH access to the SLURM login node +# +# Dry-run: nel eval run examples/configs/toolsandbox_slurm.yaml --dry-run +# Submit: nel eval run examples/configs/toolsandbox_slurm.yaml --submit + +services: + nemotron: + type: api + url: https://inference-api.nvidia.com/v1/chat/completions + protocol: chat_completions + model: azure/openai/gpt-4o + api_key: ${INFERENCE_API_KEY} + +benchmarks: + - name: toolsandbox + params: + runner: subprocess + python_exe: /opt/toolsandbox-venv/bin/python + entrypoint: /opt/toolsandbox_entrypoint.py + user_model: meta/llama-3.1-70b-instruct + parallel: 8 + test_mode: false + solver: + type: simple + service: nemotron + timeout: 14400.0 + +output: + dir: ./results/toolsandbox_slurm + report: [markdown, json] + +cluster: + type: slurm + hostname: ${SLURM_LOGIN_HOST} + account: ${SLURM_ACCOUNT} + walltime: "04:00:00" + eval_image: ${SHARED_ROOT}/nel/toolsandbox-nel-combined.sqsh + container_env: + INFERENCE_API_KEY: ${INFERENCE_API_KEY} + node_pools: + default: + partition: cpu + nodes: 1 diff --git a/src/nemo_evaluator/benchmarks/__init__.py b/src/nemo_evaluator/benchmarks/__init__.py index 3b2f9cd4b..de2b3879b 100644 --- a/src/nemo_evaluator/benchmarks/__init__.py +++ b/src/nemo_evaluator/benchmarks/__init__.py @@ -33,5 +33,6 @@ import nemo_evaluator.benchmarks.simpleqa # noqa: F401 import nemo_evaluator.benchmarks.terminal_bench_hard # noqa: F401 import nemo_evaluator.benchmarks.terminal_bench_v1 # noqa: F401 +import nemo_evaluator.benchmarks.toolsandbox # noqa: F401 import nemo_evaluator.benchmarks.triviaqa # noqa: F401 import nemo_evaluator.benchmarks.xstest # noqa: F401 diff --git a/src/nemo_evaluator/benchmarks/toolsandbox.py b/src/nemo_evaluator/benchmarks/toolsandbox.py new file mode 100644 index 000000000..96dc67902 --- /dev/null +++ b/src/nemo_evaluator/benchmarks/toolsandbox.py @@ -0,0 +1,340 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ToolSandbox benchmark — Apple's stateful multi-turn tool-use evaluation. + +Registers ``toolsandbox`` as a built-in benchmark. Bypasses the standard +seed/solve/verify loop via ``run_batch()``, which runs ToolSandbox in one of +three modes and parses the resulting ``result_summary.json``. + +Runner modes +------------ +docker (default) + Spawns a pre-built Docker image. Requires Docker on the eval host. + + Build the image once:: + + docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest . + +apptainer + Same image as ``docker`` mode but executed via ``apptainer run``. + Use on SLURM clusters where Docker is unavailable. ``image`` should be + a path to a ``.sif`` or ``.sqsh`` file on the shared filesystem. + +subprocess + Runs the ToolSandbox entrypoint directly as a Python subprocess — no + container needed. Use when the eval container already has ToolSandbox + pre-installed (e.g. ``Dockerfile.toolsandbox-combined``). Set + ``python_exe`` to the venv Python that has ToolSandbox, and + ``entrypoint`` to the patch script path. + +Config usage:: + + benchmarks: + - name: toolsandbox + params: + # --- runner selection --- + runner: docker # docker | apptainer | subprocess + image: toolsandbox-nel:latest # docker image name / sif path + # --- subprocess-mode overrides --- + python_exe: /opt/toolsandbox-venv/bin/python + entrypoint: /opt/toolsandbox_entrypoint.py + # --- benchmark settings --- + user_model: meta/llama-3.1-70b-instruct # user-simulator model + parallel: 4 # concurrent scenarios + test_mode: false # true = small subset only + scenarios: [] # [] = all scenarios + solver: + type: simple + service: my_model + timeout: 7200.0 + +Both agent and user simulator call the NVIDIA Inference API — no OpenAI +key required. ``NVIDIA_API_KEY`` must be set in the environment. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import os +import sys +import tempfile +from pathlib import Path +from typing import TYPE_CHECKING, Any, Literal + +from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult +from nemo_evaluator.environments.registry import register + +if TYPE_CHECKING: + from nemo_evaluator.sandbox.base import Sandbox + +logger = logging.getLogger(__name__) + +_DEFAULT_IMAGE = "toolsandbox-nel:latest" +_DEFAULT_USER_MODEL = "meta/llama-3.1-70b-instruct" +_DEFAULT_BASE_URL = "https://integrate.api.nvidia.com/v1" +_DEFAULT_ENTRYPOINT = "/opt/toolsandbox_entrypoint.py" +_CONTAINER_OUTPUT = "/output" + +_CLI_AGENT = "Gorilla" +_CLI_USER = "GPT_4_o_2024_05_13" + + +def _to_openai_base_url(url: str) -> str: + """Strip /chat/completions, /completions, /responses path suffix from NEL service URLs.""" + for suffix in ("/chat/completions", "/completions", "/responses"): + if url.endswith(suffix): + return url[: -len(suffix)] + return url.rstrip("/") + + +@register("toolsandbox") +class ToolSandboxEnvironment(EvalEnvironment): + """Runs ToolSandbox and parses aggregate metrics. + + The entire scenario suite executes as a single batch. + ``seed()`` and ``verify()`` are not used. + """ + + def __init__( + self, + runner: Literal["docker", "apptainer", "subprocess"] = "docker", + image: str = _DEFAULT_IMAGE, + python_exe: str | None = None, + entrypoint: str = _DEFAULT_ENTRYPOINT, + user_model: str = _DEFAULT_USER_MODEL, + scenarios: list[str] | None = None, + parallel: int = 4, + timeout: float = 7200.0, + test_mode: bool = False, + ) -> None: + super().__init__() + self._runner = runner + self._image = image + self._python_exe = python_exe or sys.executable + self._entrypoint = entrypoint + self._user_model = user_model + self._scenarios: list[str] = scenarios or [] + self._parallel = parallel + self._timeout = timeout + self._test_mode = test_mode + + # ------------------------------------------------------------------ + # EvalEnvironment interface + # ------------------------------------------------------------------ + + async def dataset_size(self) -> int: + return 0 + + async def seed(self, idx: int) -> SeedResult: + raise NotImplementedError("ToolSandboxEnvironment uses run_batch()") + + async def verify( + self, + response: str, + expected: str, + sandbox: "Sandbox | None" = None, + **metadata: Any, + ) -> VerifyResult: + raise NotImplementedError("ToolSandboxEnvironment uses run_batch()") + + # ------------------------------------------------------------------ + # Batch execution + # ------------------------------------------------------------------ + + async def run_batch(self, solver: Any = None, config: dict[str, Any] | None = None) -> dict[str, Any]: + config = config or {} + model_url = config.get("base_url", "") or _DEFAULT_BASE_URL + model_id = config.get("model", "") + api_key = config.get("api_key") or os.environ.get("NVIDIA_API_KEY", "") + + base_url = _to_openai_base_url(model_url) + + with tempfile.TemporaryDirectory(prefix="nel_toolsandbox_") as tmpdir: + output_dir = Path(tmpdir) / "output" + output_dir.mkdir() + + cmd, env = self._build_cmd(output_dir, base_url, model_id, api_key) + logger.info("Launching ToolSandbox (%s): %s", self._runner, " ".join(cmd[:10]) + " ...") + + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + env=env, + ) + try: + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=self._timeout) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + logger.error("ToolSandbox timed out after %.0fs", self._timeout) + stderr = b"timeout" + + rc = proc.returncode or 0 + if rc != 0: + logger.error( + "ToolSandbox exited %d:\n%s", + rc, + (stderr or b"").decode(errors="replace")[:2000], + ) + + return self._parse_results(output_dir, rc, model_id) + + # ------------------------------------------------------------------ + # Command builders + # ------------------------------------------------------------------ + + def _build_cmd( + self, + output_dir: Path, + base_url: str, + model_id: str, + api_key: str, + ) -> tuple[list[str], dict[str, str] | None]: + """Return (cmd, env) for the selected runner.""" + if self._runner == "subprocess": + return self._build_subprocess_cmd(output_dir, base_url, model_id, api_key) + if self._runner == "apptainer": + return self._build_apptainer_cmd(output_dir, base_url, model_id, api_key), None + return self._build_docker_cmd(output_dir, base_url, model_id, api_key), None + + def _toolsandbox_cli_args(self, output_dir_str: str) -> list[str]: + args = ["--agent", _CLI_AGENT, "--user", _CLI_USER] + args.extend(["--output_dir", output_dir_str]) + args.extend(["--parallel", str(self._parallel)]) + if self._test_mode: + args.append("--test_mode") + elif self._scenarios: + args.extend(["--scenarios"] + list(self._scenarios)) + return args + + def _container_env_flags(self, base_url: str, model_id: str, api_key: str) -> list[str]: + flags = [ + "-e", f"NVIDIA_BASE_URL={base_url}", + "-e", f"NVIDIA_AGENT_MODEL={model_id}", + "-e", f"NVIDIA_USER_MODEL={self._user_model}", + ] + if api_key: + flags.extend(["-e", f"NVIDIA_API_KEY={api_key}"]) + return flags + + def _build_docker_cmd(self, output_dir: Path, base_url: str, model_id: str, api_key: str) -> list[str]: + cmd = ["docker", "run", "--rm", "-v", f"{output_dir}:{_CONTAINER_OUTPUT}"] + cmd.extend(self._container_env_flags(base_url, model_id, api_key)) + cmd.append(self._image) + cmd.extend(self._toolsandbox_cli_args(_CONTAINER_OUTPUT)) + return cmd + + def _build_apptainer_cmd(self, output_dir: Path, base_url: str, model_id: str, api_key: str) -> list[str]: + env_flags: list[str] = [] + for flag in self._container_env_flags(base_url, model_id, api_key): + if flag == "-e": + continue + env_flags.extend(["--env", flag]) + + cmd = [ + "apptainer", "run", "--bind", f"{output_dir}:{_CONTAINER_OUTPUT}", + *env_flags, + self._image, + ] + cmd.extend(self._toolsandbox_cli_args(_CONTAINER_OUTPUT)) + return cmd + + def _build_subprocess_cmd( + self, output_dir: Path, base_url: str, model_id: str, api_key: str + ) -> tuple[list[str], dict[str, str]]: + # Env is passed via environment variables to the subprocess + env = { + **os.environ, + "NVIDIA_BASE_URL": base_url, + "NVIDIA_AGENT_MODEL": model_id, + "NVIDIA_USER_MODEL": self._user_model, + } + if api_key: + env["NVIDIA_API_KEY"] = api_key + + cmd = [self._python_exe, self._entrypoint] + cmd.extend(self._toolsandbox_cli_args(str(output_dir))) + return cmd, env + + # ------------------------------------------------------------------ + # Results parsing + # ------------------------------------------------------------------ + + def _parse_results(self, output_dir: Path, exit_code: int, model_id: str) -> dict[str, Any]: + summary = self._load_result_summary(output_dir) + scores = self._extract_scores(summary) + + return { + "benchmark": { + "name": self.name, + "samples": len(summary.get("per_scenario_results", [])), + "scores": scores, + }, + "config": { + "benchmark": self.name, + "runner": self._runner, + "image": self._image if self._runner != "subprocess" else None, + "model": model_id, + "user_model": self._user_model, + "framework": "toolsandbox", + "scenarios": self._scenarios or "all", + "test_mode": self._test_mode, + }, + "_container_exit_code": exit_code, + } + + def _load_result_summary(self, output_dir: Path) -> dict[str, Any]: + for candidate in sorted(output_dir.rglob("result_summary.json")): + try: + return json.loads(candidate.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError) as exc: + logger.warning("Could not parse %s: %s", candidate, exc) + logger.warning("No result_summary.json found in %s", output_dir) + return {} + + @staticmethod + def _extract_scores(summary: dict[str, Any]) -> dict[str, Any]: + """Extract scores from ToolSandbox result_summary.json. + + Real format (confirmed from smoke test): + category_aggregated_results: + ALL_CATEGORIES: {similarity: float, turn_count: float} + STATE_DEPENDENCY: {similarity: float, turn_count: float} + ... + """ + scores: dict[str, Any] = {} + + cat_results: dict[str, Any] = summary.get("category_aggregated_results") or {} + + # Overall score comes from the ALL_CATEGORIES aggregate + all_cat = cat_results.get("ALL_CATEGORIES") or {} + if "similarity" in all_cat: + scores["similarity"] = {"value": round(float(all_cat["similarity"]), 4)} + if "turn_count" in all_cat: + scores["turn_count"] = {"value": round(float(all_cat["turn_count"]), 2)} + + # Per-category breakdown (skip ALL_CATEGORIES to avoid duplication) + for cat_name, cat_data in cat_results.items(): + if cat_name == "ALL_CATEGORIES": + continue + if isinstance(cat_data, dict) and "similarity" in cat_data: + scores[f"per_category/{cat_name}/similarity"] = { + "value": round(float(cat_data["similarity"]), 4) + } + + return scores diff --git a/tests/test_environments/test_toolsandbox.py b/tests/test_environments/test_toolsandbox.py new file mode 100644 index 000000000..642a97d38 --- /dev/null +++ b/tests/test_environments/test_toolsandbox.py @@ -0,0 +1,351 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Offline tests for ToolSandboxEnvironment (no Docker, no network).""" + +from __future__ import annotations + +import json +import pytest + +from nemo_evaluator.benchmarks.toolsandbox import ToolSandboxEnvironment, _to_openai_base_url + + +# --------------------------------------------------------------------------- +# URL normalization +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "url, expected", + [ + ( + "https://integrate.api.nvidia.com/v1/chat/completions", + "https://integrate.api.nvidia.com/v1", + ), + ( + "https://integrate.api.nvidia.com/v1/completions", + "https://integrate.api.nvidia.com/v1", + ), + ( + "https://integrate.api.nvidia.com/v1/responses", + "https://integrate.api.nvidia.com/v1", + ), + ( + "https://integrate.api.nvidia.com/v1", + "https://integrate.api.nvidia.com/v1", + ), + ( + "http://localhost:8000/v1/chat/completions", + "http://localhost:8000/v1", + ), + ( + "http://localhost:8000/v1", + "http://localhost:8000/v1", + ), + ], + ids=[ + "strip_chat_completions", + "strip_completions", + "strip_responses", + "no_op", + "localhost_with_suffix", + "localhost_base", + ], +) +def test_to_openai_base_url(url: str, expected: str) -> None: + assert _to_openai_base_url(url) == expected + + +# --------------------------------------------------------------------------- +# ToolSandboxEnvironment construction and defaults +# --------------------------------------------------------------------------- + + +def test_default_construction() -> None: + env = ToolSandboxEnvironment() + assert env._runner == "docker" + assert env._image == "toolsandbox-nel:latest" + assert env._user_model == "meta/llama-3.1-70b-instruct" + assert env._scenarios == [] + assert env._parallel == 4 + assert not env._test_mode + + +def test_custom_params() -> None: + env = ToolSandboxEnvironment( + runner="subprocess", + image="toolsandbox-nel:v1.2", + python_exe="/opt/toolsandbox-venv/bin/python", + user_model="meta/llama-3.1-8b-instruct", + scenarios=["wifi_off", "cellular_off"], + parallel=8, + test_mode=True, + ) + assert env._runner == "subprocess" + assert env._python_exe == "/opt/toolsandbox-venv/bin/python" + assert env._scenarios == ["wifi_off", "cellular_off"] + assert env._parallel == 8 + assert env._test_mode + + +# --------------------------------------------------------------------------- +# Docker command construction +# --------------------------------------------------------------------------- + +_P = __import__("pathlib").Path + + +def test_docker_cmd_no_scenarios() -> None: + env = ToolSandboxEnvironment(runner="docker") + cmd, _ = env._build_cmd( + output_dir=_P("/tmp/output"), + base_url="https://integrate.api.nvidia.com/v1", + model_id="nvidia/nemotron-3-super-120b-a12b", + api_key="test-key", + ) + assert cmd[0] == "docker" + assert "--agent" in cmd + assert "Gorilla" in cmd + assert "--user" in cmd + assert "GPT_4_o_2024_05_13" in cmd + assert "--scenarios" not in cmd + assert "--test_mode" not in cmd + assert "NVIDIA_AGENT_MODEL=nvidia/nemotron-3-super-120b-a12b" in " ".join(cmd) + + +def test_docker_cmd_with_scenarios() -> None: + env = ToolSandboxEnvironment(runner="docker", scenarios=["wifi_off", "make_call"]) + cmd, _ = env._build_cmd( + output_dir=_P("/tmp/output"), + base_url="https://integrate.api.nvidia.com/v1", + model_id="test-model", + api_key="key", + ) + idx = cmd.index("--scenarios") + assert cmd[idx + 1] == "wifi_off" + assert cmd[idx + 2] == "make_call" + + +def test_docker_cmd_test_mode() -> None: + env = ToolSandboxEnvironment(runner="docker", test_mode=True) + cmd, _ = env._build_cmd( + output_dir=_P("/tmp/output"), + base_url="https://integrate.api.nvidia.com/v1", + model_id="test-model", + api_key="key", + ) + assert "--test_mode" in cmd + assert "--scenarios" not in cmd + + +def test_docker_cmd_no_api_key() -> None: + env = ToolSandboxEnvironment(runner="docker") + cmd, _ = env._build_cmd( + output_dir=_P("/tmp/output"), + base_url="https://integrate.api.nvidia.com/v1", + model_id="test-model", + api_key="", + ) + assert "NVIDIA_API_KEY" not in " ".join(cmd) + + +# --------------------------------------------------------------------------- +# Apptainer command construction +# --------------------------------------------------------------------------- + + +def test_apptainer_cmd_basics() -> None: + env = ToolSandboxEnvironment(runner="apptainer", image="/shared/nel/toolsandbox.sif") + cmd, _ = env._build_cmd( + output_dir=_P("/tmp/output"), + base_url="https://integrate.api.nvidia.com/v1", + model_id="test-model", + api_key="key", + ) + assert cmd[0] == "apptainer" + assert "run" in cmd + assert "--bind" in cmd + assert "/shared/nel/toolsandbox.sif" in cmd + assert "--agent" in cmd + + +# --------------------------------------------------------------------------- +# Subprocess command construction +# --------------------------------------------------------------------------- + + +def test_subprocess_cmd_basics() -> None: + env = ToolSandboxEnvironment( + runner="subprocess", + python_exe="/opt/toolsandbox-venv/bin/python", + entrypoint="/opt/toolsandbox_entrypoint.py", + ) + cmd, env_vars = env._build_cmd( + output_dir=_P("/tmp/output"), + base_url="https://integrate.api.nvidia.com/v1", + model_id="test-model", + api_key="test-key", + ) + assert cmd[0] == "/opt/toolsandbox-venv/bin/python" + assert cmd[1] == "/opt/toolsandbox_entrypoint.py" + assert "--agent" in cmd + assert "Gorilla" in cmd + # API config passed via environment, not CLI flags + assert env_vars["NVIDIA_BASE_URL"] == "https://integrate.api.nvidia.com/v1" + assert env_vars["NVIDIA_AGENT_MODEL"] == "test-model" + assert env_vars["NVIDIA_API_KEY"] == "test-key" + # output_dir is a local host path, not the container-mount path /output + assert "/tmp/output" in " ".join(cmd) + assert cmd.count("/output") == 0 or all(c != "/output" for c in cmd) + + +def test_subprocess_cmd_no_api_key_does_not_override() -> None: + """When api_key='', we must not overwrite any existing env var with empty string.""" + import os as _os + env_env = ToolSandboxEnvironment(runner="subprocess") + _, env_vars = env_env._build_cmd( + output_dir=_P("/tmp/output"), + base_url="https://integrate.api.nvidia.com/v1", + model_id="test-model", + api_key="", + ) + # If NVIDIA_API_KEY already existed in os.environ, it should not be blanked. + # If it wasn't there, it should still not be there (or be identical to original). + original = _os.environ.get("NVIDIA_API_KEY", "") + assert env_vars.get("NVIDIA_API_KEY", "") == original + + +# --------------------------------------------------------------------------- +# Score extraction +# --------------------------------------------------------------------------- + + +def test_extract_scores_full() -> None: + # Real format confirmed by smoke test + summary = { + "per_scenario_results": [], + "category_aggregated_results": { + "ALL_CATEGORIES": {"similarity": 0.72, "turn_count": 4.3}, + "STATE_DEPENDENCY": {"similarity": 0.91, "turn_count": 3.1}, + "MULTIPLE_TOOL_CALL": {"similarity": 0.61, "turn_count": 5.0}, + }, + } + scores = ToolSandboxEnvironment._extract_scores(summary) + + assert scores["similarity"]["value"] == pytest.approx(0.72, abs=1e-4) + assert scores["turn_count"]["value"] == pytest.approx(4.3, abs=1e-2) + assert "per_category/STATE_DEPENDENCY/similarity" in scores + assert scores["per_category/STATE_DEPENDENCY/similarity"]["value"] == pytest.approx(0.91, abs=1e-4) + assert "per_category/MULTIPLE_TOOL_CALL/similarity" in scores + # ALL_CATEGORIES is not duplicated as a per_category entry + assert "per_category/ALL_CATEGORIES/similarity" not in scores + + +def test_extract_scores_empty_summary() -> None: + scores = ToolSandboxEnvironment._extract_scores({}) + assert scores == {} + + +def test_extract_scores_no_categories() -> None: + summary = {"category_aggregated_results": {"ALL_CATEGORIES": {"similarity": 0.5}}} + scores = ToolSandboxEnvironment._extract_scores(summary) + assert scores == {"similarity": {"value": 0.5}} + + +def test_extract_scores_category_missing_similarity() -> None: + summary = { + "category_aggregated_results": { + "ALL_CATEGORIES": {"similarity": 0.8, "turn_count": 3.0}, + "STATE_DEPENDENCY": {"turn_count": 3.0}, # no similarity + } + } + scores = ToolSandboxEnvironment._extract_scores(summary) + assert "per_category/STATE_DEPENDENCY/similarity" not in scores + assert scores["similarity"]["value"] == pytest.approx(0.8) + + +# --------------------------------------------------------------------------- +# Result summary loading (from temp directory) +# --------------------------------------------------------------------------- + + +def test_load_result_summary(tmp_path): + env = ToolSandboxEnvironment() + + # Simulate ToolSandbox output structure + run_dir = tmp_path / "agent_Gorilla_user_GPT_4_o_20240513_12345" + run_dir.mkdir() + summary_data = {"similarity": 0.65, "turn_count": 3.1} + (run_dir / "result_summary.json").write_text(json.dumps(summary_data)) + + result = env._load_result_summary(tmp_path) + assert result == summary_data + + +def test_load_result_summary_missing(tmp_path) -> None: + env = ToolSandboxEnvironment() + result = env._load_result_summary(tmp_path) + assert result == {} + + +# --------------------------------------------------------------------------- +# Bundle structure +# --------------------------------------------------------------------------- + + +def test_parse_results_bundle_keys(tmp_path) -> None: + env = ToolSandboxEnvironment() + + run_dir = tmp_path / "run_1" + run_dir.mkdir() + (run_dir / "result_summary.json").write_text( + json.dumps({ + "per_scenario_results": [{"name": "s1"}, {"name": "s2"}], + "category_aggregated_results": { + "ALL_CATEGORIES": {"similarity": 0.7, "turn_count": 5.0} + }, + }) + ) + + bundle = env._parse_results(tmp_path, exit_code=0, model_id="my-model") + + assert "benchmark" in bundle + assert "config" in bundle + assert bundle["benchmark"]["name"] == "toolsandbox" + assert bundle["benchmark"]["samples"] == 2 + assert bundle["benchmark"]["scores"]["similarity"]["value"] == pytest.approx(0.7) + assert bundle["config"]["framework"] == "toolsandbox" + assert bundle["config"]["runner"] == "docker" + assert bundle["config"]["model"] == "my-model" + assert bundle["_container_exit_code"] == 0 + + +# --------------------------------------------------------------------------- +# seed/verify raise NotImplementedError +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_seed_raises() -> None: + env = ToolSandboxEnvironment() + with pytest.raises(NotImplementedError): + await env.seed(0) + + +@pytest.mark.asyncio +async def test_verify_raises() -> None: + env = ToolSandboxEnvironment() + with pytest.raises(NotImplementedError): + await env.verify("response", "expected")