From 43e77ac93813935e401e7d562e212233293124a9 Mon Sep 17 00:00:00 2001
From: Wojciech Prazuch <wprazuch@nvidia.com>
Date: Mon, 18 May 2026 14:54:03 -0700
Subject: [PATCH 1/6] feat(benchmarks): add ToolSandbox multi-turn tool-use
 benchmark
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ToolSandbox (https://github.com/apple/ToolSandbox) evaluates stateful,
multi-turn tool-use with a user simulator — the standard seed/solve/verify
loop does not apply.  This integration runs the full benchmark inside a
dedicated Docker container via run_batch() and parses result_summary.json
into the standard NEL bundle format.

Key design choices:
- ToolSandboxEnvironment subclasses EvalEnvironment and overrides run_batch()
- Both agent and user simulator call the NVIDIA Inference API (no OpenAI key)
- docker/toolsandbox_entrypoint.py patches AGENT_TYPE_TO_FACTORY[Gorilla] and
  USER_TYPE_TO_FACTORY[GPT_4_o_2024_05_13] with NVIDIANIMAgent/NVIDIANIMUser,
  which use native function-calling format and read NVIDIA_BASE_URL from env
- URL normalization strips /chat/completions from NEL service URLs before
  passing to the OpenAI SDK base_url

Files added:
  src/nemo_evaluator/benchmarks/toolsandbox.py
  docker/Dockerfile.toolsandbox
  docker/toolsandbox_entrypoint.py
  examples/configs/toolsandbox.yaml
  tests/test_environments/test_toolsandbox.py (21 offline tests, all green)

Signed-off-by: Wojciech Prazuch <wprazuch@nvidia.com>
---
 docker/Dockerfile.toolsandbox                |  35 +++
 docker/toolsandbox_entrypoint.py             |  87 ++++++
 examples/configs/toolsandbox.yaml            |  43 +++
 src/nemo_evaluator/benchmarks/__init__.py    |   1 +
 src/nemo_evaluator/benchmarks/toolsandbox.py | 257 ++++++++++++++++++
 tests/test_environments/test_toolsandbox.py  | 268 +++++++++++++++++++
 6 files changed, 691 insertions(+)
 create mode 100644 docker/Dockerfile.toolsandbox
 create mode 100644 docker/toolsandbox_entrypoint.py
 create mode 100644 examples/configs/toolsandbox.yaml
 create mode 100644 src/nemo_evaluator/benchmarks/toolsandbox.py
 create mode 100644 tests/test_environments/test_toolsandbox.py
diff --git a/docker/Dockerfile.toolsandbox b/docker/Dockerfile.toolsandbox
new file mode 100644
index 000000000..a96b69614
--- /dev/null
+++ b/docker/Dockerfile.toolsandbox
@@ -0,0 +1,35 @@
+# ToolSandbox evaluation container for NEL Next.
+#
+# Bundles Apple's ToolSandbox benchmark with a custom NVIDIA NIM agent/user
+# that routes model calls through any OpenAI-compatible endpoint.
+#
+# Build:
+#   docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest .
+#
+# Pin to a specific commit for reproducible builds:
+#   docker build -f docker/Dockerfile.toolsandbox \
+#     --build-arg TOOLSANDBOX_REF=<commit-sha> \
+#     -t toolsandbox-nel:<commit-sha> .
+#
+# Required env vars at runtime (injected by ToolSandboxEnvironment.run_batch):
+#   NVIDIA_BASE_URL    – OpenAI-compatible base URL
+#   NVIDIA_API_KEY     – API key
+#   NVIDIA_AGENT_MODEL – Model ID for the agent under evaluation
+#   NVIDIA_USER_MODEL  – Model ID for the user simulator
+
+FROM python:3.11-slim
+
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends git \
+    && rm -rf /var/lib/apt/lists/*
+
+ARG TOOLSANDBOX_REF=main
+
+# ToolSandbox pins openai==1.17.0 and other specific versions — install in
+# a clean environment separate from any NEL dependencies.
+RUN pip install --no-cache-dir \
+    "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}"
+
+COPY docker/toolsandbox_entrypoint.py /opt/toolsandbox_entrypoint.py
+
+ENTRYPOINT ["python", "/opt/toolsandbox_entrypoint.py"]
diff --git a/docker/toolsandbox_entrypoint.py b/docker/toolsandbox_entrypoint.py
new file mode 100644
index 000000000..c8eea9fd4
--- /dev/null
+++ b/docker/toolsandbox_entrypoint.py
@@ -0,0 +1,87 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ToolSandbox NEL entrypoint.
+
+Patches the ToolSandbox agent/user factories to use NVIDIA-hosted models via
+any OpenAI-compatible endpoint, then delegates to the standard tool_sandbox CLI.
+
+Required environment variables:
+  NVIDIA_BASE_URL    – OpenAI-compatible endpoint base URL
+                       (e.g. https://integrate.api.nvidia.com/v1)
+  NVIDIA_API_KEY     – API key for both agent and user models
+  NVIDIA_AGENT_MODEL – Model name for the agent under evaluation
+
+Optional:
+  NVIDIA_USER_MODEL  – Model for user simulator
+                       (default: meta/llama-3.1-70b-instruct)
+
+CLI args (after patching) follow the standard tool_sandbox interface:
+  --agent Gorilla --user GPT_4_o_2024_05_13 [--scenarios ...] [--test_mode]
+"""
+from __future__ import annotations
+
+import os
+import sys
+
+
+def _require_env(name: str) -> str:
+    val = os.environ.get(name, "").strip()
+    if not val:
+        raise RuntimeError(f"Required environment variable {name!r} is not set")
+    return val
+
+
+def _register_nvidia_roles() -> None:
+    """Replace Gorilla agent and GPT-4o user with NVIDIA NIM-backed classes.
+
+    We reuse existing RoleImplType enum keys so the CLI accepts
+    ``--agent Gorilla --user GPT_4_o_2024_05_13`` without modification.
+    """
+    from openai import OpenAI
+    from tool_sandbox.cli.utils import AGENT_TYPE_TO_FACTORY, RoleImplType, USER_TYPE_TO_FACTORY
+    from tool_sandbox.roles.openai_api_agent import OpenAIAPIAgent
+    from tool_sandbox.roles.openai_api_user import OpenAIAPIUser
+
+    base_url = _require_env("NVIDIA_BASE_URL")
+    api_key = _require_env("NVIDIA_API_KEY")
+    agent_model = _require_env("NVIDIA_AGENT_MODEL")
+    user_model = os.environ.get("NVIDIA_USER_MODEL", "meta/llama-3.1-70b-instruct")
+
+    def _client() -> OpenAI:
+        return OpenAI(base_url=base_url, api_key=api_key)
+
+    class NVIDIANIMAgent(OpenAIAPIAgent):
+        model_name: str = agent_model
+
+        def __init__(self) -> None:
+            super().__init__()
+            self.openai_client = _client()
+
+    class NVIDIANIMUser(OpenAIAPIUser):
+        model_name: str = user_model
+
+        def __init__(self) -> None:
+            super().__init__()
+            self.openai_client = _client()
+
+    AGENT_TYPE_TO_FACTORY[RoleImplType.Gorilla] = NVIDIANIMAgent
+    USER_TYPE_TO_FACTORY[RoleImplType.GPT_4_o_2024_05_13] = NVIDIANIMUser
+
+
+if __name__ == "__main__":
+    _register_nvidia_roles()
+    from tool_sandbox.cli import main
+
+    sys.exit(main())
diff --git a/examples/configs/toolsandbox.yaml b/examples/configs/toolsandbox.yaml
new file mode 100644
index 000000000..12aa87fe2
--- /dev/null
+++ b/examples/configs/toolsandbox.yaml
@@ -0,0 +1,43 @@
+# Flavor: ToolSandbox — stateful multi-turn tool-use benchmark
+#
+# Evaluates an LLM's ability to use tools across stateful, multi-turn
+# conversations (Apple's ToolSandbox: https://github.com/apple/ToolSandbox).
+# Both the agent under evaluation AND the user simulator call the NVIDIA
+# Inference API — no OpenAI key required.
+#
+# Prerequisites:
+#   - NVIDIA_API_KEY set in environment
+#   - Docker running locally
+#   - toolsandbox-nel image built (once):
+#       docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest .
+#
+# Run: nel eval run examples/configs/toolsandbox.yaml
+
+services:
+  nemotron:
+    type: api
+    url: https://integrate.api.nvidia.com/v1/chat/completions
+    protocol: chat_completions
+    model: nvidia/nemotron-3-super-120b-a12b
+    api_key: ${NVIDIA_API_KEY}
+
+benchmarks:
+  - name: toolsandbox
+    params:
+      image: toolsandbox-nel:latest
+      # Model used as user simulator (must be available on the same API)
+      user_model: meta/llama-3.1-70b-instruct
+      # Number of scenarios to run in parallel inside the container
+      parallel: 4
+      # Set to true to run only a small predefined subset for quick validation
+      test_mode: false
+      # Specific scenarios to run — omit or set [] to run the full suite
+      # scenarios: [wifi_off, cellular_off, make_call]
+    solver:
+      type: simple
+      service: nemotron
+    timeout: 7200.0
+
+output:
+  dir: ./results/toolsandbox
+  report: [markdown, json]
diff --git a/src/nemo_evaluator/benchmarks/__init__.py b/src/nemo_evaluator/benchmarks/__init__.py
index 3b2f9cd4b..de2b3879b 100644
--- a/src/nemo_evaluator/benchmarks/__init__.py
+++ b/src/nemo_evaluator/benchmarks/__init__.py
@@ -33,5 +33,6 @@
 import nemo_evaluator.benchmarks.simpleqa  # noqa: F401
 import nemo_evaluator.benchmarks.terminal_bench_hard  # noqa: F401
 import nemo_evaluator.benchmarks.terminal_bench_v1  # noqa: F401
+import nemo_evaluator.benchmarks.toolsandbox  # noqa: F401
 import nemo_evaluator.benchmarks.triviaqa  # noqa: F401
 import nemo_evaluator.benchmarks.xstest  # noqa: F401
diff --git a/src/nemo_evaluator/benchmarks/toolsandbox.py b/src/nemo_evaluator/benchmarks/toolsandbox.py
new file mode 100644
index 000000000..53c147cee
--- /dev/null
+++ b/src/nemo_evaluator/benchmarks/toolsandbox.py
@@ -0,0 +1,257 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ToolSandbox benchmark — Apple's stateful multi-turn tool-use evaluation.
+
+Registers ``toolsandbox`` as a built-in benchmark.  Bypasses the standard
+seed/solve/verify loop via ``run_batch()``, which spawns a pre-built Docker
+image containing ToolSandbox and parses the resulting ``result_summary.json``.
+
+Config usage::
+
+    benchmarks:
+      - name: toolsandbox
+        params:
+          image: toolsandbox-nel:latest           # pre-built Docker image
+          user_model: meta/llama-3.1-70b-instruct # user-simulator model
+          parallel: 4                             # concurrent scenarios
+          test_mode: false                        # true = small subset only
+          scenarios: []                           # [] = all scenarios
+        solver:
+          type: simple
+          service: my_model
+        timeout: 7200.0
+
+Build the image once before running::
+
+    docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest .
+
+The agent under evaluation is taken from the ``solver.service`` entry.
+The user simulator calls the same NVIDIA API base URL with ``user_model``.
+Both require ``NVIDIA_API_KEY`` in the environment.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import os
+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult
+from nemo_evaluator.environments.registry import register
+
+if TYPE_CHECKING:
+    from nemo_evaluator.sandbox.base import Sandbox
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_IMAGE = "toolsandbox-nel:latest"
+_DEFAULT_USER_MODEL = "meta/llama-3.1-70b-instruct"
+_DEFAULT_BASE_URL = "https://integrate.api.nvidia.com/v1"
+_CONTAINER_OUTPUT = "/output"
+
+# ToolSandbox uses --agent Gorilla / --user GPT_4_o_2024_05_13 as the CLI
+# selectors; the entrypoint script patches those factory entries to point at
+# NVIDIANIMAgent / NVIDIANIMUser backed by NVIDIA_BASE_URL.
+_CLI_AGENT = "Gorilla"
+_CLI_USER = "GPT_4_o_2024_05_13"
+
+
+def _to_openai_base_url(url: str) -> str:
+    """Normalize a NEL service URL to the OpenAI SDK base_url format.
+
+    NEL service URLs include the full path (e.g. /v1/chat/completions).
+    ToolSandbox expects just the base (e.g. https://host/v1).
+    """
+    for suffix in ("/chat/completions", "/completions", "/responses"):
+        if url.endswith(suffix):
+            return url[: -len(suffix)]
+    return url.rstrip("/")
+
+
+@register("toolsandbox")
+class ToolSandboxEnvironment(EvalEnvironment):
+    """Runs ToolSandbox in a Docker container and parses aggregate metrics.
+
+    The entire scenario suite executes as a single batch inside the container.
+    ``seed()`` and ``verify()`` are not used.
+    """
+
+    def __init__(
+        self,
+        image: str = _DEFAULT_IMAGE,
+        user_model: str = _DEFAULT_USER_MODEL,
+        scenarios: list[str] | None = None,
+        parallel: int = 4,
+        timeout: float = 7200.0,
+        test_mode: bool = False,
+    ) -> None:
+        super().__init__()
+        self._image = image
+        self._user_model = user_model
+        self._scenarios: list[str] = scenarios or []
+        self._parallel = parallel
+        self._timeout = timeout
+        self._test_mode = test_mode
+
+    # ------------------------------------------------------------------
+    # EvalEnvironment interface
+    # ------------------------------------------------------------------
+
+    async def dataset_size(self) -> int:
+        return 0
+
+    async def seed(self, idx: int) -> SeedResult:
+        raise NotImplementedError("ToolSandboxEnvironment uses run_batch()")
+
+    async def verify(
+        self,
+        response: str,
+        expected: str,
+        sandbox: "Sandbox | None" = None,
+        **metadata: Any,
+    ) -> VerifyResult:
+        raise NotImplementedError("ToolSandboxEnvironment uses run_batch()")
+
+    # ------------------------------------------------------------------
+    # Batch execution
+    # ------------------------------------------------------------------
+
+    async def run_batch(self, solver: Any = None, config: dict[str, Any] | None = None) -> dict[str, Any]:
+        config = config or {}
+        model_url = config.get("base_url", "") or _DEFAULT_BASE_URL
+        model_id = config.get("model", "")
+        api_key = config.get("api_key") or os.environ.get("NVIDIA_API_KEY", "")
+
+        base_url = _to_openai_base_url(model_url)
+
+        with tempfile.TemporaryDirectory(prefix="nel_toolsandbox_") as tmpdir:
+            output_dir = Path(tmpdir) / "output"
+            output_dir.mkdir()
+
+            cmd = self._build_docker_cmd(output_dir, base_url, model_id, api_key)
+            logger.info("Launching ToolSandbox: %s", " ".join(cmd[:10]) + " ...")
+
+            proc = await asyncio.create_subprocess_exec(
+                *cmd,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+            try:
+                stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=self._timeout)
+            except asyncio.TimeoutError:
+                proc.kill()
+                await proc.wait()
+                logger.error("ToolSandbox timed out after %.0fs", self._timeout)
+                stderr = b"timeout"
+
+            rc = proc.returncode or 0
+            if rc != 0:
+                logger.error(
+                    "ToolSandbox container exited %d:\n%s",
+                    rc,
+                    (stderr or b"").decode(errors="replace")[:2000],
+                )
+
+            return self._parse_results(output_dir, rc, model_id)
+
+    # ------------------------------------------------------------------
+    # Docker command builder
+    # ------------------------------------------------------------------
+
+    def _build_docker_cmd(
+        self,
+        output_dir: Path,
+        base_url: str,
+        model_id: str,
+        api_key: str,
+    ) -> list[str]:
+        cmd = [
+            "docker", "run", "--rm",
+            "-v", f"{output_dir}:{_CONTAINER_OUTPUT}",
+            "-e", f"NVIDIA_BASE_URL={base_url}",
+            "-e", f"NVIDIA_AGENT_MODEL={model_id}",
+            "-e", f"NVIDIA_USER_MODEL={self._user_model}",
+        ]
+        if api_key:
+            cmd.extend(["-e", f"NVIDIA_API_KEY={api_key}"])
+
+        cmd.append(self._image)
+
+        cmd.extend(["--agent", _CLI_AGENT, "--user", _CLI_USER])
+        cmd.extend(["--output_dir", _CONTAINER_OUTPUT])
+        cmd.extend(["--parallel", str(self._parallel)])
+
+        if self._test_mode:
+            cmd.append("--test_mode")
+        elif self._scenarios:
+            cmd.extend(["--scenarios"] + list(self._scenarios))
+
+        return cmd
+
+    # ------------------------------------------------------------------
+    # Results parsing
+    # ------------------------------------------------------------------
+
+    def _parse_results(self, output_dir: Path, exit_code: int, model_id: str) -> dict[str, Any]:
+        summary = self._load_result_summary(output_dir)
+        scores = self._extract_scores(summary)
+
+        return {
+            "benchmark": {
+                "name": self.name,
+                "samples": summary.get("num_scenarios", len(summary.get("per_category", {}))),
+                "scores": scores,
+            },
+            "config": {
+                "benchmark": self.name,
+                "image": self._image,
+                "model": model_id,
+                "user_model": self._user_model,
+                "framework": "toolsandbox",
+                "scenarios": self._scenarios or "all",
+                "test_mode": self._test_mode,
+            },
+            "_container_exit_code": exit_code,
+        }
+
+    def _load_result_summary(self, output_dir: Path) -> dict[str, Any]:
+        for candidate in sorted(output_dir.rglob("result_summary.json")):
+            try:
+                return json.loads(candidate.read_text(encoding="utf-8"))
+            except (json.JSONDecodeError, OSError) as exc:
+                logger.warning("Could not parse %s: %s", candidate, exc)
+        logger.warning("No result_summary.json found in %s", output_dir)
+        return {}
+
+    @staticmethod
+    def _extract_scores(summary: dict[str, Any]) -> dict[str, Any]:
+        scores: dict[str, Any] = {}
+
+        for metric in ("similarity", "turn_count"):
+            if metric in summary:
+                scores[metric] = {"value": round(float(summary[metric]), 4)}
+
+        per_category = summary.get("per_category") or {}
+        for cat_name, cat_data in per_category.items():
+            if isinstance(cat_data, dict) and "similarity" in cat_data:
+                scores[f"per_category/{cat_name}/similarity"] = {
+                    "value": round(float(cat_data["similarity"]), 4)
+                }
+
+        return scores
diff --git a/tests/test_environments/test_toolsandbox.py b/tests/test_environments/test_toolsandbox.py
new file mode 100644
index 000000000..241152e19
--- /dev/null
+++ b/tests/test_environments/test_toolsandbox.py
@@ -0,0 +1,268 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Offline tests for ToolSandboxEnvironment (no Docker, no network)."""
+
+from __future__ import annotations
+
+import json
+import pytest
+
+from nemo_evaluator.benchmarks.toolsandbox import ToolSandboxEnvironment, _to_openai_base_url
+
+
+# ---------------------------------------------------------------------------
+# URL normalization
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "url, expected",
+    [
+        (
+            "https://integrate.api.nvidia.com/v1/chat/completions",
+            "https://integrate.api.nvidia.com/v1",
+        ),
+        (
+            "https://integrate.api.nvidia.com/v1/completions",
+            "https://integrate.api.nvidia.com/v1",
+        ),
+        (
+            "https://integrate.api.nvidia.com/v1/responses",
+            "https://integrate.api.nvidia.com/v1",
+        ),
+        (
+            "https://integrate.api.nvidia.com/v1",
+            "https://integrate.api.nvidia.com/v1",
+        ),
+        (
+            "http://localhost:8000/v1/chat/completions",
+            "http://localhost:8000/v1",
+        ),
+        (
+            "http://localhost:8000/v1",
+            "http://localhost:8000/v1",
+        ),
+    ],
+    ids=[
+        "strip_chat_completions",
+        "strip_completions",
+        "strip_responses",
+        "no_op",
+        "localhost_with_suffix",
+        "localhost_base",
+    ],
+)
+def test_to_openai_base_url(url: str, expected: str) -> None:
+    assert _to_openai_base_url(url) == expected
+
+
+# ---------------------------------------------------------------------------
+# ToolSandboxEnvironment construction and defaults
+# ---------------------------------------------------------------------------
+
+
+def test_default_construction() -> None:
+    env = ToolSandboxEnvironment()
+    assert env._image == "toolsandbox-nel:latest"
+    assert env._user_model == "meta/llama-3.1-70b-instruct"
+    assert env._scenarios == []
+    assert env._parallel == 4
+    assert not env._test_mode
+
+
+def test_custom_params() -> None:
+    env = ToolSandboxEnvironment(
+        image="toolsandbox-nel:v1.2",
+        user_model="meta/llama-3.1-8b-instruct",
+        scenarios=["wifi_off", "cellular_off"],
+        parallel=8,
+        test_mode=True,
+    )
+    assert env._image == "toolsandbox-nel:v1.2"
+    assert env._scenarios == ["wifi_off", "cellular_off"]
+    assert env._parallel == 8
+    assert env._test_mode
+
+
+# ---------------------------------------------------------------------------
+# Docker command construction
+# ---------------------------------------------------------------------------
+
+
+def test_docker_cmd_no_scenarios() -> None:
+    env = ToolSandboxEnvironment()
+    cmd = env._build_docker_cmd(
+        output_dir=__import__("pathlib").Path("/tmp/output"),
+        base_url="https://integrate.api.nvidia.com/v1",
+        model_id="nvidia/nemotron-3-super-120b-a12b",
+        api_key="test-key",
+    )
+    assert "docker" in cmd
+    assert "--agent" in cmd
+    assert "Gorilla" in cmd
+    assert "--user" in cmd
+    assert "GPT_4_o_2024_05_13" in cmd
+    assert "--scenarios" not in cmd
+    assert "--test_mode" not in cmd
+    assert "NVIDIA_AGENT_MODEL=nvidia/nemotron-3-super-120b-a12b" in " ".join(cmd)
+
+
+def test_docker_cmd_with_scenarios() -> None:
+    env = ToolSandboxEnvironment(scenarios=["wifi_off", "make_call"])
+    cmd = env._build_docker_cmd(
+        output_dir=__import__("pathlib").Path("/tmp/output"),
+        base_url="https://integrate.api.nvidia.com/v1",
+        model_id="test-model",
+        api_key="key",
+    )
+    idx = cmd.index("--scenarios")
+    assert cmd[idx + 1] == "wifi_off"
+    assert cmd[idx + 2] == "make_call"
+
+
+def test_docker_cmd_test_mode() -> None:
+    env = ToolSandboxEnvironment(test_mode=True)
+    cmd = env._build_docker_cmd(
+        output_dir=__import__("pathlib").Path("/tmp/output"),
+        base_url="https://integrate.api.nvidia.com/v1",
+        model_id="test-model",
+        api_key="key",
+    )
+    assert "--test_mode" in cmd
+    assert "--scenarios" not in cmd
+
+
+def test_docker_cmd_no_api_key() -> None:
+    env = ToolSandboxEnvironment()
+    cmd = env._build_docker_cmd(
+        output_dir=__import__("pathlib").Path("/tmp/output"),
+        base_url="https://integrate.api.nvidia.com/v1",
+        model_id="test-model",
+        api_key="",
+    )
+    cmd_str = " ".join(cmd)
+    assert "NVIDIA_API_KEY" not in cmd_str
+
+
+# ---------------------------------------------------------------------------
+# Score extraction
+# ---------------------------------------------------------------------------
+
+
+def test_extract_scores_full() -> None:
+    summary = {
+        "similarity": 0.72,
+        "turn_count": 4.3,
+        "per_category": {
+            "single_tool_call": {"similarity": 0.91, "count": 45},
+            "multiple_tool_call": {"similarity": 0.61, "count": 20},
+        },
+    }
+    scores = ToolSandboxEnvironment._extract_scores(summary)
+
+    assert scores["similarity"]["value"] == pytest.approx(0.72, abs=1e-4)
+    assert scores["turn_count"]["value"] == pytest.approx(4.3, abs=1e-4)
+    assert "per_category/single_tool_call/similarity" in scores
+    assert scores["per_category/single_tool_call/similarity"]["value"] == pytest.approx(0.91, abs=1e-4)
+    assert "per_category/multiple_tool_call/similarity" in scores
+
+
+def test_extract_scores_empty_summary() -> None:
+    scores = ToolSandboxEnvironment._extract_scores({})
+    assert scores == {}
+
+
+def test_extract_scores_no_categories() -> None:
+    summary = {"similarity": 0.5}
+    scores = ToolSandboxEnvironment._extract_scores(summary)
+    assert scores == {"similarity": {"value": 0.5}}
+
+
+def test_extract_scores_category_missing_similarity() -> None:
+    summary = {
+        "similarity": 0.8,
+        "per_category": {"single_tool_call": {"count": 10}},
+    }
+    scores = ToolSandboxEnvironment._extract_scores(summary)
+    assert "per_category/single_tool_call/similarity" not in scores
+    assert scores["similarity"]["value"] == pytest.approx(0.8)
+
+
+# ---------------------------------------------------------------------------
+# Result summary loading (from temp directory)
+# ---------------------------------------------------------------------------
+
+
+def test_load_result_summary(tmp_path):
+    env = ToolSandboxEnvironment()
+
+    # Simulate ToolSandbox output structure
+    run_dir = tmp_path / "agent_Gorilla_user_GPT_4_o_20240513_12345"
+    run_dir.mkdir()
+    summary_data = {"similarity": 0.65, "turn_count": 3.1}
+    (run_dir / "result_summary.json").write_text(json.dumps(summary_data))
+
+    result = env._load_result_summary(tmp_path)
+    assert result == summary_data
+
+
+def test_load_result_summary_missing(tmp_path) -> None:
+    env = ToolSandboxEnvironment()
+    result = env._load_result_summary(tmp_path)
+    assert result == {}
+
+
+# ---------------------------------------------------------------------------
+# Bundle structure
+# ---------------------------------------------------------------------------
+
+
+def test_parse_results_bundle_keys(tmp_path) -> None:
+    env = ToolSandboxEnvironment()
+
+    run_dir = tmp_path / "run_1"
+    run_dir.mkdir()
+    (run_dir / "result_summary.json").write_text(
+        json.dumps({"similarity": 0.7, "turn_count": 5.0})
+    )
+
+    bundle = env._parse_results(tmp_path, exit_code=0, model_id="my-model")
+
+    assert "benchmark" in bundle
+    assert "config" in bundle
+    assert bundle["benchmark"]["name"] == "toolsandbox"
+    assert bundle["benchmark"]["scores"]["similarity"]["value"] == pytest.approx(0.7)
+    assert bundle["config"]["framework"] == "toolsandbox"
+    assert bundle["config"]["model"] == "my-model"
+    assert bundle["_container_exit_code"] == 0
+
+
+# ---------------------------------------------------------------------------
+# seed/verify raise NotImplementedError
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_seed_raises() -> None:
+    env = ToolSandboxEnvironment()
+    with pytest.raises(NotImplementedError):
+        await env.seed(0)
+
+
+@pytest.mark.asyncio
+async def test_verify_raises() -> None:
+    env = ToolSandboxEnvironment()
+    with pytest.raises(NotImplementedError):
+        await env.verify("response", "expected")

From e633f976b041fdef6915c63fa418b97fdf602dea Mon Sep 17 00:00:00 2001
From: Wojciech Prazuch <wprazuch@nvidia.com>
Date: Mon, 18 May 2026 15:06:08 -0700
Subject: [PATCH 2/6] feat(benchmarks): add subprocess/apptainer runners for
 SLURM

Adds two runner modes to ToolSandboxEnvironment:
- apptainer: runs toolsandbox-nel.sif via `apptainer run` (SLURM with
  apptainer, no Docker needed)
- subprocess: runs toolsandbox_entrypoint.py directly as a Python
  subprocess (eval container has ToolSandbox pre-installed in a venv,
  zero nesting)

Also adds Dockerfile.toolsandbox-combined (ToolSandbox in isolated
/opt/toolsandbox-venv alongside NEL Next) and a SLURM example config
that uses the subprocess runner.

24 offline tests, all green.

Signed-off-by: Wojciech Prazuch <wprazuch@nvidia.com>
---
 docker/Dockerfile.toolsandbox-combined       |  46 ++++++
 examples/configs/toolsandbox_slurm.yaml      |  56 +++++++
 src/nemo_evaluator/benchmarks/toolsandbox.py | 156 +++++++++++++------
 tests/test_environments/test_toolsandbox.py  | 104 +++++++++++--
 4 files changed, 302 insertions(+), 60 deletions(-)
 create mode 100644 docker/Dockerfile.toolsandbox-combined
 create mode 100644 examples/configs/toolsandbox_slurm.yaml

diff --git a/docker/Dockerfile.toolsandbox-combined b/docker/Dockerfile.toolsandbox-combined
new file mode 100644
index 000000000..8c811e88e
--- /dev/null
+++ b/docker/Dockerfile.toolsandbox-combined
@@ -0,0 +1,46 @@
+# ToolSandbox + NEL Next combined evaluation container.
+#
+# Used for SLURM runs where nested Docker is unavailable.  ToolSandbox is
+# installed in an isolated venv (/opt/toolsandbox-venv) so its pinned
+# dependencies (openai==1.17.0, etc.) do not conflict with NEL Next.
+#
+# Build:
+#   docker build -f docker/Dockerfile.toolsandbox-combined \
+#     -t toolsandbox-nel-combined:latest .
+#
+# Convert to squashfs for SLURM (run on a login node with Docker):
+#   enroot import -o /shared/nel/toolsandbox-nel-combined.sqsh \
+#     dockerd://toolsandbox-nel-combined:latest
+#
+# NEL Next config (subprocess runner):
+#   benchmarks:
+#     - name: toolsandbox
+#       params:
+#         runner: subprocess
+#         python_exe: /opt/toolsandbox-venv/bin/python
+#         entrypoint: /opt/toolsandbox_entrypoint.py
+#
+# Required env vars at runtime:
+#   NVIDIA_BASE_URL    – OpenAI-compatible base URL
+#   NVIDIA_API_KEY     – API key
+#   NVIDIA_AGENT_MODEL – model ID for the agent under evaluation
+#   NVIDIA_USER_MODEL  – model ID for user simulator
+
+ARG BASE_IMAGE=nemo-evaluator
+
+FROM ${BASE_IMAGE}
+
+# Install Python 3.11 for the ToolSandbox venv (avoids openai version conflicts
+# with NEL Next which uses the system Python 3.12)
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends python3.11 python3.11-venv \
+    && rm -rf /var/lib/apt/lists/*
+
+ARG TOOLSANDBOX_REF=main
+
+# Create an isolated venv for ToolSandbox with its pinned deps
+RUN python3.11 -m venv /opt/toolsandbox-venv \
+    && /opt/toolsandbox-venv/bin/pip install --no-cache-dir \
+       "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}"
+
+COPY docker/toolsandbox_entrypoint.py /opt/toolsandbox_entrypoint.py
diff --git a/examples/configs/toolsandbox_slurm.yaml b/examples/configs/toolsandbox_slurm.yaml
new file mode 100644
index 000000000..cc73df779
--- /dev/null
+++ b/examples/configs/toolsandbox_slurm.yaml
@@ -0,0 +1,56 @@
+# Flavor: ToolSandbox on SLURM (subprocess runner — no nested Docker)
+#
+# Uses the subprocess runner so ToolSandbox (pre-installed in
+# /opt/toolsandbox-venv inside the eval container) runs directly without
+# a nested Docker call.  Suitable for any SLURM cluster.
+#
+# Prerequisites:
+#   - NVIDIA_API_KEY set in environment (or in cluster.container_env)
+#   - toolsandbox-nel-combined squashfs on shared storage:
+#       docker build -f docker/Dockerfile.toolsandbox-combined \
+#           -t toolsandbox-nel-combined:latest .
+#       enroot import -o /shared/nel/toolsandbox-nel-combined.sqsh \
+#           dockerd://toolsandbox-nel-combined:latest
+#   - SSH access to the SLURM login node
+#
+# Dry-run: nel eval run examples/configs/toolsandbox_slurm.yaml --dry-run
+# Submit:  nel eval run examples/configs/toolsandbox_slurm.yaml --submit
+
+services:
+  nemotron:
+    type: api
+    url: https://integrate.api.nvidia.com/v1/chat/completions
+    protocol: chat_completions
+    model: nvidia/nemotron-3-super-120b-a12b
+    api_key: ${NVIDIA_API_KEY}
+
+benchmarks:
+  - name: toolsandbox
+    params:
+      runner: subprocess
+      python_exe: /opt/toolsandbox-venv/bin/python
+      entrypoint: /opt/toolsandbox_entrypoint.py
+      user_model: meta/llama-3.1-70b-instruct
+      parallel: 8
+      test_mode: false
+    solver:
+      type: simple
+      service: nemotron
+    timeout: 14400.0
+
+output:
+  dir: ./results/toolsandbox_slurm
+  report: [markdown, json]
+
+cluster:
+  type: slurm
+  hostname: ${SLURM_LOGIN_HOST}
+  account: ${SLURM_ACCOUNT}
+  walltime: "04:00:00"
+  eval_image: ${SHARED_ROOT}/nel/toolsandbox-nel-combined.sqsh
+  container_env:
+    NVIDIA_API_KEY: ${NVIDIA_API_KEY}
+  node_pools:
+    default:
+      partition: cpu
+      nodes: 1
diff --git a/src/nemo_evaluator/benchmarks/toolsandbox.py b/src/nemo_evaluator/benchmarks/toolsandbox.py
index 53c147cee..bc49b2448 100644
--- a/src/nemo_evaluator/benchmarks/toolsandbox.py
+++ b/src/nemo_evaluator/benchmarks/toolsandbox.py
@@ -15,31 +15,53 @@
 """ToolSandbox benchmark — Apple's stateful multi-turn tool-use evaluation.
 
 Registers ``toolsandbox`` as a built-in benchmark.  Bypasses the standard
-seed/solve/verify loop via ``run_batch()``, which spawns a pre-built Docker
-image containing ToolSandbox and parses the resulting ``result_summary.json``.
+seed/solve/verify loop via ``run_batch()``, which runs ToolSandbox in one of
+three modes and parses the resulting ``result_summary.json``.
+
+Runner modes
+------------
+docker (default)
+    Spawns a pre-built Docker image.  Requires Docker on the eval host.
+
+    Build the image once::
+
+        docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest .
+
+apptainer
+    Same image as ``docker`` mode but executed via ``apptainer run``.
+    Use on SLURM clusters where Docker is unavailable.  ``image`` should be
+    a path to a ``.sif`` or ``.sqsh`` file on the shared filesystem.
+
+subprocess
+    Runs the ToolSandbox entrypoint directly as a Python subprocess — no
+    container needed.  Use when the eval container already has ToolSandbox
+    pre-installed (e.g. ``Dockerfile.toolsandbox-combined``).  Set
+    ``python_exe`` to the venv Python that has ToolSandbox, and
+    ``entrypoint`` to the patch script path.
 
 Config usage::
 
     benchmarks:
       - name: toolsandbox
         params:
-          image: toolsandbox-nel:latest           # pre-built Docker image
-          user_model: meta/llama-3.1-70b-instruct # user-simulator model
-          parallel: 4                             # concurrent scenarios
-          test_mode: false                        # true = small subset only
-          scenarios: []                           # [] = all scenarios
+          # --- runner selection ---
+          runner: docker                           # docker | apptainer | subprocess
+          image: toolsandbox-nel:latest            # docker image name / sif path
+          # --- subprocess-mode overrides ---
+          python_exe: /opt/toolsandbox-venv/bin/python
+          entrypoint: /opt/toolsandbox_entrypoint.py
+          # --- benchmark settings ---
+          user_model: meta/llama-3.1-70b-instruct  # user-simulator model
+          parallel: 4                              # concurrent scenarios
+          test_mode: false                         # true = small subset only
+          scenarios: []                            # [] = all scenarios
         solver:
           type: simple
           service: my_model
         timeout: 7200.0
 
-Build the image once before running::
-
-    docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest .
-
-The agent under evaluation is taken from the ``solver.service`` entry.
-The user simulator calls the same NVIDIA API base URL with ``user_model``.
-Both require ``NVIDIA_API_KEY`` in the environment.
+Both agent and user simulator call the NVIDIA Inference API — no OpenAI
+key required.  ``NVIDIA_API_KEY`` must be set in the environment.
 """
 
 from __future__ import annotations
@@ -48,9 +70,10 @@
 import json
 import logging
 import os
+import sys
 import tempfile
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Literal
 
 from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult
 from nemo_evaluator.environments.registry import register
@@ -63,21 +86,15 @@
 _DEFAULT_IMAGE = "toolsandbox-nel:latest"
 _DEFAULT_USER_MODEL = "meta/llama-3.1-70b-instruct"
 _DEFAULT_BASE_URL = "https://integrate.api.nvidia.com/v1"
+_DEFAULT_ENTRYPOINT = "/opt/toolsandbox_entrypoint.py"
 _CONTAINER_OUTPUT = "/output"
 
-# ToolSandbox uses --agent Gorilla / --user GPT_4_o_2024_05_13 as the CLI
-# selectors; the entrypoint script patches those factory entries to point at
-# NVIDIANIMAgent / NVIDIANIMUser backed by NVIDIA_BASE_URL.
 _CLI_AGENT = "Gorilla"
 _CLI_USER = "GPT_4_o_2024_05_13"
 
 
 def _to_openai_base_url(url: str) -> str:
-    """Normalize a NEL service URL to the OpenAI SDK base_url format.
-
-    NEL service URLs include the full path (e.g. /v1/chat/completions).
-    ToolSandbox expects just the base (e.g. https://host/v1).
-    """
+    """Strip /chat/completions, /completions, /responses path suffix from NEL service URLs."""
     for suffix in ("/chat/completions", "/completions", "/responses"):
         if url.endswith(suffix):
             return url[: -len(suffix)]
@@ -86,15 +103,18 @@ def _to_openai_base_url(url: str) -> str:
 
 @register("toolsandbox")
 class ToolSandboxEnvironment(EvalEnvironment):
-    """Runs ToolSandbox in a Docker container and parses aggregate metrics.
+    """Runs ToolSandbox and parses aggregate metrics.
 
-    The entire scenario suite executes as a single batch inside the container.
+    The entire scenario suite executes as a single batch.
     ``seed()`` and ``verify()`` are not used.
     """
 
     def __init__(
         self,
+        runner: Literal["docker", "apptainer", "subprocess"] = "docker",
         image: str = _DEFAULT_IMAGE,
+        python_exe: str | None = None,
+        entrypoint: str = _DEFAULT_ENTRYPOINT,
         user_model: str = _DEFAULT_USER_MODEL,
         scenarios: list[str] | None = None,
         parallel: int = 4,
@@ -102,7 +122,10 @@ def __init__(
         test_mode: bool = False,
     ) -> None:
         super().__init__()
+        self._runner = runner
         self._image = image
+        self._python_exe = python_exe or sys.executable
+        self._entrypoint = entrypoint
         self._user_model = user_model
         self._scenarios: list[str] = scenarios or []
         self._parallel = parallel
@@ -144,13 +167,14 @@ async def run_batch(self, solver: Any = None, config: dict[str, Any] | None = No
             output_dir = Path(tmpdir) / "output"
             output_dir.mkdir()
 
-            cmd = self._build_docker_cmd(output_dir, base_url, model_id, api_key)
-            logger.info("Launching ToolSandbox: %s", " ".join(cmd[:10]) + " ...")
+            cmd, env = self._build_cmd(output_dir, base_url, model_id, api_key)
+            logger.info("Launching ToolSandbox (%s): %s", self._runner, " ".join(cmd[:10]) + " ...")
 
             proc = await asyncio.create_subprocess_exec(
                 *cmd,
                 stdout=asyncio.subprocess.PIPE,
                 stderr=asyncio.subprocess.PIPE,
+                env=env,
             )
             try:
                 stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=self._timeout)
@@ -163,7 +187,7 @@ async def run_batch(self, solver: Any = None, config: dict[str, Any] | None = No
             rc = proc.returncode or 0
             if rc != 0:
                 logger.error(
-                    "ToolSandbox container exited %d:\n%s",
+                    "ToolSandbox exited %d:\n%s",
                     rc,
                     (stderr or b"").decode(errors="replace")[:2000],
                 )
@@ -171,39 +195,82 @@ async def run_batch(self, solver: Any = None, config: dict[str, Any] | None = No
             return self._parse_results(output_dir, rc, model_id)
 
     # ------------------------------------------------------------------
-    # Docker command builder
+    # Command builders
     # ------------------------------------------------------------------
 
-    def _build_docker_cmd(
+    def _build_cmd(
         self,
         output_dir: Path,
         base_url: str,
         model_id: str,
         api_key: str,
-    ) -> list[str]:
-        cmd = [
-            "docker", "run", "--rm",
-            "-v", f"{output_dir}:{_CONTAINER_OUTPUT}",
+    ) -> tuple[list[str], dict[str, str] | None]:
+        """Return (cmd, env) for the selected runner."""
+        if self._runner == "subprocess":
+            return self._build_subprocess_cmd(output_dir, base_url, model_id, api_key)
+        if self._runner == "apptainer":
+            return self._build_apptainer_cmd(output_dir, base_url, model_id, api_key), None
+        return self._build_docker_cmd(output_dir, base_url, model_id, api_key), None
+
+    def _toolsandbox_cli_args(self, output_dir_str: str) -> list[str]:
+        args = ["--agent", _CLI_AGENT, "--user", _CLI_USER]
+        args.extend(["--output_dir", output_dir_str])
+        args.extend(["--parallel", str(self._parallel)])
+        if self._test_mode:
+            args.append("--test_mode")
+        elif self._scenarios:
+            args.extend(["--scenarios"] + list(self._scenarios))
+        return args
+
+    def _container_env_flags(self, base_url: str, model_id: str, api_key: str) -> list[str]:
+        flags = [
             "-e", f"NVIDIA_BASE_URL={base_url}",
             "-e", f"NVIDIA_AGENT_MODEL={model_id}",
             "-e", f"NVIDIA_USER_MODEL={self._user_model}",
         ]
         if api_key:
-            cmd.extend(["-e", f"NVIDIA_API_KEY={api_key}"])
+            flags.extend(["-e", f"NVIDIA_API_KEY={api_key}"])
+        return flags
 
+    def _build_docker_cmd(self, output_dir: Path, base_url: str, model_id: str, api_key: str) -> list[str]:
+        cmd = ["docker", "run", "--rm", "-v", f"{output_dir}:{_CONTAINER_OUTPUT}"]
+        cmd.extend(self._container_env_flags(base_url, model_id, api_key))
         cmd.append(self._image)
+        cmd.extend(self._toolsandbox_cli_args(_CONTAINER_OUTPUT))
+        return cmd
 
-        cmd.extend(["--agent", _CLI_AGENT, "--user", _CLI_USER])
-        cmd.extend(["--output_dir", _CONTAINER_OUTPUT])
-        cmd.extend(["--parallel", str(self._parallel)])
-
-        if self._test_mode:
-            cmd.append("--test_mode")
-        elif self._scenarios:
-            cmd.extend(["--scenarios"] + list(self._scenarios))
+    def _build_apptainer_cmd(self, output_dir: Path, base_url: str, model_id: str, api_key: str) -> list[str]:
+        env_flags: list[str] = []
+        for flag in self._container_env_flags(base_url, model_id, api_key):
+            if flag == "-e":
+                continue
+            env_flags.extend(["--env", flag])
 
+        cmd = [
+            "apptainer", "run", "--bind", f"{output_dir}:{_CONTAINER_OUTPUT}",
+            *env_flags,
+            self._image,
+        ]
+        cmd.extend(self._toolsandbox_cli_args(_CONTAINER_OUTPUT))
         return cmd
 
+    def _build_subprocess_cmd(
+        self, output_dir: Path, base_url: str, model_id: str, api_key: str
+    ) -> tuple[list[str], dict[str, str]]:
+        # Env is passed via environment variables to the subprocess
+        env = {
+            **os.environ,
+            "NVIDIA_BASE_URL": base_url,
+            "NVIDIA_AGENT_MODEL": model_id,
+            "NVIDIA_USER_MODEL": self._user_model,
+        }
+        if api_key:
+            env["NVIDIA_API_KEY"] = api_key
+
+        cmd = [self._python_exe, self._entrypoint]
+        cmd.extend(self._toolsandbox_cli_args(str(output_dir)))
+        return cmd, env
+
     # ------------------------------------------------------------------
     # Results parsing
     # ------------------------------------------------------------------
@@ -220,7 +287,8 @@ def _parse_results(self, output_dir: Path, exit_code: int, model_id: str) -> dic
             },
             "config": {
                 "benchmark": self.name,
-                "image": self._image,
+                "runner": self._runner,
+                "image": self._image if self._runner != "subprocess" else None,
                 "model": model_id,
                 "user_model": self._user_model,
                 "framework": "toolsandbox",
diff --git a/tests/test_environments/test_toolsandbox.py b/tests/test_environments/test_toolsandbox.py
index 241152e19..9f2006734 100644
--- a/tests/test_environments/test_toolsandbox.py
+++ b/tests/test_environments/test_toolsandbox.py
@@ -75,6 +75,7 @@ def test_to_openai_base_url(url: str, expected: str) -> None:
 
 def test_default_construction() -> None:
     env = ToolSandboxEnvironment()
+    assert env._runner == "docker"
     assert env._image == "toolsandbox-nel:latest"
     assert env._user_model == "meta/llama-3.1-70b-instruct"
     assert env._scenarios == []
@@ -84,13 +85,16 @@ def test_default_construction() -> None:
 
 def test_custom_params() -> None:
     env = ToolSandboxEnvironment(
+        runner="subprocess",
         image="toolsandbox-nel:v1.2",
+        python_exe="/opt/toolsandbox-venv/bin/python",
         user_model="meta/llama-3.1-8b-instruct",
         scenarios=["wifi_off", "cellular_off"],
         parallel=8,
         test_mode=True,
     )
-    assert env._image == "toolsandbox-nel:v1.2"
+    assert env._runner == "subprocess"
+    assert env._python_exe == "/opt/toolsandbox-venv/bin/python"
     assert env._scenarios == ["wifi_off", "cellular_off"]
     assert env._parallel == 8
     assert env._test_mode
@@ -100,16 +104,18 @@ def test_custom_params() -> None:
 # Docker command construction
 # ---------------------------------------------------------------------------
 
+_P = __import__("pathlib").Path
+
 
 def test_docker_cmd_no_scenarios() -> None:
-    env = ToolSandboxEnvironment()
-    cmd = env._build_docker_cmd(
-        output_dir=__import__("pathlib").Path("/tmp/output"),
+    env = ToolSandboxEnvironment(runner="docker")
+    cmd, _ = env._build_cmd(
+        output_dir=_P("/tmp/output"),
         base_url="https://integrate.api.nvidia.com/v1",
         model_id="nvidia/nemotron-3-super-120b-a12b",
         api_key="test-key",
     )
-    assert "docker" in cmd
+    assert cmd[0] == "docker"
     assert "--agent" in cmd
     assert "Gorilla" in cmd
     assert "--user" in cmd
@@ -120,9 +126,9 @@ def test_docker_cmd_no_scenarios() -> None:
 
 
 def test_docker_cmd_with_scenarios() -> None:
-    env = ToolSandboxEnvironment(scenarios=["wifi_off", "make_call"])
-    cmd = env._build_docker_cmd(
-        output_dir=__import__("pathlib").Path("/tmp/output"),
+    env = ToolSandboxEnvironment(runner="docker", scenarios=["wifi_off", "make_call"])
+    cmd, _ = env._build_cmd(
+        output_dir=_P("/tmp/output"),
         base_url="https://integrate.api.nvidia.com/v1",
         model_id="test-model",
         api_key="key",
@@ -133,9 +139,9 @@ def test_docker_cmd_with_scenarios() -> None:
 
 
 def test_docker_cmd_test_mode() -> None:
-    env = ToolSandboxEnvironment(test_mode=True)
-    cmd = env._build_docker_cmd(
-        output_dir=__import__("pathlib").Path("/tmp/output"),
+    env = ToolSandboxEnvironment(runner="docker", test_mode=True)
+    cmd, _ = env._build_cmd(
+        output_dir=_P("/tmp/output"),
         base_url="https://integrate.api.nvidia.com/v1",
         model_id="test-model",
         api_key="key",
@@ -145,15 +151,80 @@ def test_docker_cmd_test_mode() -> None:
 
 
 def test_docker_cmd_no_api_key() -> None:
-    env = ToolSandboxEnvironment()
-    cmd = env._build_docker_cmd(
-        output_dir=__import__("pathlib").Path("/tmp/output"),
+    env = ToolSandboxEnvironment(runner="docker")
+    cmd, _ = env._build_cmd(
+        output_dir=_P("/tmp/output"),
+        base_url="https://integrate.api.nvidia.com/v1",
+        model_id="test-model",
+        api_key="",
+    )
+    assert "NVIDIA_API_KEY" not in " ".join(cmd)
+
+
+# ---------------------------------------------------------------------------
+# Apptainer command construction
+# ---------------------------------------------------------------------------
+
+
+def test_apptainer_cmd_basics() -> None:
+    env = ToolSandboxEnvironment(runner="apptainer", image="/shared/nel/toolsandbox.sif")
+    cmd, _ = env._build_cmd(
+        output_dir=_P("/tmp/output"),
+        base_url="https://integrate.api.nvidia.com/v1",
+        model_id="test-model",
+        api_key="key",
+    )
+    assert cmd[0] == "apptainer"
+    assert "run" in cmd
+    assert "--bind" in cmd
+    assert "/shared/nel/toolsandbox.sif" in cmd
+    assert "--agent" in cmd
+
+
+# ---------------------------------------------------------------------------
+# Subprocess command construction
+# ---------------------------------------------------------------------------
+
+
+def test_subprocess_cmd_basics() -> None:
+    env = ToolSandboxEnvironment(
+        runner="subprocess",
+        python_exe="/opt/toolsandbox-venv/bin/python",
+        entrypoint="/opt/toolsandbox_entrypoint.py",
+    )
+    cmd, env_vars = env._build_cmd(
+        output_dir=_P("/tmp/output"),
+        base_url="https://integrate.api.nvidia.com/v1",
+        model_id="test-model",
+        api_key="test-key",
+    )
+    assert cmd[0] == "/opt/toolsandbox-venv/bin/python"
+    assert cmd[1] == "/opt/toolsandbox_entrypoint.py"
+    assert "--agent" in cmd
+    assert "Gorilla" in cmd
+    # API config passed via environment, not CLI flags
+    assert env_vars["NVIDIA_BASE_URL"] == "https://integrate.api.nvidia.com/v1"
+    assert env_vars["NVIDIA_AGENT_MODEL"] == "test-model"
+    assert env_vars["NVIDIA_API_KEY"] == "test-key"
+    # output_dir is a local host path, not the container-mount path /output
+    assert "/tmp/output" in " ".join(cmd)
+    assert cmd.count("/output") == 0 or all(c != "/output" for c in cmd)
+
+
+def test_subprocess_cmd_no_api_key_does_not_override() -> None:
+    """When api_key='', we must not overwrite any existing env var with empty string."""
+    import os as _os
+    env_env = ToolSandboxEnvironment(runner="subprocess")
+    _, env_vars = env_env._build_cmd(
+        output_dir=_P("/tmp/output"),
         base_url="https://integrate.api.nvidia.com/v1",
         model_id="test-model",
         api_key="",
     )
-    cmd_str = " ".join(cmd)
-    assert "NVIDIA_API_KEY" not in cmd_str
+    # If NVIDIA_API_KEY already existed in os.environ, it should not be blanked.
+    # If it wasn't there, it should still not be there (or be identical to original).
+    original = _os.environ.get("NVIDIA_API_KEY", "")
+    assert env_vars.get("NVIDIA_API_KEY", "") == original
 
 
 # ---------------------------------------------------------------------------
@@ -245,6 +316,7 @@ def test_parse_results_bundle_keys(tmp_path) -> None:
     assert bundle["benchmark"]["name"] == "toolsandbox"
     assert bundle["benchmark"]["scores"]["similarity"]["value"] == pytest.approx(0.7)
     assert bundle["config"]["framework"] == "toolsandbox"
+    assert bundle["config"]["runner"] == "docker"
     assert bundle["config"]["model"] == "my-model"
     assert bundle["_container_exit_code"] == 0
 

From 762f6d5a73ae4921d2f933a726473ad09fef28f4 Mon Sep 17 00:00:00 2001
From: Wojciech Prazuch <wprazuch@nvidia.com>
Date: Mon, 18 May 2026 15:10:10 -0700
Subject: [PATCH 3/6] fix(toolsandbox): set OPENAI_API_KEY placeholder before
 parent __init__
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

OpenAIAPIAgent/User.__init__ reads OPENAI_API_KEY to build a client that
NVIDIANIMAgent immediately replaces — without the placeholder it raises
even though we never use the parent's client.

Signed-off-by: Wojciech Prazuch <wprazuch@nvidia.com>
---
 docker/toolsandbox_entrypoint.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docker/toolsandbox_entrypoint.py b/docker/toolsandbox_entrypoint.py
index c8eea9fd4..44b25b1a6 100644
--- a/docker/toolsandbox_entrypoint.py
+++ b/docker/toolsandbox_entrypoint.py
@@ -59,6 +59,11 @@ def _register_nvidia_roles() -> None:
     agent_model = _require_env("NVIDIA_AGENT_MODEL")
     user_model = os.environ.get("NVIDIA_USER_MODEL", "meta/llama-3.1-70b-instruct")
 
+    # OpenAIAPIAgent/User.__init__ reads OPENAI_API_KEY to create a temporary
+    # client that NVIDIANIMAgent immediately replaces.  Set a placeholder so
+    # the parent __init__ doesn't raise when the env var is absent.
+    os.environ.setdefault("OPENAI_API_KEY", api_key or "not-used")
+
     def _client() -> OpenAI:
         return OpenAI(base_url=base_url, api_key=api_key)
 

From 955baaaca65cd0be823eacd1f05b04c16ce3eba9 Mon Sep 17 00:00:00 2001
From: Wojciech Prazuch <wprazuch@nvidia.com>
Date: Mon, 18 May 2026 15:10:51 -0700
Subject: [PATCH 4/6] fix(toolsandbox): pin httpx<0.28 to fix openai==1.17.0
 proxies compat

openai==1.17.0 (pinned by ToolSandbox) passes proxies= to httpx which
removed that argument in 0.28.0, causing TypeError at runtime.

Signed-off-by: Wojciech Prazuch <wprazuch@nvidia.com>
---
 docker/Dockerfile.toolsandbox          | 3 ++-
 docker/Dockerfile.toolsandbox-combined | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.toolsandbox b/docker/Dockerfile.toolsandbox
index a96b69614..a86c176af 100644
--- a/docker/Dockerfile.toolsandbox
+++ b/docker/Dockerfile.toolsandbox
@@ -28,7 +28,8 @@ ARG TOOLSANDBOX_REF=main
 # ToolSandbox pins openai==1.17.0 and other specific versions — install in
 # a clean environment separate from any NEL dependencies.
 RUN pip install --no-cache-dir \
-    "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}"
+    "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}" \
+    "httpx<0.28.0"
 
 COPY docker/toolsandbox_entrypoint.py /opt/toolsandbox_entrypoint.py
 
diff --git a/docker/Dockerfile.toolsandbox-combined b/docker/Dockerfile.toolsandbox-combined
index 8c811e88e..ab388fb70 100644
--- a/docker/Dockerfile.toolsandbox-combined
+++ b/docker/Dockerfile.toolsandbox-combined
@@ -41,6 +41,7 @@ ARG TOOLSANDBOX_REF=main
 # Create an isolated venv for ToolSandbox with its pinned deps
 RUN python3.11 -m venv /opt/toolsandbox-venv \
     && /opt/toolsandbox-venv/bin/pip install --no-cache-dir \
-       "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}"
+       "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}" \
+       "httpx<0.28.0"
 
 COPY docker/toolsandbox_entrypoint.py /opt/toolsandbox_entrypoint.py

From e62aed8e2728c25ca44081eba9dec47a1eec895c Mon Sep 17 00:00:00 2001
From: Wojciech Prazuch <wprazuch@nvidia.com>
Date: Mon, 18 May 2026 15:14:50 -0700
Subject: [PATCH 5/6] fix(toolsandbox): correct result_summary.json parsing
 from smoke test

Smoke test revealed the actual output format uses category_aggregated_results
instead of a top-level similarity key.  Overall score comes from
ALL_CATEGORIES; per-category breakdown skips it to avoid duplication.
Sample count reads from per_scenario_results length.

Signed-off-by: Wojciech Prazuch <wprazuch@nvidia.com>
---
 src/nemo_evaluator/benchmarks/toolsandbox.py | 27 +++++++++++---
 tests/test_environments/test_toolsandbox.py  | 39 +++++++++++++-------
 2 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/src/nemo_evaluator/benchmarks/toolsandbox.py b/src/nemo_evaluator/benchmarks/toolsandbox.py
index bc49b2448..96dc67902 100644
--- a/src/nemo_evaluator/benchmarks/toolsandbox.py
+++ b/src/nemo_evaluator/benchmarks/toolsandbox.py
@@ -282,7 +282,7 @@ def _parse_results(self, output_dir: Path, exit_code: int, model_id: str) -> dic
         return {
             "benchmark": {
                 "name": self.name,
-                "samples": summary.get("num_scenarios", len(summary.get("per_category", {}))),
+                "samples": len(summary.get("per_scenario_results", [])),
                 "scores": scores,
             },
             "config": {
@@ -309,14 +309,29 @@ def _load_result_summary(self, output_dir: Path) -> dict[str, Any]:
 
     @staticmethod
     def _extract_scores(summary: dict[str, Any]) -> dict[str, Any]:
+        """Extract scores from ToolSandbox result_summary.json.
+
+        Real format (confirmed from smoke test):
+          category_aggregated_results:
+            ALL_CATEGORIES: {similarity: float, turn_count: float}
+            STATE_DEPENDENCY: {similarity: float, turn_count: float}
+            ...
+        """
         scores: dict[str, Any] = {}
 
-        for metric in ("similarity", "turn_count"):
-            if metric in summary:
-                scores[metric] = {"value": round(float(summary[metric]), 4)}
+        cat_results: dict[str, Any] = summary.get("category_aggregated_results") or {}
 
-        per_category = summary.get("per_category") or {}
-        for cat_name, cat_data in per_category.items():
+        # Overall score comes from the ALL_CATEGORIES aggregate
+        all_cat = cat_results.get("ALL_CATEGORIES") or {}
+        if "similarity" in all_cat:
+            scores["similarity"] = {"value": round(float(all_cat["similarity"]), 4)}
+        if "turn_count" in all_cat:
+            scores["turn_count"] = {"value": round(float(all_cat["turn_count"]), 2)}
+
+        # Per-category breakdown (skip ALL_CATEGORIES to avoid duplication)
+        for cat_name, cat_data in cat_results.items():
+            if cat_name == "ALL_CATEGORIES":
+                continue
             if isinstance(cat_data, dict) and "similarity" in cat_data:
                 scores[f"per_category/{cat_name}/similarity"] = {
                     "value": round(float(cat_data["similarity"]), 4)
diff --git a/tests/test_environments/test_toolsandbox.py b/tests/test_environments/test_toolsandbox.py
index 9f2006734..642a97d38 100644
--- a/tests/test_environments/test_toolsandbox.py
+++ b/tests/test_environments/test_toolsandbox.py
@@ -233,21 +233,24 @@ def test_subprocess_cmd_no_api_key_does_not_override() -> None:
 
 
 def test_extract_scores_full() -> None:
+    # Real format confirmed by smoke test
     summary = {
-        "similarity": 0.72,
-        "turn_count": 4.3,
-        "per_category": {
-            "single_tool_call": {"similarity": 0.91, "count": 45},
-            "multiple_tool_call": {"similarity": 0.61, "count": 20},
+        "per_scenario_results": [],
+        "category_aggregated_results": {
+            "ALL_CATEGORIES": {"similarity": 0.72, "turn_count": 4.3},
+            "STATE_DEPENDENCY": {"similarity": 0.91, "turn_count": 3.1},
+            "MULTIPLE_TOOL_CALL": {"similarity": 0.61, "turn_count": 5.0},
         },
     }
     scores = ToolSandboxEnvironment._extract_scores(summary)
 
     assert scores["similarity"]["value"] == pytest.approx(0.72, abs=1e-4)
-    assert scores["turn_count"]["value"] == pytest.approx(4.3, abs=1e-4)
-    assert "per_category/single_tool_call/similarity" in scores
-    assert scores["per_category/single_tool_call/similarity"]["value"] == pytest.approx(0.91, abs=1e-4)
-    assert "per_category/multiple_tool_call/similarity" in scores
+    assert scores["turn_count"]["value"] == pytest.approx(4.3, abs=1e-2)
+    assert "per_category/STATE_DEPENDENCY/similarity" in scores
+    assert scores["per_category/STATE_DEPENDENCY/similarity"]["value"] == pytest.approx(0.91, abs=1e-4)
+    assert "per_category/MULTIPLE_TOOL_CALL/similarity" in scores
+    # ALL_CATEGORIES is not duplicated as a per_category entry
+    assert "per_category/ALL_CATEGORIES/similarity" not in scores
 
 
 def test_extract_scores_empty_summary() -> None:
@@ -256,18 +259,20 @@ def test_extract_scores_empty_summary() -> None:
 
 
 def test_extract_scores_no_categories() -> None:
-    summary = {"similarity": 0.5}
+    summary = {"category_aggregated_results": {"ALL_CATEGORIES": {"similarity": 0.5}}}
     scores = ToolSandboxEnvironment._extract_scores(summary)
     assert scores == {"similarity": {"value": 0.5}}
 
 
 def test_extract_scores_category_missing_similarity() -> None:
     summary = {
-        "similarity": 0.8,
-        "per_category": {"single_tool_call": {"count": 10}},
+        "category_aggregated_results": {
+            "ALL_CATEGORIES": {"similarity": 0.8, "turn_count": 3.0},
+            "STATE_DEPENDENCY": {"turn_count": 3.0},  # no similarity
+        }
     }
     scores = ToolSandboxEnvironment._extract_scores(summary)
-    assert "per_category/single_tool_call/similarity" not in scores
+    assert "per_category/STATE_DEPENDENCY/similarity" not in scores
     assert scores["similarity"]["value"] == pytest.approx(0.8)
 
 
@@ -306,7 +311,12 @@ def test_parse_results_bundle_keys(tmp_path) -> None:
     run_dir = tmp_path / "run_1"
     run_dir.mkdir()
     (run_dir / "result_summary.json").write_text(
-        json.dumps({"similarity": 0.7, "turn_count": 5.0})
+        json.dumps({
+            "per_scenario_results": [{"name": "s1"}, {"name": "s2"}],
+            "category_aggregated_results": {
+                "ALL_CATEGORIES": {"similarity": 0.7, "turn_count": 5.0}
+            },
+        })
     )
 
     bundle = env._parse_results(tmp_path, exit_code=0, model_id="my-model")
@@ -314,6 +324,7 @@ def test_parse_results_bundle_keys(tmp_path) -> None:
     assert "benchmark" in bundle
     assert "config" in bundle
     assert bundle["benchmark"]["name"] == "toolsandbox"
+    assert bundle["benchmark"]["samples"] == 2
     assert bundle["benchmark"]["scores"]["similarity"]["value"] == pytest.approx(0.7)
     assert bundle["config"]["framework"] == "toolsandbox"
     assert bundle["config"]["runner"] == "docker"

From 6b3b416abc9f7980b9b8dd26e38ca27e069ee197 Mon Sep 17 00:00:00 2001
From: Wojciech Prazuch <wprazuch@nvidia.com>
Date: Mon, 18 May 2026 15:23:22 -0700
Subject: [PATCH 6/6] chore(toolsandbox): use inference-api.nvidia.com + gpt-4o
 in example configs

Smoke test confirmed: inference-api.nvidia.com with azure/openai/gpt-4o
runs 3/3 scenarios with similarity=0.952, no errors, no rate limits.

Signed-off-by: Wojciech Prazuch <wprazuch@nvidia.com>
---
 examples/configs/toolsandbox.yaml       | 10 +++++-----
 examples/configs/toolsandbox_slurm.yaml |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/configs/toolsandbox.yaml b/examples/configs/toolsandbox.yaml
index 12aa87fe2..6e5db70ec 100644
--- a/examples/configs/toolsandbox.yaml
+++ b/examples/configs/toolsandbox.yaml
@@ -6,7 +6,7 @@
 # Inference API — no OpenAI key required.
 #
 # Prerequisites:
-#   - NVIDIA_API_KEY set in environment
+#   - INFERENCE_API_KEY set in environment
 #   - Docker running locally
 #   - toolsandbox-nel image built (once):
 #       docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest .
@@ -16,17 +16,17 @@
 services:
   nemotron:
     type: api
-    url: https://integrate.api.nvidia.com/v1/chat/completions
+    url: https://inference-api.nvidia.com/v1/chat/completions
     protocol: chat_completions
-    model: nvidia/nemotron-3-super-120b-a12b
-    api_key: ${NVIDIA_API_KEY}
+    model: azure/openai/gpt-4o
+    api_key: ${INFERENCE_API_KEY}
 
 benchmarks:
   - name: toolsandbox
     params:
       image: toolsandbox-nel:latest
       # Model used as user simulator (must be available on the same API)
-      user_model: meta/llama-3.1-70b-instruct
+      user_model: azure/openai/gpt-4o
       # Number of scenarios to run in parallel inside the container
       parallel: 4
       # Set to true to run only a small predefined subset for quick validation
diff --git a/examples/configs/toolsandbox_slurm.yaml b/examples/configs/toolsandbox_slurm.yaml
index cc73df779..abc9842db 100644
--- a/examples/configs/toolsandbox_slurm.yaml
+++ b/examples/configs/toolsandbox_slurm.yaml
@@ -19,10 +19,10 @@
 services:
   nemotron:
     type: api
-    url: https://integrate.api.nvidia.com/v1/chat/completions
+    url: https://inference-api.nvidia.com/v1/chat/completions
     protocol: chat_completions
-    model: nvidia/nemotron-3-super-120b-a12b
-    api_key: ${NVIDIA_API_KEY}
+    model: azure/openai/gpt-4o
+    api_key: ${INFERENCE_API_KEY}
 
 benchmarks:
   - name: toolsandbox
@@ -49,7 +49,7 @@ cluster:
   walltime: "04:00:00"
   eval_image: ${SHARED_ROOT}/nel/toolsandbox-nel-combined.sqsh
   container_env:
-    NVIDIA_API_KEY: ${NVIDIA_API_KEY}
+    INFERENCE_API_KEY: ${INFERENCE_API_KEY}
   node_pools:
     default:
       partition: cpu