NVIDIA-NeMo · wprazuch · May 18, 2026 · May 18, 2026 · May 18, 2026 · May 18, 2026
diff --git a/docker/Dockerfile.toolsandbox b/docker/Dockerfile.toolsandbox
@@ -0,0 +1,36 @@
+# ToolSandbox evaluation container for NEL Next.
+#
+# Bundles Apple's ToolSandbox benchmark with a custom NVIDIA NIM agent/user
+# that routes model calls through any OpenAI-compatible endpoint.
+#
+# Build:
+#   docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest .
+#
+# Pin to a specific commit for reproducible builds:
+#   docker build -f docker/Dockerfile.toolsandbox \
+#     --build-arg TOOLSANDBOX_REF=<commit-sha> \
+#     -t toolsandbox-nel:<commit-sha> .
+#
+# Required env vars at runtime (injected by ToolSandboxEnvironment.run_batch):
+#   NVIDIA_BASE_URL    – OpenAI-compatible base URL
+#   NVIDIA_API_KEY     – API key
+#   NVIDIA_AGENT_MODEL – Model ID for the agent under evaluation
+#   NVIDIA_USER_MODEL  – Model ID for the user simulator
+
+FROM python:3.11-slim
+
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends git \
+    && rm -rf /var/lib/apt/lists/*
+
+ARG TOOLSANDBOX_REF=main
+
+# ToolSandbox pins openai==1.17.0 and other specific versions — install in
+# a clean environment separate from any NEL dependencies.
+RUN pip install --no-cache-dir \
+    "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}" \
+    "httpx<0.28.0"
+
+COPY docker/toolsandbox_entrypoint.py /opt/toolsandbox_entrypoint.py
+
+ENTRYPOINT ["python", "/opt/toolsandbox_entrypoint.py"]
diff --git a/docker/Dockerfile.toolsandbox-combined b/docker/Dockerfile.toolsandbox-combined
@@ -0,0 +1,47 @@
+# ToolSandbox + NEL Next combined evaluation container.
+#
+# Used for SLURM runs where nested Docker is unavailable.  ToolSandbox is
+# installed in an isolated venv (/opt/toolsandbox-venv) so its pinned
+# dependencies (openai==1.17.0, etc.) do not conflict with NEL Next.
+#
+# Build:
+#   docker build -f docker/Dockerfile.toolsandbox-combined \
+#     -t toolsandbox-nel-combined:latest .
+#
+# Convert to squashfs for SLURM (run on a login node with Docker):
+#   enroot import -o /shared/nel/toolsandbox-nel-combined.sqsh \
+#     dockerd://toolsandbox-nel-combined:latest
+#
+# NEL Next config (subprocess runner):
+#   benchmarks:
+#     - name: toolsandbox
+#       params:
+#         runner: subprocess
+#         python_exe: /opt/toolsandbox-venv/bin/python
+#         entrypoint: /opt/toolsandbox_entrypoint.py
+#
+# Required env vars at runtime:
+#   NVIDIA_BASE_URL    – OpenAI-compatible base URL
+#   NVIDIA_API_KEY     – API key
+#   NVIDIA_AGENT_MODEL – model ID for the agent under evaluation
+#   NVIDIA_USER_MODEL  – model ID for user simulator
+
+ARG BASE_IMAGE=nemo-evaluator
+
+FROM ${BASE_IMAGE}
+
+# Install Python 3.11 for the ToolSandbox venv (avoids openai version conflicts
+# with NEL Next which uses the system Python 3.12)
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends python3.11 python3.11-venv \
+    && rm -rf /var/lib/apt/lists/*
+
+ARG TOOLSANDBOX_REF=main
+
+# Create an isolated venv for ToolSandbox with its pinned deps
+RUN python3.11 -m venv /opt/toolsandbox-venv \
+    && /opt/toolsandbox-venv/bin/pip install --no-cache-dir \
+       "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}" \
+       "httpx<0.28.0"
+
+COPY docker/toolsandbox_entrypoint.py /opt/toolsandbox_entrypoint.py
diff --git a/docker/toolsandbox_entrypoint.py b/docker/toolsandbox_entrypoint.py
@@ -0,0 +1,92 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ToolSandbox NEL entrypoint.
+
+Patches the ToolSandbox agent/user factories to use NVIDIA-hosted models via
+any OpenAI-compatible endpoint, then delegates to the standard tool_sandbox CLI.
+
+Required environment variables:
+  NVIDIA_BASE_URL    – OpenAI-compatible endpoint base URL
+                       (e.g. https://integrate.api.nvidia.com/v1)
+  NVIDIA_API_KEY     – API key for both agent and user models
+  NVIDIA_AGENT_MODEL – Model name for the agent under evaluation
+
+Optional:
+  NVIDIA_USER_MODEL  – Model for user simulator
+                       (default: meta/llama-3.1-70b-instruct)
+
+CLI args (after patching) follow the standard tool_sandbox interface:
+  --agent Gorilla --user GPT_4_o_2024_05_13 [--scenarios ...] [--test_mode]
+"""
+from __future__ import annotations
+
+import os
+import sys
+
+
+def _require_env(name: str) -> str:
+    val = os.environ.get(name, "").strip()
+    if not val:
+        raise RuntimeError(f"Required environment variable {name!r} is not set")
+    return val
+
+
+def _register_nvidia_roles() -> None:
+    """Replace Gorilla agent and GPT-4o user with NVIDIA NIM-backed classes.
+
+    We reuse existing RoleImplType enum keys so the CLI accepts
+    ``--agent Gorilla --user GPT_4_o_2024_05_13`` without modification.
+    """
+    from openai import OpenAI
+    from tool_sandbox.cli.utils import AGENT_TYPE_TO_FACTORY, RoleImplType, USER_TYPE_TO_FACTORY
+    from tool_sandbox.roles.openai_api_agent import OpenAIAPIAgent
+    from tool_sandbox.roles.openai_api_user import OpenAIAPIUser
+
+    base_url = _require_env("NVIDIA_BASE_URL")
+    api_key = _require_env("NVIDIA_API_KEY")
+    agent_model = _require_env("NVIDIA_AGENT_MODEL")
+    user_model = os.environ.get("NVIDIA_USER_MODEL", "meta/llama-3.1-70b-instruct")
+
+    # OpenAIAPIAgent/User.__init__ reads OPENAI_API_KEY to create a temporary
+    # client that NVIDIANIMAgent immediately replaces.  Set a placeholder so
+    # the parent __init__ doesn't raise when the env var is absent.
+    os.environ.setdefault("OPENAI_API_KEY", api_key or "not-used")
+
+    def _client() -> OpenAI:
+        return OpenAI(base_url=base_url, api_key=api_key)
+
+    class NVIDIANIMAgent(OpenAIAPIAgent):
+        model_name: str = agent_model
+
+        def __init__(self) -> None:
+            super().__init__()
+            self.openai_client = _client()
+
+    class NVIDIANIMUser(OpenAIAPIUser):
+        model_name: str = user_model
+
+        def __init__(self) -> None:
+            super().__init__()
+            self.openai_client = _client()
+
+    AGENT_TYPE_TO_FACTORY[RoleImplType.Gorilla] = NVIDIANIMAgent
+    USER_TYPE_TO_FACTORY[RoleImplType.GPT_4_o_2024_05_13] = NVIDIANIMUser
+
+
+if __name__ == "__main__":
+    _register_nvidia_roles()
+    from tool_sandbox.cli import main
+
+    sys.exit(main())
diff --git a/examples/configs/toolsandbox.yaml b/examples/configs/toolsandbox.yaml
@@ -0,0 +1,43 @@
+# Flavor: ToolSandbox — stateful multi-turn tool-use benchmark
+#
+# Evaluates an LLM's ability to use tools across stateful, multi-turn
+# conversations (Apple's ToolSandbox: https://github.com/apple/ToolSandbox).
+# Both the agent under evaluation AND the user simulator call the NVIDIA
+# Inference API — no OpenAI key required.
+#
+# Prerequisites:
+#   - INFERENCE_API_KEY set in environment
+#   - Docker running locally
+#   - toolsandbox-nel image built (once):
+#       docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest .
+#
+# Run: nel eval run examples/configs/toolsandbox.yaml
+
+services:
+  nemotron:
+    type: api
+    url: https://inference-api.nvidia.com/v1/chat/completions
+    protocol: chat_completions
+    model: azure/openai/gpt-4o
+    api_key: ${INFERENCE_API_KEY}
+
+benchmarks:
+  - name: toolsandbox
+    params:
+      image: toolsandbox-nel:latest
+      # Model used as user simulator (must be available on the same API)
+      user_model: azure/openai/gpt-4o
+      # Number of scenarios to run in parallel inside the container
+      parallel: 4
+      # Set to true to run only a small predefined subset for quick validation
+      test_mode: false
+      # Specific scenarios to run — omit or set [] to run the full suite
+      # scenarios: [wifi_off, cellular_off, make_call]
+    solver:
+      type: simple
+      service: nemotron
+    timeout: 7200.0
+
+output:
+  dir: ./results/toolsandbox
+  report: [markdown, json]
diff --git a/examples/configs/toolsandbox_slurm.yaml b/examples/configs/toolsandbox_slurm.yaml
@@ -0,0 +1,56 @@
+# Flavor: ToolSandbox on SLURM (subprocess runner — no nested Docker)
+#
+# Uses the subprocess runner so ToolSandbox (pre-installed in
+# /opt/toolsandbox-venv inside the eval container) runs directly without
+# a nested Docker call.  Suitable for any SLURM cluster.
+#
+# Prerequisites:
+#   - NVIDIA_API_KEY set in environment (or in cluster.container_env)
+#   - toolsandbox-nel-combined squashfs on shared storage:
+#       docker build -f docker/Dockerfile.toolsandbox-combined \
+#           -t toolsandbox-nel-combined:latest .
+#       enroot import -o /shared/nel/toolsandbox-nel-combined.sqsh \
+#           dockerd://toolsandbox-nel-combined:latest
+#   - SSH access to the SLURM login node
+#
+# Dry-run: nel eval run examples/configs/toolsandbox_slurm.yaml --dry-run
+# Submit:  nel eval run examples/configs/toolsandbox_slurm.yaml --submit
+
+services:
+  nemotron:
+    type: api
+    url: https://inference-api.nvidia.com/v1/chat/completions
+    protocol: chat_completions
+    model: azure/openai/gpt-4o
+    api_key: ${INFERENCE_API_KEY}
+
+benchmarks:
+  - name: toolsandbox
+    params:
+      runner: subprocess
+      python_exe: /opt/toolsandbox-venv/bin/python
+      entrypoint: /opt/toolsandbox_entrypoint.py
+      user_model: meta/llama-3.1-70b-instruct
+      parallel: 8
+      test_mode: false
+    solver:
+      type: simple
+      service: nemotron
+    timeout: 14400.0
+
+output:
+  dir: ./results/toolsandbox_slurm
+  report: [markdown, json]
+
+cluster:
+  type: slurm
+  hostname: ${SLURM_LOGIN_HOST}
+  account: ${SLURM_ACCOUNT}
+  walltime: "04:00:00"
+  eval_image: ${SHARED_ROOT}/nel/toolsandbox-nel-combined.sqsh
+  container_env:
+    INFERENCE_API_KEY: ${INFERENCE_API_KEY}
+  node_pools:
+    default:
+      partition: cpu
+      nodes: 1
diff --git a/src/nemo_evaluator/benchmarks/__init__.py b/src/nemo_evaluator/benchmarks/__init__.py
@@ -33,5 +33,6 @@
 import nemo_evaluator.benchmarks.simpleqa  # noqa: F401
 import nemo_evaluator.benchmarks.terminal_bench_hard  # noqa: F401
 import nemo_evaluator.benchmarks.terminal_bench_v1  # noqa: F401
+import nemo_evaluator.benchmarks.toolsandbox  # noqa: F401
 import nemo_evaluator.benchmarks.triviaqa  # noqa: F401
 import nemo_evaluator.benchmarks.xstest  # noqa: F401