Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions docker/Dockerfile.toolsandbox
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# ToolSandbox evaluation container for NEL Next.
#
# Bundles Apple's ToolSandbox benchmark with a custom NVIDIA NIM agent/user
# that routes model calls through any OpenAI-compatible endpoint.
#
# Build:
# docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest .
#
# Pin to a specific commit for reproducible builds:
# docker build -f docker/Dockerfile.toolsandbox \
# --build-arg TOOLSANDBOX_REF=<commit-sha> \
# -t toolsandbox-nel:<commit-sha> .
#
# Required env vars at runtime (injected by ToolSandboxEnvironment.run_batch):
# NVIDIA_BASE_URL – OpenAI-compatible base URL
# NVIDIA_API_KEY – API key
# NVIDIA_AGENT_MODEL – Model ID for the agent under evaluation
# NVIDIA_USER_MODEL – Model ID for the user simulator

FROM python:3.11-slim

RUN apt-get update \
&& apt-get install -y --no-install-recommends git \
&& rm -rf /var/lib/apt/lists/*

ARG TOOLSANDBOX_REF=main

# ToolSandbox pins openai==1.17.0 and other specific versions — install in
# a clean environment separate from any NEL dependencies.
RUN pip install --no-cache-dir \
"tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}" \
"httpx<0.28.0"

COPY docker/toolsandbox_entrypoint.py /opt/toolsandbox_entrypoint.py

ENTRYPOINT ["python", "/opt/toolsandbox_entrypoint.py"]
47 changes: 47 additions & 0 deletions docker/Dockerfile.toolsandbox-combined
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# ToolSandbox + NEL Next combined evaluation container.
#
# Used for SLURM runs where nested Docker is unavailable. ToolSandbox is
# installed in an isolated venv (/opt/toolsandbox-venv) so its pinned
# dependencies (openai==1.17.0, etc.) do not conflict with NEL Next.
#
# Build:
# docker build -f docker/Dockerfile.toolsandbox-combined \
# -t toolsandbox-nel-combined:latest .
#
# Convert to squashfs for SLURM (run on a login node with Docker):
# enroot import -o /shared/nel/toolsandbox-nel-combined.sqsh \
# dockerd://toolsandbox-nel-combined:latest
#
# NEL Next config (subprocess runner):
# benchmarks:
# - name: toolsandbox
# params:
# runner: subprocess
# python_exe: /opt/toolsandbox-venv/bin/python
# entrypoint: /opt/toolsandbox_entrypoint.py
#
# Required env vars at runtime:
# NVIDIA_BASE_URL – OpenAI-compatible base URL
# NVIDIA_API_KEY – API key
# NVIDIA_AGENT_MODEL – model ID for the agent under evaluation
# NVIDIA_USER_MODEL – model ID for user simulator

ARG BASE_IMAGE=nemo-evaluator

FROM ${BASE_IMAGE}

# Install Python 3.11 for the ToolSandbox venv (avoids openai version conflicts
# with NEL Next which uses the system Python 3.12)
RUN apt-get update \
&& apt-get install -y --no-install-recommends python3.11 python3.11-venv \
&& rm -rf /var/lib/apt/lists/*

ARG TOOLSANDBOX_REF=main

# Create an isolated venv for ToolSandbox with its pinned deps
RUN python3.11 -m venv /opt/toolsandbox-venv \
&& /opt/toolsandbox-venv/bin/pip install --no-cache-dir \
"tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}" \
"httpx<0.28.0"

COPY docker/toolsandbox_entrypoint.py /opt/toolsandbox_entrypoint.py
92 changes: 92 additions & 0 deletions docker/toolsandbox_entrypoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ToolSandbox NEL entrypoint.

Patches the ToolSandbox agent/user factories to use NVIDIA-hosted models via
any OpenAI-compatible endpoint, then delegates to the standard tool_sandbox CLI.

Required environment variables:
NVIDIA_BASE_URL – OpenAI-compatible endpoint base URL
(e.g. https://integrate.api.nvidia.com/v1)
NVIDIA_API_KEY – API key for both agent and user models
NVIDIA_AGENT_MODEL – Model name for the agent under evaluation

Optional:
NVIDIA_USER_MODEL – Model for user simulator
(default: meta/llama-3.1-70b-instruct)

CLI args (after patching) follow the standard tool_sandbox interface:
--agent Gorilla --user GPT_4_o_2024_05_13 [--scenarios ...] [--test_mode]
"""
from __future__ import annotations

import os
import sys


def _require_env(name: str) -> str:
val = os.environ.get(name, "").strip()
if not val:
raise RuntimeError(f"Required environment variable {name!r} is not set")
return val


def _register_nvidia_roles() -> None:
"""Replace Gorilla agent and GPT-4o user with NVIDIA NIM-backed classes.

We reuse existing RoleImplType enum keys so the CLI accepts
``--agent Gorilla --user GPT_4_o_2024_05_13`` without modification.
"""
from openai import OpenAI
from tool_sandbox.cli.utils import AGENT_TYPE_TO_FACTORY, RoleImplType, USER_TYPE_TO_FACTORY
from tool_sandbox.roles.openai_api_agent import OpenAIAPIAgent
from tool_sandbox.roles.openai_api_user import OpenAIAPIUser

base_url = _require_env("NVIDIA_BASE_URL")
api_key = _require_env("NVIDIA_API_KEY")
agent_model = _require_env("NVIDIA_AGENT_MODEL")
user_model = os.environ.get("NVIDIA_USER_MODEL", "meta/llama-3.1-70b-instruct")

# OpenAIAPIAgent/User.__init__ reads OPENAI_API_KEY to create a temporary
# client that NVIDIANIMAgent immediately replaces. Set a placeholder so
# the parent __init__ doesn't raise when the env var is absent.
os.environ.setdefault("OPENAI_API_KEY", api_key or "not-used")

def _client() -> OpenAI:
return OpenAI(base_url=base_url, api_key=api_key)

class NVIDIANIMAgent(OpenAIAPIAgent):
model_name: str = agent_model

def __init__(self) -> None:
super().__init__()
self.openai_client = _client()

class NVIDIANIMUser(OpenAIAPIUser):
model_name: str = user_model

def __init__(self) -> None:
super().__init__()
self.openai_client = _client()

AGENT_TYPE_TO_FACTORY[RoleImplType.Gorilla] = NVIDIANIMAgent
USER_TYPE_TO_FACTORY[RoleImplType.GPT_4_o_2024_05_13] = NVIDIANIMUser


if __name__ == "__main__":
_register_nvidia_roles()
from tool_sandbox.cli import main

sys.exit(main())
43 changes: 43 additions & 0 deletions examples/configs/toolsandbox.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Flavor: ToolSandbox — stateful multi-turn tool-use benchmark
#
# Evaluates an LLM's ability to use tools across stateful, multi-turn
# conversations (Apple's ToolSandbox: https://github.com/apple/ToolSandbox).
# Both the agent under evaluation AND the user simulator call the NVIDIA
# Inference API — no OpenAI key required.
#
# Prerequisites:
# - INFERENCE_API_KEY set in environment
# - Docker running locally
# - toolsandbox-nel image built (once):
# docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest .
#
# Run: nel eval run examples/configs/toolsandbox.yaml

services:
nemotron:
type: api
url: https://inference-api.nvidia.com/v1/chat/completions
protocol: chat_completions
model: azure/openai/gpt-4o
api_key: ${INFERENCE_API_KEY}

benchmarks:
- name: toolsandbox
params:
image: toolsandbox-nel:latest
# Model used as user simulator (must be available on the same API)
user_model: azure/openai/gpt-4o
# Number of scenarios to run in parallel inside the container
parallel: 4
# Set to true to run only a small predefined subset for quick validation
test_mode: false
# Specific scenarios to run — omit or set [] to run the full suite
# scenarios: [wifi_off, cellular_off, make_call]
solver:
type: simple
service: nemotron
timeout: 7200.0

output:
dir: ./results/toolsandbox
report: [markdown, json]
56 changes: 56 additions & 0 deletions examples/configs/toolsandbox_slurm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Flavor: ToolSandbox on SLURM (subprocess runner — no nested Docker)
#
# Uses the subprocess runner so ToolSandbox (pre-installed in
# /opt/toolsandbox-venv inside the eval container) runs directly without
# a nested Docker call. Suitable for any SLURM cluster.
#
# Prerequisites:
# - NVIDIA_API_KEY set in environment (or in cluster.container_env)
# - toolsandbox-nel-combined squashfs on shared storage:
# docker build -f docker/Dockerfile.toolsandbox-combined \
# -t toolsandbox-nel-combined:latest .
# enroot import -o /shared/nel/toolsandbox-nel-combined.sqsh \
# dockerd://toolsandbox-nel-combined:latest
# - SSH access to the SLURM login node
#
# Dry-run: nel eval run examples/configs/toolsandbox_slurm.yaml --dry-run
# Submit: nel eval run examples/configs/toolsandbox_slurm.yaml --submit

services:
nemotron:
type: api
url: https://inference-api.nvidia.com/v1/chat/completions
protocol: chat_completions
model: azure/openai/gpt-4o
api_key: ${INFERENCE_API_KEY}

benchmarks:
- name: toolsandbox
params:
runner: subprocess
python_exe: /opt/toolsandbox-venv/bin/python
entrypoint: /opt/toolsandbox_entrypoint.py
user_model: meta/llama-3.1-70b-instruct
parallel: 8
test_mode: false
solver:
type: simple
service: nemotron
timeout: 14400.0

output:
dir: ./results/toolsandbox_slurm
report: [markdown, json]

cluster:
type: slurm
hostname: ${SLURM_LOGIN_HOST}
account: ${SLURM_ACCOUNT}
walltime: "04:00:00"
eval_image: ${SHARED_ROOT}/nel/toolsandbox-nel-combined.sqsh
container_env:
INFERENCE_API_KEY: ${INFERENCE_API_KEY}
node_pools:
default:
partition: cpu
nodes: 1
1 change: 1 addition & 0 deletions src/nemo_evaluator/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,6 @@
import nemo_evaluator.benchmarks.simpleqa # noqa: F401
import nemo_evaluator.benchmarks.terminal_bench_hard # noqa: F401
import nemo_evaluator.benchmarks.terminal_bench_v1 # noqa: F401
import nemo_evaluator.benchmarks.toolsandbox # noqa: F401
import nemo_evaluator.benchmarks.triviaqa # noqa: F401
import nemo_evaluator.benchmarks.xstest # noqa: F401
Loading
Loading