diff --git a/docker/Dockerfile.toolsandbox b/docker/Dockerfile.toolsandbox
new file mode 100644
index 000000000..a86c176af
--- /dev/null
+++ b/docker/Dockerfile.toolsandbox
@@ -0,0 +1,36 @@
+# ToolSandbox evaluation container for NEL Next.
+#
+# Bundles Apple's ToolSandbox benchmark with a custom NVIDIA NIM agent/user
+# that routes model calls through any OpenAI-compatible endpoint.
+#
+# Build:
+#   docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest .
+#
+# Pin to a specific commit for reproducible builds:
+#   docker build -f docker/Dockerfile.toolsandbox \
+#     --build-arg TOOLSANDBOX_REF=<commit-sha> \
+#     -t toolsandbox-nel:<commit-sha> .
+#
+# Required env vars at runtime (injected by ToolSandboxEnvironment.run_batch):
+#   NVIDIA_BASE_URL    – OpenAI-compatible base URL
+#   NVIDIA_API_KEY     – API key
+#   NVIDIA_AGENT_MODEL – Model ID for the agent under evaluation
+#   NVIDIA_USER_MODEL  – Model ID for the user simulator
+
+FROM python:3.11-slim
+
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends git \
+    && rm -rf /var/lib/apt/lists/*
+
+ARG TOOLSANDBOX_REF=main
+
+# ToolSandbox pins openai==1.17.0 and other specific versions — install in
+# a clean environment separate from any NEL dependencies.
+RUN pip install --no-cache-dir \
+    "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}" \
+    "httpx<0.28.0"
+
+COPY docker/toolsandbox_entrypoint.py /opt/toolsandbox_entrypoint.py
+
+ENTRYPOINT ["python", "/opt/toolsandbox_entrypoint.py"]
diff --git a/docker/Dockerfile.toolsandbox-combined b/docker/Dockerfile.toolsandbox-combined
new file mode 100644
index 000000000..ab388fb70
--- /dev/null
+++ b/docker/Dockerfile.toolsandbox-combined
@@ -0,0 +1,47 @@
+# ToolSandbox + NEL Next combined evaluation container.
+#
+# Used for SLURM runs where nested Docker is unavailable.  ToolSandbox is
+# installed in an isolated venv (/opt/toolsandbox-venv) so its pinned
+# dependencies (openai==1.17.0, etc.) do not conflict with NEL Next.
+#
+# Build:
+#   docker build -f docker/Dockerfile.toolsandbox-combined \
+#     -t toolsandbox-nel-combined:latest .
+#
+# Convert to squashfs for SLURM (run on a login node with Docker):
+#   enroot import -o /shared/nel/toolsandbox-nel-combined.sqsh \
+#     dockerd://toolsandbox-nel-combined:latest
+#
+# NEL Next config (subprocess runner):
+#   benchmarks:
+#     - name: toolsandbox
+#       params:
+#         runner: subprocess
+#         python_exe: /opt/toolsandbox-venv/bin/python
+#         entrypoint: /opt/toolsandbox_entrypoint.py
+#
+# Required env vars at runtime:
+#   NVIDIA_BASE_URL    – OpenAI-compatible base URL
+#   NVIDIA_API_KEY     – API key
+#   NVIDIA_AGENT_MODEL – model ID for the agent under evaluation
+#   NVIDIA_USER_MODEL  – model ID for user simulator
+
+ARG BASE_IMAGE=nemo-evaluator
+
+FROM ${BASE_IMAGE}
+
+# Install Python 3.11 for the ToolSandbox venv (avoids openai version conflicts
+# with NEL Next which uses the system Python 3.12)
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends python3.11 python3.11-venv \
+    && rm -rf /var/lib/apt/lists/*
+
+ARG TOOLSANDBOX_REF=main
+
+# Create an isolated venv for ToolSandbox with its pinned deps
+RUN python3.11 -m venv /opt/toolsandbox-venv \
+    && /opt/toolsandbox-venv/bin/pip install --no-cache-dir \
+       "tool-sandbox[dev] @ git+https://github.com/apple/ToolSandbox.git@${TOOLSANDBOX_REF}" \
+       "httpx<0.28.0"
+
+COPY docker/toolsandbox_entrypoint.py /opt/toolsandbox_entrypoint.py
diff --git a/docker/toolsandbox_entrypoint.py b/docker/toolsandbox_entrypoint.py
new file mode 100644
index 000000000..44b25b1a6
--- /dev/null
+++ b/docker/toolsandbox_entrypoint.py
@@ -0,0 +1,92 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ToolSandbox NEL entrypoint.
+
+Patches the ToolSandbox agent/user factories to use NVIDIA-hosted models via
+any OpenAI-compatible endpoint, then delegates to the standard tool_sandbox CLI.
+
+Required environment variables:
+  NVIDIA_BASE_URL    – OpenAI-compatible endpoint base URL
+                       (e.g. https://integrate.api.nvidia.com/v1)
+  NVIDIA_API_KEY     – API key for both agent and user models
+  NVIDIA_AGENT_MODEL – Model name for the agent under evaluation
+
+Optional:
+  NVIDIA_USER_MODEL  – Model for user simulator
+                       (default: meta/llama-3.1-70b-instruct)
+
+CLI args (after patching) follow the standard tool_sandbox interface:
+  --agent Gorilla --user GPT_4_o_2024_05_13 [--scenarios ...] [--test_mode]
+"""
+from __future__ import annotations
+
+import os
+import sys
+
+
+def _require_env(name: str) -> str:
+    val = os.environ.get(name, "").strip()
+    if not val:
+        raise RuntimeError(f"Required environment variable {name!r} is not set")
+    return val
+
+
+def _register_nvidia_roles() -> None:
+    """Replace Gorilla agent and GPT-4o user with NVIDIA NIM-backed classes.
+
+    We reuse existing RoleImplType enum keys so the CLI accepts
+    ``--agent Gorilla --user GPT_4_o_2024_05_13`` without modification.
+    """
+    from openai import OpenAI
+    from tool_sandbox.cli.utils import AGENT_TYPE_TO_FACTORY, RoleImplType, USER_TYPE_TO_FACTORY
+    from tool_sandbox.roles.openai_api_agent import OpenAIAPIAgent
+    from tool_sandbox.roles.openai_api_user import OpenAIAPIUser
+
+    base_url = _require_env("NVIDIA_BASE_URL")
+    api_key = _require_env("NVIDIA_API_KEY")
+    agent_model = _require_env("NVIDIA_AGENT_MODEL")
+    user_model = os.environ.get("NVIDIA_USER_MODEL", "meta/llama-3.1-70b-instruct")
+
+    # OpenAIAPIAgent/User.__init__ reads OPENAI_API_KEY to create a temporary
+    # client that NVIDIANIMAgent immediately replaces.  Set a placeholder so
+    # the parent __init__ doesn't raise when the env var is absent.
+    os.environ.setdefault("OPENAI_API_KEY", api_key or "not-used")
+
+    def _client() -> OpenAI:
+        return OpenAI(base_url=base_url, api_key=api_key)
+
+    class NVIDIANIMAgent(OpenAIAPIAgent):
+        model_name: str = agent_model
+
+        def __init__(self) -> None:
+            super().__init__()
+            self.openai_client = _client()
+
+    class NVIDIANIMUser(OpenAIAPIUser):
+        model_name: str = user_model
+
+        def __init__(self) -> None:
+            super().__init__()
+            self.openai_client = _client()
+
+    AGENT_TYPE_TO_FACTORY[RoleImplType.Gorilla] = NVIDIANIMAgent
+    USER_TYPE_TO_FACTORY[RoleImplType.GPT_4_o_2024_05_13] = NVIDIANIMUser
+
+
+if __name__ == "__main__":
+    _register_nvidia_roles()
+    from tool_sandbox.cli import main
+
+    sys.exit(main())
diff --git a/examples/configs/toolsandbox.yaml b/examples/configs/toolsandbox.yaml
new file mode 100644
index 000000000..6e5db70ec
--- /dev/null
+++ b/examples/configs/toolsandbox.yaml
@@ -0,0 +1,43 @@
+# Flavor: ToolSandbox — stateful multi-turn tool-use benchmark
+#
+# Evaluates an LLM's ability to use tools across stateful, multi-turn
+# conversations (Apple's ToolSandbox: https://github.com/apple/ToolSandbox).
+# Both the agent under evaluation AND the user simulator call the NVIDIA
+# Inference API — no OpenAI key required.
+#
+# Prerequisites:
+#   - INFERENCE_API_KEY set in environment
+#   - Docker running locally
+#   - toolsandbox-nel image built (once):
+#       docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest .
+#
+# Run: nel eval run examples/configs/toolsandbox.yaml
+
+services:
+  nemotron:
+    type: api
+    url: https://inference-api.nvidia.com/v1/chat/completions
+    protocol: chat_completions
+    model: azure/openai/gpt-4o
+    api_key: ${INFERENCE_API_KEY}
+
+benchmarks:
+  - name: toolsandbox
+    params:
+      image: toolsandbox-nel:latest
+      # Model used as user simulator (must be available on the same API)
+      user_model: azure/openai/gpt-4o
+      # Number of scenarios to run in parallel inside the container
+      parallel: 4
+      # Set to true to run only a small predefined subset for quick validation
+      test_mode: false
+      # Specific scenarios to run — omit or set [] to run the full suite
+      # scenarios: [wifi_off, cellular_off, make_call]
+    solver:
+      type: simple
+      service: nemotron
+    timeout: 7200.0
+
+output:
+  dir: ./results/toolsandbox
+  report: [markdown, json]
diff --git a/examples/configs/toolsandbox_slurm.yaml b/examples/configs/toolsandbox_slurm.yaml
new file mode 100644
index 000000000..abc9842db
--- /dev/null
+++ b/examples/configs/toolsandbox_slurm.yaml
@@ -0,0 +1,56 @@
+# Flavor: ToolSandbox on SLURM (subprocess runner — no nested Docker)
+#
+# Uses the subprocess runner so ToolSandbox (pre-installed in
+# /opt/toolsandbox-venv inside the eval container) runs directly without
+# a nested Docker call.  Suitable for any SLURM cluster.
+#
+# Prerequisites:
+#   - NVIDIA_API_KEY set in environment (or in cluster.container_env)
+#   - toolsandbox-nel-combined squashfs on shared storage:
+#       docker build -f docker/Dockerfile.toolsandbox-combined \
+#           -t toolsandbox-nel-combined:latest .
+#       enroot import -o /shared/nel/toolsandbox-nel-combined.sqsh \
+#           dockerd://toolsandbox-nel-combined:latest
+#   - SSH access to the SLURM login node
+#
+# Dry-run: nel eval run examples/configs/toolsandbox_slurm.yaml --dry-run
+# Submit:  nel eval run examples/configs/toolsandbox_slurm.yaml --submit
+
+services:
+  nemotron:
+    type: api
+    url: https://inference-api.nvidia.com/v1/chat/completions
+    protocol: chat_completions
+    model: azure/openai/gpt-4o
+    api_key: ${INFERENCE_API_KEY}
+
+benchmarks:
+  - name: toolsandbox
+    params:
+      runner: subprocess
+      python_exe: /opt/toolsandbox-venv/bin/python
+      entrypoint: /opt/toolsandbox_entrypoint.py
+      user_model: meta/llama-3.1-70b-instruct
+      parallel: 8
+      test_mode: false
+    solver:
+      type: simple
+      service: nemotron
+    timeout: 14400.0
+
+output:
+  dir: ./results/toolsandbox_slurm
+  report: [markdown, json]
+
+cluster:
+  type: slurm
+  hostname: ${SLURM_LOGIN_HOST}
+  account: ${SLURM_ACCOUNT}
+  walltime: "04:00:00"
+  eval_image: ${SHARED_ROOT}/nel/toolsandbox-nel-combined.sqsh
+  container_env:
+    INFERENCE_API_KEY: ${INFERENCE_API_KEY}
+  node_pools:
+    default:
+      partition: cpu
+      nodes: 1
diff --git a/src/nemo_evaluator/benchmarks/__init__.py b/src/nemo_evaluator/benchmarks/__init__.py
index 3b2f9cd4b..de2b3879b 100644
--- a/src/nemo_evaluator/benchmarks/__init__.py
+++ b/src/nemo_evaluator/benchmarks/__init__.py
@@ -33,5 +33,6 @@
 import nemo_evaluator.benchmarks.simpleqa  # noqa: F401
 import nemo_evaluator.benchmarks.terminal_bench_hard  # noqa: F401
 import nemo_evaluator.benchmarks.terminal_bench_v1  # noqa: F401
+import nemo_evaluator.benchmarks.toolsandbox  # noqa: F401
 import nemo_evaluator.benchmarks.triviaqa  # noqa: F401
 import nemo_evaluator.benchmarks.xstest  # noqa: F401
diff --git a/src/nemo_evaluator/benchmarks/toolsandbox.py b/src/nemo_evaluator/benchmarks/toolsandbox.py
new file mode 100644
index 000000000..96dc67902
--- /dev/null
+++ b/src/nemo_evaluator/benchmarks/toolsandbox.py
@@ -0,0 +1,340 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ToolSandbox benchmark — Apple's stateful multi-turn tool-use evaluation.
+
+Registers ``toolsandbox`` as a built-in benchmark.  Bypasses the standard
+seed/solve/verify loop via ``run_batch()``, which runs ToolSandbox in one of
+three modes and parses the resulting ``result_summary.json``.
+
+Runner modes
+------------
+docker (default)
+    Spawns a pre-built Docker image.  Requires Docker on the eval host.
+
+    Build the image once::
+
+        docker build -f docker/Dockerfile.toolsandbox -t toolsandbox-nel:latest .
+
+apptainer
+    Same image as ``docker`` mode but executed via ``apptainer run``.
+    Use on SLURM clusters where Docker is unavailable.  ``image`` should be
+    a path to a ``.sif`` or ``.sqsh`` file on the shared filesystem.
+
+subprocess
+    Runs the ToolSandbox entrypoint directly as a Python subprocess — no
+    container needed.  Use when the eval container already has ToolSandbox
+    pre-installed (e.g. ``Dockerfile.toolsandbox-combined``).  Set
+    ``python_exe`` to the venv Python that has ToolSandbox, and
+    ``entrypoint`` to the patch script path.
+
+Config usage::
+
+    benchmarks:
+      - name: toolsandbox
+        params:
+          # --- runner selection ---
+          runner: docker                           # docker | apptainer | subprocess
+          image: toolsandbox-nel:latest            # docker image name / sif path
+          # --- subprocess-mode overrides ---
+          python_exe: /opt/toolsandbox-venv/bin/python
+          entrypoint: /opt/toolsandbox_entrypoint.py
+          # --- benchmark settings ---
+          user_model: meta/llama-3.1-70b-instruct  # user-simulator model
+          parallel: 4                              # concurrent scenarios
+          test_mode: false                         # true = small subset only
+          scenarios: []                            # [] = all scenarios
+        solver:
+          type: simple
+          service: my_model
+        timeout: 7200.0
+
+Both agent and user simulator call the NVIDIA Inference API — no OpenAI
+key required.  ``NVIDIA_API_KEY`` must be set in the environment.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import os
+import sys
+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Literal
+
+from nemo_evaluator.environments.base import EvalEnvironment, SeedResult, VerifyResult
+from nemo_evaluator.environments.registry import register
+
+if TYPE_CHECKING:
+    from nemo_evaluator.sandbox.base import Sandbox
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_IMAGE = "toolsandbox-nel:latest"
+_DEFAULT_USER_MODEL = "meta/llama-3.1-70b-instruct"
+_DEFAULT_BASE_URL = "https://integrate.api.nvidia.com/v1"
+_DEFAULT_ENTRYPOINT = "/opt/toolsandbox_entrypoint.py"
+_CONTAINER_OUTPUT = "/output"
+
+_CLI_AGENT = "Gorilla"
+_CLI_USER = "GPT_4_o_2024_05_13"
+
+
+def _to_openai_base_url(url: str) -> str:
+    """Strip /chat/completions, /completions, /responses path suffix from NEL service URLs."""
+    for suffix in ("/chat/completions", "/completions", "/responses"):
+        if url.endswith(suffix):
+            return url[: -len(suffix)]
+    return url.rstrip("/")
+
+
+@register("toolsandbox")
+class ToolSandboxEnvironment(EvalEnvironment):
+    """Runs ToolSandbox and parses aggregate metrics.
+
+    The entire scenario suite executes as a single batch.
+    ``seed()`` and ``verify()`` are not used.
+    """
+
+    def __init__(
+        self,
+        runner: Literal["docker", "apptainer", "subprocess"] = "docker",
+        image: str = _DEFAULT_IMAGE,
+        python_exe: str | None = None,
+        entrypoint: str = _DEFAULT_ENTRYPOINT,
+        user_model: str = _DEFAULT_USER_MODEL,
+        scenarios: list[str] | None = None,
+        parallel: int = 4,
+        timeout: float = 7200.0,
+        test_mode: bool = False,
+    ) -> None:
+        super().__init__()
+        self._runner = runner
+        self._image = image
+        self._python_exe = python_exe or sys.executable
+        self._entrypoint = entrypoint
+        self._user_model = user_model
+        self._scenarios: list[str] = scenarios or []
+        self._parallel = parallel
+        self._timeout = timeout
+        self._test_mode = test_mode
+
+    # ------------------------------------------------------------------
+    # EvalEnvironment interface
+    # ------------------------------------------------------------------
+
+    async def dataset_size(self) -> int:
+        return 0
+
+    async def seed(self, idx: int) -> SeedResult:
+        raise NotImplementedError("ToolSandboxEnvironment uses run_batch()")
+
+    async def verify(
+        self,
+        response: str,
+        expected: str,
+        sandbox: "Sandbox | None" = None,
+        **metadata: Any,
+    ) -> VerifyResult:
+        raise NotImplementedError("ToolSandboxEnvironment uses run_batch()")
+
+    # ------------------------------------------------------------------
+    # Batch execution
+    # ------------------------------------------------------------------
+
+    async def run_batch(self, solver: Any = None, config: dict[str, Any] | None = None) -> dict[str, Any]:
+        config = config or {}
+        model_url = config.get("base_url", "") or _DEFAULT_BASE_URL
+        model_id = config.get("model", "")
+        api_key = config.get("api_key") or os.environ.get("NVIDIA_API_KEY", "")
+
+        base_url = _to_openai_base_url(model_url)
+
+        with tempfile.TemporaryDirectory(prefix="nel_toolsandbox_") as tmpdir:
+            output_dir = Path(tmpdir) / "output"
+            output_dir.mkdir()
+
+            cmd, env = self._build_cmd(output_dir, base_url, model_id, api_key)
+            logger.info("Launching ToolSandbox (%s): %s", self._runner, " ".join(cmd[:10]) + " ...")
+
+            proc = await asyncio.create_subprocess_exec(
+                *cmd,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+                env=env,
+            )
+            try:
+                stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=self._timeout)
+            except asyncio.TimeoutError:
+                proc.kill()
+                await proc.wait()
+                logger.error("ToolSandbox timed out after %.0fs", self._timeout)
+                stderr = b"timeout"
+
+            rc = proc.returncode or 0
+            if rc != 0:
+                logger.error(
+                    "ToolSandbox exited %d:\n%s",
+                    rc,
+                    (stderr or b"").decode(errors="replace")[:2000],
+                )
+
+            return self._parse_results(output_dir, rc, model_id)
+
+    # ------------------------------------------------------------------
+    # Command builders
+    # ------------------------------------------------------------------
+
+    def _build_cmd(
+        self,
+        output_dir: Path,
+        base_url: str,
+        model_id: str,
+        api_key: str,
+    ) -> tuple[list[str], dict[str, str] | None]:
+        """Return (cmd, env) for the selected runner."""
+        if self._runner == "subprocess":
+            return self._build_subprocess_cmd(output_dir, base_url, model_id, api_key)
+        if self._runner == "apptainer":
+            return self._build_apptainer_cmd(output_dir, base_url, model_id, api_key), None
+        return self._build_docker_cmd(output_dir, base_url, model_id, api_key), None
+
+    def _toolsandbox_cli_args(self, output_dir_str: str) -> list[str]:
+        args = ["--agent", _CLI_AGENT, "--user", _CLI_USER]
+        args.extend(["--output_dir", output_dir_str])
+        args.extend(["--parallel", str(self._parallel)])
+        if self._test_mode:
+            args.append("--test_mode")
+        elif self._scenarios:
+            args.extend(["--scenarios"] + list(self._scenarios))
+        return args
+
+    def _container_env_flags(self, base_url: str, model_id: str, api_key: str) -> list[str]:
+        flags = [
+            "-e", f"NVIDIA_BASE_URL={base_url}",
+            "-e", f"NVIDIA_AGENT_MODEL={model_id}",
+            "-e", f"NVIDIA_USER_MODEL={self._user_model}",
+        ]
+        if api_key:
+            flags.extend(["-e", f"NVIDIA_API_KEY={api_key}"])
+        return flags
+
+    def _build_docker_cmd(self, output_dir: Path, base_url: str, model_id: str, api_key: str) -> list[str]:
+        cmd = ["docker", "run", "--rm", "-v", f"{output_dir}:{_CONTAINER_OUTPUT}"]
+        cmd.extend(self._container_env_flags(base_url, model_id, api_key))
+        cmd.append(self._image)
+        cmd.extend(self._toolsandbox_cli_args(_CONTAINER_OUTPUT))
+        return cmd
+
+    def _build_apptainer_cmd(self, output_dir: Path, base_url: str, model_id: str, api_key: str) -> list[str]:
+        env_flags: list[str] = []
+        for flag in self._container_env_flags(base_url, model_id, api_key):
+            if flag == "-e":
+                continue
+            env_flags.extend(["--env", flag])
+
+        cmd = [
+            "apptainer", "run", "--bind", f"{output_dir}:{_CONTAINER_OUTPUT}",
+            *env_flags,
+            self._image,
+        ]
+        cmd.extend(self._toolsandbox_cli_args(_CONTAINER_OUTPUT))
+        return cmd
+
+    def _build_subprocess_cmd(
+        self, output_dir: Path, base_url: str, model_id: str, api_key: str
+    ) -> tuple[list[str], dict[str, str]]:
+        # Env is passed via environment variables to the subprocess
+        env = {
+            **os.environ,
+            "NVIDIA_BASE_URL": base_url,
+            "NVIDIA_AGENT_MODEL": model_id,
+            "NVIDIA_USER_MODEL": self._user_model,
+        }
+        if api_key:
+            env["NVIDIA_API_KEY"] = api_key
+
+        cmd = [self._python_exe, self._entrypoint]
+        cmd.extend(self._toolsandbox_cli_args(str(output_dir)))
+        return cmd, env
+
+    # ------------------------------------------------------------------
+    # Results parsing
+    # ------------------------------------------------------------------
+
+    def _parse_results(self, output_dir: Path, exit_code: int, model_id: str) -> dict[str, Any]:
+        summary = self._load_result_summary(output_dir)
+        scores = self._extract_scores(summary)
+
+        return {
+            "benchmark": {
+                "name": self.name,
+                "samples": len(summary.get("per_scenario_results", [])),
+                "scores": scores,
+            },
+            "config": {
+                "benchmark": self.name,
+                "runner": self._runner,
+                "image": self._image if self._runner != "subprocess" else None,
+                "model": model_id,
+                "user_model": self._user_model,
+                "framework": "toolsandbox",
+                "scenarios": self._scenarios or "all",
+                "test_mode": self._test_mode,
+            },
+            "_container_exit_code": exit_code,
+        }
+
+    def _load_result_summary(self, output_dir: Path) -> dict[str, Any]:
+        for candidate in sorted(output_dir.rglob("result_summary.json")):
+            try:
+                return json.loads(candidate.read_text(encoding="utf-8"))
+            except (json.JSONDecodeError, OSError) as exc:
+                logger.warning("Could not parse %s: %s", candidate, exc)
+        logger.warning("No result_summary.json found in %s", output_dir)
+        return {}
+
+    @staticmethod
+    def _extract_scores(summary: dict[str, Any]) -> dict[str, Any]:
+        """Extract scores from ToolSandbox result_summary.json.
+
+        Real format (confirmed from smoke test):
+          category_aggregated_results:
+            ALL_CATEGORIES: {similarity: float, turn_count: float}
+            STATE_DEPENDENCY: {similarity: float, turn_count: float}
+            ...
+        """
+        scores: dict[str, Any] = {}
+
+        cat_results: dict[str, Any] = summary.get("category_aggregated_results") or {}
+
+        # Overall score comes from the ALL_CATEGORIES aggregate
+        all_cat = cat_results.get("ALL_CATEGORIES") or {}
+        if "similarity" in all_cat:
+            scores["similarity"] = {"value": round(float(all_cat["similarity"]), 4)}
+        if "turn_count" in all_cat:
+            scores["turn_count"] = {"value": round(float(all_cat["turn_count"]), 2)}
+
+        # Per-category breakdown (skip ALL_CATEGORIES to avoid duplication)
+        for cat_name, cat_data in cat_results.items():
+            if cat_name == "ALL_CATEGORIES":
+                continue
+            if isinstance(cat_data, dict) and "similarity" in cat_data:
+                scores[f"per_category/{cat_name}/similarity"] = {
+                    "value": round(float(cat_data["similarity"]), 4)
+                }
+
+        return scores
diff --git a/tests/test_environments/test_toolsandbox.py b/tests/test_environments/test_toolsandbox.py
new file mode 100644
index 000000000..642a97d38
--- /dev/null
+++ b/tests/test_environments/test_toolsandbox.py
@@ -0,0 +1,351 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Offline tests for ToolSandboxEnvironment (no Docker, no network)."""
+
+from __future__ import annotations
+
+import json
+import pytest
+
+from nemo_evaluator.benchmarks.toolsandbox import ToolSandboxEnvironment, _to_openai_base_url
+
+
+# ---------------------------------------------------------------------------
+# URL normalization
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "url, expected",
+    [
+        (
+            "https://integrate.api.nvidia.com/v1/chat/completions",
+            "https://integrate.api.nvidia.com/v1",
+        ),
+        (
+            "https://integrate.api.nvidia.com/v1/completions",
+            "https://integrate.api.nvidia.com/v1",
+        ),
+        (
+            "https://integrate.api.nvidia.com/v1/responses",
+            "https://integrate.api.nvidia.com/v1",
+        ),
+        (
+            "https://integrate.api.nvidia.com/v1",
+            "https://integrate.api.nvidia.com/v1",
+        ),
+        (
+            "http://localhost:8000/v1/chat/completions",
+            "http://localhost:8000/v1",
+        ),
+        (
+            "http://localhost:8000/v1",
+            "http://localhost:8000/v1",
+        ),
+    ],
+    ids=[
+        "strip_chat_completions",
+        "strip_completions",
+        "strip_responses",
+        "no_op",
+        "localhost_with_suffix",
+        "localhost_base",
+    ],
+)
+def test_to_openai_base_url(url: str, expected: str) -> None:
+    assert _to_openai_base_url(url) == expected
+
+
+# ---------------------------------------------------------------------------
+# ToolSandboxEnvironment construction and defaults
+# ---------------------------------------------------------------------------
+
+
+def test_default_construction() -> None:
+    env = ToolSandboxEnvironment()
+    assert env._runner == "docker"
+    assert env._image == "toolsandbox-nel:latest"
+    assert env._user_model == "meta/llama-3.1-70b-instruct"
+    assert env._scenarios == []
+    assert env._parallel == 4
+    assert not env._test_mode
+
+
+def test_custom_params() -> None:
+    env = ToolSandboxEnvironment(
+        runner="subprocess",
+        image="toolsandbox-nel:v1.2",
+        python_exe="/opt/toolsandbox-venv/bin/python",
+        user_model="meta/llama-3.1-8b-instruct",
+        scenarios=["wifi_off", "cellular_off"],
+        parallel=8,
+        test_mode=True,
+    )
+    assert env._runner == "subprocess"
+    assert env._python_exe == "/opt/toolsandbox-venv/bin/python"
+    assert env._scenarios == ["wifi_off", "cellular_off"]
+    assert env._parallel == 8
+    assert env._test_mode
+
+
+# ---------------------------------------------------------------------------
+# Docker command construction
+# ---------------------------------------------------------------------------
+
+_P = __import__("pathlib").Path
+
+
+def test_docker_cmd_no_scenarios() -> None:
+    env = ToolSandboxEnvironment(runner="docker")
+    cmd, _ = env._build_cmd(
+        output_dir=_P("/tmp/output"),
+        base_url="https://integrate.api.nvidia.com/v1",
+        model_id="nvidia/nemotron-3-super-120b-a12b",
+        api_key="test-key",
+    )
+    assert cmd[0] == "docker"
+    assert "--agent" in cmd
+    assert "Gorilla" in cmd
+    assert "--user" in cmd
+    assert "GPT_4_o_2024_05_13" in cmd
+    assert "--scenarios" not in cmd
+    assert "--test_mode" not in cmd
+    assert "NVIDIA_AGENT_MODEL=nvidia/nemotron-3-super-120b-a12b" in " ".join(cmd)
+
+
+def test_docker_cmd_with_scenarios() -> None:
+    env = ToolSandboxEnvironment(runner="docker", scenarios=["wifi_off", "make_call"])
+    cmd, _ = env._build_cmd(
+        output_dir=_P("/tmp/output"),
+        base_url="https://integrate.api.nvidia.com/v1",
+        model_id="test-model",
+        api_key="key",
+    )
+    idx = cmd.index("--scenarios")
+    assert cmd[idx + 1] == "wifi_off"
+    assert cmd[idx + 2] == "make_call"
+
+
+def test_docker_cmd_test_mode() -> None:
+    env = ToolSandboxEnvironment(runner="docker", test_mode=True)
+    cmd, _ = env._build_cmd(
+        output_dir=_P("/tmp/output"),
+        base_url="https://integrate.api.nvidia.com/v1",
+        model_id="test-model",
+        api_key="key",
+    )
+    assert "--test_mode" in cmd
+    assert "--scenarios" not in cmd
+
+
+def test_docker_cmd_no_api_key() -> None:
+    env = ToolSandboxEnvironment(runner="docker")
+    cmd, _ = env._build_cmd(
+        output_dir=_P("/tmp/output"),
+        base_url="https://integrate.api.nvidia.com/v1",
+        model_id="test-model",
+        api_key="",
+    )
+    assert "NVIDIA_API_KEY" not in " ".join(cmd)
+
+
+# ---------------------------------------------------------------------------
+# Apptainer command construction
+# ---------------------------------------------------------------------------
+
+
+def test_apptainer_cmd_basics() -> None:
+    env = ToolSandboxEnvironment(runner="apptainer", image="/shared/nel/toolsandbox.sif")
+    cmd, _ = env._build_cmd(
+        output_dir=_P("/tmp/output"),
+        base_url="https://integrate.api.nvidia.com/v1",
+        model_id="test-model",
+        api_key="key",
+    )
+    assert cmd[0] == "apptainer"
+    assert "run" in cmd
+    assert "--bind" in cmd
+    assert "/shared/nel/toolsandbox.sif" in cmd
+    assert "--agent" in cmd
+
+
+# ---------------------------------------------------------------------------
+# Subprocess command construction
+# ---------------------------------------------------------------------------
+
+
+def test_subprocess_cmd_basics() -> None:
+    env = ToolSandboxEnvironment(
+        runner="subprocess",
+        python_exe="/opt/toolsandbox-venv/bin/python",
+        entrypoint="/opt/toolsandbox_entrypoint.py",
+    )
+    cmd, env_vars = env._build_cmd(
+        output_dir=_P("/tmp/output"),
+        base_url="https://integrate.api.nvidia.com/v1",
+        model_id="test-model",
+        api_key="test-key",
+    )
+    assert cmd[0] == "/opt/toolsandbox-venv/bin/python"
+    assert cmd[1] == "/opt/toolsandbox_entrypoint.py"
+    assert "--agent" in cmd
+    assert "Gorilla" in cmd
+    # API config passed via environment, not CLI flags
+    assert env_vars["NVIDIA_BASE_URL"] == "https://integrate.api.nvidia.com/v1"
+    assert env_vars["NVIDIA_AGENT_MODEL"] == "test-model"
+    assert env_vars["NVIDIA_API_KEY"] == "test-key"
+    # output_dir is a local host path, not the container-mount path /output
+    assert "/tmp/output" in " ".join(cmd)
+    assert cmd.count("/output") == 0 or all(c != "/output" for c in cmd)
+
+
+def test_subprocess_cmd_no_api_key_does_not_override() -> None:
+    """When api_key='', we must not overwrite any existing env var with empty string."""
+    import os as _os
+    env_env = ToolSandboxEnvironment(runner="subprocess")
+    _, env_vars = env_env._build_cmd(
+        output_dir=_P("/tmp/output"),
+        base_url="https://integrate.api.nvidia.com/v1",
+        model_id="test-model",
+        api_key="",
+    )
+    # If NVIDIA_API_KEY already existed in os.environ, it should not be blanked.
+    # If it wasn't there, it should still not be there (or be identical to original).
+    original = _os.environ.get("NVIDIA_API_KEY", "")
+    assert env_vars.get("NVIDIA_API_KEY", "") == original
+
+
+# ---------------------------------------------------------------------------
+# Score extraction
+# ---------------------------------------------------------------------------
+
+
+def test_extract_scores_full() -> None:
+    # Real format confirmed by smoke test
+    summary = {
+        "per_scenario_results": [],
+        "category_aggregated_results": {
+            "ALL_CATEGORIES": {"similarity": 0.72, "turn_count": 4.3},
+            "STATE_DEPENDENCY": {"similarity": 0.91, "turn_count": 3.1},
+            "MULTIPLE_TOOL_CALL": {"similarity": 0.61, "turn_count": 5.0},
+        },
+    }
+    scores = ToolSandboxEnvironment._extract_scores(summary)
+
+    assert scores["similarity"]["value"] == pytest.approx(0.72, abs=1e-4)
+    assert scores["turn_count"]["value"] == pytest.approx(4.3, abs=1e-2)
+    assert "per_category/STATE_DEPENDENCY/similarity" in scores
+    assert scores["per_category/STATE_DEPENDENCY/similarity"]["value"] == pytest.approx(0.91, abs=1e-4)
+    assert "per_category/MULTIPLE_TOOL_CALL/similarity" in scores
+    # ALL_CATEGORIES is not duplicated as a per_category entry
+    assert "per_category/ALL_CATEGORIES/similarity" not in scores
+
+
+def test_extract_scores_empty_summary() -> None:
+    scores = ToolSandboxEnvironment._extract_scores({})
+    assert scores == {}
+
+
+def test_extract_scores_no_categories() -> None:
+    summary = {"category_aggregated_results": {"ALL_CATEGORIES": {"similarity": 0.5}}}
+    scores = ToolSandboxEnvironment._extract_scores(summary)
+    assert scores == {"similarity": {"value": 0.5}}
+
+
+def test_extract_scores_category_missing_similarity() -> None:
+    summary = {
+        "category_aggregated_results": {
+            "ALL_CATEGORIES": {"similarity": 0.8, "turn_count": 3.0},
+            "STATE_DEPENDENCY": {"turn_count": 3.0},  # no similarity
+        }
+    }
+    scores = ToolSandboxEnvironment._extract_scores(summary)
+    assert "per_category/STATE_DEPENDENCY/similarity" not in scores
+    assert scores["similarity"]["value"] == pytest.approx(0.8)
+
+
+# ---------------------------------------------------------------------------
+# Result summary loading (from temp directory)
+# ---------------------------------------------------------------------------
+
+
+def test_load_result_summary(tmp_path):
+    env = ToolSandboxEnvironment()
+
+    # Simulate ToolSandbox output structure
+    run_dir = tmp_path / "agent_Gorilla_user_GPT_4_o_20240513_12345"
+    run_dir.mkdir()
+    summary_data = {"similarity": 0.65, "turn_count": 3.1}
+    (run_dir / "result_summary.json").write_text(json.dumps(summary_data))
+
+    result = env._load_result_summary(tmp_path)
+    assert result == summary_data
+
+
+def test_load_result_summary_missing(tmp_path) -> None:
+    env = ToolSandboxEnvironment()
+    result = env._load_result_summary(tmp_path)
+    assert result == {}
+
+
+# ---------------------------------------------------------------------------
+# Bundle structure
+# ---------------------------------------------------------------------------
+
+
+def test_parse_results_bundle_keys(tmp_path) -> None:
+    env = ToolSandboxEnvironment()
+
+    run_dir = tmp_path / "run_1"
+    run_dir.mkdir()
+    (run_dir / "result_summary.json").write_text(
+        json.dumps({
+            "per_scenario_results": [{"name": "s1"}, {"name": "s2"}],
+            "category_aggregated_results": {
+                "ALL_CATEGORIES": {"similarity": 0.7, "turn_count": 5.0}
+            },
+        })
+    )
+
+    bundle = env._parse_results(tmp_path, exit_code=0, model_id="my-model")
+
+    assert "benchmark" in bundle
+    assert "config" in bundle
+    assert bundle["benchmark"]["name"] == "toolsandbox"
+    assert bundle["benchmark"]["samples"] == 2
+    assert bundle["benchmark"]["scores"]["similarity"]["value"] == pytest.approx(0.7)
+    assert bundle["config"]["framework"] == "toolsandbox"
+    assert bundle["config"]["runner"] == "docker"
+    assert bundle["config"]["model"] == "my-model"
+    assert bundle["_container_exit_code"] == 0
+
+
+# ---------------------------------------------------------------------------
+# seed/verify raise NotImplementedError
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_seed_raises() -> None:
+    env = ToolSandboxEnvironment()
+    with pytest.raises(NotImplementedError):
+        await env.seed(0)
+
+
+@pytest.mark.asyncio
+async def test_verify_raises() -> None:
+    env = ToolSandboxEnvironment()
+    with pytest.raises(NotImplementedError):
+        await env.verify("response", "expected")