NVIDIA · bbednarski9 · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026 · Mar 8, 2026
@@ -35,6 +35,7 @@ repos:
       - { id: uv-lock, name: "uv-lock-adk", args: [--project, packages/nvidia_nat_adk], files: "packages/nvidia_nat_adk/pyproject.toml" }
       - { id: uv-lock, name: "uv-lock-agno", args: [--project, packages/nvidia_nat_agno], files: "packages/nvidia_nat_agno/pyproject.toml" }
       - { id: uv-lock, name: "uv-lock-autogen", args: [--project, packages/nvidia_nat_autogen], files: "packages/nvidia_nat_autogen/pyproject.toml" }
+      - { id: uv-lock, name: "uv-lock-benchmarks", args: [--project, packages/nvidia_nat_benchmarks], files: "packages/nvidia_nat_benchmarks/pyproject.toml" }
       - { id: uv-lock, name: "uv-lock-core", args: [--project, packages/nvidia_nat_core], files: "packages/nvidia_nat_core/pyproject.toml" }
       - { id: uv-lock, name: "uv-lock-crewai", args: [--project, packages/nvidia_nat_crewai], files: "packages/nvidia_nat_crewai/pyproject.toml" }
       - { id: uv-lock, name: "uv-lock-data_flywheel", args: [--project, packages/nvidia_nat_data_flywheel], files: "packages/nvidia_nat_data_flywheel/pyproject.toml" }

@@ -20,6 +20,7 @@ Authlib
 [Bb]ackoff
 [Bb]ackpressure
 [Bb]atcher
+BFCL
 [Bb]oolean
 Brev
 [Cc]allable(s?)
@@ -55,6 +56,7 @@ Datadog
 [Dd]atastore
 DB(s?)
 [Dd]eclaratively
+[Dd]eduplic(ate|ated|ation)
 [Dd]enylist
 [Dd]eserialize
 [Dd]ev
@@ -181,11 +183,13 @@ SSE
 [Ss]ubgraph(s?)
 [Ss]ubpackage(s?)
 [Ss]ubsampl(e|ing)
+[Ss]ubstring(s?)
 [Ss]ubtask(s?)
 [Ss]ubword(s?)
 [Ss]uperset(s?)
 Tavily
 [Tt]eardown
+[Tt]elecom
 [Tt]imestamp(s?)
 [Tt]okenization
 [Tt]oolchain

@@ -1021,6 +1021,27 @@ eval:
     max_concurrency: 4
 ```
 
+### Shuffling evaluation order
+
+By default, evaluation items are processed in the order they appear in the dataset. To randomize the order, enable `shuffle` in the `eval.general` section. This can help reduce ordering bias in evaluation results — for example, when using KV cache or prefix sharing optimizations that may favor items processed together.
+
+```yaml
+eval:
+  general:
+    shuffle: true
+```
+
+For reproducible shuffling across runs, set `shuffle_seed` to a fixed integer:
+
+```yaml
+eval:
+  general:
+    shuffle: true
+    shuffle_seed: 42
+```
+
+When `shuffle_seed` is omitted, a different random order is used each run.
+
 ### Pickup where you left off
 When running the evaluation on a large dataset, it is recommended to resume the evaluation from where it was left off. This is particularly useful while using overloaded services that may timeout while running the workflow. When that happens a workflow interrupted warning is issued and workflow output is saved to a file.
 

@@ -0,0 +1,213 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Galileo Agent Leaderboard v2 Evaluation
+
+**Complexity:** 🟡 Intermediate
+
+Evaluate NeMo Agent Toolkit agent workflows against the [Galileo Agent Leaderboard v2](https://huggingface.co/datasets/galileo-ai/agent-leaderboard-v2) benchmark. This benchmark tests whether an agent can select the correct tools for real-world use cases across multiple domains.
+
+## Key Features
+
+- **5 domains**: Banking, Healthcare, Insurance, Investment, Telecom
+- **Tool stub execution**: All domain tools are registered as stubs — the agent selects tools without executing real backends
+- **Tool Selection Quality (TSQ)**: F1 score between predicted and expected tool calls
+- **HuggingFace integration**: Dataset downloads automatically from `galileo-ai/agent-leaderboard-v2`
+- **Multi-domain evaluation**: Evaluate across one or all domains in a single run
+
+## Table of Contents
+
+- [Installation](#installation)
+- [Set Up Environment](#set-up-environment)
+- [Auto-Download from HuggingFace](#auto-download-from-huggingface)
+- [Run Evaluation](#run-evaluation)
+- [Understanding Results](#understanding-results)
+- [All Domains Evaluation](#all-domains-evaluation)
+
+---
+
+## Installation
+
+```bash
+uv pip install -e examples/benchmarks/agent_leaderboard
+```
+
+This installs the `datasets` library for HuggingFace access.
+
+---
+
+## Set Up Environment
+
+```bash
+export NVIDIA_API_KEY=<your-nvidia-api-key>
+```
+
+---
+
+## Auto-Download from HuggingFace
+
+If no local file is found, the dataset loader downloads directly from HuggingFace. Just point `file_path` to a non-existent path and the `domains` config will be used to download:
+
+```yaml
+dataset:
+  _type: agent_leaderboard
+  file_path: ./data/auto_download.json  # Will trigger HF download
+  domains: [banking]
+```
+
+---
+
+## Run Evaluation
+
+### Banking domain (quick test with 10 scenarios)
+
+```bash
+export AGENT_LEADERBOARD_LIMIT=10
+nat eval --config_file examples/benchmarks/agent_leaderboard/configs/eval_banking.yml
+```
+
+**Expected output:**
+```
+INFO - Starting evaluation run with config file: .../eval_banking.yml
+INFO - Loaded 10 entries from data/agent_leaderboard/agent_leaderboard_v2_banking.json
+INFO - Shared workflow built (entry_function=None)
+Running workflow: 100%|██████████| 10/10 [03:20<00:00, 20.00s/it]
+INFO - TSQ evaluation complete: avg_f1=0.650 across 10 scenarios
+
+=== EVALUATION SUMMARY ===
+| Evaluator |   Avg Score | Output File     |
+|-----------|-------------|-----------------|
+| tsq       |       0.650 | tsq_output.json |
+```
+
+### Full banking evaluation
+
+```bash
+unset AGENT_LEADERBOARD_LIMIT
+nat eval --config_file examples/benchmarks/agent_leaderboard/configs/eval_banking.yml
+```
+
+---
+
+## Understanding Results
+
+### The `agent_leaderboard_tsq` evaluator
+
+This example uses the **Tool Selection Quality (TSQ)** evaluator (`_type: agent_leaderboard_tsq` in the eval config). It compares the tool calls the agent made (captured by the workflow via `ToolIntentBuffer`) against the expected tool calls derived from the scenario's user goals.
+
+The evaluator computes an **F1 score** between predicted and expected tool sets:
+- **Precision** = (correctly predicted tools) / (total predicted tools)
+- **Recall** = (correctly predicted tools) / (total expected tools)
+- **F1** = 2 × precision × recall / (precision + recall)
+
+Tool names are normalized before comparison (case-insensitive, underscores/hyphens stripped, module prefixes removed) so that `banking_tools__get_account_balance` matches `get_account_balance`.
+
+The evaluator is configured in the YAML under `eval.evaluators`:
+
+```yaml
+evaluators:
+  tsq:
+    _type: agent_leaderboard_tsq
+    tool_weight: 1.0          # Weight for tool selection F1 (default: 1.0)
+    parameter_weight: 0.0     # Weight for parameter accuracy (default: 0.0)
+```
+
+The final score per item is `tool_weight × tool_f1 + parameter_weight × param_accuracy`. With default weights, only tool selection matters.
+
+### Per-item metrics
+
+Each item in the evaluator output contains:
+
+| Field | Description |
+|-------|-------------|
+| `tool_selection_f1` | F1 score between predicted and expected tool names |
+| `parameter_accuracy` | Parameter correctness (placeholder — future enhancement) |
+| `predicted_tools` | Normalized list of tools the agent called |
+| `expected_tools` | Normalized list of tools expected from user goals |
+| `num_predicted` | Total tool call intents captured |
+| `num_expected` | Total expected tool calls from ground truth |
+
+### Inspect results
+
+```bash
+python -c "
+import json
+with open('.tmp/nat/benchmarks/agent_leaderboard/banking/tsq_output.json') as f:
+    data = json.load(f)
+print(f'Average TSQ F1: {data[\"average_score\"]:.3f}')
+print(f'Total scenarios: {len(data[\"eval_output_items\"])}')
+
+for item in data['eval_output_items'][:3]:
+    r = item['reasoning']
+    print(f'  {item[\"id\"]}:')
+    print(f'    F1={r[\"tool_selection_f1\"]:.2f}  predicted={r[\"predicted_tools\"]}')
+    print(f'    expected={r[\"expected_tools\"]}')
+"
+```
+
+**Example output:**
+```
+Average TSQ F1: 0.650
+Total scenarios: 10
+  banking_scenario_000:
+    F1=1.00  predicted=['getaccountbalance']
+    expected=['getaccountbalance']
+  banking_scenario_001:
+    F1=0.67  predicted=['getaccountbalance', 'gettransactionhistory']
+    expected=['getaccountbalance', 'transferfunds']
+  banking_scenario_002:
+    F1=0.00  predicted=['scheduleappointment']
+    expected=['getexchangerates']
+```
+
+### Score interpretation
+
+| F1 Score | Meaning |
+|----------|---------|
+| 1.0 | All expected tools predicted, no extra tools |
+| 0.5–0.9 | Partial match — some tools correct, some missing or extra |
+| 0.0 | No overlap between predicted and expected tools |
+
+---
+
+## All Domains Evaluation
+
+Download all 5 domains:
+
+```bash
+python examples/dynamo_integration/scripts/download_agent_leaderboard_v2.py \
+  --output-dir data/agent_leaderboard \
+  --domains banking healthcare insurance investment telecom
+```
+
+Run across all domains:
+
+```bash
+export AGENT_LEADERBOARD_DATA=data/agent_leaderboard/agent_leaderboard_v2_all.json
+nat eval --config_file examples/benchmarks/agent_leaderboard/configs/eval_all_domains.yml
+```
+
+### Available domains
+
+| Domain | Scenarios | Tools | Personas | Description |
+|--------|-----------|-------|----------|-------------|
+| `banking` | 100 | 20 | 100 | Account management, transfers, loans, cards |
+| `healthcare` | 100 | 20 | 100 | Appointments, prescriptions, medical records |
+| `insurance` | 100 | 20 | 100 | Policies, claims, coverage, renewals |
+| `investment` | 100 | 20 | 100 | Portfolio management, stocks, trading |
+| `telecom` | 100 | 20 | 100 | Plans, billing, support, device management |
+| **Total** | **500** | **100** | **500** | |
@@ -0,0 +1 @@
+src/nat_benchmark_agent_leaderboard/configs
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = ["setuptools>=64", "setuptools-scm>=8", "setuptools_dynamic_dependencies>=1.0.0"]
+
+[tool.setuptools_scm]
+git_describe_command = "git describe --long --first-parent"
+root = "../../.."
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[project]
+name = "nat_benchmark_agent_leaderboard"
+dynamic = ["version", "dependencies"]
+requires-python = ">=3.11,<3.14"
+description = "Galileo Agent Leaderboard v2 benchmark evaluation example for NeMo Agent Toolkit"
+classifiers = ["Programming Language :: Python"]
+
+[tool.setuptools_dynamic_dependencies]
+dependencies = [
+  "nvidia-nat[eval,langchain,test] == {version}",
+  "nvidia-nat-benchmarks[agent-leaderboard] == {version}",
+]
+
+[tool.uv.sources]
+nvidia-nat = { path = "../../..", editable = true }
+nvidia-nat-benchmarks = { path = "../../../packages/nvidia_nat_benchmarks", editable = true }
+
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Agent Leaderboard v2 — All domains evaluation
+# Usage: nat eval --config_file examples/benchmarks/agent_leaderboard/configs/eval_all_domains.yml
+
+llms:
+  nim_llm:
+    _type: nim
+    model_name: meta/llama-3.3-70b-instruct
+    base_url: ${NVIDIA_BASE_URL:-https://integrate.api.nvidia.com/v1}
+    api_key: ${NVIDIA_API_KEY}
+    max_tokens: 1024
+    temperature: 0.0
+
+workflow:
+  _type: agent_leaderboard_workflow
+  llm_name: nim_llm
+  max_steps: 10
+
+eval:
+  general:
+    output_dir: .tmp/nat/benchmarks/agent_leaderboard/all_domains/
+    workflow_alias: agent_leaderboard_all
+    per_input_user_id: false
+    max_concurrency: 5
+    dataset:
+      _type: agent_leaderboard
+      file_path: ${AGENT_LEADERBOARD_DATA:-./data/agent_leaderboard_v2_all.json}
+      domains:
+        - banking
+        - healthcare
+        - insurance
+        - investment
+        - telecom
+      limit: ${AGENT_LEADERBOARD_LIMIT:-}
+      structure:
+        question_key: question
+        answer_key: answer
+
+  evaluators:
+    tsq:
+      _type: agent_leaderboard_tsq
+      tool_weight: 1.0
+      parameter_weight: 0.0