Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# ==============================================================================
# Eval-only multi-node Ray cluster (deployment: none, agentic Gym evals)
# ==============================================================================
# Target use case: agentic Gym evals (GDPVal, SWE-bench, terminalbench) against
# an external policy endpoint, where Stirrup actor concurrency needs to spread
# over multiple CPU nodes to avoid per-node CPU/FUSE saturation during agent boot.
#
# Architecture (handled by the SLURM executor when eval_ray_cluster=true):
# - Allocates N CPU nodes.
# - On every node, runs the eval container and a per-node pre_cmd
# (apptainer install, FUSE deps).
# - Starts `ray start --head` on PRIMARY_NODE and `ray start --address=...`
# on the remaining nodes; waits for all N daemons to join.
# - `nemo-evaluator run_eval` runs ONLY on PRIMARY_NODE. Gym's
# `ng_e2e_collect_rollouts` connects to the Ray cluster via
# `++ray_head_node_address=$RAY_HEAD_NODE_ADDRESS`. Stirrup actors are
# scheduled across all N nodes; each actor FUSE-mounts its SIF locally.
#
# Policy routing mirrors canonical Ultra V3 GDPVal:
# ++use_absolute_ip=true
# ++policy_base_url={{target.api_endpoint.url}}
# `use_absolute_ip=true` rewrites policy_base_url so worker-node actors hit
# the head node's external IP. The adapter binds to 0.0.0.0:3825 because the
# executor exports `ADAPTER_HOST=0.0.0.0` when eval_ray_cluster=true.
#
# Prerequisite (verify before submitting): Gym's actor placement must be SPREAD,
# not Ray's default PACK. Otherwise actors fill the head node first and the
# multi-node setup buys nothing. Check Gym's `ng_e2e_collect_rollouts` source
# for the relevant placement_strategy flag and pass it via `common_params` if
# needed.
# ==============================================================================

defaults:
- execution: slurm/default
- deployment: none
- _self_

execution:
hostname: ??? # SLURM login hostname
account: ???
output_dir: ??? # Absolute path on compute nodes
partition: cpu
num_nodes: 8
gpus_per_node: 0
walltime: "06:00:00"
sbatch_extra_flags:
qos: cpu-normal

# Multi-node eval Ray cluster
eval_ray_cluster: true
eval_ray_port: 6379
eval_ray_dashboard_port: 8265
eval_ray_ready_timeout: 600
eval_per_node_pre_cmd: |
set -e
apt-get update -qq && apt-get install -y -qq squashfuse fuse3 git-lfs rpm2cpio cpio
if ! command -v apptainer >/dev/null 2>&1; then
curl -sSL https://raw.githubusercontent.com/apptainer/apptainer/main/tools/install-unprivileged.sh \
| bash -s - /opt/apptainer
ln -sf /opt/apptainer/bin/apptainer /usr/local/bin/apptainer
fi
ln -sf /usr/local/sbin/apptainer /usr/local/bin/apptainer 2>/dev/null || true
mkdir -p /usr/local/var/apptainer/mnt/session

mounts:
mount_home: false
evaluation: {}
# Canonical GDPVal apptainer/SIF mounts go here, e.g.:
# /lustre/.../mengxiwu/apptainer/bin: /usr/local/sbin
# /lustre/.../mengxiwu/apptainer/etc/apptainer: /usr/local/etc/apptainer
# /lustre/.../mengxiwu/apptainer/libexec/apptainer: /usr/local/libexec/apptainer
# /lustre/.../agronskiy/images/apptainer: /gdpval/sif

target:
api_endpoint:
url: https://integrate.api.nvidia.com/v1/chat/completions
model_id: meta/llama-3.1-405b-instruct # Replace with your target model.
api_key_name: NVIDIA_API_KEY

env_vars:
NVIDIA_API_KEY: host:NVIDIA_API_KEY

evaluation:
# Placeholder task — replace with your agentic Gym benchmark (e.g. nemo_gym_agentic
# with a container and nemo_evaluator_config including common_params:
# ++use_absolute_ip=true
# ++policy_base_url={{target.api_endpoint.url}}
# ++policy_api_key={{target.api_endpoint.api_key_name}}
# ++policy_model_name={{target.api_endpoint.model_id}}
# ++ray_head_node_address=$RAY_HEAD_NODE_ADDRESS
# ++gdpval_stirrup_agent.responses_api_agents.stirrup_agent.concurrency=192
# )
tasks:
- name: AIME_2024
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,27 @@ proxy:
haproxy_port: 5009
health_check_path: /health
health_check_status: 200

# Eval-only multi-node Ray cluster. Active only when deployment.type == "none"
# AND num_nodes > 1 AND eval_ray_cluster == true. Aux deployments are rejected
# in combination with eval_ray_cluster=true.
eval_ray_cluster: false # Opt in to spread eval Ray actors across all allocated nodes.
eval_ray_port: 6379 # Ray GCS port on the head node.
eval_ray_dashboard_port: 8265 # Ray dashboard port.
eval_ray_ready_timeout: 600 # Seconds to wait for all N Ray daemons to join.
eval_per_node_pre_cmd: null # Optional bash snippet run on every node inside the
# eval container before `ray start` (e.g. install
# apptainer / squashfuse). When set, requires
# NEMO_EVALUATOR_TRUST_PRE_CMD=1.
eval_ray_pre_start_cmd: null # Optional bash snippet prepended to each `ray start`
# (head + workers) and `ray status` wait handler. Use
# to put `ray` on PATH when the eval container ships
# it inside a venv (e.g. Gym: `source /opt/Gym/.venv/bin/activate`).
eval_ray_head_workload_cmd: null # Optional bash snippet that REPLACES `sleep infinity`
# in the ray-head's inner_cmd, so the head's bootstrap
# container runs the actual workload (e.g. bash a
# lustre-rendezvous deployment script that invokes
# ng_e2e_collect_rollouts). Driver + Ray actors then
# share one venv → eliminates pickle-ABI skew. Mirror
# canonical vllm_ray pre_cmd's lustre-rendezvous pattern.
# When unset, head stays `sleep infinity`.
Loading
Loading