Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3171,3 +3171,9 @@
description:
- "Validates measured-power aggregation pipeline (PR #1558) on both NVIDIA (H200) and AMD (MI355X) hardware — different SMI tools (nvidia-smi vs amd-smi), different CSV schemas (power.draw [W] vs socket_power), same aggregator. No config change. Entry intentionally kept past merge so run-sweep produces canonical agg JSONs with avg_power_w + joules_per_output_token on main for both vendors, seeding the dashboard's day-zero data."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1558

- config-keys:
- dsv4-fp4-gb300-dynamo-sglang
description:
- "Smoke run validating multinode measured-power aggregation (PR #1574). No config change; entry exists to trigger a sweep that produces the first multinode agg JSON with avg_power_w + joules_per_*_token populated from per-node srt-slurm perfmon CSVs. Validates per-source GPU-id namespacing in aggregate_power.py (without it, 14 nodes × 4 GPUs would report num_gpus=4 instead of 56) and the GPU_METRICS_CSV_GLOB env var bridge in process_result.py. Only the gb300-cw runner has the perfmon launcher changes; any gb300-nv runs in the sweep will succeed normally without power fields, which the dashboard handles gracefully (chart gates on field presence)."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1574
52 changes: 50 additions & 2 deletions runners/launch_gb300-cw.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,13 @@ if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
export MODEL_PATH="/mnt/vast/models/dsv4"

if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
SRT_SLURM_RECIPES_REPO="https://github.com/NVIDIA/srt-slurm.git"
SRT_SLURM_RECIPES_REF="main"
# Pinned to our SemiAnalysisAI fork of NVIDIA/srt-slurm to pick up
# PR #35 (per-node nvidia-smi monitoring during the benchmark sweep)
# ahead of its upstream merge. The branch tracks PR #35's head SHA:
# to bump, re-fetch refs/pull/35/head from NVIDIA/srt-slurm and force-
# push to SemiAnalysisAI/srt-slurm:feat/inferencex-perfmon.
SRT_SLURM_RECIPES_REPO="https://github.com/SemiAnalysisAI/srt-slurm.git"
SRT_SLURM_RECIPES_REF="feat/inferencex-perfmon"
SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4"
SRT_RECIPE_DST="recipes/sglang/deepseek-v4"
elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
Expand Down Expand Up @@ -106,6 +111,30 @@ git checkout "$SRT_SLURM_RECIPES_REF"
mkdir -p "$SRT_RECIPE_DST"
cp -rT "$SRT_RECIPE_SRC" "$SRT_RECIPE_DST"

# Enable per-node GPU perfmon (PR #35) on every overlaid recipe. `monitoring`
# is a top-level SrtConfig field and defaults to None, so without this the
# orchestrator's _start_perf_monitor short-circuits and no perf_samples_*.csv
# are ever written — multinode measured-power aggregation would silently
# skip. Idempotent: skips recipes that already declare `monitoring:`.
#
# CRITICAL: use `find` recursively, not a flat `*.yaml` glob. Recipes live
# in $SRT_RECIPE_DST/<workload>/*.yaml (e.g. .../8k1k/*.yaml) — a flat glob
# matches zero files, the loop runs zero times, no recipe gets monitoring,
# and perfmon never spawns. PR #1574's first real sweep (#26548110246) hit
# exactly this: completed "success" with no power data because the glob
# matched nothing and the failure was silent end-to-end.
INJECTED_COUNT=0
while IFS= read -r recipe; do
if ! grep -q '^monitoring:' "$recipe"; then
printf '\nmonitoring:\n enabled: true\n sample_interval: 1.0\n' >> "$recipe"
echo "[perfmon] enabled monitoring in recipe: $recipe"
INJECTED_COUNT=$((INJECTED_COUNT + 1))
fi
done < <(find "$SRT_RECIPE_DST" -type f -name '*.yaml')
if [ "$INJECTED_COUNT" -eq 0 ]; then
echo "[perfmon] WARNING: zero recipes received monitoring injection under $SRT_RECIPE_DST. Either every recipe already had it, or the directory layout changed — power data will be MISSING from this run." >&2
fi

echo "Installing srtctl..."
# CRITICAL — uv install location.
# Runner pod is x86 but compute nodes are aarch64, and /mnt/home is
Expand Down Expand Up @@ -279,6 +308,25 @@ else
echo "Warning: Logs directory not found at $LOGS_DIR"
fi

# Hand the per-node perfmon CSVs off to the downstream "Process result" step
# in benchmark-multinode-tmpl.yml. srt-slurm's perfmon (PR #35) writes
# perf_samples_{node}.csv straight into $LOGS_DIR on the host. process_result.py
# already invokes aggregate_power.run() inline; teaching it to read
# GPU_METRICS_CSV_GLOB lets utils/aggregate_power.py do the multi-CSV
# aggregation (each agg JSON gets avg_power_w / joules_per_*_token patched in
# place). Use an absolute glob because process_result.py runs from
# $GITHUB_WORKSPACE, not from this srt-slurm checkout.
if [ -d "$LOGS_DIR" ]; then
perf_glob_dir="$(pwd)/$LOGS_DIR"
perf_csv_count=$(ls "$perf_glob_dir"/perf_samples_*.csv 2>/dev/null | wc -l | tr -d ' ')
if [ "$perf_csv_count" -gt 0 ]; then
echo "[perfmon] Found $perf_csv_count per-node perf_samples_*.csv under $perf_glob_dir/"
echo "GPU_METRICS_CSV_GLOB=$perf_glob_dir/perf_samples_*.csv" >> "$GITHUB_ENV"
else
echo "[perfmon] WARNING: monitoring enabled but no perf_samples_*.csv found in $perf_glob_dir — measured power aggregation will be skipped"
fi
fi

if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
if [ ! -d "$LOGS_DIR" ]; then
exit 1
Expand Down
Loading