Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
dd551f3
fix(profile): upload staged trace path
Oseltamivir May 21, 2026
7c0cb09
feat(profile): add B300 DeepSeek V4 Flash config
Oseltamivir May 21, 2026
e2639e5
fix(profile): stage relay trace in checkout
Oseltamivir May 21, 2026
5e87c8c
feat(profile): add Flash DEP MTP3 profile
Oseltamivir May 21, 2026
b00f855
fix(profile): push traces with repo token
Oseltamivir May 21, 2026
a8df66c
fix(profile): align Flash MTP profiling steps
Oseltamivir May 21, 2026
e78383e
fix(profile): capture two SGL profiling steps
Oseltamivir May 21, 2026
e3393af
fix(profile): enable B300 Flash vLLM traces
Oseltamivir May 21, 2026
39f914b
feat(profile): add Flash vLLM MTP3 run
Oseltamivir May 21, 2026
f9d6523
fix(profile): capture one profiling step
Oseltamivir May 21, 2026
2e2f876
fix(profile): switch Flash vLLM MTP to DEP8
Oseltamivir May 21, 2026
cd160ee
fix(profile): rerun Flash vLLM MTP at conc8
Oseltamivir May 21, 2026
9b534f7
fix(profile): disable Flash vLLM MTP cudagraphs
Oseltamivir May 21, 2026
4f1f0fa
fix(profile): limit Flash vLLM trace to decode steps
Oseltamivir May 22, 2026
fc21e40
fix(profile): disable Flash vLLM torch compile
Oseltamivir May 22, 2026
5eb4b65
fix(profile): capture three Flash vLLM decode steps
Oseltamivir May 22, 2026
39f3b7c
fix(profile): enable Flash vLLM cudagraphs
Oseltamivir May 22, 2026
ef11755
fix(profile): capture eight Flash vLLM decode steps
Oseltamivir May 22, 2026
22db6e2
fix(profile): use compatible Flash cudagraph config
Oseltamivir May 22, 2026
81f5a8a
fix(profile): run five Flash vLLM decode steps
Oseltamivir May 22, 2026
a37cb73
fix(profile): use vLLM profiler window for Flash steps
Oseltamivir May 22, 2026
6f89c90
fix(profile): limit Flash vLLM request length
Oseltamivir May 22, 2026
e9bfbf9
fix(results): guard zero tpot interactivity
Oseltamivir May 22, 2026
64cbdc3
fix(profile): simulate Flash offline decode batch
Oseltamivir May 26, 2026
6a824fc
fix(profile): warm up Flash decode batch before trace
Oseltamivir May 26, 2026
3b7d8a7
fix(profile): target third Flash decode step
Oseltamivir May 26, 2026
524ca63
fix(profile): add GB200 DSV4 MTP3 profile
Oseltamivir May 26, 2026
58d423e
fix(profile): stringify multinode concurrency env
Oseltamivir May 26, 2026
2f300a3
fix(profile): use aggregate GB200 DSV4 profile
Oseltamivir May 26, 2026
eb885ff
fix(profile): enable vllm trace output for GB200
Oseltamivir May 26, 2026
27ddec5
fix(profile): capture later GB200 decode step
Oseltamivir May 26, 2026
0dc0300
fix(profile): switch GB200 profile to DSV4 Flash
Oseltamivir May 26, 2026
16d77dc
fix(profile): point GB200 Flash to Lustre weights
Oseltamivir May 26, 2026
980316a
fix(profile): use 4-GPU GB200 Flash profile
Oseltamivir May 27, 2026
47d860d
fix: avoid stale profile trace reuse
Oseltamivir May 27, 2026
f44c224
fix: prioritize decode in gb200 flash profile
Oseltamivir May 27, 2026
4ae1fe6
fix: profile final gb200 flash decode step
Oseltamivir May 27, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2038,6 +2038,77 @@ dsv4-fp4-b300-sglang:
- { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }

# Targeted single-point Flash config for profile.yml. Keep the existing Pro
# sweep entry above unchanged; this profile-only key reuses the same B300
# SGLang launch path at the 1k1k, conc=64 point.
dsv4-flash-fp4-b300-sglang:
image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15
model: deepseek-ai/DeepSeek-V4-Flash
model-prefix: dsv4
runner: b300
precision: fp4
framework: sglang
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, ep: 1, conc-start: 64, conc-end: 64 }

# Targeted single-point Flash vLLM profile matching the SGLang profile point
# above. Keep this narrow so profile.yml dispatches only the intended 1k1k run.
dsv4-flash-fp4-b300-vllm:
image: vllm/vllm-openai:v0.21.0
model: deepseek-ai/DeepSeek-V4-Flash
model-prefix: dsv4
runner: b300
precision: fp4
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, ep: 1, conc-start: 64, conc-end: 64 }

# Targeted Flash vLLM MTP DEP8 profile at the same single-point profile
# location. The shared launcher maps dp-attn=true to DP without TP, and selects
# 3 speculative tokens for this model.
dsv4-flash-fp4-b300-vllm-mtp:
image: vllm/vllm-openai:v0.21.0
model: deepseek-ai/DeepSeek-V4-Flash
model-prefix: dsv4
runner: b300
precision: fp4
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 8, dp-attn: true, conc-start: 8, conc-end: 8, spec-decoding: mtp }

# Targeted Flash MTP profile: DEP4 at the same 1k1k conc=64 point as the
# non-MTP Flash profile above. The shared SGLang MTP launcher selects the
# Flash-only (steps=3, draft-tokens=3) speculative settings for this model.
dsv4-flash-fp4-b300-sglang-mtp:
image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
model: deepseek-ai/DeepSeek-V4-Flash
model-prefix: dsv4
runner: b300
precision: fp4
framework: sglang
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 64, spec-decoding: mtp }

# DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
# selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
# DP_ATTENTION:
Expand Down Expand Up @@ -8609,6 +8680,37 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
ep: 8
dp-attn: true

# Dedicated profile point for a single GB200 node / global batch 256 shape:
# aggregated DEP4 on GB200, MTP3, conc=256.
dsv4-flash-fp4-gb200-dynamo-vllm-mtp3-profile:
image: vllm/vllm-openai:v0.21.0-ubuntu2404
model: deepseek-ai/DeepSeek-V4-Flash
model-prefix: dsv4
runner: gb200
precision: fp4
framework: dynamo-vllm
multinode: true
disagg: false
scenarios:
fixed-seq-len:
- isl: 8192
osl: 256
search-space:
- conc-list: [256]
spec-decoding: mtp
prefill:
num-worker: 4
tp: 1
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml"
decode:
num-worker: 0
tp: 1
ep: 1
dp-attn: false

dsv4-fp4-b300-dynamo-vllm:
image: vllm/vllm-openai:v0.20.1
model: deepseek-ai/DeepSeek-V4-Pro
Expand Down
177 changes: 164 additions & 13 deletions .github/workflows/profile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,18 @@ jobs:
TP: ${{ matrix.config.tp }}
EP_SIZE: ${{ matrix.config.ep }}
DP_ATTENTION: ${{ matrix.config['dp-attn'] }}
CONC: ${{ matrix.config.conc }}
CONC: ${{ toJson(matrix.config.conc) }}
CONC_JSON: ${{ toJson(matrix.config.conc) }}
PREFILL_NUM_WORKERS: ${{ matrix.config.prefill['num-worker'] }}
PREFILL_TP: ${{ matrix.config.prefill.tp }}
PREFILL_EP: ${{ matrix.config.prefill.ep }}
PREFILL_DP_ATTN: ${{ matrix.config.prefill['dp-attn'] }}
PREFILL_ADDITIONAL_SETTINGS_JSON: ${{ toJson(matrix.config.prefill['additional-settings']) }}
DECODE_NUM_WORKERS: ${{ matrix.config.decode['num-worker'] }}
DECODE_TP: ${{ matrix.config.decode.tp }}
DECODE_EP: ${{ matrix.config.decode.ep }}
DECODE_DP_ATTN: ${{ matrix.config.decode['dp-attn'] }}
DECODE_ADDITIONAL_SETTINGS_JSON: ${{ toJson(matrix.config.decode['additional-settings']) }}
SPEC_DECODING: ${{ matrix.config.spec-decoding }}
DISAGG: ${{ matrix.config.disagg }}
MOE_DEBUG: '0'
Expand Down Expand Up @@ -148,7 +159,7 @@ jobs:
ref: ${{ inputs.ref || github.sha }}
clean: false

- name: Launch + Profile (single-node sglang/vllm)
- name: Launch + Profile
id: run
env:
RUNNER_NAME: ${{ runner.name }}
Expand All @@ -159,20 +170,153 @@ jobs:
shell: bash
run: |
set -euo pipefail
ep_val="${EP_SIZE:-1}"
res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}"

export_additional_settings() {
local settings_json="$1"
python3 - "$settings_json" <<'PY'
import json
import sys

raw = sys.argv[1]
if not raw or raw == "null":
raise SystemExit(0)
for item in json.loads(raw) or []:
print(item)
PY
}

normalize_conc() {
python3 - <<'PY'
import json
import os

raw = os.environ.get("CONC_JSON") or os.environ.get("CONC") or "[]"
try:
value = json.loads(raw)
except json.JSONDecodeError:
value = raw
if isinstance(value, list):
print("x".join(str(v) for v in value))
else:
print(str(value))
PY
}

if [ -n "${PREFILL_NUM_WORKERS:-}" ] && [ -n "${DECODE_NUM_WORKERS:-}" ]; then
conc_val="$(normalize_conc)"
res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_prefill-tp${PREFILL_TP}-ep${PREFILL_EP}-dp${PREFILL_DP_ATTN}-nw${PREFILL_NUM_WORKERS}_decode-tp${DECODE_TP}-ep${DECODE_EP}-dp${DECODE_DP_ATTN}-nw${DECODE_NUM_WORKERS}_disagg-${DISAGG}_spec-${SPEC_DECODING}_conc${conc_val}_${RUNNER_NAME}"

echo "IS_MULTINODE=true" >> "$GITHUB_ENV"
echo "PREFILL_GPUS=$((PREFILL_NUM_WORKERS * PREFILL_TP))" >> "$GITHUB_ENV"
echo "DECODE_GPUS=$((DECODE_NUM_WORKERS * DECODE_TP))" >> "$GITHUB_ENV"

while IFS= read -r setting; do
if [ -n "$setting" ]; then
export "$setting"
fi
done < <(export_additional_settings "${PREFILL_ADDITIONAL_SETTINGS_JSON:-null}")
while IFS= read -r setting; do
if [ -n "$setting" ]; then
export "$setting"
fi
done < <(export_additional_settings "${DECODE_ADDITIONAL_SETTINGS_JSON:-null}")
else
ep_val="${EP_SIZE:-1}"
res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}"
fi

export RESULT_FILENAME="${res_name}"
echo "RESULT_FILENAME=${res_name}" >> "$GITHUB_ENV"

echo "Removing stale profile artifacts from previous runs"
rm -rf LOGS
rm -f profile_*.trace.json.gz multinode_server_logs.tar.gz

bash ./runners/launch_${RUNNER_NAME%%_*}.sh

if [ ! -f "${res_name}.json" ]; then
echo "Run failed: Benchmark result ${res_name}.json not found." >&2
exit 1
result_candidate="$(find . -maxdepth 1 -type f -name "${res_name}_*.json" | sort | head -n1 || true)"
if [ -n "$result_candidate" ] && [ -f "$result_candidate" ]; then
cp "$result_candidate" "${res_name}.json"
else
echo "Run failed: Benchmark result ${res_name}.json not found." >&2
exit 1
fi
fi

trace_path="profile_${res_name}.trace.json.gz"
if [ ! -f "$trace_path" ] && [ -d LOGS ]; then
trace_candidate="$(python3 - <<'PY'
from pathlib import Path

root = Path("LOGS")

def is_trace_candidate(path: Path) -> bool:
name = path.name
if name.startswith("results_") or "profile_export" in name:
return False
if name.endswith((".trace.json", ".trace.json.gz", ".pt.trace.json", ".pt.trace.json.gz")):
return True
return "trace" in name and name.endswith((".json", ".json.gz"))

candidates = [p for p in root.rglob("*") if p.is_file() and is_trace_candidate(p)]
if candidates:
print(max(candidates, key=lambda p: (p.stat().st_mtime_ns, p.stat().st_size)))
PY
)"
if [ -n "$trace_candidate" ] && [ -f "$trace_candidate" ]; then
echo "Selected profile trace candidate: $trace_candidate"
if [[ "$trace_candidate" == *.gz ]]; then
cp "$trace_candidate" "$trace_path"
else
gzip -c "$trace_candidate" > "$trace_path"
fi
fi
fi

if [ -f "$trace_path" ]; then
echo "Profile trace prepared: $trace_path"
ls -lh "$trace_path"
sha256sum "$trace_path"
python3 - "$trace_path" <<'PY'
import gzip
import os
import re
import sys

trace_path = sys.argv[1]
expected = set()
worker_gpus = []
for workers_key, tp_key in (
("PREFILL_NUM_WORKERS", "PREFILL_TP"),
("DECODE_NUM_WORKERS", "DECODE_TP"),
):
workers = os.environ.get(workers_key)
tp = os.environ.get(tp_key)
if workers and workers.isdigit() and tp and tp.isdigit():
gpus = int(workers) * int(tp)
if gpus:
expected.add(gpus)
worker_gpus.append(gpus)
if len(worker_gpus) > 1:
expected.add(sum(worker_gpus))

opener = gzip.open if trace_path.endswith(".gz") else open
with opener(trace_path, "rt", errors="replace") as f:
prefix = f.read(1024 * 1024)

if '"traceEvents"' not in prefix:
raise SystemExit(f"{trace_path} does not look like a Perfetto trace: traceEvents key not found near start")

match = re.search(r'"world_size"\s*:\s*(\d+)', prefix)
if expected and match:
world_size = int(match.group(1))
if world_size not in expected:
allowed = ", ".join(str(v) for v in sorted(expected))
raise SystemExit(
f"{trace_path} has distributed world_size={world_size}, expected one of: {allowed}"
)
PY
echo "trace=$trace_path" >> "$GITHUB_OUTPUT"
if [ "${FRAMEWORK}" = "sglang" ]; then
# Try to locate corresponding TP-0 traces produced by SGLang profiler
Expand All @@ -193,6 +337,11 @@ jobs:
fi
else
echo "Profile trace not found: $trace_path" >&2
if [ -d LOGS ]; then
echo "LOGS profile candidates:" >&2
find LOGS -maxdepth 8 -type f \( -path "*/profiles/*" -o -name "*trace*" -o -name "*profile*" \) -printf "%p %s bytes\n" 2>/dev/null | sort >&2 || true
fi
exit 1
fi

- name: Process result (json -> agg)
Expand All @@ -206,7 +355,7 @@ jobs:
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: profile_${{ env.RESULT_FILENAME }}
path: profile_${{ env.RESULT_FILENAME }}.trace.json.gz
path: ${{ steps.run.outputs.trace }}
if-no-files-found: ignore

- name: Upload TP-0-DECODE trace as artifact
Expand Down Expand Up @@ -240,33 +389,35 @@ jobs:
repository: SemiAnalysisAI/InferenceX-trace-storage
path: storage
ref: master
ssh-key: ${{ secrets.PROFILER_STORAGE_DEPLOY_KEY }}
token: ${{ secrets.REPO_PAT }}
fetch-depth: 0

- name: Push profile to storage repo
if: ${{ steps.run.outputs.trace != '' }}
id: push
env:
TRACE_LOCAL: ${{ steps.run.outputs.trace }}
REPO_PAT: ${{ secrets.REPO_PAT }}
shell: bash
run: |
set -euo pipefail

dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}"
dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${RESULT_FILENAME}"
mkdir -p "$dest_dir"
cp "$TRACE_LOCAL" "$dest_dir/trace.json.gz"

pushd storage >/dev/null
git config user.name "github-actions"
git config user.email "github-actions@github.com"
git remote set-url origin "https://x-access-token:${REPO_PAT}@github.com/SemiAnalysisAI/InferenceX-trace-storage.git"
git add -A
git commit -m "Add profile: ${GITHUB_SHA} ${{ matrix.config['exp-name'] }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}" || echo "Nothing to commit"
git push
git commit -m "Add profile: ${GITHUB_SHA} ${RESULT_FILENAME}" || echo "Nothing to commit"
git push origin HEAD:master
STORAGE_SHA="$(git rev-parse HEAD)"
popd >/dev/null

export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}/trace.json.gz"
export TITLE="${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}"
export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${RESULT_FILENAME}/trace.json.gz"
export TITLE="${RESULT_FILENAME}"

enc_src="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["RAW_URL"], safe=""))')"
enc_title="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["TITLE"], safe=""))')"
Expand Down
Loading
Loading