diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4a683d372..72a8ca70f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2038,6 +2038,77 @@ dsv4-fp4-b300-sglang: - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } +# Targeted single-point Flash config for profile.yml. Keep the existing Pro +# sweep entry above unchanged; this profile-only key reuses the same B300 +# SGLang launch path at the 1k1k, conc=64 point. +dsv4-flash-fp4-b300-sglang: + image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15 + model: deepseek-ai/DeepSeek-V4-Flash + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 } + +# Targeted single-point Flash vLLM profile matching the SGLang profile point +# above. Keep this narrow so profile.yml dispatches only the intended 1k1k run. +dsv4-flash-fp4-b300-vllm: + image: vllm/vllm-openai:v0.21.0 + model: deepseek-ai/DeepSeek-V4-Flash + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 64, conc-end: 64 } + +# Targeted Flash vLLM MTP DEP8 profile at the same single-point profile +# location. The shared launcher maps dp-attn=true to DP without TP, and selects +# 3 speculative tokens for this model. +dsv4-flash-fp4-b300-vllm-mtp: + image: vllm/vllm-openai:v0.21.0 + model: deepseek-ai/DeepSeek-V4-Flash + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 8, dp-attn: true, conc-start: 8, conc-end: 8, spec-decoding: mtp } + +# Targeted Flash MTP profile: DEP4 at the same 1k1k conc=64 point as the +# non-MTP Flash profile above. The shared SGLang MTP launcher selects the +# Flash-only (steps=3, draft-tokens=3) speculative settings for this model. +dsv4-flash-fp4-b300-sglang-mtp: + image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 + model: deepseek-ai/DeepSeek-V4-Flash + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 64, spec-decoding: mtp } + # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by # DP_ATTENTION: @@ -8609,6 +8680,37 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2: ep: 8 dp-attn: true +# Dedicated profile point for a single GB200 node / global batch 256 shape: +# aggregated DEP4 on GB200, MTP3, conc=256. +dsv4-flash-fp4-gb200-dynamo-vllm-mtp3-profile: + image: vllm/vllm-openai:v0.21.0-ubuntu2404 + model: deepseek-ai/DeepSeek-V4-Flash + model-prefix: dsv4 + runner: gb200 + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: false + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 256 + search-space: + - conc-list: [256] + spec-decoding: mtp + prefill: + num-worker: 4 + tp: 1 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml" + decode: + num-worker: 0 + tp: 1 + ep: 1 + dp-attn: false + dsv4-fp4-b300-dynamo-vllm: image: vllm/vllm-openai:v0.20.1 model: deepseek-ai/DeepSeek-V4-Pro diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml index 8152d47a5..e2d08430f 100644 --- a/.github/workflows/profile.yml +++ b/.github/workflows/profile.yml @@ -112,7 +112,18 @@ jobs: TP: ${{ matrix.config.tp }} EP_SIZE: ${{ matrix.config.ep }} DP_ATTENTION: ${{ matrix.config['dp-attn'] }} - CONC: ${{ matrix.config.conc }} + CONC: ${{ toJson(matrix.config.conc) }} + CONC_JSON: ${{ toJson(matrix.config.conc) }} + PREFILL_NUM_WORKERS: ${{ matrix.config.prefill['num-worker'] }} + PREFILL_TP: ${{ matrix.config.prefill.tp }} + PREFILL_EP: ${{ matrix.config.prefill.ep }} + PREFILL_DP_ATTN: ${{ matrix.config.prefill['dp-attn'] }} + PREFILL_ADDITIONAL_SETTINGS_JSON: ${{ toJson(matrix.config.prefill['additional-settings']) }} + DECODE_NUM_WORKERS: ${{ matrix.config.decode['num-worker'] }} + DECODE_TP: ${{ matrix.config.decode.tp }} + DECODE_EP: ${{ matrix.config.decode.ep }} + DECODE_DP_ATTN: ${{ matrix.config.decode['dp-attn'] }} + DECODE_ADDITIONAL_SETTINGS_JSON: ${{ toJson(matrix.config.decode['additional-settings']) }} SPEC_DECODING: ${{ matrix.config.spec-decoding }} DISAGG: ${{ matrix.config.disagg }} MOE_DEBUG: '0' @@ -148,7 +159,7 @@ jobs: ref: ${{ inputs.ref || github.sha }} clean: false - - name: Launch + Profile (single-node sglang/vllm) + - name: Launch + Profile id: run env: RUNNER_NAME: ${{ runner.name }} @@ -159,20 +170,153 @@ jobs: shell: bash run: | set -euo pipefail - ep_val="${EP_SIZE:-1}" - res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}" + + export_additional_settings() { + local settings_json="$1" + python3 - "$settings_json" <<'PY' + import json + import sys + + raw = sys.argv[1] + if not raw or raw == "null": + raise SystemExit(0) + for item in json.loads(raw) or []: + print(item) + PY + } + + normalize_conc() { + python3 - <<'PY' + import json + import os + + raw = os.environ.get("CONC_JSON") or os.environ.get("CONC") or "[]" + try: + value = json.loads(raw) + except json.JSONDecodeError: + value = raw + if isinstance(value, list): + print("x".join(str(v) for v in value)) + else: + print(str(value)) + PY + } + + if [ -n "${PREFILL_NUM_WORKERS:-}" ] && [ -n "${DECODE_NUM_WORKERS:-}" ]; then + conc_val="$(normalize_conc)" + res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_prefill-tp${PREFILL_TP}-ep${PREFILL_EP}-dp${PREFILL_DP_ATTN}-nw${PREFILL_NUM_WORKERS}_decode-tp${DECODE_TP}-ep${DECODE_EP}-dp${DECODE_DP_ATTN}-nw${DECODE_NUM_WORKERS}_disagg-${DISAGG}_spec-${SPEC_DECODING}_conc${conc_val}_${RUNNER_NAME}" + + echo "IS_MULTINODE=true" >> "$GITHUB_ENV" + echo "PREFILL_GPUS=$((PREFILL_NUM_WORKERS * PREFILL_TP))" >> "$GITHUB_ENV" + echo "DECODE_GPUS=$((DECODE_NUM_WORKERS * DECODE_TP))" >> "$GITHUB_ENV" + + while IFS= read -r setting; do + if [ -n "$setting" ]; then + export "$setting" + fi + done < <(export_additional_settings "${PREFILL_ADDITIONAL_SETTINGS_JSON:-null}") + while IFS= read -r setting; do + if [ -n "$setting" ]; then + export "$setting" + fi + done < <(export_additional_settings "${DECODE_ADDITIONAL_SETTINGS_JSON:-null}") + else + ep_val="${EP_SIZE:-1}" + res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}" + fi + export RESULT_FILENAME="${res_name}" echo "RESULT_FILENAME=${res_name}" >> "$GITHUB_ENV" + echo "Removing stale profile artifacts from previous runs" + rm -rf LOGS + rm -f profile_*.trace.json.gz multinode_server_logs.tar.gz + bash ./runners/launch_${RUNNER_NAME%%_*}.sh if [ ! -f "${res_name}.json" ]; then - echo "Run failed: Benchmark result ${res_name}.json not found." >&2 - exit 1 + result_candidate="$(find . -maxdepth 1 -type f -name "${res_name}_*.json" | sort | head -n1 || true)" + if [ -n "$result_candidate" ] && [ -f "$result_candidate" ]; then + cp "$result_candidate" "${res_name}.json" + else + echo "Run failed: Benchmark result ${res_name}.json not found." >&2 + exit 1 + fi fi trace_path="profile_${res_name}.trace.json.gz" + if [ ! -f "$trace_path" ] && [ -d LOGS ]; then + trace_candidate="$(python3 - <<'PY' + from pathlib import Path + + root = Path("LOGS") + + def is_trace_candidate(path: Path) -> bool: + name = path.name + if name.startswith("results_") or "profile_export" in name: + return False + if name.endswith((".trace.json", ".trace.json.gz", ".pt.trace.json", ".pt.trace.json.gz")): + return True + return "trace" in name and name.endswith((".json", ".json.gz")) + + candidates = [p for p in root.rglob("*") if p.is_file() and is_trace_candidate(p)] + if candidates: + print(max(candidates, key=lambda p: (p.stat().st_mtime_ns, p.stat().st_size))) + PY + )" + if [ -n "$trace_candidate" ] && [ -f "$trace_candidate" ]; then + echo "Selected profile trace candidate: $trace_candidate" + if [[ "$trace_candidate" == *.gz ]]; then + cp "$trace_candidate" "$trace_path" + else + gzip -c "$trace_candidate" > "$trace_path" + fi + fi + fi + if [ -f "$trace_path" ]; then + echo "Profile trace prepared: $trace_path" + ls -lh "$trace_path" + sha256sum "$trace_path" + python3 - "$trace_path" <<'PY' + import gzip + import os + import re + import sys + + trace_path = sys.argv[1] + expected = set() + worker_gpus = [] + for workers_key, tp_key in ( + ("PREFILL_NUM_WORKERS", "PREFILL_TP"), + ("DECODE_NUM_WORKERS", "DECODE_TP"), + ): + workers = os.environ.get(workers_key) + tp = os.environ.get(tp_key) + if workers and workers.isdigit() and tp and tp.isdigit(): + gpus = int(workers) * int(tp) + if gpus: + expected.add(gpus) + worker_gpus.append(gpus) + if len(worker_gpus) > 1: + expected.add(sum(worker_gpus)) + + opener = gzip.open if trace_path.endswith(".gz") else open + with opener(trace_path, "rt", errors="replace") as f: + prefix = f.read(1024 * 1024) + + if '"traceEvents"' not in prefix: + raise SystemExit(f"{trace_path} does not look like a Perfetto trace: traceEvents key not found near start") + + match = re.search(r'"world_size"\s*:\s*(\d+)', prefix) + if expected and match: + world_size = int(match.group(1)) + if world_size not in expected: + allowed = ", ".join(str(v) for v in sorted(expected)) + raise SystemExit( + f"{trace_path} has distributed world_size={world_size}, expected one of: {allowed}" + ) + PY echo "trace=$trace_path" >> "$GITHUB_OUTPUT" if [ "${FRAMEWORK}" = "sglang" ]; then # Try to locate corresponding TP-0 traces produced by SGLang profiler @@ -193,6 +337,11 @@ jobs: fi else echo "Profile trace not found: $trace_path" >&2 + if [ -d LOGS ]; then + echo "LOGS profile candidates:" >&2 + find LOGS -maxdepth 8 -type f \( -path "*/profiles/*" -o -name "*trace*" -o -name "*profile*" \) -printf "%p %s bytes\n" 2>/dev/null | sort >&2 || true + fi + exit 1 fi - name: Process result (json -> agg) @@ -206,7 +355,7 @@ jobs: uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: profile_${{ env.RESULT_FILENAME }} - path: profile_${{ env.RESULT_FILENAME }}.trace.json.gz + path: ${{ steps.run.outputs.trace }} if-no-files-found: ignore - name: Upload TP-0-DECODE trace as artifact @@ -240,7 +389,7 @@ jobs: repository: SemiAnalysisAI/InferenceX-trace-storage path: storage ref: master - ssh-key: ${{ secrets.PROFILER_STORAGE_DEPLOY_KEY }} + token: ${{ secrets.REPO_PAT }} fetch-depth: 0 - name: Push profile to storage repo @@ -248,25 +397,27 @@ jobs: id: push env: TRACE_LOCAL: ${{ steps.run.outputs.trace }} + REPO_PAT: ${{ secrets.REPO_PAT }} shell: bash run: | set -euo pipefail - dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}" + dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${RESULT_FILENAME}" mkdir -p "$dest_dir" cp "$TRACE_LOCAL" "$dest_dir/trace.json.gz" pushd storage >/dev/null git config user.name "github-actions" git config user.email "github-actions@github.com" + git remote set-url origin "https://x-access-token:${REPO_PAT}@github.com/SemiAnalysisAI/InferenceX-trace-storage.git" git add -A - git commit -m "Add profile: ${GITHUB_SHA} ${{ matrix.config['exp-name'] }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}" || echo "Nothing to commit" - git push + git commit -m "Add profile: ${GITHUB_SHA} ${RESULT_FILENAME}" || echo "Nothing to commit" + git push origin HEAD:master STORAGE_SHA="$(git rev-parse HEAD)" popd >/dev/null - export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}/trace.json.gz" - export TITLE="${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}" + export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${RESULT_FILENAME}/trace.json.gz" + export TITLE="${RESULT_FILENAME}" enc_src="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["RAW_URL"], safe=""))')" enc_title="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["TITLE"], safe=""))')" diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index cfd30cd04..e6980d0c1 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -178,6 +178,7 @@ wait_for_server_ready() { # --max-concurrency: Max concurrency # --result-filename: Result filename without extension # --result-dir: Result directory +# --num-warmups: Optional warmup request count before benchmark/profile # --use-chat-template: Optional flag to enable chat template # --dsv4: Optional flag to use the DeepSeek-V4 chat template # (encoding_dsv4.py) instead of the tokenizer's built-in jinja @@ -204,6 +205,7 @@ run_benchmark_serving() { local result_filename="" local result_dir="" local workspace_dir="" + local num_warmups="" local use_chat_template=false local dsv4=false local trust_remote_code=false @@ -259,6 +261,10 @@ run_benchmark_serving() { workspace_dir="$2" shift 2 ;; + --num-warmups) + num_warmups="$2" + shift 2 + ;; --use-chat-template) use_chat_template=true shift @@ -341,6 +347,10 @@ run_benchmark_serving() { num_prompts="$max_concurrency" fi + if [[ -z "$num_warmups" ]]; then + num_warmups="$((2 * max_concurrency))" + fi + # Build benchmark command local benchmark_cmd=( python3 "$workspace_dir/utils/bench_serving/benchmark_serving.py" @@ -357,7 +367,7 @@ run_benchmark_serving() { --ignore-eos "${profile_flag[@]}" --save-result - --num-warmups "$((2 * max_concurrency))" \ + --num-warmups "$num_warmups" \ --percentile-metrics 'ttft,tpot,itl,e2el' --result-dir "$result_dir" --result-filename "$result_filename.json" @@ -508,7 +518,7 @@ move_profile_trace_for_relay() { return 0 fi - local dest_trace="/workspace/profile_${RESULT_FILENAME}.trace.json.gz" + local dest_trace="$PWD/profile_${RESULT_FILENAME}.trace.json.gz" if [[ "$trace_file" == *.gz ]]; then cp -f "$trace_file" "$dest_trace" else diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml new file mode 100644 index 000000000..00cf06b78 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-flash-profile-4gpu-conc256-mtp3.yaml @@ -0,0 +1,107 @@ +name: "svf-vllm-agg-gb200-flash-profile-4gpu-conc256-mtp3" + +model: + path: "deepseek-v4-flash" + container: "vllm/vllm-openai:v0.21.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_RPC_TIMEOUT: "1800000" + VLLM_TORCH_PROFILER_DIR: "/logs/profiles/agg" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + aggregated: + served-model-name: "deepseek-ai/DeepSeek-V4-Flash" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-hybrid-lb: true + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + speculative-config: '{"method":"mtp","num_speculative_tokens":3}' + attention-config: '{"use_fp4_indexer_cache":true}' + tokenizer-mode: deepseek_v4 + max-model-len: 8704 + max-num-seqs: 256 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 256 + profiler-config: '{"profiler":"torch","torch_profiler_dir":"/logs/profiles/agg","ignore_frontend":true,"delay_iterations":1296,"max_iterations":1,"active_iterations":1,"torch_profiler_with_stack":false}' + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + all2all-backend: "flashinfer_nvlink_one_sided" + +profiling: + type: "torch" + aggregated: + start_step: 1296 + stop_step: 1297 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 256 + concurrencies: "256" + req_rate: "inf" + num_prompts_mult: 1 + num_warmup_mult: 4 + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Flash" + container: + image: "vllm/vllm-openai:v0.21.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh index 03102778d..0ce1f016f 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -77,11 +77,17 @@ if [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 + SPECULATIVE_NUM_STEPS=1 + SPECULATIVE_NUM_DRAFT_TOKENS=2 + if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then + SPECULATIVE_NUM_STEPS=3 + SPECULATIVE_NUM_DRAFT_TOKENS=3 + fi SPEC_FLAGS=( --speculative-algorithm EAGLE - --speculative-num-steps 1 + --speculative-num-steps "$SPECULATIVE_NUM_STEPS" --speculative-eagle-topk 1 - --speculative-num-draft-tokens 2 + --speculative-num-draft-tokens "$SPECULATIVE_NUM_DRAFT_TOKENS" ) PARALLEL_ARGS=( --dp-size "$TP" diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh index 92d4bf4ad..8bf458ae4 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh @@ -47,6 +47,14 @@ if [ "${DP_ATTENTION}" = "true" ]; then MOE_ARGS=(--moe-backend deep_gemm_mega_moe) fi +PROFILE_ARGS=() +if [[ "${PROFILE:-}" == "1" ]]; then + PROFILE_ARGS=( + --profiler-config + "{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}" + ) +fi + if [ "${DP_ATTENTION}" = "true" ]; then MAX_NUM_BATCHED_TOKENS=2048 else @@ -76,6 +84,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ --no-enable-prefix-caching \ "${EP_ARGS[@]}" \ "${MOE_ARGS[@]}" \ + "${PROFILE_ARGS[@]}" \ --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \ --attention_config.use_fp4_indexer_cache True \ --tokenizer-mode deepseek_v4 \ diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index cb41a9eb1..efda4024d 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -44,7 +44,41 @@ else MAX_NUM_BATCHED_TOKENS=$(( ISL * 2 )) fi +PROFILE_ARGS=() +if [[ "${PROFILE:-}" == "1" ]]; then + PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\"}" + if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then + PROFILER_CONFIG="{\"profiler\":\"torch\",\"torch_profiler_dir\":\"${VLLM_TORCH_PROFILER_DIR:-/workspace/}\",\"ignore_frontend\":true,\"delay_iterations\":3,\"max_iterations\":1,\"active_iterations\":1,\"torch_profiler_with_stack\":false}" + fi + PROFILE_ARGS=( + --profiler-config + "$PROFILER_CONFIG" + ) +fi + +COMPILATION_ARGS=( + --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + --max-cudagraph-capture-size 2048 +) +if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then + COMPILATION_ARGS=( + --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + --max-cudagraph-capture-size 2048 + ) +fi + BENCHMARK_MAX_MODEL_LEN=$MAX_MODEL_LEN +BENCHMARK_OUTPUT_LEN=$OSL +BENCHMARK_NUM_PROMPTS=$((CONC * 10)) +BENCHMARK_MAX_CONCURRENCY=$CONC +BENCHMARK_NUM_WARMUPS=$((2 * BENCHMARK_MAX_CONCURRENCY)) + +if [[ "${PROFILE:-}" == "1" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then + BENCHMARK_OUTPUT_LEN=3 + BENCHMARK_NUM_PROMPTS=256 + BENCHMARK_MAX_CONCURRENCY=256 + BENCHMARK_NUM_WARMUPS=4096 +fi if [ "${EVAL_ONLY}" = "true" ]; then EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN") @@ -54,8 +88,12 @@ else SERVE_MAX_MODEL_LEN="$BENCHMARK_MAX_MODEL_LEN" fi -# use 2 speculative tokens for all configs for now +# Keep the existing Pro MTP profile at 2 speculative tokens; Flash uses the +# requested 3-token MTP profile. NUM_SPEC_TOKENS=2 +if [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Flash" ]]; then + NUM_SPEC_TOKENS=3 +fi start_gpu_monitor @@ -69,13 +107,13 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ --no-enable-prefix-caching \ "${EP_ARGS[@]}" \ "${MOE_ARGS[@]}" \ - --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \ + "${PROFILE_ARGS[@]}" \ + "${COMPILATION_ARGS[@]}" \ --attention_config.use_fp4_indexer_cache True \ --tokenizer-mode deepseek_v4 \ --tool-call-parser deepseek_v4 \ --enable-auto-tool-choice \ --reasoning-parser deepseek_v4 \ - --max-cudagraph-capture-size 2048 \ --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \ --max-model-len "$SERVE_MAX_MODEL_LEN" \ --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 & @@ -93,10 +131,11 @@ run_benchmark_serving \ --port "$PORT" \ --backend vllm \ --input-len "$ISL" \ - --output-len "$OSL" \ + --output-len "$BENCHMARK_OUTPUT_LEN" \ --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ + --num-prompts "$BENCHMARK_NUM_PROMPTS" \ + --max-concurrency "$BENCHMARK_MAX_CONCURRENCY" \ + --num-warmups "$BENCHMARK_NUM_WARMUPS" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ \ --trust-remote-code \ diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index cca8b4ab0..fcc630db9 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -287,7 +287,7 @@ else HF_HUB_CACHE_MOUNT="/data/models" if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}" - elif [[ "$MODEL_PREFIX" == "dsv4" ]]; then + elif [[ "$MODEL_PREFIX" == "dsv4" && "$MODEL" == "deepseek-ai/DeepSeek-V4-Pro" ]]; then export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro" fi SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index dada98bd6..ed4824ef5 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -16,11 +16,15 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/" export SRT_SLURM_MODEL_PREFIX="dsr1-fp4" elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then - # Same compute-node-local NVMe path as the dynamo-vllm dsv4 - # branch — see that branch for rationale. SRT_SLURM_MODEL_PREFIX - # matches the model.path alias in our DSV4 sglang recipes. - export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" - export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" + # SRT_SLURM_MODEL_PREFIX matches the model.path alias in our DSV4 + # sglang recipes. + if [[ $MODEL == "deepseek-ai/DeepSeek-V4-Flash" ]]; then + export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Flash/" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-flash" + else + export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" + fi else export MODEL_PATH=$MODEL fi @@ -49,11 +53,15 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4" export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4" elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then - # Weights live on compute-node local NVMe (/mnt/numa1) — no Lustre - # contention, fast startup. SRT_SLURM_MODEL_PREFIX matches the - # model.path alias in our DSV4 recipes. - export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" - export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" + # SRT_SLURM_MODEL_PREFIX matches the model.path alias in our DSV4 + # recipes. + if [[ $MODEL == "deepseek-ai/DeepSeek-V4-Flash" ]]; then + export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Flash/" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-flash" + else + export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" + fi else echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4" exit 1 @@ -296,6 +304,7 @@ echo "Collecting results..." if [ -d "$LOGS_DIR" ]; then echo "Found logs directory: $LOGS_DIR" + rm -rf "$GITHUB_WORKSPACE/LOGS" cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . else diff --git a/utils/process_result.py b/utils/process_result.py index 4603287bc..2010c09ff 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -1,5 +1,6 @@ import sys import json +import math import os from pathlib import Path @@ -128,8 +129,9 @@ def get_required_env_vars(required_vars): if key.endswith('ms'): data[key.replace('_ms', '')] = float(value) / 1000.0 if 'tpot' in key: + tpot_ms = float(value) data[key.replace('_ms', '').replace( - 'tpot', 'intvty')] = 1000.0 / float(value) + 'tpot', 'intvty')] = 1000.0 / tpot_ms if math.isfinite(tpot_ms) and tpot_ms > 0 else 0.0 print(json.dumps(data, indent=2)) diff --git a/utils/test_process_result.py b/utils/test_process_result.py index e3903c6e6..edeba20be 100644 --- a/utils/test_process_result.py +++ b/utils/test_process_result.py @@ -342,6 +342,22 @@ def test_tpot_to_interactivity_conversion(self, tmp_path, single_node_env_vars): assert output_data["intvty_p50"] == pytest.approx(50.0) assert output_data["intvty_p99"] == pytest.approx(20.0) + def test_zero_tpot_interactivity_is_guarded(self, tmp_path, single_node_env_vars): + """Test that zero TPOT fields do not crash interactivity conversion.""" + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 1, + "total_token_throughput": 1000.0, + "output_throughput": 800.0, + "mean_tpot_ms": 0.0, + } + + result = run_script(tmp_path, single_node_env_vars, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["mean_intvty"] == pytest.approx(0.0) + def test_throughput_per_gpu_single_node(self, tmp_path, single_node_env_vars): """Test throughput per GPU calculation for single node.""" benchmark_result = {