From 3194358bd04eb8fa2afb4a05dbde1b0a56be4f22 Mon Sep 17 00:00:00 2001 From: Ankur Singh Date: Mon, 17 Nov 2025 13:43:54 -0800 Subject: [PATCH 1/7] add launch_b200-dgxc.sh --- runners/launch_b200-dgxc.sh | 94 +++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 runners/launch_b200-dgxc.sh diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh new file mode 100644 index 000000000..21a10d48f --- /dev/null +++ b/runners/launch_b200-dgxc.sh @@ -0,0 +1,94 @@ +#!/usr/bin/bash + +HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/" +FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +PORT=8888 + +# Create unique cache directory based on model parameters +MODEL_NAME=$(basename "$MODEL") + +server_name="bmk-server" +client_name="bmk-client" + +nvidia-smi + +# GPUs must be idle +if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then + echo "[ERROR] GPU busy from previous run"; nvidia-smi; exit 1 +fi + +set -x +# Use --init flag to run an init process (PID 1) inside container for better signal handling and zombie process cleanup +# Ref: https://www.paolomainardi.com/posts/docker-run-init/ + +# NCCL_GRAPH_REGISTER tries to automatically enable user buffer registration with CUDA Graphs. +# Disabling it can reduce perf but will improve CI stability. i.e. we won't see vLLM/Sglang crashes. +# Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-register + + +docker run --rm -d --init --network host --name $server_name \ +--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ +-e NCCL_GRAPH_REGISTER=0 \ +-e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +--entrypoint=/bin/bash \ +$(echo "$IMAGE" | sed 's/#/\//') \ +benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" =~ Application\ startup\ complete ]]; then + break + fi +done < <(docker logs -f --tail=0 $server_name 2>&1) + +git clone https://github.com/kimbochen/bench_serving.git + + +if [[ "$MODEL" == "nvidia/DeepSeek-R1-0528-FP4" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then + if [[ "$OSL" == "8192" ]]; then + NUM_PROMPTS=$(( CONC * 20 )) + else + NUM_PROMPTS=$(( CONC * 50 )) + fi +else + NUM_PROMPTS=$(( CONC * 10 )) +fi + +set -x +docker run --rm --network host --name $client_name \ +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +-e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ +--entrypoint=/bin/bash \ +$(echo "$IMAGE" | sed 's/#/\//') \ +-lc "pip install -q datasets pandas && \ +python3 bench_serving/benchmark_serving.py \ +--model $MODEL --backend vllm --base-url http://localhost:$PORT \ +--dataset-name random \ +--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ +--num-prompts $NUM_PROMPTS \ +--max-concurrency $CONC \ +--request-rate inf --ignore-eos \ +--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ +--result-dir /workspace/ --result-filename $RESULT_FILENAME.json" + +# Try graceful first +docker stop -t 90 "$server_name" || true +# Wait until it's really dead +docker wait "$server_name" >/dev/null 2>&1 || true +# Force remove if anything lingers +docker rm -f "$server_name" >/dev/null 2>&1 || true + +# Give a moment for GPU processes to fully terminate +sleep 2 +# Verify GPUs are now idle; if not, print diag and (optionally) reset +if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then + echo "[WARN] After stop, GPU still busy:"; nvidia-smi + # Last resort if driver allows and GPUs appear idle otherwise: + #nvidia-smi --gpu-reset -i 0,1,2,3,4,5,6,7 2>/dev/null || true +fi + +nvidia-smi From beb0c9a546d3b25abbc12e36c30819b8900d25bd Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Tue, 18 Nov 2025 10:25:30 -0800 Subject: [PATCH 2/7] add gptoss trt docker --- benchmarks/gptoss_fp4_b200_trt_docker.sh | 78 ++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 benchmarks/gptoss_fp4_b200_trt_docker.sh diff --git a/benchmarks/gptoss_fp4_b200_trt_docker.sh b/benchmarks/gptoss_fp4_b200_trt_docker.sh new file mode 100644 index 000000000..c3f971bb0 --- /dev/null +++ b/benchmarks/gptoss_fp4_b200_trt_docker.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +# === Required Env Vars === +# HF_TOKEN +# HF_HUB_CACHE +# IMAGE +# MODEL +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# TP +# CONC +# RESULT_FILENAME +# PORT + +# GPTOSS TRTLLM Deployment Guide: +# https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md + +# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= +EP_SIZE="1" +MOE_BACKEND="TRTLLM" +DP_ATTENTION=false + +# Higher concurrencies: Concurrency >= 256 +# MoE Backend = CUTLASS +# Use DP attention with expert parallel MoE +if [[ $CONC -ge 256 ]]; then + EP_SIZE="$TP" + DP_ATTENTION=true +fi + +echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" + +EXTRA_CONFIG_FILE="gptoss-fp4.yml" +export TRTLLM_ENABLE_PDL=1 +export NCCL_GRAPH_REGISTER=0 + +cat > $EXTRA_CONFIG_FILE << EOF +cuda_graph_config: + enable_padding: true + max_batch_size: $CONC +enable_attention_dp: $DP_ATTENTION +kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 +print_iter_log: true +stream_interval: 20 +num_postprocess_workers: 4 +moe_config: + backend: $MOE_BACKEND +EOF + +if [[ "$DP_ATTENTION" == "true" ]]; then + cat << EOF >> $EXTRA_CONFIG_FILE +attention_dp_config: + enable_balance: true +EOF +fi + +echo "Generated config file contents:" +cat $EXTRA_CONFIG_FILE + +set -x + +MAX_NUM_TOKENS=20000 + +# Launch TRT-LLM server +mpirun -n 1 --oversubscribe --allow-run-as-root \ + trtllm-serve $MODEL --port=$PORT \ + --trust_remote_code \ + --backend=pytorch \ + --max_batch_size 512 \ + --max_seq_len=$MAX_MODEL_LEN \ + --max_num_tokens=$MAX_NUM_TOKENS \ + --tp_size=$TP --ep_size=$EP_SIZE \ + --extra_llm_api_options=$EXTRA_CONFIG_FILE \ No newline at end of file From 1c68a83831c875aec9a2b8183907cc1e0d7d97c1 Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Tue, 18 Nov 2025 16:41:54 -0800 Subject: [PATCH 3/7] add dgxc runners --- .github/configs/runners.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 245b74762..98080efea 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -24,6 +24,8 @@ b200: - 'b200-nvd_1' - 'b200-nvd_2' - 'b200-nvd_3' +- 'b200-dgxc_1' +- 'b200-dgxc_2' mi300x: - 'mi300x-amd_0' - 'mi300x-amd_1' From 7bc658e6589f28a0994cd136abf433537be8627d Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Mon, 24 Nov 2025 09:12:25 -0800 Subject: [PATCH 4/7] move benchmark client inside OG docker container --- benchmarks/gptoss_fp4_b200_trt_docker.sh | 26 +++++++++++- runners/launch_b200-dgxc.sh | 51 +++++------------------- 2 files changed, 36 insertions(+), 41 deletions(-) diff --git a/benchmarks/gptoss_fp4_b200_trt_docker.sh b/benchmarks/gptoss_fp4_b200_trt_docker.sh index c3f971bb0..1988082e7 100644 --- a/benchmarks/gptoss_fp4_b200_trt_docker.sh +++ b/benchmarks/gptoss_fp4_b200_trt_docker.sh @@ -14,6 +14,8 @@ # RESULT_FILENAME # PORT +SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) + # GPTOSS TRTLLM Deployment Guide: # https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md @@ -75,4 +77,26 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \ --max_seq_len=$MAX_MODEL_LEN \ --max_num_tokens=$MAX_NUM_TOKENS \ --tp_size=$TP --ep_size=$EP_SIZE \ - --extra_llm_api_options=$EXTRA_CONFIG_FILE \ No newline at end of file + --extra_llm_api_options=$EXTRA_CONFIG_FILE > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Source benchmark utilities +source "$(dirname "$0")/benchmark_lib.sh" + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend openai \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ No newline at end of file diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index 21a10d48f..f48372d28 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -8,7 +8,6 @@ PORT=8888 MODEL_NAME=$(basename "$MODEL") server_name="bmk-server" -client_name="bmk-client" nvidia-smi @@ -25,55 +24,27 @@ set -x # Disabling it can reduce perf but will improve CI stability. i.e. we won't see vLLM/Sglang crashes. # Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-register - -docker run --rm -d --init --network host --name $server_name \ ---runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ --v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ --e NCCL_GRAPH_REGISTER=0 \ --e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ ---entrypoint=/bin/bash \ -$(echo "$IMAGE" | sed 's/#/\//') \ -benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" - -set +x -while IFS= read -r line; do - printf '%s\n' "$line" - if [[ "$line" =~ Application\ startup\ complete ]]; then - break - fi -done < <(docker logs -f --tail=0 $server_name 2>&1) - -git clone https://github.com/kimbochen/bench_serving.git - - if [[ "$MODEL" == "nvidia/DeepSeek-R1-0528-FP4" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then if [[ "$OSL" == "8192" ]]; then - NUM_PROMPTS=$(( CONC * 20 )) + export NUM_PROMPTS=$(( CONC * 20 )) else - NUM_PROMPTS=$(( CONC * 50 )) + export NUM_PROMPTS=$(( CONC * 50 )) fi else - NUM_PROMPTS=$(( CONC * 10 )) + export NUM_PROMPTS=$(( CONC * 10 )) fi -set -x -docker run --rm --network host --name $client_name \ +docker run --rm --init --network host --name $server_name \ +--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ +-e NCCL_GRAPH_REGISTER=0 \ +-e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ --lc "pip install -q datasets pandas && \ -python3 bench_serving/benchmark_serving.py \ ---model $MODEL --backend vllm --base-url http://localhost:$PORT \ ---dataset-name random \ ---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \ ---num-prompts $NUM_PROMPTS \ ---max-concurrency $CONC \ ---request-rate inf --ignore-eos \ ---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \ ---result-dir /workspace/ --result-filename $RESULT_FILENAME.json" +benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" # Try graceful first docker stop -t 90 "$server_name" || true From a26c125f5bb50d1a1ee76524801072089dc6a871 Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Mon, 24 Nov 2025 12:15:38 -0800 Subject: [PATCH 5/7] move SGL to b200-test runners --- .github/configs/nvidia-master.yaml | 4 ++-- .github/configs/runners.yaml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 954abbba2..215d04206 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2,7 +2,7 @@ dsr1-fp4-b200-sglang: image: lmsysorg/sglang:v0.5.5-cu129-amd64 model: nvidia/DeepSeek-R1-0528-FP4-V2 model-prefix: dsr1 - runner: b200 + runner: b200-test precision: fp4 framework: sglang seq-len-configs: @@ -76,7 +76,7 @@ dsr1-fp8-b200-sglang: image: lmsysorg/sglang:v0.5.5-cu129-amd64 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 - runner: b200 + runner: b200-test precision: fp8 framework: sglang seq-len-configs: diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 98080efea..333fa4d8f 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -24,6 +24,7 @@ b200: - 'b200-nvd_1' - 'b200-nvd_2' - 'b200-nvd_3' +b200-test: - 'b200-dgxc_1' - 'b200-dgxc_2' mi300x: From a53e8d968cfd714d00be1764a54c9e19f36005cd Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Wed, 26 Nov 2025 09:14:36 -0800 Subject: [PATCH 6/7] updated b200 runner label --- .github/configs/nvidia-master.yaml | 4 +- .github/configs/runners.yaml | 1 - benchmarks/gptoss_fp4_b200_trt_docker.sh | 102 ----------------------- 3 files changed, 2 insertions(+), 105 deletions(-) delete mode 100644 benchmarks/gptoss_fp4_b200_trt_docker.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 215d04206..954abbba2 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2,7 +2,7 @@ dsr1-fp4-b200-sglang: image: lmsysorg/sglang:v0.5.5-cu129-amd64 model: nvidia/DeepSeek-R1-0528-FP4-V2 model-prefix: dsr1 - runner: b200-test + runner: b200 precision: fp4 framework: sglang seq-len-configs: @@ -76,7 +76,7 @@ dsr1-fp8-b200-sglang: image: lmsysorg/sglang:v0.5.5-cu129-amd64 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 - runner: b200-test + runner: b200 precision: fp8 framework: sglang seq-len-configs: diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 333fa4d8f..98080efea 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -24,7 +24,6 @@ b200: - 'b200-nvd_1' - 'b200-nvd_2' - 'b200-nvd_3' -b200-test: - 'b200-dgxc_1' - 'b200-dgxc_2' mi300x: diff --git a/benchmarks/gptoss_fp4_b200_trt_docker.sh b/benchmarks/gptoss_fp4_b200_trt_docker.sh deleted file mode 100644 index 1988082e7..000000000 --- a/benchmarks/gptoss_fp4_b200_trt_docker.sh +++ /dev/null @@ -1,102 +0,0 @@ -#!/usr/bin/env bash - -# === Required Env Vars === -# HF_TOKEN -# HF_HUB_CACHE -# IMAGE -# MODEL -# ISL -# OSL -# MAX_MODEL_LEN -# RANDOM_RANGE_RATIO -# TP -# CONC -# RESULT_FILENAME -# PORT - -SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) - -# GPTOSS TRTLLM Deployment Guide: -# https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md - -# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -EP_SIZE="1" -MOE_BACKEND="TRTLLM" -DP_ATTENTION=false - -# Higher concurrencies: Concurrency >= 256 -# MoE Backend = CUTLASS -# Use DP attention with expert parallel MoE -if [[ $CONC -ge 256 ]]; then - EP_SIZE="$TP" - DP_ATTENTION=true -fi - -echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" - -EXTRA_CONFIG_FILE="gptoss-fp4.yml" -export TRTLLM_ENABLE_PDL=1 -export NCCL_GRAPH_REGISTER=0 - -cat > $EXTRA_CONFIG_FILE << EOF -cuda_graph_config: - enable_padding: true - max_batch_size: $CONC -enable_attention_dp: $DP_ATTENTION -kv_cache_config: - dtype: fp8 - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 -print_iter_log: true -stream_interval: 20 -num_postprocess_workers: 4 -moe_config: - backend: $MOE_BACKEND -EOF - -if [[ "$DP_ATTENTION" == "true" ]]; then - cat << EOF >> $EXTRA_CONFIG_FILE -attention_dp_config: - enable_balance: true -EOF -fi - -echo "Generated config file contents:" -cat $EXTRA_CONFIG_FILE - -set -x - -MAX_NUM_TOKENS=20000 - -# Launch TRT-LLM server -mpirun -n 1 --oversubscribe --allow-run-as-root \ - trtllm-serve $MODEL --port=$PORT \ - --trust_remote_code \ - --backend=pytorch \ - --max_batch_size 512 \ - --max_seq_len=$MAX_MODEL_LEN \ - --max_num_tokens=$MAX_NUM_TOKENS \ - --tp_size=$TP --ep_size=$EP_SIZE \ - --extra_llm_api_options=$EXTRA_CONFIG_FILE > $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -# Source benchmark utilities -source "$(dirname "$0")/benchmark_lib.sh" - -# Wait for server to be ready -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_benchmark_serving \ - --model "$MODEL" \ - --port "$PORT" \ - --backend openai \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts $(( $CONC * 10 )) \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ No newline at end of file From e44c8cb42e1b014719ef5738d1b5edebdc7e8401 Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Wed, 26 Nov 2025 12:31:18 -0800 Subject: [PATCH 7/7] Add DP_ATTENTION env variable to b200 runner scripts --- runners/launch_b200-dgxc.sh | 2 +- runners/launch_b200-nvd.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index f48372d28..4d8ec0aed 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -38,7 +38,7 @@ docker run --rm --init --network host --name $server_name \ --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \ diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index c5216b006..ebfa67458 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -39,7 +39,7 @@ docker run --rm --init --network host --name $server_name \ --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \