[Feature] P2pNccl concurrent dispatch mode by simondanielsson · Pull Request #167 · vllm-project/router

simondanielsson · 2026-05-05T09:15:00Z

Purpose

Fixes #168.

Add concurrent dispatch of requests to P/D instance for the P2pNcclConnector, removing from the TTFT (1) a full HTTP round-trip, and (2) scheduling overhead on D.

The bulk of this feature was already added in #157, so we mostly need to add a --kv-connector p2pnccl enum here to make it work also for P2pNccl.

Note: This is branched off of #157, so the diff will be much smaller once that PR is merged.

TTFT of toy proxy vs vllm-router. Qwen3-8b, 1P1D, 1k/1k ISL/OSL, MI300X

Usage

vllm-router \
--vllm-pd-disaggregation \
--kv-connector p2pnccl \
--vllm-discovery-address "0.0.0.0:36367" \
--policy consistent_hash \
--prefill-policy consistent_hash \
--decode-policy consistent_hash \
--log-level info

and use a --kv-transfer-config like this for vLLM:

{
  "kv_connector": "P2pNcclConnector",
  "kv_role": "kv_producer", # kv_consumer on the D side
  "kv_buffer_size": "1e10",
  "kv_port": "21001",
  "kv_connector_extra_config": {
    "proxy_ip": "${PREFILL_IP}",
    "proxy_port": "36367",
    "http_port": "8100"
  }
}

Test Plan

Compare performance of Qwen3-8b using vllm-router vs toy proxy on 2xMI300X.

First build the router in this branch

docker build   -f Dockerfile.router   -t ghcr.io/simondanielsson/vllm-router:p2pnccl   .
# or pull a prebuilt one: docker pull ghcr.io/simondanielsson/vllm-router:p2pnccl

1. Toy proxy

Following https://docs.vllm.ai/en/stable/design/p2p_nccl_connector/#run-1p3d

# P instance
PREFILL_IP=10.21.9.8
KV_CONFIG=$(cat <<EOF
{
  "kv_connector": "P2pNcclConnector",
  "kv_role": "kv_producer",
  "kv_buffer_size": "1e10",
  "kv_port": "21001",
  "kv_connector_extra_config": {
    "proxy_ip": "${PREFILL_IP}",
    "proxy_port": "30001",
    "http_port": "8100"
  }
}
EOF
)
docker run \
  --rm \
  --name p2p-prefill \
  --init --network host --ipc host --privileged \
  --cap-add SYS_PTRACE --security-opt seccomp=unconfined \
  --ulimit memlock=-1 --ulimit stack=67108864 \
  --shm-size 256G \
  --group-add video --group-add render \
  --device /dev/kfd --device /dev/dri --device /dev/infiniband \
  -v /sys:/sys \
  -v "${HOME}/.cache/huggingface:/root/.cache/huggingface" \
  -v "${PWD}/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py:/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py" \
  -e HF_HOME=/root/.cache/huggingface \
  -e HF_HUB_ENABLE_HF_TRANSFER=0 \
  -e NCCL_MIN_NCHANNELS=112 \
  -e NCCL_CUMEM_ENABLE=0 \
  -e VLLM_USE_V1=1 \
  -e VLLM_ENGINE_READY_TIMEOUT_S=3600 \
  -e VLLM_SERVER_DEV_MODE=1 \
  -e VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1 \
  -e VLLM_ROCM_USE_AITER=1 \
  -e HIP_VISIBLE_DEVICES=0 \
  -e NCCL_P2P_DISABLE=1 \
  -e NCCL_DEBUG=INFO \
  vllm/vllm-openai-rocm:v0.20.1 \
  Qwen/Qwen3-8b \
    --port 8100 \
    --tensor-parallel-size 1 \
    --gpu-memory-utilization 0.7 \
    --max-num-batched-tokens 32768 \
    --max-model-len 16384 \
    --trust-remote-code \
    --no-enable-prefix-caching \
    --enforce-eager \
    --kv-transfer-config "${KV_CONFIG}"

# D instance
KV_CONFIG=$(cat <<EOF
{
  "kv_connector": "P2pNcclConnector",
  "kv_role": "kv_consumer",
  "kv_buffer_size": "1e10",
  "kv_port": "22001",
  "kv_connector_extra_config": {
    "proxy_ip": "${PREFILL_IP}",
    "proxy_port": "30001",
    "http_port": "8200"
  }
}
EOF
)
docker run \
  --rm \
  --name p2p-decode \
  --init --network host --ipc host --privileged \
  --cap-add SYS_PTRACE --security-opt seccomp=unconfined \
  --ulimit memlock=-1 --ulimit stack=67108864 \
  --shm-size 256G \
  --group-add video --group-add render \
  --device /dev/kfd --device /dev/dri --device /dev/infiniband \
  -v /sys:/sys \
  -v "${HOME}/.cache/huggingface:/root/.cache/huggingface" \
  -v "${PWD}/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py:/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py" \
  -e HF_HOME=/root/.cache/huggingface \
  -e HF_HUB_ENABLE_HF_TRANSFER=0 \
  -e NCCL_MIN_NCHANNELS=112 \
  -e NCCL_CUMEM_ENABLE=0 \
  -e VLLM_USE_V1=1 \
  -e VLLM_ENGINE_READY_TIMEOUT_S=3600 \
  -e VLLM_SERVER_DEV_MODE=1 \
  -e VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1 \
  -e VLLM_ROCM_USE_AITER=1 \
  -e HIP_VISIBLE_DEVICES=1 \
  -e NCCL_P2P_DISABLE=1 \
  -e NCCL_DEBUG=INFO \
  vllm/vllm-openai-rocm:v0.20.1 \
  Qwen/Qwen3-8b \
    --port 8200 \
    --tensor-parallel-size 1 \
    --gpu-memory-utilization 0.7 \
    --max-num-batched-tokens 32768 \
    --max-model-len 16384 \
    --trust-remote-code \
    --no-enable-prefix-caching \
    --enforce-eager \
    --kv-transfer-config "${KV_CONFIG}"

# Toy proxy
docker run \
  --rm \
  --name p2p-toy-proxy \
  --network host \
  --rm \
  --entrypoint bash \
  vllm/vllm-openai-rocm:v0.20.1 \
  -c "python3 -u /app/vllm/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py"

# Run the benchmark
for input_len in 1000 8000; do
  for concurrency in 8 64 128 ; do
    docker exec p2p-prefill \
      vllm bench serve \
        --base-url http://localhost:10001 \
        --backend vllm \
        --model Qwen/Qwen3-8b \
        --dataset-name random \
        --random-input-len $input_len \
        --random-output-len 1000 \
        --max-concurrency $concurrency \
        --num-warmups $((concurrency * 2)) \
        --num-prompts $((concurrency * 10)) \
        --goodput ttft:1000 \
        --seed 1234 \
    | tee results/p2pnccl/toy/bench__input${input_len}_concurrency${concurrency}.txt
  done
done

2. vllm-router

# Prefill and Decode instances
# Same as above but set "proxy_port": "36367"

# Run the router
docker run \
  --name vllm-router \
  --network host \
  --rm \
  ghcr.io/simondanielsson/vllm-router:p2pnccl \
  vllm-router \
  --vllm-pd-disaggregation \
  --kv-connector p2pnccl \
  --vllm-discovery-address "0.0.0.0:36367" \
  --policy consistent_hash \
  --prefill-policy consistent_hash \
  --decode-policy consistent_hash \
  --log-level info


# Benchmark
for input_len in 1000 8000; do
  for concurrency in 8 64 128 ; do
    docker exec p2p-prefill \
      vllm bench serve \
        --base-url http://localhost:30000 \
        --backend vllm \
        --model Qwen/Qwen3-8b \
        --dataset-name random \
        --random-input-len $input_len \
        --random-output-len 1000 \
        --max-concurrency $concurrency \
        --num-warmups $((concurrency * 2)) \
        --num-prompts $((concurrency * 10)) \
        --goodput ttft:1000 \
        --seed 1234 \
    | tee results/p2pnccl/router/bench_input${input_len}_concurrency${concurrency}.txt
  done
done

Test Result

Seq: Sequential dispatch (no --kv-connector specified)
Conc: Concurrent dispatch (added in this PR)
toy: toy proxy

Router	Concurrency	ISL	OSL	Req/s	TTFT (P50/P99) ms	TPOT (P50/P99) ms	ITL (P50/P99) ms
router conc	8	1000	1000	0.76	355.31 / 2016.83	10.08 / 10.47	10.19 / 11.04
router seq	8	1000	1000	0.74	372.81 / 492.01	10.54 / 11.11	10.25 / 14.81
toy	8	1000	1000	0.75	359.32 / 382.84	10.49 / 10.65	10.19 / 11.18
router conc	16	1000	1000	1.44	715.49 / 728.79	10.49 / 10.84	10.30 / 11.32
router seq	16	1000	1000	1.40	695.01 / 748.84	10.77 / 10.89	10.91 / 11.97
toy	16	1000	1000	1.40	692.45 / 1218.71	10.71 / 10.91	10.92 / 12.01
router conc	32	1000	1000	2.60	1324.03 / 1398.90	10.99 / 12.16	10.94 / 16.23
router seq	32	1000	1000	2.58	1315.41 / 1542.22	11.07 / 12.02	11.26 / 14.73
toy	32	1000	1000	2.66	1317.71 / 1495.75	10.69 / 11.02	10.72 / 12.74
router conc	64	1000	1000	3.87	2566.81 / 3226.73	13.70 / 16.31	13.69 / 25.72
router seq	64	1000	1000	4.06	1953.20 / 2760.47	13.71 / 13.81	13.62 / 19.02
toy	64	1000	1000	3.90	2712.48 / 2978.06	13.73 / 14.28	13.71 / 18.98
router conc	128	1000	1000	4.99	5027.41 / 5127.96	20.66 / 23.03	20.71 / 23.90
router seq	128	1000	1000	5.71	1584.55 / 5613.52	20.17 / 20.85	20.35 / 31.28
toy	128	1000	1000	4.06	15147.87 / 27770.89	15.47 / 19.81	14.30 / 25.32
router conc	8	8000	1000	0.59	2974.99 / 3036.85	10.59 / 10.67	10.87 / 11.32
router seq	8	8000	1000	0.61	2396.62 / 3111.41	10.48 / 11.27	10.56 / 16.03
toy	8	8000	1000	0.59	2922.98 / 3316.60	10.67 / 11.78	10.87 / 21.05
router conc	16	8000	1000	0.83	5941.13 / 5975.88	13.33 / 13.83	13.39 / 16.96
router seq	16	8000	1000	0.98	2189.37 / 6090.57	13.47 / 13.62	13.53 / 15.95
toy	16	8000	1000	0.84	5397.78 / 7086.33	13.93 / 16.64	13.41 / 38.68
router conc	32	8000	1000	1.00	11661.23 / 12290.19	20.11 / 31.42	19.83 / 24.29
router seq	32	8000	1000	1.41	1715.28 / 12610.73	19.78 / 20.01	19.84 / 25.14
toy	32	8000	1000	1.04	7466.71 / 16573.74	22.73 / 25.41	19.52 / 40.08

Essential Elements of an Effective PR Description Checklist

The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
The test plan, such as providing test command.
The test results, such as pasting the results comparison before and after, or e2e results