From 2441f1fbbbd2bed783e62e26cbaeb07f558c7a62 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 26 May 2026 12:53:45 +0900 Subject: [PATCH 01/10] chore(agentic): annotate CPU DRAM limit comments for Kimi FP4 B200 Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh index 366603f45..e2dce7b2a 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh @@ -99,6 +99,8 @@ case "$OFFLOADING" in # RSS + page cache. Eager mode (the shortcut form default) is # intentional here per user request — Kimi FP4 on B200 has cleared # the full eager sweep before. + #(srok), internal node limitation + #TOTAL_CPU_DRAM_GB=2500 TOTAL_CPU_DRAM_GB=2500 export VLLM_USE_SIMPLE_KV_OFFLOAD=1 OFFLOAD_ARGS=( @@ -119,6 +121,8 @@ case "$OFFLOADING" in # --kv-offloading-size through vLLM's integrated LMCache convenience # path, which divides the value by TP and then hits a large single-shot # cudaHostAlloc in LMCache 0.4.5's single-process local CPU backend. + #(srok), internal node limitation + #TOTAL_CPU_DRAM_GB=2500 TOTAL_CPU_DRAM_GB=2500 LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" LMCACHE_PORT="${LMCACHE_PORT:-5555}" From 461bbe7f6c99a0a9c7b85e1e5f631d4443204e7d Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 26 May 2026 13:01:41 +0900 Subject: [PATCH 02/10] fix(agentic): reduce Kimi FP4 B200 CPU DRAM limit to 1500 GB Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh index e2dce7b2a..f1111e3d9 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh @@ -101,7 +101,7 @@ case "$OFFLOADING" in # the full eager sweep before. #(srok), internal node limitation #TOTAL_CPU_DRAM_GB=2500 - TOTAL_CPU_DRAM_GB=2500 + TOTAL_CPU_DRAM_GB=1500 export VLLM_USE_SIMPLE_KV_OFFLOAD=1 OFFLOAD_ARGS=( --kv_offloading_backend native @@ -123,7 +123,7 @@ case "$OFFLOADING" in # cudaHostAlloc in LMCache 0.4.5's single-process local CPU backend. #(srok), internal node limitation #TOTAL_CPU_DRAM_GB=2500 - TOTAL_CPU_DRAM_GB=2500 + TOTAL_CPU_DRAM_GB=1500 LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" LMCACHE_PORT="${LMCACHE_PORT:-5555}" LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" From 806b3c948bd7951e818dd85c3cce899b1ba5b5fa Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 26 May 2026 13:40:00 +0900 Subject: [PATCH 03/10] manual Signed-off-by: seungrokj --- .../single_node/agentic/kimik2.5_fp4_b200.sh | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh index f1111e3d9..6cef20bcc 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh @@ -90,6 +90,166 @@ wait_for_lmcache_ready() { exit 1 } +write_lmcache_cuda_mp_patch() { + local patch_dir="$1" + mkdir -p "$patch_dir" + cat > "$patch_dir/sitecustomize.py" <<'PY' +"""Runtime compatibility for LMCache MP on CUDA Kimi MLA KV caches.""" + +import os +import threading + +if os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR") == "1": + import builtins + import sys + + _orig_import = builtins.__import__ + + def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None: + _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator + + if getattr(_LazyMemoryAllocator, "_agentic_cuda_demand_patch", False): + return + + _orig_init = _LazyMemoryAllocator.__init__ + _orig_allocate = _LazyMemoryAllocator.allocate + _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate + + def _expand_to(self, target_size: int) -> None: + target_size = min( + self._final_size, + _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE), + ) + lock = self._agentic_cuda_demand_expand_lock + with lock: + if target_size <= self._curr_size: + return + + start_size = self._curr_size + while self._curr_size < target_size: + commit_start = self._curr_size + commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE) + while self._curr_size < commit_target: + self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE) + self._curr_size += self.PIN_CHUNK_SIZE + self._commit_expansion(self._curr_size - commit_start) + + self._log_expansion_progress(self._curr_size - start_size) + + def _retry_with_demand_expansion(self, allocate_once): + obj = allocate_once() + step_gb = float(os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_STEP_GB", "64")) + step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3))) + + while obj is None and self._curr_size < self._final_size: + _expand_to(self, self._curr_size + step_bytes) + obj = allocate_once() + + return obj + + def _patched_init(self, *args, **kwargs): + _orig_init(self, *args, **kwargs) + self._agentic_cuda_demand_expand_lock = threading.Lock() + + # LMCache MP's upstream LazyMemoryAllocator currently expands to + # the final pinned size in a background thread. On CUDA Kimi TP4, + # vLLM reaches KV-cache registration only after that 1.5 TB pool + # is fully pinned, and the server-side IPC open path can stall + # before acknowledging register_kv_caches. Keep the same final + # capacity, but pin/commit extra host memory only when L1 + # allocations actually need it. + self._stop_expand.set() + self._expand_thread.join() + _lazy_memory_allocator.logger.info( + "Agentic CUDA patch: using demand-driven LMCache pinned " + "memory expansion; final capacity remains %s MB", + self._final_size >> 20, + ) + + def _patched_allocate( + self, + shapes, + dtypes, + fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, + allocator_type=None, + ): + return _retry_with_demand_expansion( + self, + lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type), + ) + + def _patched_batched_allocate( + self, + shapes, + dtypes, + batch_size, + fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, + allocator_type=None, + ): + return _retry_with_demand_expansion( + self, + lambda: _orig_batched_allocate( + self, shapes, dtypes, batch_size, fmt, allocator_type + ), + ) + + _LazyMemoryAllocator.__init__ = _patched_init + _LazyMemoryAllocator.allocate = _patched_allocate + _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate + _LazyMemoryAllocator._agentic_cuda_demand_patch = True + + def _patch_l1_memory_manager(_memory_manager) -> None: + _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None) + _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None) + if _L1MemoryManager is None or _LazyMemoryAllocator is None: + return + if getattr(_L1MemoryManager, "_agentic_cuda_final_capacity_patch", False): + return + + _orig_get_memory_usage = _L1MemoryManager.get_memory_usage + + def _patched_get_memory_usage(self): + allocator = getattr(self, "_allocator", None) + if isinstance(allocator, _LazyMemoryAllocator): + address_manager = allocator.get_address_manager() + used_size = ( + address_manager.get_heap_size() - address_manager.get_free_size() + ) + return used_size, allocator._final_size + return _orig_get_memory_usage(self) + + _L1MemoryManager.get_memory_usage = _patched_get_memory_usage + _L1MemoryManager._agentic_cuda_final_capacity_patch = True + + def _maybe_patch_lazy_memory_allocator() -> None: + module = sys.modules.get("lmcache.v1.lazy_memory_allocator") + if module is not None and hasattr(module, "LazyMemoryAllocator"): + _patch_lazy_memory_allocator(module) + + def _maybe_patch_l1_memory_manager() -> None: + module = sys.modules.get("lmcache.v1.distributed.memory_manager") + if module is not None and hasattr(module, "L1MemoryManager"): + _patch_l1_memory_manager(module) + + def _agentic_cuda_import(name, globals=None, locals=None, fromlist=(), level=0): + module = _orig_import(name, globals, locals, fromlist, level) + if name == "lmcache.v1.lazy_memory_allocator" or ( + name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules + ): + _maybe_patch_lazy_memory_allocator() + if name == "lmcache.v1.distributed.memory_manager" or ( + name.startswith("lmcache") + and "lmcache.v1.distributed.memory_manager" in sys.modules + ): + _maybe_patch_l1_memory_manager() + return module + + builtins.__import__ = _agentic_cuda_import + _maybe_patch_lazy_memory_allocator() + _maybe_patch_l1_memory_manager() +PY +} + case "$OFFLOADING" in none) ;; @@ -114,6 +274,10 @@ case "$OFFLOADING" in unset VLLM_USE_SIMPLE_KV_OFFLOAD agentic_pip_install --quiet --no-cache-dir lmcache + LMCACHE_CUDA_PATCH_DIR="$RESULT_DIR/lmcache_cuda_patch" + write_lmcache_cuda_mp_patch "$LMCACHE_CUDA_PATCH_DIR" + export LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR=1 + export PYTHONPATH="$LMCACHE_CUDA_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}" python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null # Keep the semantic CPU KV pool at 2.5 TB for every TP shape. MP mode From 18eb2d5b5b438ec95ee964b0ce96d9c285ed7d9d Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 26 May 2026 16:19:03 +0900 Subject: [PATCH 04/10] manual Signed-off-by: seungrokj --- .../single_node/agentic/kimik2.5_fp4_b200.sh | 230 +++--------------- 1 file changed, 34 insertions(+), 196 deletions(-) diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh index 6cef20bcc..e972eebf6 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh @@ -90,165 +90,42 @@ wait_for_lmcache_ready() { exit 1 } -write_lmcache_cuda_mp_patch() { - local patch_dir="$1" - mkdir -p "$patch_dir" - cat > "$patch_dir/sitecustomize.py" <<'PY' -"""Runtime compatibility for LMCache MP on CUDA Kimi MLA KV caches.""" - -import os -import threading - -if os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR") == "1": - import builtins - import sys - - _orig_import = builtins.__import__ - - def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None: - _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator - - if getattr(_LazyMemoryAllocator, "_agentic_cuda_demand_patch", False): - return - - _orig_init = _LazyMemoryAllocator.__init__ - _orig_allocate = _LazyMemoryAllocator.allocate - _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate - - def _expand_to(self, target_size: int) -> None: - target_size = min( - self._final_size, - _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE), - ) - lock = self._agentic_cuda_demand_expand_lock - with lock: - if target_size <= self._curr_size: - return - - start_size = self._curr_size - while self._curr_size < target_size: - commit_start = self._curr_size - commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE) - while self._curr_size < commit_target: - self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE) - self._curr_size += self.PIN_CHUNK_SIZE - self._commit_expansion(self._curr_size - commit_start) - - self._log_expansion_progress(self._curr_size - start_size) - - def _retry_with_demand_expansion(self, allocate_once): - obj = allocate_once() - step_gb = float(os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_STEP_GB", "64")) - step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3))) - - while obj is None and self._curr_size < self._final_size: - _expand_to(self, self._curr_size + step_bytes) - obj = allocate_once() - - return obj - - def _patched_init(self, *args, **kwargs): - _orig_init(self, *args, **kwargs) - self._agentic_cuda_demand_expand_lock = threading.Lock() - - # LMCache MP's upstream LazyMemoryAllocator currently expands to - # the final pinned size in a background thread. On CUDA Kimi TP4, - # vLLM reaches KV-cache registration only after that 1.5 TB pool - # is fully pinned, and the server-side IPC open path can stall - # before acknowledging register_kv_caches. Keep the same final - # capacity, but pin/commit extra host memory only when L1 - # allocations actually need it. - self._stop_expand.set() - self._expand_thread.join() - _lazy_memory_allocator.logger.info( - "Agentic CUDA patch: using demand-driven LMCache pinned " - "memory expansion; final capacity remains %s MB", - self._final_size >> 20, - ) - - def _patched_allocate( - self, - shapes, - dtypes, - fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, - allocator_type=None, - ): - return _retry_with_demand_expansion( - self, - lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type), - ) - - def _patched_batched_allocate( - self, - shapes, - dtypes, - batch_size, - fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, - allocator_type=None, - ): - return _retry_with_demand_expansion( - self, - lambda: _orig_batched_allocate( - self, shapes, dtypes, batch_size, fmt, allocator_type - ), - ) - - _LazyMemoryAllocator.__init__ = _patched_init - _LazyMemoryAllocator.allocate = _patched_allocate - _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate - _LazyMemoryAllocator._agentic_cuda_demand_patch = True - - def _patch_l1_memory_manager(_memory_manager) -> None: - _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None) - _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None) - if _L1MemoryManager is None or _LazyMemoryAllocator is None: - return - if getattr(_L1MemoryManager, "_agentic_cuda_final_capacity_patch", False): - return - - _orig_get_memory_usage = _L1MemoryManager.get_memory_usage - - def _patched_get_memory_usage(self): - allocator = getattr(self, "_allocator", None) - if isinstance(allocator, _LazyMemoryAllocator): - address_manager = allocator.get_address_manager() - used_size = ( - address_manager.get_heap_size() - address_manager.get_free_size() - ) - return used_size, allocator._final_size - return _orig_get_memory_usage(self) - - _L1MemoryManager.get_memory_usage = _patched_get_memory_usage - _L1MemoryManager._agentic_cuda_final_capacity_patch = True - - def _maybe_patch_lazy_memory_allocator() -> None: - module = sys.modules.get("lmcache.v1.lazy_memory_allocator") - if module is not None and hasattr(module, "LazyMemoryAllocator"): - _patch_lazy_memory_allocator(module) - - def _maybe_patch_l1_memory_manager() -> None: - module = sys.modules.get("lmcache.v1.distributed.memory_manager") - if module is not None and hasattr(module, "L1MemoryManager"): - _patch_l1_memory_manager(module) +echo "Starting vllm server..." +export TORCH_CUDA_ARCH_LIST="10.0" +export PYTHONNOUSERSITE=1 +# Disable vLLM v0.21+ CUDA-graph memory estimator. Its pre-reservation +# eats ~32% of HBM upfront which, combined with FP4 weights at TP=4 +# (~62 GB/GPU), leaves no room for KV blocks -- _check_enough_kv_cache_memory +# trips before the engine starts. Our --gpu-memory-utilization=0.90 already +# leaves ~18 GB/GPU slack outside vLLM's budget, which is the same safety +# net the estimator provides, so disabling it is redundant rather than +# unsafe. +export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 - def _agentic_cuda_import(name, globals=None, locals=None, fromlist=(), level=0): - module = _orig_import(name, globals, locals, fromlist, level) - if name == "lmcache.v1.lazy_memory_allocator" or ( - name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules - ): - _maybe_patch_lazy_memory_allocator() - if name == "lmcache.v1.distributed.memory_manager" or ( - name.startswith("lmcache") - and "lmcache.v1.distributed.memory_manager" in sys.modules - ): - _maybe_patch_l1_memory_manager() - return module +{ set +x; } 2>/dev/null +VLLM_CMD=( + vllm serve "$MODEL" + --host 0.0.0.0 + --port "$PORT" + --tensor-parallel-size="$TP" + --gpu-memory-utilization 0.90 + --max-num-seqs "$CONC" + --reasoning-parser kimi_k2 + --tool-call-parser kimi_k2 + --compilation_config.pass_config.fuse_allreduce_rms true + --kv-cache-dtype fp8 + --max-cudagraph-capture-size 2048 + --stream-interval 20 + --trust-remote-code + "${PREFIX_CACHE_ARGS[@]}" + "${OFFLOAD_ARGS[@]}" +) +printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" +printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" +"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" - builtins.__import__ = _agentic_cuda_import - _maybe_patch_lazy_memory_allocator() - _maybe_patch_l1_memory_manager() -PY -} case "$OFFLOADING" in none) @@ -274,10 +151,6 @@ case "$OFFLOADING" in unset VLLM_USE_SIMPLE_KV_OFFLOAD agentic_pip_install --quiet --no-cache-dir lmcache - LMCACHE_CUDA_PATCH_DIR="$RESULT_DIR/lmcache_cuda_patch" - write_lmcache_cuda_mp_patch "$LMCACHE_CUDA_PATCH_DIR" - export LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR=1 - export PYTHONPATH="$LMCACHE_CUDA_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}" python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null # Keep the semantic CPU KV pool at 2.5 TB for every TP shape. MP mode @@ -337,41 +210,6 @@ case "$OFFLOADING" in ;; esac -echo "Starting vllm server..." -export TORCH_CUDA_ARCH_LIST="10.0" -export PYTHONNOUSERSITE=1 -# Disable vLLM v0.21+ CUDA-graph memory estimator. Its pre-reservation -# eats ~32% of HBM upfront which, combined with FP4 weights at TP=4 -# (~62 GB/GPU), leaves no room for KV blocks -- _check_enough_kv_cache_memory -# trips before the engine starts. Our --gpu-memory-utilization=0.90 already -# leaves ~18 GB/GPU slack outside vLLM's budget, which is the same safety -# net the estimator provides, so disabling it is redundant rather than -# unsafe. -export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 - -{ set +x; } 2>/dev/null -VLLM_CMD=( - vllm serve "$MODEL" - --host 0.0.0.0 - --port "$PORT" - --tensor-parallel-size="$TP" - --gpu-memory-utilization 0.90 - --max-num-seqs "$CONC" - --reasoning-parser kimi_k2 - --tool-call-parser kimi_k2 - --compilation_config.pass_config.fuse_allreduce_rms true - --kv-cache-dtype fp8 - --max-cudagraph-capture-size 2048 - --stream-interval 20 - --trust-remote-code - "${PREFIX_CACHE_ARGS[@]}" - "${OFFLOAD_ARGS[@]}" -) -printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" -printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" -"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" From c050d08c350db5e465ec4dd90f4dc80d76920f5a Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 26 May 2026 17:47:03 +0900 Subject: [PATCH 05/10] manual Signed-off-by: seungrokj --- .../single_node/agentic/kimik2.5_fp4_b200.sh | 72 +- .../agentic/kimik2.5_fp4_mi355x.sh | 686 +++++++++--------- 2 files changed, 383 insertions(+), 375 deletions(-) diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh index e972eebf6..f1111e3d9 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh @@ -90,43 +90,6 @@ wait_for_lmcache_ready() { exit 1 } -echo "Starting vllm server..." -export TORCH_CUDA_ARCH_LIST="10.0" -export PYTHONNOUSERSITE=1 -# Disable vLLM v0.21+ CUDA-graph memory estimator. Its pre-reservation -# eats ~32% of HBM upfront which, combined with FP4 weights at TP=4 -# (~62 GB/GPU), leaves no room for KV blocks -- _check_enough_kv_cache_memory -# trips before the engine starts. Our --gpu-memory-utilization=0.90 already -# leaves ~18 GB/GPU slack outside vLLM's budget, which is the same safety -# net the estimator provides, so disabling it is redundant rather than -# unsafe. -export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 - -{ set +x; } 2>/dev/null -VLLM_CMD=( - vllm serve "$MODEL" - --host 0.0.0.0 - --port "$PORT" - --tensor-parallel-size="$TP" - --gpu-memory-utilization 0.90 - --max-num-seqs "$CONC" - --reasoning-parser kimi_k2 - --tool-call-parser kimi_k2 - --compilation_config.pass_config.fuse_allreduce_rms true - --kv-cache-dtype fp8 - --max-cudagraph-capture-size 2048 - --stream-interval 20 - --trust-remote-code - "${PREFIX_CACHE_ARGS[@]}" - "${OFFLOAD_ARGS[@]}" -) -printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" -printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" -"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - - case "$OFFLOADING" in none) ;; @@ -210,6 +173,41 @@ case "$OFFLOADING" in ;; esac +echo "Starting vllm server..." +export TORCH_CUDA_ARCH_LIST="10.0" +export PYTHONNOUSERSITE=1 +# Disable vLLM v0.21+ CUDA-graph memory estimator. Its pre-reservation +# eats ~32% of HBM upfront which, combined with FP4 weights at TP=4 +# (~62 GB/GPU), leaves no room for KV blocks -- _check_enough_kv_cache_memory +# trips before the engine starts. Our --gpu-memory-utilization=0.90 already +# leaves ~18 GB/GPU slack outside vLLM's budget, which is the same safety +# net the estimator provides, so disabling it is redundant rather than +# unsafe. +export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 + +{ set +x; } 2>/dev/null +VLLM_CMD=( + vllm serve "$MODEL" + --host 0.0.0.0 + --port "$PORT" + --tensor-parallel-size="$TP" + --gpu-memory-utilization 0.90 + --max-num-seqs "$CONC" + --reasoning-parser kimi_k2 + --tool-call-parser kimi_k2 + --compilation_config.pass_config.fuse_allreduce_rms true + --kv-cache-dtype fp8 + --max-cudagraph-capture-size 2048 + --stream-interval 20 + --trust-remote-code + "${PREFIX_CACHE_ARGS[@]}" + "${OFFLOAD_ARGS[@]}" +) +printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" +printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" +"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index 1e716aa4e..aa4ffd149 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -55,289 +55,289 @@ if [ "${TP}" -lt 8 ]; then export VLLM_ROCM_USE_AITER_RMSNORM=0 fi -write_lmcache_rocm_mp_patch() { - local patch_dir="$1" - mkdir -p "$patch_dir" - cat > "$patch_dir/sitecustomize.py" <<'PY' -"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches.""" - -import os -import threading - -if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1": - import builtins - import sys - - _orig_import = builtins.__import__ - - def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None: - _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator - - if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False): - return - - _orig_init = _LazyMemoryAllocator.__init__ - _orig_allocate = _LazyMemoryAllocator.allocate - _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate - - def _expand_to(self, target_size: int) -> None: - target_size = min( - self._final_size, - _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE), - ) - lock = self._agentic_rocm_demand_expand_lock - with lock: - if target_size <= self._curr_size: - return - - start_size = self._curr_size - while self._curr_size < target_size: - commit_start = self._curr_size - commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE) - while self._curr_size < commit_target: - self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE) - self._curr_size += self.PIN_CHUNK_SIZE - self._commit_expansion(self._curr_size - commit_start) - - self._log_expansion_progress(self._curr_size - start_size) - - def _retry_with_demand_expansion(self, allocate_once): - obj = allocate_once() - step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64")) - step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3))) - - while obj is None and self._curr_size < self._final_size: - _expand_to(self, self._curr_size + step_bytes) - obj = allocate_once() - - return obj - - def _patched_init(self, *args, **kwargs): - _orig_init(self, *args, **kwargs) - self._agentic_rocm_demand_expand_lock = threading.Lock() - - # LMCache MP's upstream LazyMemoryAllocator currently expands to - # the final pinned size in a background thread. On ROCm Kimi TP4, - # vLLM reaches KV-cache registration only after that 2.5 TB pool - # is fully pinned, and the server-side IPC open path can stall - # before acknowledging register_kv_caches. Keep the same final - # capacity, but pin/commit extra host memory only when L1 - # allocations actually need it. - self._stop_expand.set() - self._expand_thread.join() - _lazy_memory_allocator.logger.info( - "Agentic ROCm patch: using demand-driven LMCache pinned " - "memory expansion; final capacity remains %s MB", - self._final_size >> 20, - ) - - def _patched_allocate( - self, - shapes, - dtypes, - fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, - allocator_type=None, - ): - return _retry_with_demand_expansion( - self, - lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type), - ) - - def _patched_batched_allocate( - self, - shapes, - dtypes, - batch_size, - fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, - allocator_type=None, - ): - return _retry_with_demand_expansion( - self, - lambda: _orig_batched_allocate( - self, shapes, dtypes, batch_size, fmt, allocator_type - ), - ) - - _LazyMemoryAllocator.__init__ = _patched_init - _LazyMemoryAllocator.allocate = _patched_allocate - _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate - _LazyMemoryAllocator._agentic_rocm_demand_patch = True - - def _patch_l1_memory_manager(_memory_manager) -> None: - _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None) - _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None) - if _L1MemoryManager is None or _LazyMemoryAllocator is None: - return - if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False): - return - - _orig_get_memory_usage = _L1MemoryManager.get_memory_usage - - def _patched_get_memory_usage(self): - allocator = getattr(self, "_allocator", None) - if isinstance(allocator, _LazyMemoryAllocator): - address_manager = allocator.get_address_manager() - used_size = ( - address_manager.get_heap_size() - address_manager.get_free_size() - ) - return used_size, allocator._final_size - return _orig_get_memory_usage(self) - - _L1MemoryManager.get_memory_usage = _patched_get_memory_usage - _L1MemoryManager._agentic_rocm_final_capacity_patch = True - - def _maybe_patch_lazy_memory_allocator() -> None: - module = sys.modules.get("lmcache.v1.lazy_memory_allocator") - if module is not None and hasattr(module, "LazyMemoryAllocator"): - _patch_lazy_memory_allocator(module) - - def _maybe_patch_l1_memory_manager() -> None: - module = sys.modules.get("lmcache.v1.distributed.memory_manager") - if module is not None and hasattr(module, "L1MemoryManager"): - _patch_l1_memory_manager(module) - - def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0): - module = _orig_import(name, globals, locals, fromlist, level) - if name == "lmcache.v1.lazy_memory_allocator" or ( - name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules - ): - _maybe_patch_lazy_memory_allocator() - if name == "lmcache.v1.distributed.memory_manager" or ( - name.startswith("lmcache") - and "lmcache.v1.distributed.memory_manager" in sys.modules - ): - _maybe_patch_l1_memory_manager() - return module - - builtins.__import__ = _agentic_rocm_import - _maybe_patch_lazy_memory_allocator() - _maybe_patch_l1_memory_manager() - -if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1": - import torch - import lmcache.non_cuda_equivalents as lmc - - if not hasattr(lmc, "multi_layer_block_kv_transfer"): - _DTYPE_BY_NAME = { - "bfloat16": torch.bfloat16, - "float16": torch.float16, - "float32": torch.float32, - } - - def _dtype_from_env() -> torch.dtype: - name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16") - try: - return _DTYPE_BY_NAME[name] - except KeyError as exc: - raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc - - def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor: - block_stride = shape_desc.block_stride_elems or ( - shape_desc.bs * shape_desc.nh * shape_desc.hs - ) - base = lmc._tensor_from_ptr( - ptr, - (shape_desc.nb * block_stride,), - dtype, - device, - ) - return torch.as_strided( - base, - (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs), - (block_stride, shape_desc.nh * shape_desc.hs, 1), - ) - - def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor: - return lmc._tensor_from_ptr( - ptr, - (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs), - dtype, - device, - ) - - def multi_layer_block_kv_transfer( - group_kv_pointers, - tmp_buffer_ptrs, - block_ids, - paged_memory_device, - direction, - shape_desc, - lmcache_chunk_size, - gpu_kv_format, - skip_blocks=0, - ) -> None: - # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with - # shape [num_blocks, block_size, hidden_size]. LMCache's Python - # fallback has no block-transfer entrypoint yet, so implement the - # same gather/scatter contract with torch indexing on ROCm. - if shape_desc.kv_size != 1: - raise NotImplementedError( - "ROCm LMCache MP block fallback currently supports MLA KV caches only" - ) - - dtype = _dtype_from_env() - device = ( - paged_memory_device - if isinstance(paged_memory_device, torch.device) - else torch.device(paged_memory_device) - ) - num_layers = int(group_kv_pointers.numel()) - blocks_per_chunk = lmcache_chunk_size // shape_desc.bs - direction_name = getattr(direction, "name", str(direction)) - - for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs): - start = chunk_idx * blocks_per_chunk - end = start + blocks_per_chunk - chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long) - - dest_slot_offset = 0 - if skip_blocks and chunk_idx == 0: - chunk_blocks = chunk_blocks[int(skip_blocks):] - dest_slot_offset = int(skip_blocks) * shape_desc.bs - if chunk_blocks.numel() == 0: - continue - - num_slots = int(chunk_blocks.numel()) * shape_desc.bs - tmp = _tmp_view( - int(tmp_ptr), - shape_desc, - num_layers, - lmcache_chunk_size, - dtype, - device, - ) - - for layer_idx in range(num_layers): - paged = _paged_view( - int(group_kv_pointers[layer_idx].item()), - shape_desc, - dtype, - device, - ) - tmp_slice = tmp[ - 0, - layer_idx, - dest_slot_offset : dest_slot_offset + num_slots, - :, - ] - if direction_name == "D2H": - gathered = paged.index_select(0, chunk_blocks).reshape( - num_slots, shape_desc.nh * shape_desc.hs - ) - tmp_slice.copy_(gathered) - elif direction_name == "H2D": - src = tmp_slice.reshape( - int(chunk_blocks.numel()), - shape_desc.bs, - shape_desc.nh * shape_desc.hs, - ) - paged.index_copy_(0, chunk_blocks, src) - else: - raise ValueError(f"Unsupported transfer direction: {direction}") - - lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer -PY -} +#write_lmcache_rocm_mp_patch() { +# local patch_dir="$1" +# mkdir -p "$patch_dir" +# cat > "$patch_dir/sitecustomize.py" <<'PY' +#"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches.""" +# +#import os +#import threading +# +#if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1": +# import builtins +# import sys +# +# _orig_import = builtins.__import__ +# +# def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None: +# _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator +# +# if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False): +# return +# +# _orig_init = _LazyMemoryAllocator.__init__ +# _orig_allocate = _LazyMemoryAllocator.allocate +# _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate +# +# def _expand_to(self, target_size: int) -> None: +# target_size = min( +# self._final_size, +# _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE), +# ) +# lock = self._agentic_rocm_demand_expand_lock +# with lock: +# if target_size <= self._curr_size: +# return +# +# start_size = self._curr_size +# while self._curr_size < target_size: +# commit_start = self._curr_size +# commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE) +# while self._curr_size < commit_target: +# self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE) +# self._curr_size += self.PIN_CHUNK_SIZE +# self._commit_expansion(self._curr_size - commit_start) +# +# self._log_expansion_progress(self._curr_size - start_size) +# +# def _retry_with_demand_expansion(self, allocate_once): +# obj = allocate_once() +# step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64")) +# step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3))) +# +# while obj is None and self._curr_size < self._final_size: +# _expand_to(self, self._curr_size + step_bytes) +# obj = allocate_once() +# +# return obj +# +# def _patched_init(self, *args, **kwargs): +# _orig_init(self, *args, **kwargs) +# self._agentic_rocm_demand_expand_lock = threading.Lock() +# +# # LMCache MP's upstream LazyMemoryAllocator currently expands to +# # the final pinned size in a background thread. On ROCm Kimi TP4, +# # vLLM reaches KV-cache registration only after that 2.5 TB pool +# # is fully pinned, and the server-side IPC open path can stall +# # before acknowledging register_kv_caches. Keep the same final +# # capacity, but pin/commit extra host memory only when L1 +# # allocations actually need it. +# self._stop_expand.set() +# self._expand_thread.join() +# _lazy_memory_allocator.logger.info( +# "Agentic ROCm patch: using demand-driven LMCache pinned " +# "memory expansion; final capacity remains %s MB", +# self._final_size >> 20, +# ) +# +# def _patched_allocate( +# self, +# shapes, +# dtypes, +# fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, +# allocator_type=None, +# ): +# return _retry_with_demand_expansion( +# self, +# lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type), +# ) +# +# def _patched_batched_allocate( +# self, +# shapes, +# dtypes, +# batch_size, +# fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, +# allocator_type=None, +# ): +# return _retry_with_demand_expansion( +# self, +# lambda: _orig_batched_allocate( +# self, shapes, dtypes, batch_size, fmt, allocator_type +# ), +# ) +# +# _LazyMemoryAllocator.__init__ = _patched_init +# _LazyMemoryAllocator.allocate = _patched_allocate +# _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate +# _LazyMemoryAllocator._agentic_rocm_demand_patch = True +# +# def _patch_l1_memory_manager(_memory_manager) -> None: +# _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None) +# _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None) +# if _L1MemoryManager is None or _LazyMemoryAllocator is None: +# return +# if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False): +# return +# +# _orig_get_memory_usage = _L1MemoryManager.get_memory_usage +# +# def _patched_get_memory_usage(self): +# allocator = getattr(self, "_allocator", None) +# if isinstance(allocator, _LazyMemoryAllocator): +# address_manager = allocator.get_address_manager() +# used_size = ( +# address_manager.get_heap_size() - address_manager.get_free_size() +# ) +# return used_size, allocator._final_size +# return _orig_get_memory_usage(self) +# +# _L1MemoryManager.get_memory_usage = _patched_get_memory_usage +# _L1MemoryManager._agentic_rocm_final_capacity_patch = True +# +# def _maybe_patch_lazy_memory_allocator() -> None: +# module = sys.modules.get("lmcache.v1.lazy_memory_allocator") +# if module is not None and hasattr(module, "LazyMemoryAllocator"): +# _patch_lazy_memory_allocator(module) +# +# def _maybe_patch_l1_memory_manager() -> None: +# module = sys.modules.get("lmcache.v1.distributed.memory_manager") +# if module is not None and hasattr(module, "L1MemoryManager"): +# _patch_l1_memory_manager(module) +# +# def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0): +# module = _orig_import(name, globals, locals, fromlist, level) +# if name == "lmcache.v1.lazy_memory_allocator" or ( +# name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules +# ): +# _maybe_patch_lazy_memory_allocator() +# if name == "lmcache.v1.distributed.memory_manager" or ( +# name.startswith("lmcache") +# and "lmcache.v1.distributed.memory_manager" in sys.modules +# ): +# _maybe_patch_l1_memory_manager() +# return module +# +# builtins.__import__ = _agentic_rocm_import +# _maybe_patch_lazy_memory_allocator() +# _maybe_patch_l1_memory_manager() +# +#if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1": +# import torch +# import lmcache.non_cuda_equivalents as lmc +# +# if not hasattr(lmc, "multi_layer_block_kv_transfer"): +# _DTYPE_BY_NAME = { +# "bfloat16": torch.bfloat16, +# "float16": torch.float16, +# "float32": torch.float32, +# } +# +# def _dtype_from_env() -> torch.dtype: +# name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16") +# try: +# return _DTYPE_BY_NAME[name] +# except KeyError as exc: +# raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc +# +# def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor: +# block_stride = shape_desc.block_stride_elems or ( +# shape_desc.bs * shape_desc.nh * shape_desc.hs +# ) +# base = lmc._tensor_from_ptr( +# ptr, +# (shape_desc.nb * block_stride,), +# dtype, +# device, +# ) +# return torch.as_strided( +# base, +# (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs), +# (block_stride, shape_desc.nh * shape_desc.hs, 1), +# ) +# +# def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor: +# return lmc._tensor_from_ptr( +# ptr, +# (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs), +# dtype, +# device, +# ) +# +# def multi_layer_block_kv_transfer( +# group_kv_pointers, +# tmp_buffer_ptrs, +# block_ids, +# paged_memory_device, +# direction, +# shape_desc, +# lmcache_chunk_size, +# gpu_kv_format, +# skip_blocks=0, +# ) -> None: +# # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with +# # shape [num_blocks, block_size, hidden_size]. LMCache's Python +# # fallback has no block-transfer entrypoint yet, so implement the +# # same gather/scatter contract with torch indexing on ROCm. +# if shape_desc.kv_size != 1: +# raise NotImplementedError( +# "ROCm LMCache MP block fallback currently supports MLA KV caches only" +# ) +# +# dtype = _dtype_from_env() +# device = ( +# paged_memory_device +# if isinstance(paged_memory_device, torch.device) +# else torch.device(paged_memory_device) +# ) +# num_layers = int(group_kv_pointers.numel()) +# blocks_per_chunk = lmcache_chunk_size // shape_desc.bs +# direction_name = getattr(direction, "name", str(direction)) +# +# for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs): +# start = chunk_idx * blocks_per_chunk +# end = start + blocks_per_chunk +# chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long) +# +# dest_slot_offset = 0 +# if skip_blocks and chunk_idx == 0: +# chunk_blocks = chunk_blocks[int(skip_blocks):] +# dest_slot_offset = int(skip_blocks) * shape_desc.bs +# if chunk_blocks.numel() == 0: +# continue +# +# num_slots = int(chunk_blocks.numel()) * shape_desc.bs +# tmp = _tmp_view( +# int(tmp_ptr), +# shape_desc, +# num_layers, +# lmcache_chunk_size, +# dtype, +# device, +# ) +# +# for layer_idx in range(num_layers): +# paged = _paged_view( +# int(group_kv_pointers[layer_idx].item()), +# shape_desc, +# dtype, +# device, +# ) +# tmp_slice = tmp[ +# 0, +# layer_idx, +# dest_slot_offset : dest_slot_offset + num_slots, +# :, +# ] +# if direction_name == "D2H": +# gathered = paged.index_select(0, chunk_blocks).reshape( +# num_slots, shape_desc.nh * shape_desc.hs +# ) +# tmp_slice.copy_(gathered) +# elif direction_name == "H2D": +# src = tmp_slice.reshape( +# int(chunk_blocks.numel()), +# shape_desc.bs, +# shape_desc.nh * shape_desc.hs, +# ) +# paged.index_copy_(0, chunk_blocks, src) +# else: +# raise ValueError(f"Unsupported transfer direction: {direction}") +# +# lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer +#PY +#} # Workaround for MEC FW <177 RCCL memory reclaim issue version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') @@ -431,60 +431,70 @@ case "$OFFLOADING" in { set +x; } 2>/dev/null unset VLLM_USE_SIMPLE_KV_OFFLOAD - agentic_pip_install --quiet --no-cache-dir lmcache - # LMCache's current dependency chain can install NVIDIA/CUDA NIXL and - # CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and - # during Kimi fused-MoE model inspection it imports nixl_ep whenever - # that module is importable, even when this run is not using EP/NIXL - # kernels. The CUDA extension then fails immediately on AMD nodes with - # "ImportError: libcuda.so.1". - # - # LMCache MP also uses CuPy stream APIs while registering vLLM's KV - # caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime - # with cudaErrorInsufficientDriver when LMCache touches the stream. Use - # the ROCm 7 CuPy wheel so the same API dispatches through HIP. - python3 -m pip uninstall -y \ - nixl nixl-cu12 nixl-cu13 nixl_ep \ - >/dev/null 2>&1 || true - python3 -m pip uninstall -y \ - cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \ - >/dev/null 2>&1 || true - agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0 - python3 - <<'PY' -import importlib.util -import sys - -spec = importlib.util.find_spec("nixl_ep") -if spec is not None: - locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"]) - print( - "Error: nixl_ep is still importable after LMCache install; " - "this ROCm Kimi run would import a CUDA-only nixl_ep module. " - f"location={locations}", - file=sys.stderr, - ) - sys.exit(1) - -try: - from cupy_backends.cuda.api import runtime as cupy_runtime -except Exception as exc: - print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr) - sys.exit(1) - -if not getattr(cupy_runtime, "is_hip", False): - print( - "Error: CuPy is still using the CUDA backend after installing " - "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.", - file=sys.stderr, - ) - sys.exit(1) -PY - LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch" - write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR" - export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1 - export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16 - export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1 - export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}" + #agentic_pip_install --quiet --no-cache-dir lmcache + ## LMCache's current dependency chain can install NVIDIA/CUDA NIXL and + ## CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and + ## during Kimi fused-MoE model inspection it imports nixl_ep whenever + ## that module is importable, even when this run is not using EP/NIXL + ## kernels. The CUDA extension then fails immediately on AMD nodes with + ## "ImportError: libcuda.so.1". + ## + ## LMCache MP also uses CuPy stream APIs while registering vLLM's KV + ## caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime + ## with cudaErrorInsufficientDriver when LMCache touches the stream. Use + ## the ROCm 7 CuPy wheel so the same API dispatches through HIP. + #python3 -m pip uninstall -y \ + # nixl nixl-cu12 nixl-cu13 nixl_ep \ + # >/dev/null 2>&1 || true + #python3 -m pip uninstall -y \ + # cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \ + # >/dev/null 2>&1 || true + #agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0 + + + +# python3 - <<'PY' +#import importlib.util +#import sys +# +#spec = importlib.util.find_spec("nixl_ep") +#if spec is not None: +# locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"]) +# print( +# "Error: nixl_ep is still importable after LMCache install; " +# "this ROCm Kimi run would import a CUDA-only nixl_ep module. " +# f"location={locations}", +# file=sys.stderr, +# ) +# sys.exit(1) +# +#try: +# from cupy_backends.cuda.api import runtime as cupy_runtime +#except Exception as exc: +# print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr) +# sys.exit(1) +# +#if not getattr(cupy_runtime, "is_hip", False): +# print( +# "Error: CuPy is still using the CUDA backend after installing " +# "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.", +# file=sys.stderr, +# ) +# sys.exit(1) +#PY + #LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch" + #write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR" + #export LMCACHE_ROCM_MP_BLOCK_FALLBACK=0 + #export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16 + #export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=0 + #export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}" + + git clone https://github.com/seungrokj/LMCache.git + cd LMCache + pip install -r requirements/build.txt + CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation + cd .. + python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV @@ -578,4 +588,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -run_agentic_replay_and_write_outputs "$RESULT_DIR" +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file From 2af7377bae1f849d00e827fe28bf00062f5182b2 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 26 May 2026 18:57:51 +0900 Subject: [PATCH 06/10] manual Signed-off-by: seungrokj --- benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index aa4ffd149..ad83d6daa 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -521,6 +521,7 @@ case "$OFFLOADING" in LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" + export LMCACHE_BLOCKING_TIMEOUT_SECS=60 echo "Starting LMCache MP server..." LMCACHE_CMD=( From b089e28e1598fc53da56360a5d43709244432c45 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 26 May 2026 20:14:49 +0900 Subject: [PATCH 07/10] fix(agentic): add CUDA LMCache MP patch for Kimi FP4 B200 Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 6 +- .../agentic/kimik2.5_fp4_mi355x.sh | 342 ------------------ 2 files changed, 4 insertions(+), 344 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index d02218f5f..76d380233 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -616,10 +616,12 @@ kimik2.5-fp4-mi355x-vllm-agentic: agentic-coding: - duration: 1800 search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } - { tp: 8, offloading: lmcache, conc-list: [32, 40, 48, 56] } - - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - { tp: 4, offloading: lmcache, conc-list: [16, 24, 32, 40] } + #- { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } + #- { tp: 8, offloading: lmcache, conc-list: [32, 40, 48, 56] } + #- { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } + #- { tp: 4, offloading: lmcache, conc-list: [16, 24, 32, 40] } kimik2.5-fp4-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index ad83d6daa..e9c036ba3 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -55,290 +55,6 @@ if [ "${TP}" -lt 8 ]; then export VLLM_ROCM_USE_AITER_RMSNORM=0 fi -#write_lmcache_rocm_mp_patch() { -# local patch_dir="$1" -# mkdir -p "$patch_dir" -# cat > "$patch_dir/sitecustomize.py" <<'PY' -#"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches.""" -# -#import os -#import threading -# -#if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1": -# import builtins -# import sys -# -# _orig_import = builtins.__import__ -# -# def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None: -# _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator -# -# if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False): -# return -# -# _orig_init = _LazyMemoryAllocator.__init__ -# _orig_allocate = _LazyMemoryAllocator.allocate -# _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate -# -# def _expand_to(self, target_size: int) -> None: -# target_size = min( -# self._final_size, -# _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE), -# ) -# lock = self._agentic_rocm_demand_expand_lock -# with lock: -# if target_size <= self._curr_size: -# return -# -# start_size = self._curr_size -# while self._curr_size < target_size: -# commit_start = self._curr_size -# commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE) -# while self._curr_size < commit_target: -# self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE) -# self._curr_size += self.PIN_CHUNK_SIZE -# self._commit_expansion(self._curr_size - commit_start) -# -# self._log_expansion_progress(self._curr_size - start_size) -# -# def _retry_with_demand_expansion(self, allocate_once): -# obj = allocate_once() -# step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64")) -# step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3))) -# -# while obj is None and self._curr_size < self._final_size: -# _expand_to(self, self._curr_size + step_bytes) -# obj = allocate_once() -# -# return obj -# -# def _patched_init(self, *args, **kwargs): -# _orig_init(self, *args, **kwargs) -# self._agentic_rocm_demand_expand_lock = threading.Lock() -# -# # LMCache MP's upstream LazyMemoryAllocator currently expands to -# # the final pinned size in a background thread. On ROCm Kimi TP4, -# # vLLM reaches KV-cache registration only after that 2.5 TB pool -# # is fully pinned, and the server-side IPC open path can stall -# # before acknowledging register_kv_caches. Keep the same final -# # capacity, but pin/commit extra host memory only when L1 -# # allocations actually need it. -# self._stop_expand.set() -# self._expand_thread.join() -# _lazy_memory_allocator.logger.info( -# "Agentic ROCm patch: using demand-driven LMCache pinned " -# "memory expansion; final capacity remains %s MB", -# self._final_size >> 20, -# ) -# -# def _patched_allocate( -# self, -# shapes, -# dtypes, -# fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, -# allocator_type=None, -# ): -# return _retry_with_demand_expansion( -# self, -# lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type), -# ) -# -# def _patched_batched_allocate( -# self, -# shapes, -# dtypes, -# batch_size, -# fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, -# allocator_type=None, -# ): -# return _retry_with_demand_expansion( -# self, -# lambda: _orig_batched_allocate( -# self, shapes, dtypes, batch_size, fmt, allocator_type -# ), -# ) -# -# _LazyMemoryAllocator.__init__ = _patched_init -# _LazyMemoryAllocator.allocate = _patched_allocate -# _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate -# _LazyMemoryAllocator._agentic_rocm_demand_patch = True -# -# def _patch_l1_memory_manager(_memory_manager) -> None: -# _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None) -# _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None) -# if _L1MemoryManager is None or _LazyMemoryAllocator is None: -# return -# if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False): -# return -# -# _orig_get_memory_usage = _L1MemoryManager.get_memory_usage -# -# def _patched_get_memory_usage(self): -# allocator = getattr(self, "_allocator", None) -# if isinstance(allocator, _LazyMemoryAllocator): -# address_manager = allocator.get_address_manager() -# used_size = ( -# address_manager.get_heap_size() - address_manager.get_free_size() -# ) -# return used_size, allocator._final_size -# return _orig_get_memory_usage(self) -# -# _L1MemoryManager.get_memory_usage = _patched_get_memory_usage -# _L1MemoryManager._agentic_rocm_final_capacity_patch = True -# -# def _maybe_patch_lazy_memory_allocator() -> None: -# module = sys.modules.get("lmcache.v1.lazy_memory_allocator") -# if module is not None and hasattr(module, "LazyMemoryAllocator"): -# _patch_lazy_memory_allocator(module) -# -# def _maybe_patch_l1_memory_manager() -> None: -# module = sys.modules.get("lmcache.v1.distributed.memory_manager") -# if module is not None and hasattr(module, "L1MemoryManager"): -# _patch_l1_memory_manager(module) -# -# def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0): -# module = _orig_import(name, globals, locals, fromlist, level) -# if name == "lmcache.v1.lazy_memory_allocator" or ( -# name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules -# ): -# _maybe_patch_lazy_memory_allocator() -# if name == "lmcache.v1.distributed.memory_manager" or ( -# name.startswith("lmcache") -# and "lmcache.v1.distributed.memory_manager" in sys.modules -# ): -# _maybe_patch_l1_memory_manager() -# return module -# -# builtins.__import__ = _agentic_rocm_import -# _maybe_patch_lazy_memory_allocator() -# _maybe_patch_l1_memory_manager() -# -#if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1": -# import torch -# import lmcache.non_cuda_equivalents as lmc -# -# if not hasattr(lmc, "multi_layer_block_kv_transfer"): -# _DTYPE_BY_NAME = { -# "bfloat16": torch.bfloat16, -# "float16": torch.float16, -# "float32": torch.float32, -# } -# -# def _dtype_from_env() -> torch.dtype: -# name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16") -# try: -# return _DTYPE_BY_NAME[name] -# except KeyError as exc: -# raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc -# -# def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor: -# block_stride = shape_desc.block_stride_elems or ( -# shape_desc.bs * shape_desc.nh * shape_desc.hs -# ) -# base = lmc._tensor_from_ptr( -# ptr, -# (shape_desc.nb * block_stride,), -# dtype, -# device, -# ) -# return torch.as_strided( -# base, -# (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs), -# (block_stride, shape_desc.nh * shape_desc.hs, 1), -# ) -# -# def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor: -# return lmc._tensor_from_ptr( -# ptr, -# (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs), -# dtype, -# device, -# ) -# -# def multi_layer_block_kv_transfer( -# group_kv_pointers, -# tmp_buffer_ptrs, -# block_ids, -# paged_memory_device, -# direction, -# shape_desc, -# lmcache_chunk_size, -# gpu_kv_format, -# skip_blocks=0, -# ) -> None: -# # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with -# # shape [num_blocks, block_size, hidden_size]. LMCache's Python -# # fallback has no block-transfer entrypoint yet, so implement the -# # same gather/scatter contract with torch indexing on ROCm. -# if shape_desc.kv_size != 1: -# raise NotImplementedError( -# "ROCm LMCache MP block fallback currently supports MLA KV caches only" -# ) -# -# dtype = _dtype_from_env() -# device = ( -# paged_memory_device -# if isinstance(paged_memory_device, torch.device) -# else torch.device(paged_memory_device) -# ) -# num_layers = int(group_kv_pointers.numel()) -# blocks_per_chunk = lmcache_chunk_size // shape_desc.bs -# direction_name = getattr(direction, "name", str(direction)) -# -# for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs): -# start = chunk_idx * blocks_per_chunk -# end = start + blocks_per_chunk -# chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long) -# -# dest_slot_offset = 0 -# if skip_blocks and chunk_idx == 0: -# chunk_blocks = chunk_blocks[int(skip_blocks):] -# dest_slot_offset = int(skip_blocks) * shape_desc.bs -# if chunk_blocks.numel() == 0: -# continue -# -# num_slots = int(chunk_blocks.numel()) * shape_desc.bs -# tmp = _tmp_view( -# int(tmp_ptr), -# shape_desc, -# num_layers, -# lmcache_chunk_size, -# dtype, -# device, -# ) -# -# for layer_idx in range(num_layers): -# paged = _paged_view( -# int(group_kv_pointers[layer_idx].item()), -# shape_desc, -# dtype, -# device, -# ) -# tmp_slice = tmp[ -# 0, -# layer_idx, -# dest_slot_offset : dest_slot_offset + num_slots, -# :, -# ] -# if direction_name == "D2H": -# gathered = paged.index_select(0, chunk_blocks).reshape( -# num_slots, shape_desc.nh * shape_desc.hs -# ) -# tmp_slice.copy_(gathered) -# elif direction_name == "H2D": -# src = tmp_slice.reshape( -# int(chunk_blocks.numel()), -# shape_desc.bs, -# shape_desc.nh * shape_desc.hs, -# ) -# paged.index_copy_(0, chunk_blocks, src) -# else: -# raise ValueError(f"Unsupported transfer direction: {direction}") -# -# lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer -#PY -#} - # Workaround for MEC FW <177 RCCL memory reclaim issue version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then @@ -431,64 +147,6 @@ case "$OFFLOADING" in { set +x; } 2>/dev/null unset VLLM_USE_SIMPLE_KV_OFFLOAD - #agentic_pip_install --quiet --no-cache-dir lmcache - ## LMCache's current dependency chain can install NVIDIA/CUDA NIXL and - ## CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and - ## during Kimi fused-MoE model inspection it imports nixl_ep whenever - ## that module is importable, even when this run is not using EP/NIXL - ## kernels. The CUDA extension then fails immediately on AMD nodes with - ## "ImportError: libcuda.so.1". - ## - ## LMCache MP also uses CuPy stream APIs while registering vLLM's KV - ## caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime - ## with cudaErrorInsufficientDriver when LMCache touches the stream. Use - ## the ROCm 7 CuPy wheel so the same API dispatches through HIP. - #python3 -m pip uninstall -y \ - # nixl nixl-cu12 nixl-cu13 nixl_ep \ - # >/dev/null 2>&1 || true - #python3 -m pip uninstall -y \ - # cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \ - # >/dev/null 2>&1 || true - #agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0 - - - -# python3 - <<'PY' -#import importlib.util -#import sys -# -#spec = importlib.util.find_spec("nixl_ep") -#if spec is not None: -# locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"]) -# print( -# "Error: nixl_ep is still importable after LMCache install; " -# "this ROCm Kimi run would import a CUDA-only nixl_ep module. " -# f"location={locations}", -# file=sys.stderr, -# ) -# sys.exit(1) -# -#try: -# from cupy_backends.cuda.api import runtime as cupy_runtime -#except Exception as exc: -# print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr) -# sys.exit(1) -# -#if not getattr(cupy_runtime, "is_hip", False): -# print( -# "Error: CuPy is still using the CUDA backend after installing " -# "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.", -# file=sys.stderr, -# ) -# sys.exit(1) -#PY - #LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch" - #write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR" - #export LMCACHE_ROCM_MP_BLOCK_FALLBACK=0 - #export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16 - #export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=0 - #export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}" - git clone https://github.com/seungrokj/LMCache.git cd LMCache pip install -r requirements/build.txt From 2912288af18bafc218c8da1113e8fba994c66821 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 26 May 2026 22:35:37 +0900 Subject: [PATCH 08/10] manual Signed-off-by: seungrokj --- .github/configs/amd-master.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 76d380233..f7f100544 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -616,8 +616,9 @@ kimik2.5-fp4-mi355x-vllm-agentic: agentic-coding: - duration: 1800 search-space: - - { tp: 8, offloading: lmcache, conc-list: [32, 40, 48, 56] } - - { tp: 4, offloading: lmcache, conc-list: [16, 24, 32, 40] } + - { tp: 8, offloading: lmcache, conc-list: [40] } + #- { tp: 8, offloading: lmcache, conc-list: [32, 40, 48, 56] } + #- { tp: 4, offloading: lmcache, conc-list: [16, 24, 32, 40] } #- { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } #- { tp: 8, offloading: lmcache, conc-list: [32, 40, 48, 56] } #- { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } From 3fa8c2be322b3fe8481375b4940be9f63bdb0a68 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 26 May 2026 22:37:54 +0900 Subject: [PATCH 09/10] manual Signed-off-by: seungrokj --- .github/configs/amd-master.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index f7f100544..76d380233 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -616,9 +616,8 @@ kimik2.5-fp4-mi355x-vllm-agentic: agentic-coding: - duration: 1800 search-space: - - { tp: 8, offloading: lmcache, conc-list: [40] } - #- { tp: 8, offloading: lmcache, conc-list: [32, 40, 48, 56] } - #- { tp: 4, offloading: lmcache, conc-list: [16, 24, 32, 40] } + - { tp: 8, offloading: lmcache, conc-list: [32, 40, 48, 56] } + - { tp: 4, offloading: lmcache, conc-list: [16, 24, 32, 40] } #- { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } #- { tp: 8, offloading: lmcache, conc-list: [32, 40, 48, 56] } #- { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } From 0323ccb9f6b9f40fe8114845221610a6afa0b287 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Wed, 27 May 2026 00:30:00 +0900 Subject: [PATCH 10/10] manual Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 6 ++---- benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh | 8 ++------ 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 76d380233..d02218f5f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -616,12 +616,10 @@ kimik2.5-fp4-mi355x-vllm-agentic: agentic-coding: - duration: 1800 search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } - { tp: 8, offloading: lmcache, conc-list: [32, 40, 48, 56] } + - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - { tp: 4, offloading: lmcache, conc-list: [16, 24, 32, 40] } - #- { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } - #- { tp: 8, offloading: lmcache, conc-list: [32, 40, 48, 56] } - #- { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - #- { tp: 4, offloading: lmcache, conc-list: [16, 24, 32, 40] } kimik2.5-fp4-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh index f1111e3d9..366603f45 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh @@ -99,9 +99,7 @@ case "$OFFLOADING" in # RSS + page cache. Eager mode (the shortcut form default) is # intentional here per user request — Kimi FP4 on B200 has cleared # the full eager sweep before. - #(srok), internal node limitation - #TOTAL_CPU_DRAM_GB=2500 - TOTAL_CPU_DRAM_GB=1500 + TOTAL_CPU_DRAM_GB=2500 export VLLM_USE_SIMPLE_KV_OFFLOAD=1 OFFLOAD_ARGS=( --kv_offloading_backend native @@ -121,9 +119,7 @@ case "$OFFLOADING" in # --kv-offloading-size through vLLM's integrated LMCache convenience # path, which divides the value by TP and then hits a large single-shot # cudaHostAlloc in LMCache 0.4.5's single-process local CPU backend. - #(srok), internal node limitation - #TOTAL_CPU_DRAM_GB=2500 - TOTAL_CPU_DRAM_GB=1500 + TOTAL_CPU_DRAM_GB=2500 LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" LMCACHE_PORT="${LMCACHE_PORT:-5555}" LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"