From 2441f1fbbbd2bed783e62e26cbaeb07f558c7a62 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 26 May 2026 12:53:45 +0900
Subject: [PATCH 01/10] chore(agentic): annotate CPU DRAM limit comments for
 Kimi FP4 B200

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
index 366603f45..e2dce7b2a 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
@@ -99,6 +99,8 @@ case "$OFFLOADING" in
         # RSS + page cache. Eager mode (the shortcut form default) is
         # intentional here per user request — Kimi FP4 on B200 has cleared
         # the full eager sweep before.
+        #(srok), internal node limitation
+        #TOTAL_CPU_DRAM_GB=2500
         TOTAL_CPU_DRAM_GB=2500
         export VLLM_USE_SIMPLE_KV_OFFLOAD=1
         OFFLOAD_ARGS=(
@@ -119,6 +121,8 @@ case "$OFFLOADING" in
         # --kv-offloading-size through vLLM's integrated LMCache convenience
         # path, which divides the value by TP and then hits a large single-shot
         # cudaHostAlloc in LMCache 0.4.5's single-process local CPU backend.
+        #(srok), internal node limitation
+        #TOTAL_CPU_DRAM_GB=2500
         TOTAL_CPU_DRAM_GB=2500
         LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
         LMCACHE_PORT="${LMCACHE_PORT:-5555}"

From 461bbe7f6c99a0a9c7b85e1e5f631d4443204e7d Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 26 May 2026 13:01:41 +0900
Subject: [PATCH 02/10] fix(agentic): reduce Kimi FP4 B200 CPU DRAM limit to
 1500 GB

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
index e2dce7b2a..f1111e3d9 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
@@ -101,7 +101,7 @@ case "$OFFLOADING" in
         # the full eager sweep before.
         #(srok), internal node limitation
         #TOTAL_CPU_DRAM_GB=2500
-        TOTAL_CPU_DRAM_GB=2500
+        TOTAL_CPU_DRAM_GB=1500
         export VLLM_USE_SIMPLE_KV_OFFLOAD=1
         OFFLOAD_ARGS=(
             --kv_offloading_backend native
@@ -123,7 +123,7 @@ case "$OFFLOADING" in
         # cudaHostAlloc in LMCache 0.4.5's single-process local CPU backend.
         #(srok), internal node limitation
         #TOTAL_CPU_DRAM_GB=2500
-        TOTAL_CPU_DRAM_GB=2500
+        TOTAL_CPU_DRAM_GB=1500
         LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
         LMCACHE_PORT="${LMCACHE_PORT:-5555}"
         LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"

From 806b3c948bd7951e818dd85c3cce899b1ba5b5fa Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 26 May 2026 13:40:00 +0900
Subject: [PATCH 03/10] manual

Signed-off-by: seungrokj <seungrok.jung@amd.com>
---
 .../single_node/agentic/kimik2.5_fp4_b200.sh  | 164 ++++++++++++++++++
 1 file changed, 164 insertions(+)

diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
index f1111e3d9..6cef20bcc 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
@@ -90,6 +90,166 @@ wait_for_lmcache_ready() {
     exit 1
 }
 
+write_lmcache_cuda_mp_patch() {
+    local patch_dir="$1"
+    mkdir -p "$patch_dir"
+    cat > "$patch_dir/sitecustomize.py" <<'PY'
+"""Runtime compatibility for LMCache MP on CUDA Kimi MLA KV caches."""
+
+import os
+import threading
+
+if os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR") == "1":
+    import builtins
+    import sys
+
+    _orig_import = builtins.__import__
+
+    def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
+        _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
+
+        if getattr(_LazyMemoryAllocator, "_agentic_cuda_demand_patch", False):
+            return
+
+        _orig_init = _LazyMemoryAllocator.__init__
+        _orig_allocate = _LazyMemoryAllocator.allocate
+        _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
+
+        def _expand_to(self, target_size: int) -> None:
+            target_size = min(
+                self._final_size,
+                _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
+            )
+            lock = self._agentic_cuda_demand_expand_lock
+            with lock:
+                if target_size <= self._curr_size:
+                    return
+
+                start_size = self._curr_size
+                while self._curr_size < target_size:
+                    commit_start = self._curr_size
+                    commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
+                    while self._curr_size < commit_target:
+                        self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
+                        self._curr_size += self.PIN_CHUNK_SIZE
+                    self._commit_expansion(self._curr_size - commit_start)
+
+                self._log_expansion_progress(self._curr_size - start_size)
+
+        def _retry_with_demand_expansion(self, allocate_once):
+            obj = allocate_once()
+            step_gb = float(os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_STEP_GB", "64"))
+            step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
+
+            while obj is None and self._curr_size < self._final_size:
+                _expand_to(self, self._curr_size + step_bytes)
+                obj = allocate_once()
+
+            return obj
+
+        def _patched_init(self, *args, **kwargs):
+            _orig_init(self, *args, **kwargs)
+            self._agentic_cuda_demand_expand_lock = threading.Lock()
+
+            # LMCache MP's upstream LazyMemoryAllocator currently expands to
+            # the final pinned size in a background thread. On CUDA Kimi TP4,
+            # vLLM reaches KV-cache registration only after that 1.5 TB pool
+            # is fully pinned, and the server-side IPC open path can stall
+            # before acknowledging register_kv_caches. Keep the same final
+            # capacity, but pin/commit extra host memory only when L1
+            # allocations actually need it.
+            self._stop_expand.set()
+            self._expand_thread.join()
+            _lazy_memory_allocator.logger.info(
+                "Agentic CUDA patch: using demand-driven LMCache pinned "
+                "memory expansion; final capacity remains %s MB",
+                self._final_size >> 20,
+            )
+
+        def _patched_allocate(
+            self,
+            shapes,
+            dtypes,
+            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
+            allocator_type=None,
+        ):
+            return _retry_with_demand_expansion(
+                self,
+                lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
+            )
+
+        def _patched_batched_allocate(
+            self,
+            shapes,
+            dtypes,
+            batch_size,
+            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
+            allocator_type=None,
+        ):
+            return _retry_with_demand_expansion(
+                self,
+                lambda: _orig_batched_allocate(
+                    self, shapes, dtypes, batch_size, fmt, allocator_type
+                ),
+            )
+
+        _LazyMemoryAllocator.__init__ = _patched_init
+        _LazyMemoryAllocator.allocate = _patched_allocate
+        _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
+        _LazyMemoryAllocator._agentic_cuda_demand_patch = True
+
+    def _patch_l1_memory_manager(_memory_manager) -> None:
+        _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
+        _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
+        if _L1MemoryManager is None or _LazyMemoryAllocator is None:
+            return
+        if getattr(_L1MemoryManager, "_agentic_cuda_final_capacity_patch", False):
+            return
+
+        _orig_get_memory_usage = _L1MemoryManager.get_memory_usage
+
+        def _patched_get_memory_usage(self):
+            allocator = getattr(self, "_allocator", None)
+            if isinstance(allocator, _LazyMemoryAllocator):
+                address_manager = allocator.get_address_manager()
+                used_size = (
+                    address_manager.get_heap_size() - address_manager.get_free_size()
+                )
+                return used_size, allocator._final_size
+            return _orig_get_memory_usage(self)
+
+        _L1MemoryManager.get_memory_usage = _patched_get_memory_usage
+        _L1MemoryManager._agentic_cuda_final_capacity_patch = True
+
+    def _maybe_patch_lazy_memory_allocator() -> None:
+        module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
+        if module is not None and hasattr(module, "LazyMemoryAllocator"):
+            _patch_lazy_memory_allocator(module)
+
+    def _maybe_patch_l1_memory_manager() -> None:
+        module = sys.modules.get("lmcache.v1.distributed.memory_manager")
+        if module is not None and hasattr(module, "L1MemoryManager"):
+            _patch_l1_memory_manager(module)
+
+    def _agentic_cuda_import(name, globals=None, locals=None, fromlist=(), level=0):
+        module = _orig_import(name, globals, locals, fromlist, level)
+        if name == "lmcache.v1.lazy_memory_allocator" or (
+            name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
+        ):
+            _maybe_patch_lazy_memory_allocator()
+        if name == "lmcache.v1.distributed.memory_manager" or (
+            name.startswith("lmcache")
+            and "lmcache.v1.distributed.memory_manager" in sys.modules
+        ):
+            _maybe_patch_l1_memory_manager()
+        return module
+
+    builtins.__import__ = _agentic_cuda_import
+    _maybe_patch_lazy_memory_allocator()
+    _maybe_patch_l1_memory_manager()
+PY
+}
+
 case "$OFFLOADING" in
     none)
         ;;
@@ -114,6 +274,10 @@ case "$OFFLOADING" in
         unset VLLM_USE_SIMPLE_KV_OFFLOAD
 
         agentic_pip_install --quiet --no-cache-dir lmcache
+        LMCACHE_CUDA_PATCH_DIR="$RESULT_DIR/lmcache_cuda_patch"
+        write_lmcache_cuda_mp_patch "$LMCACHE_CUDA_PATCH_DIR"
+        export LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR=1
+        export PYTHONPATH="$LMCACHE_CUDA_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}"
         python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
 
         # Keep the semantic CPU KV pool at 2.5 TB for every TP shape. MP mode

From 18eb2d5b5b438ec95ee964b0ce96d9c285ed7d9d Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 26 May 2026 16:19:03 +0900
Subject: [PATCH 04/10] manual

Signed-off-by: seungrokj <seungrok.jung@amd.com>
---
 .../single_node/agentic/kimik2.5_fp4_b200.sh  | 230 +++---------------
 1 file changed, 34 insertions(+), 196 deletions(-)

diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
index 6cef20bcc..e972eebf6 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
@@ -90,165 +90,42 @@ wait_for_lmcache_ready() {
     exit 1
 }
 
-write_lmcache_cuda_mp_patch() {
-    local patch_dir="$1"
-    mkdir -p "$patch_dir"
-    cat > "$patch_dir/sitecustomize.py" <<'PY'
-"""Runtime compatibility for LMCache MP on CUDA Kimi MLA KV caches."""
-
-import os
-import threading
-
-if os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR") == "1":
-    import builtins
-    import sys
-
-    _orig_import = builtins.__import__
-
-    def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
-        _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
-
-        if getattr(_LazyMemoryAllocator, "_agentic_cuda_demand_patch", False):
-            return
-
-        _orig_init = _LazyMemoryAllocator.__init__
-        _orig_allocate = _LazyMemoryAllocator.allocate
-        _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
-
-        def _expand_to(self, target_size: int) -> None:
-            target_size = min(
-                self._final_size,
-                _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
-            )
-            lock = self._agentic_cuda_demand_expand_lock
-            with lock:
-                if target_size <= self._curr_size:
-                    return
-
-                start_size = self._curr_size
-                while self._curr_size < target_size:
-                    commit_start = self._curr_size
-                    commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
-                    while self._curr_size < commit_target:
-                        self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
-                        self._curr_size += self.PIN_CHUNK_SIZE
-                    self._commit_expansion(self._curr_size - commit_start)
-
-                self._log_expansion_progress(self._curr_size - start_size)
-
-        def _retry_with_demand_expansion(self, allocate_once):
-            obj = allocate_once()
-            step_gb = float(os.environ.get("LMCACHE_CUDA_DEMAND_PINNED_STEP_GB", "64"))
-            step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
-
-            while obj is None and self._curr_size < self._final_size:
-                _expand_to(self, self._curr_size + step_bytes)
-                obj = allocate_once()
-
-            return obj
-
-        def _patched_init(self, *args, **kwargs):
-            _orig_init(self, *args, **kwargs)
-            self._agentic_cuda_demand_expand_lock = threading.Lock()
-
-            # LMCache MP's upstream LazyMemoryAllocator currently expands to
-            # the final pinned size in a background thread. On CUDA Kimi TP4,
-            # vLLM reaches KV-cache registration only after that 1.5 TB pool
-            # is fully pinned, and the server-side IPC open path can stall
-            # before acknowledging register_kv_caches. Keep the same final
-            # capacity, but pin/commit extra host memory only when L1
-            # allocations actually need it.
-            self._stop_expand.set()
-            self._expand_thread.join()
-            _lazy_memory_allocator.logger.info(
-                "Agentic CUDA patch: using demand-driven LMCache pinned "
-                "memory expansion; final capacity remains %s MB",
-                self._final_size >> 20,
-            )
-
-        def _patched_allocate(
-            self,
-            shapes,
-            dtypes,
-            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
-            allocator_type=None,
-        ):
-            return _retry_with_demand_expansion(
-                self,
-                lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
-            )
-
-        def _patched_batched_allocate(
-            self,
-            shapes,
-            dtypes,
-            batch_size,
-            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
-            allocator_type=None,
-        ):
-            return _retry_with_demand_expansion(
-                self,
-                lambda: _orig_batched_allocate(
-                    self, shapes, dtypes, batch_size, fmt, allocator_type
-                ),
-            )
-
-        _LazyMemoryAllocator.__init__ = _patched_init
-        _LazyMemoryAllocator.allocate = _patched_allocate
-        _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
-        _LazyMemoryAllocator._agentic_cuda_demand_patch = True
-
-    def _patch_l1_memory_manager(_memory_manager) -> None:
-        _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
-        _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
-        if _L1MemoryManager is None or _LazyMemoryAllocator is None:
-            return
-        if getattr(_L1MemoryManager, "_agentic_cuda_final_capacity_patch", False):
-            return
-
-        _orig_get_memory_usage = _L1MemoryManager.get_memory_usage
-
-        def _patched_get_memory_usage(self):
-            allocator = getattr(self, "_allocator", None)
-            if isinstance(allocator, _LazyMemoryAllocator):
-                address_manager = allocator.get_address_manager()
-                used_size = (
-                    address_manager.get_heap_size() - address_manager.get_free_size()
-                )
-                return used_size, allocator._final_size
-            return _orig_get_memory_usage(self)
-
-        _L1MemoryManager.get_memory_usage = _patched_get_memory_usage
-        _L1MemoryManager._agentic_cuda_final_capacity_patch = True
-
-    def _maybe_patch_lazy_memory_allocator() -> None:
-        module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
-        if module is not None and hasattr(module, "LazyMemoryAllocator"):
-            _patch_lazy_memory_allocator(module)
-
-    def _maybe_patch_l1_memory_manager() -> None:
-        module = sys.modules.get("lmcache.v1.distributed.memory_manager")
-        if module is not None and hasattr(module, "L1MemoryManager"):
-            _patch_l1_memory_manager(module)
+echo "Starting vllm server..."
+export TORCH_CUDA_ARCH_LIST="10.0"
+export PYTHONNOUSERSITE=1
+# Disable vLLM v0.21+ CUDA-graph memory estimator. Its pre-reservation
+# eats ~32% of HBM upfront which, combined with FP4 weights at TP=4
+# (~62 GB/GPU), leaves no room for KV blocks -- _check_enough_kv_cache_memory
+# trips before the engine starts. Our --gpu-memory-utilization=0.90 already
+# leaves ~18 GB/GPU slack outside vLLM's budget, which is the same safety
+# net the estimator provides, so disabling it is redundant rather than
+# unsafe.
+export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
 
-    def _agentic_cuda_import(name, globals=None, locals=None, fromlist=(), level=0):
-        module = _orig_import(name, globals, locals, fromlist, level)
-        if name == "lmcache.v1.lazy_memory_allocator" or (
-            name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
-        ):
-            _maybe_patch_lazy_memory_allocator()
-        if name == "lmcache.v1.distributed.memory_manager" or (
-            name.startswith("lmcache")
-            and "lmcache.v1.distributed.memory_manager" in sys.modules
-        ):
-            _maybe_patch_l1_memory_manager()
-        return module
+{ set +x; } 2>/dev/null
+VLLM_CMD=(
+    vllm serve "$MODEL"
+    --host 0.0.0.0
+    --port "$PORT"
+    --tensor-parallel-size="$TP"
+    --gpu-memory-utilization 0.90
+    --max-num-seqs "$CONC"
+    --reasoning-parser kimi_k2
+    --tool-call-parser kimi_k2
+    --compilation_config.pass_config.fuse_allreduce_rms true
+    --kv-cache-dtype fp8
+    --max-cudagraph-capture-size 2048
+    --stream-interval 20
+    --trust-remote-code
+    "${PREFIX_CACHE_ARGS[@]}"
+    "${OFFLOAD_ARGS[@]}"
+)
+printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
+"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
 
-    builtins.__import__ = _agentic_cuda_import
-    _maybe_patch_lazy_memory_allocator()
-    _maybe_patch_l1_memory_manager()
-PY
-}
 
 case "$OFFLOADING" in
     none)
@@ -274,10 +151,6 @@ case "$OFFLOADING" in
         unset VLLM_USE_SIMPLE_KV_OFFLOAD
 
         agentic_pip_install --quiet --no-cache-dir lmcache
-        LMCACHE_CUDA_PATCH_DIR="$RESULT_DIR/lmcache_cuda_patch"
-        write_lmcache_cuda_mp_patch "$LMCACHE_CUDA_PATCH_DIR"
-        export LMCACHE_CUDA_DEMAND_PINNED_ALLOCATOR=1
-        export PYTHONPATH="$LMCACHE_CUDA_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}"
         python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
 
         # Keep the semantic CPU KV pool at 2.5 TB for every TP shape. MP mode
@@ -337,41 +210,6 @@ case "$OFFLOADING" in
         ;;
 esac
 
-echo "Starting vllm server..."
-export TORCH_CUDA_ARCH_LIST="10.0"
-export PYTHONNOUSERSITE=1
-# Disable vLLM v0.21+ CUDA-graph memory estimator. Its pre-reservation
-# eats ~32% of HBM upfront which, combined with FP4 weights at TP=4
-# (~62 GB/GPU), leaves no room for KV blocks -- _check_enough_kv_cache_memory
-# trips before the engine starts. Our --gpu-memory-utilization=0.90 already
-# leaves ~18 GB/GPU slack outside vLLM's budget, which is the same safety
-# net the estimator provides, so disabling it is redundant rather than
-# unsafe.
-export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
-
-{ set +x; } 2>/dev/null
-VLLM_CMD=(
-    vllm serve "$MODEL"
-    --host 0.0.0.0
-    --port "$PORT"
-    --tensor-parallel-size="$TP"
-    --gpu-memory-utilization 0.90
-    --max-num-seqs "$CONC"
-    --reasoning-parser kimi_k2
-    --tool-call-parser kimi_k2
-    --compilation_config.pass_config.fuse_allreduce_rms true
-    --kv-cache-dtype fp8
-    --max-cudagraph-capture-size 2048
-    --stream-interval 20
-    --trust-remote-code
-    "${PREFIX_CACHE_ARGS[@]}"
-    "${OFFLOAD_ARGS[@]}"
-)
-printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
-printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
-"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
-SERVER_PID=$!
-echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 

From c050d08c350db5e465ec4dd90f4dc80d76920f5a Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 26 May 2026 17:47:03 +0900
Subject: [PATCH 05/10] manual

Signed-off-by: seungrokj <seungrok.jung@amd.com>
---
 .../single_node/agentic/kimik2.5_fp4_b200.sh  |  72 +-
 .../agentic/kimik2.5_fp4_mi355x.sh            | 686 +++++++++---------
 2 files changed, 383 insertions(+), 375 deletions(-)

diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
index e972eebf6..f1111e3d9 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
@@ -90,43 +90,6 @@ wait_for_lmcache_ready() {
     exit 1
 }
 
-echo "Starting vllm server..."
-export TORCH_CUDA_ARCH_LIST="10.0"
-export PYTHONNOUSERSITE=1
-# Disable vLLM v0.21+ CUDA-graph memory estimator. Its pre-reservation
-# eats ~32% of HBM upfront which, combined with FP4 weights at TP=4
-# (~62 GB/GPU), leaves no room for KV blocks -- _check_enough_kv_cache_memory
-# trips before the engine starts. Our --gpu-memory-utilization=0.90 already
-# leaves ~18 GB/GPU slack outside vLLM's budget, which is the same safety
-# net the estimator provides, so disabling it is redundant rather than
-# unsafe.
-export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
-
-{ set +x; } 2>/dev/null
-VLLM_CMD=(
-    vllm serve "$MODEL"
-    --host 0.0.0.0
-    --port "$PORT"
-    --tensor-parallel-size="$TP"
-    --gpu-memory-utilization 0.90
-    --max-num-seqs "$CONC"
-    --reasoning-parser kimi_k2
-    --tool-call-parser kimi_k2
-    --compilation_config.pass_config.fuse_allreduce_rms true
-    --kv-cache-dtype fp8
-    --max-cudagraph-capture-size 2048
-    --stream-interval 20
-    --trust-remote-code
-    "${PREFIX_CACHE_ARGS[@]}"
-    "${OFFLOAD_ARGS[@]}"
-)
-printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
-printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
-"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
-SERVER_PID=$!
-echo "Server PID: $SERVER_PID"
-
-
 case "$OFFLOADING" in
     none)
         ;;
@@ -210,6 +173,41 @@ case "$OFFLOADING" in
         ;;
 esac
 
+echo "Starting vllm server..."
+export TORCH_CUDA_ARCH_LIST="10.0"
+export PYTHONNOUSERSITE=1
+# Disable vLLM v0.21+ CUDA-graph memory estimator. Its pre-reservation
+# eats ~32% of HBM upfront which, combined with FP4 weights at TP=4
+# (~62 GB/GPU), leaves no room for KV blocks -- _check_enough_kv_cache_memory
+# trips before the engine starts. Our --gpu-memory-utilization=0.90 already
+# leaves ~18 GB/GPU slack outside vLLM's budget, which is the same safety
+# net the estimator provides, so disabling it is redundant rather than
+# unsafe.
+export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
+
+{ set +x; } 2>/dev/null
+VLLM_CMD=(
+    vllm serve "$MODEL"
+    --host 0.0.0.0
+    --port "$PORT"
+    --tensor-parallel-size="$TP"
+    --gpu-memory-utilization 0.90
+    --max-num-seqs "$CONC"
+    --reasoning-parser kimi_k2
+    --tool-call-parser kimi_k2
+    --compilation_config.pass_config.fuse_allreduce_rms true
+    --kv-cache-dtype fp8
+    --max-cudagraph-capture-size 2048
+    --stream-interval 20
+    --trust-remote-code
+    "${PREFIX_CACHE_ARGS[@]}"
+    "${OFFLOAD_ARGS[@]}"
+)
+printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
+"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
index 1e716aa4e..aa4ffd149 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
@@ -55,289 +55,289 @@ if [ "${TP}" -lt 8 ]; then
   export VLLM_ROCM_USE_AITER_RMSNORM=0
 fi
 
-write_lmcache_rocm_mp_patch() {
-    local patch_dir="$1"
-    mkdir -p "$patch_dir"
-    cat > "$patch_dir/sitecustomize.py" <<'PY'
-"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches."""
-
-import os
-import threading
-
-if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1":
-    import builtins
-    import sys
-
-    _orig_import = builtins.__import__
-
-    def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
-        _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
-
-        if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False):
-            return
-
-        _orig_init = _LazyMemoryAllocator.__init__
-        _orig_allocate = _LazyMemoryAllocator.allocate
-        _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
-
-        def _expand_to(self, target_size: int) -> None:
-            target_size = min(
-                self._final_size,
-                _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
-            )
-            lock = self._agentic_rocm_demand_expand_lock
-            with lock:
-                if target_size <= self._curr_size:
-                    return
-
-                start_size = self._curr_size
-                while self._curr_size < target_size:
-                    commit_start = self._curr_size
-                    commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
-                    while self._curr_size < commit_target:
-                        self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
-                        self._curr_size += self.PIN_CHUNK_SIZE
-                    self._commit_expansion(self._curr_size - commit_start)
-
-                self._log_expansion_progress(self._curr_size - start_size)
-
-        def _retry_with_demand_expansion(self, allocate_once):
-            obj = allocate_once()
-            step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64"))
-            step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
-
-            while obj is None and self._curr_size < self._final_size:
-                _expand_to(self, self._curr_size + step_bytes)
-                obj = allocate_once()
-
-            return obj
-
-        def _patched_init(self, *args, **kwargs):
-            _orig_init(self, *args, **kwargs)
-            self._agentic_rocm_demand_expand_lock = threading.Lock()
-
-            # LMCache MP's upstream LazyMemoryAllocator currently expands to
-            # the final pinned size in a background thread. On ROCm Kimi TP4,
-            # vLLM reaches KV-cache registration only after that 2.5 TB pool
-            # is fully pinned, and the server-side IPC open path can stall
-            # before acknowledging register_kv_caches. Keep the same final
-            # capacity, but pin/commit extra host memory only when L1
-            # allocations actually need it.
-            self._stop_expand.set()
-            self._expand_thread.join()
-            _lazy_memory_allocator.logger.info(
-                "Agentic ROCm patch: using demand-driven LMCache pinned "
-                "memory expansion; final capacity remains %s MB",
-                self._final_size >> 20,
-            )
-
-        def _patched_allocate(
-            self,
-            shapes,
-            dtypes,
-            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
-            allocator_type=None,
-        ):
-            return _retry_with_demand_expansion(
-                self,
-                lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
-            )
-
-        def _patched_batched_allocate(
-            self,
-            shapes,
-            dtypes,
-            batch_size,
-            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
-            allocator_type=None,
-        ):
-            return _retry_with_demand_expansion(
-                self,
-                lambda: _orig_batched_allocate(
-                    self, shapes, dtypes, batch_size, fmt, allocator_type
-                ),
-            )
-
-        _LazyMemoryAllocator.__init__ = _patched_init
-        _LazyMemoryAllocator.allocate = _patched_allocate
-        _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
-        _LazyMemoryAllocator._agentic_rocm_demand_patch = True
-
-    def _patch_l1_memory_manager(_memory_manager) -> None:
-        _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
-        _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
-        if _L1MemoryManager is None or _LazyMemoryAllocator is None:
-            return
-        if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False):
-            return
-
-        _orig_get_memory_usage = _L1MemoryManager.get_memory_usage
-
-        def _patched_get_memory_usage(self):
-            allocator = getattr(self, "_allocator", None)
-            if isinstance(allocator, _LazyMemoryAllocator):
-                address_manager = allocator.get_address_manager()
-                used_size = (
-                    address_manager.get_heap_size() - address_manager.get_free_size()
-                )
-                return used_size, allocator._final_size
-            return _orig_get_memory_usage(self)
-
-        _L1MemoryManager.get_memory_usage = _patched_get_memory_usage
-        _L1MemoryManager._agentic_rocm_final_capacity_patch = True
-
-    def _maybe_patch_lazy_memory_allocator() -> None:
-        module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
-        if module is not None and hasattr(module, "LazyMemoryAllocator"):
-            _patch_lazy_memory_allocator(module)
-
-    def _maybe_patch_l1_memory_manager() -> None:
-        module = sys.modules.get("lmcache.v1.distributed.memory_manager")
-        if module is not None and hasattr(module, "L1MemoryManager"):
-            _patch_l1_memory_manager(module)
-
-    def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0):
-        module = _orig_import(name, globals, locals, fromlist, level)
-        if name == "lmcache.v1.lazy_memory_allocator" or (
-            name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
-        ):
-            _maybe_patch_lazy_memory_allocator()
-        if name == "lmcache.v1.distributed.memory_manager" or (
-            name.startswith("lmcache")
-            and "lmcache.v1.distributed.memory_manager" in sys.modules
-        ):
-            _maybe_patch_l1_memory_manager()
-        return module
-
-    builtins.__import__ = _agentic_rocm_import
-    _maybe_patch_lazy_memory_allocator()
-    _maybe_patch_l1_memory_manager()
-
-if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1":
-    import torch
-    import lmcache.non_cuda_equivalents as lmc
-
-    if not hasattr(lmc, "multi_layer_block_kv_transfer"):
-        _DTYPE_BY_NAME = {
-            "bfloat16": torch.bfloat16,
-            "float16": torch.float16,
-            "float32": torch.float32,
-        }
-
-        def _dtype_from_env() -> torch.dtype:
-            name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16")
-            try:
-                return _DTYPE_BY_NAME[name]
-            except KeyError as exc:
-                raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc
-
-        def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-            block_stride = shape_desc.block_stride_elems or (
-                shape_desc.bs * shape_desc.nh * shape_desc.hs
-            )
-            base = lmc._tensor_from_ptr(
-                ptr,
-                (shape_desc.nb * block_stride,),
-                dtype,
-                device,
-            )
-            return torch.as_strided(
-                base,
-                (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs),
-                (block_stride, shape_desc.nh * shape_desc.hs, 1),
-            )
-
-        def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-            return lmc._tensor_from_ptr(
-                ptr,
-                (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs),
-                dtype,
-                device,
-            )
-
-        def multi_layer_block_kv_transfer(
-            group_kv_pointers,
-            tmp_buffer_ptrs,
-            block_ids,
-            paged_memory_device,
-            direction,
-            shape_desc,
-            lmcache_chunk_size,
-            gpu_kv_format,
-            skip_blocks=0,
-        ) -> None:
-            # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with
-            # shape [num_blocks, block_size, hidden_size]. LMCache's Python
-            # fallback has no block-transfer entrypoint yet, so implement the
-            # same gather/scatter contract with torch indexing on ROCm.
-            if shape_desc.kv_size != 1:
-                raise NotImplementedError(
-                    "ROCm LMCache MP block fallback currently supports MLA KV caches only"
-                )
-
-            dtype = _dtype_from_env()
-            device = (
-                paged_memory_device
-                if isinstance(paged_memory_device, torch.device)
-                else torch.device(paged_memory_device)
-            )
-            num_layers = int(group_kv_pointers.numel())
-            blocks_per_chunk = lmcache_chunk_size // shape_desc.bs
-            direction_name = getattr(direction, "name", str(direction))
-
-            for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs):
-                start = chunk_idx * blocks_per_chunk
-                end = start + blocks_per_chunk
-                chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long)
-
-                dest_slot_offset = 0
-                if skip_blocks and chunk_idx == 0:
-                    chunk_blocks = chunk_blocks[int(skip_blocks):]
-                    dest_slot_offset = int(skip_blocks) * shape_desc.bs
-                if chunk_blocks.numel() == 0:
-                    continue
-
-                num_slots = int(chunk_blocks.numel()) * shape_desc.bs
-                tmp = _tmp_view(
-                    int(tmp_ptr),
-                    shape_desc,
-                    num_layers,
-                    lmcache_chunk_size,
-                    dtype,
-                    device,
-                )
-
-                for layer_idx in range(num_layers):
-                    paged = _paged_view(
-                        int(group_kv_pointers[layer_idx].item()),
-                        shape_desc,
-                        dtype,
-                        device,
-                    )
-                    tmp_slice = tmp[
-                        0,
-                        layer_idx,
-                        dest_slot_offset : dest_slot_offset + num_slots,
-                        :,
-                    ]
-                    if direction_name == "D2H":
-                        gathered = paged.index_select(0, chunk_blocks).reshape(
-                            num_slots, shape_desc.nh * shape_desc.hs
-                        )
-                        tmp_slice.copy_(gathered)
-                    elif direction_name == "H2D":
-                        src = tmp_slice.reshape(
-                            int(chunk_blocks.numel()),
-                            shape_desc.bs,
-                            shape_desc.nh * shape_desc.hs,
-                        )
-                        paged.index_copy_(0, chunk_blocks, src)
-                    else:
-                        raise ValueError(f"Unsupported transfer direction: {direction}")
-
-        lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer
-PY
-}
+#write_lmcache_rocm_mp_patch() {
+#    local patch_dir="$1"
+#    mkdir -p "$patch_dir"
+#    cat > "$patch_dir/sitecustomize.py" <<'PY'
+#"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches."""
+#
+#import os
+#import threading
+#
+#if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1":
+#    import builtins
+#    import sys
+#
+#    _orig_import = builtins.__import__
+#
+#    def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
+#        _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
+#
+#        if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False):
+#            return
+#
+#        _orig_init = _LazyMemoryAllocator.__init__
+#        _orig_allocate = _LazyMemoryAllocator.allocate
+#        _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
+#
+#        def _expand_to(self, target_size: int) -> None:
+#            target_size = min(
+#                self._final_size,
+#                _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
+#            )
+#            lock = self._agentic_rocm_demand_expand_lock
+#            with lock:
+#                if target_size <= self._curr_size:
+#                    return
+#
+#                start_size = self._curr_size
+#                while self._curr_size < target_size:
+#                    commit_start = self._curr_size
+#                    commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
+#                    while self._curr_size < commit_target:
+#                        self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
+#                        self._curr_size += self.PIN_CHUNK_SIZE
+#                    self._commit_expansion(self._curr_size - commit_start)
+#
+#                self._log_expansion_progress(self._curr_size - start_size)
+#
+#        def _retry_with_demand_expansion(self, allocate_once):
+#            obj = allocate_once()
+#            step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64"))
+#            step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
+#
+#            while obj is None and self._curr_size < self._final_size:
+#                _expand_to(self, self._curr_size + step_bytes)
+#                obj = allocate_once()
+#
+#            return obj
+#
+#        def _patched_init(self, *args, **kwargs):
+#            _orig_init(self, *args, **kwargs)
+#            self._agentic_rocm_demand_expand_lock = threading.Lock()
+#
+#            # LMCache MP's upstream LazyMemoryAllocator currently expands to
+#            # the final pinned size in a background thread. On ROCm Kimi TP4,
+#            # vLLM reaches KV-cache registration only after that 2.5 TB pool
+#            # is fully pinned, and the server-side IPC open path can stall
+#            # before acknowledging register_kv_caches. Keep the same final
+#            # capacity, but pin/commit extra host memory only when L1
+#            # allocations actually need it.
+#            self._stop_expand.set()
+#            self._expand_thread.join()
+#            _lazy_memory_allocator.logger.info(
+#                "Agentic ROCm patch: using demand-driven LMCache pinned "
+#                "memory expansion; final capacity remains %s MB",
+#                self._final_size >> 20,
+#            )
+#
+#        def _patched_allocate(
+#            self,
+#            shapes,
+#            dtypes,
+#            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
+#            allocator_type=None,
+#        ):
+#            return _retry_with_demand_expansion(
+#                self,
+#                lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
+#            )
+#
+#        def _patched_batched_allocate(
+#            self,
+#            shapes,
+#            dtypes,
+#            batch_size,
+#            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
+#            allocator_type=None,
+#        ):
+#            return _retry_with_demand_expansion(
+#                self,
+#                lambda: _orig_batched_allocate(
+#                    self, shapes, dtypes, batch_size, fmt, allocator_type
+#                ),
+#            )
+#
+#        _LazyMemoryAllocator.__init__ = _patched_init
+#        _LazyMemoryAllocator.allocate = _patched_allocate
+#        _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
+#        _LazyMemoryAllocator._agentic_rocm_demand_patch = True
+#
+#    def _patch_l1_memory_manager(_memory_manager) -> None:
+#        _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
+#        _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
+#        if _L1MemoryManager is None or _LazyMemoryAllocator is None:
+#            return
+#        if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False):
+#            return
+#
+#        _orig_get_memory_usage = _L1MemoryManager.get_memory_usage
+#
+#        def _patched_get_memory_usage(self):
+#            allocator = getattr(self, "_allocator", None)
+#            if isinstance(allocator, _LazyMemoryAllocator):
+#                address_manager = allocator.get_address_manager()
+#                used_size = (
+#                    address_manager.get_heap_size() - address_manager.get_free_size()
+#                )
+#                return used_size, allocator._final_size
+#            return _orig_get_memory_usage(self)
+#
+#        _L1MemoryManager.get_memory_usage = _patched_get_memory_usage
+#        _L1MemoryManager._agentic_rocm_final_capacity_patch = True
+#
+#    def _maybe_patch_lazy_memory_allocator() -> None:
+#        module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
+#        if module is not None and hasattr(module, "LazyMemoryAllocator"):
+#            _patch_lazy_memory_allocator(module)
+#
+#    def _maybe_patch_l1_memory_manager() -> None:
+#        module = sys.modules.get("lmcache.v1.distributed.memory_manager")
+#        if module is not None and hasattr(module, "L1MemoryManager"):
+#            _patch_l1_memory_manager(module)
+#
+#    def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0):
+#        module = _orig_import(name, globals, locals, fromlist, level)
+#        if name == "lmcache.v1.lazy_memory_allocator" or (
+#            name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
+#        ):
+#            _maybe_patch_lazy_memory_allocator()
+#        if name == "lmcache.v1.distributed.memory_manager" or (
+#            name.startswith("lmcache")
+#            and "lmcache.v1.distributed.memory_manager" in sys.modules
+#        ):
+#            _maybe_patch_l1_memory_manager()
+#        return module
+#
+#    builtins.__import__ = _agentic_rocm_import
+#    _maybe_patch_lazy_memory_allocator()
+#    _maybe_patch_l1_memory_manager()
+#
+#if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1":
+#    import torch
+#    import lmcache.non_cuda_equivalents as lmc
+#
+#    if not hasattr(lmc, "multi_layer_block_kv_transfer"):
+#        _DTYPE_BY_NAME = {
+#            "bfloat16": torch.bfloat16,
+#            "float16": torch.float16,
+#            "float32": torch.float32,
+#        }
+#
+#        def _dtype_from_env() -> torch.dtype:
+#            name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16")
+#            try:
+#                return _DTYPE_BY_NAME[name]
+#            except KeyError as exc:
+#                raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc
+#
+#        def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+#            block_stride = shape_desc.block_stride_elems or (
+#                shape_desc.bs * shape_desc.nh * shape_desc.hs
+#            )
+#            base = lmc._tensor_from_ptr(
+#                ptr,
+#                (shape_desc.nb * block_stride,),
+#                dtype,
+#                device,
+#            )
+#            return torch.as_strided(
+#                base,
+#                (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs),
+#                (block_stride, shape_desc.nh * shape_desc.hs, 1),
+#            )
+#
+#        def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+#            return lmc._tensor_from_ptr(
+#                ptr,
+#                (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs),
+#                dtype,
+#                device,
+#            )
+#
+#        def multi_layer_block_kv_transfer(
+#            group_kv_pointers,
+#            tmp_buffer_ptrs,
+#            block_ids,
+#            paged_memory_device,
+#            direction,
+#            shape_desc,
+#            lmcache_chunk_size,
+#            gpu_kv_format,
+#            skip_blocks=0,
+#        ) -> None:
+#            # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with
+#            # shape [num_blocks, block_size, hidden_size]. LMCache's Python
+#            # fallback has no block-transfer entrypoint yet, so implement the
+#            # same gather/scatter contract with torch indexing on ROCm.
+#            if shape_desc.kv_size != 1:
+#                raise NotImplementedError(
+#                    "ROCm LMCache MP block fallback currently supports MLA KV caches only"
+#                )
+#
+#            dtype = _dtype_from_env()
+#            device = (
+#                paged_memory_device
+#                if isinstance(paged_memory_device, torch.device)
+#                else torch.device(paged_memory_device)
+#            )
+#            num_layers = int(group_kv_pointers.numel())
+#            blocks_per_chunk = lmcache_chunk_size // shape_desc.bs
+#            direction_name = getattr(direction, "name", str(direction))
+#
+#            for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs):
+#                start = chunk_idx * blocks_per_chunk
+#                end = start + blocks_per_chunk
+#                chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long)
+#
+#                dest_slot_offset = 0
+#                if skip_blocks and chunk_idx == 0:
+#                    chunk_blocks = chunk_blocks[int(skip_blocks):]
+#                    dest_slot_offset = int(skip_blocks) * shape_desc.bs
+#                if chunk_blocks.numel() == 0:
+#                    continue
+#
+#                num_slots = int(chunk_blocks.numel()) * shape_desc.bs
+#                tmp = _tmp_view(
+#                    int(tmp_ptr),
+#                    shape_desc,
+#                    num_layers,
+#                    lmcache_chunk_size,
+#                    dtype,
+#                    device,
+#                )
+#
+#                for layer_idx in range(num_layers):
+#                    paged = _paged_view(
+#                        int(group_kv_pointers[layer_idx].item()),
+#                        shape_desc,
+#                        dtype,
+#                        device,
+#                    )
+#                    tmp_slice = tmp[
+#                        0,
+#                        layer_idx,
+#                        dest_slot_offset : dest_slot_offset + num_slots,
+#                        :,
+#                    ]
+#                    if direction_name == "D2H":
+#                        gathered = paged.index_select(0, chunk_blocks).reshape(
+#                            num_slots, shape_desc.nh * shape_desc.hs
+#                        )
+#                        tmp_slice.copy_(gathered)
+#                    elif direction_name == "H2D":
+#                        src = tmp_slice.reshape(
+#                            int(chunk_blocks.numel()),
+#                            shape_desc.bs,
+#                            shape_desc.nh * shape_desc.hs,
+#                        )
+#                        paged.index_copy_(0, chunk_blocks, src)
+#                    else:
+#                        raise ValueError(f"Unsupported transfer direction: {direction}")
+#
+#        lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer
+#PY
+#}
 
 # Workaround for MEC FW <177 RCCL memory reclaim issue
 version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
@@ -431,60 +431,70 @@ case "$OFFLOADING" in
         { set +x; } 2>/dev/null
         unset VLLM_USE_SIMPLE_KV_OFFLOAD
 
-        agentic_pip_install --quiet --no-cache-dir lmcache
-        # LMCache's current dependency chain can install NVIDIA/CUDA NIXL and
-        # CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and
-        # during Kimi fused-MoE model inspection it imports nixl_ep whenever
-        # that module is importable, even when this run is not using EP/NIXL
-        # kernels. The CUDA extension then fails immediately on AMD nodes with
-        # "ImportError: libcuda.so.1".
-        #
-        # LMCache MP also uses CuPy stream APIs while registering vLLM's KV
-        # caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime
-        # with cudaErrorInsufficientDriver when LMCache touches the stream. Use
-        # the ROCm 7 CuPy wheel so the same API dispatches through HIP.
-        python3 -m pip uninstall -y \
-            nixl nixl-cu12 nixl-cu13 nixl_ep \
-            >/dev/null 2>&1 || true
-        python3 -m pip uninstall -y \
-            cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \
-            >/dev/null 2>&1 || true
-        agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0
-        python3 - <<'PY'
-import importlib.util
-import sys
-
-spec = importlib.util.find_spec("nixl_ep")
-if spec is not None:
-    locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"])
-    print(
-        "Error: nixl_ep is still importable after LMCache install; "
-        "this ROCm Kimi run would import a CUDA-only nixl_ep module. "
-        f"location={locations}",
-        file=sys.stderr,
-    )
-    sys.exit(1)
-
-try:
-    from cupy_backends.cuda.api import runtime as cupy_runtime
-except Exception as exc:
-    print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr)
-    sys.exit(1)
-
-if not getattr(cupy_runtime, "is_hip", False):
-    print(
-        "Error: CuPy is still using the CUDA backend after installing "
-        "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.",
-        file=sys.stderr,
-    )
-    sys.exit(1)
-PY
-        LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch"
-        write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR"
-        export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1
-        export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16
-        export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1
-        export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}"
+        #agentic_pip_install --quiet --no-cache-dir lmcache
+        ## LMCache's current dependency chain can install NVIDIA/CUDA NIXL and
+        ## CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and
+        ## during Kimi fused-MoE model inspection it imports nixl_ep whenever
+        ## that module is importable, even when this run is not using EP/NIXL
+        ## kernels. The CUDA extension then fails immediately on AMD nodes with
+        ## "ImportError: libcuda.so.1".
+        ##
+        ## LMCache MP also uses CuPy stream APIs while registering vLLM's KV
+        ## caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime
+        ## with cudaErrorInsufficientDriver when LMCache touches the stream. Use
+        ## the ROCm 7 CuPy wheel so the same API dispatches through HIP.
+        #python3 -m pip uninstall -y \
+        #    nixl nixl-cu12 nixl-cu13 nixl_ep \
+        #    >/dev/null 2>&1 || true
+        #python3 -m pip uninstall -y \
+        #    cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \
+        #    >/dev/null 2>&1 || true
+        #agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0
+
+
+
+#        python3 - <<'PY'
+#import importlib.util
+#import sys
+#
+#spec = importlib.util.find_spec("nixl_ep")
+#if spec is not None:
+#    locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"])
+#    print(
+#        "Error: nixl_ep is still importable after LMCache install; "
+#        "this ROCm Kimi run would import a CUDA-only nixl_ep module. "
+#        f"location={locations}",
+#        file=sys.stderr,
+#    )
+#    sys.exit(1)
+#
+#try:
+#    from cupy_backends.cuda.api import runtime as cupy_runtime
+#except Exception as exc:
+#    print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr)
+#    sys.exit(1)
+#
+#if not getattr(cupy_runtime, "is_hip", False):
+#    print(
+#        "Error: CuPy is still using the CUDA backend after installing "
+#        "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.",
+#        file=sys.stderr,
+#    )
+#    sys.exit(1)
+#PY
+        #LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch"
+        #write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR"
+        #export LMCACHE_ROCM_MP_BLOCK_FALLBACK=0
+        #export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16
+        #export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=0
+        #export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}"
+
+        git clone https://github.com/seungrokj/LMCache.git
+        cd LMCache
+        pip install -r requirements/build.txt 
+        CXX=hipcc BUILD_WITH_HIP=1 pip install -e .   --no-build-isolation
+        cd ..
+
         python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
 
         # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
@@ -578,4 +588,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file

From 2af7377bae1f849d00e827fe28bf00062f5182b2 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 26 May 2026 18:57:51 +0900
Subject: [PATCH 06/10] manual

Signed-off-by: seungrokj <seungrok.jung@amd.com>
---
 benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
index aa4ffd149..ad83d6daa 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
@@ -521,6 +521,7 @@ case "$OFFLOADING" in
         LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
         LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
         export PYTHONHASHSEED="${PYTHONHASHSEED:-0}"
+        export LMCACHE_BLOCKING_TIMEOUT_SECS=60
 
         echo "Starting LMCache MP server..."
         LMCACHE_CMD=(

From b089e28e1598fc53da56360a5d43709244432c45 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 26 May 2026 20:14:49 +0900
Subject: [PATCH 07/10] fix(agentic): add CUDA LMCache MP patch for Kimi FP4
 B200

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               |   6 +-
 .../agentic/kimik2.5_fp4_mi355x.sh            | 342 ------------------
 2 files changed, 4 insertions(+), 344 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index d02218f5f..76d380233 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -616,10 +616,12 @@ kimik2.5-fp4-mi355x-vllm-agentic:
     agentic-coding:
     - duration: 1800
       search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
       - { tp: 8, offloading: lmcache,  conc-list: [32, 40, 48, 56] }
-      - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
       - { tp: 4, offloading: lmcache,  conc-list: [16, 24, 32, 40] }
+      #- { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
+      #- { tp: 8, offloading: lmcache,  conc-list: [32, 40, 48, 56] }
+      #- { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
+      #- { tp: 4, offloading: lmcache,  conc-list: [16, 24, 32, 40] }
 
 kimik2.5-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
index ad83d6daa..e9c036ba3 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
@@ -55,290 +55,6 @@ if [ "${TP}" -lt 8 ]; then
   export VLLM_ROCM_USE_AITER_RMSNORM=0
 fi
 
-#write_lmcache_rocm_mp_patch() {
-#    local patch_dir="$1"
-#    mkdir -p "$patch_dir"
-#    cat > "$patch_dir/sitecustomize.py" <<'PY'
-#"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches."""
-#
-#import os
-#import threading
-#
-#if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1":
-#    import builtins
-#    import sys
-#
-#    _orig_import = builtins.__import__
-#
-#    def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
-#        _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
-#
-#        if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False):
-#            return
-#
-#        _orig_init = _LazyMemoryAllocator.__init__
-#        _orig_allocate = _LazyMemoryAllocator.allocate
-#        _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
-#
-#        def _expand_to(self, target_size: int) -> None:
-#            target_size = min(
-#                self._final_size,
-#                _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
-#            )
-#            lock = self._agentic_rocm_demand_expand_lock
-#            with lock:
-#                if target_size <= self._curr_size:
-#                    return
-#
-#                start_size = self._curr_size
-#                while self._curr_size < target_size:
-#                    commit_start = self._curr_size
-#                    commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
-#                    while self._curr_size < commit_target:
-#                        self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
-#                        self._curr_size += self.PIN_CHUNK_SIZE
-#                    self._commit_expansion(self._curr_size - commit_start)
-#
-#                self._log_expansion_progress(self._curr_size - start_size)
-#
-#        def _retry_with_demand_expansion(self, allocate_once):
-#            obj = allocate_once()
-#            step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64"))
-#            step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
-#
-#            while obj is None and self._curr_size < self._final_size:
-#                _expand_to(self, self._curr_size + step_bytes)
-#                obj = allocate_once()
-#
-#            return obj
-#
-#        def _patched_init(self, *args, **kwargs):
-#            _orig_init(self, *args, **kwargs)
-#            self._agentic_rocm_demand_expand_lock = threading.Lock()
-#
-#            # LMCache MP's upstream LazyMemoryAllocator currently expands to
-#            # the final pinned size in a background thread. On ROCm Kimi TP4,
-#            # vLLM reaches KV-cache registration only after that 2.5 TB pool
-#            # is fully pinned, and the server-side IPC open path can stall
-#            # before acknowledging register_kv_caches. Keep the same final
-#            # capacity, but pin/commit extra host memory only when L1
-#            # allocations actually need it.
-#            self._stop_expand.set()
-#            self._expand_thread.join()
-#            _lazy_memory_allocator.logger.info(
-#                "Agentic ROCm patch: using demand-driven LMCache pinned "
-#                "memory expansion; final capacity remains %s MB",
-#                self._final_size >> 20,
-#            )
-#
-#        def _patched_allocate(
-#            self,
-#            shapes,
-#            dtypes,
-#            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
-#            allocator_type=None,
-#        ):
-#            return _retry_with_demand_expansion(
-#                self,
-#                lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
-#            )
-#
-#        def _patched_batched_allocate(
-#            self,
-#            shapes,
-#            dtypes,
-#            batch_size,
-#            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
-#            allocator_type=None,
-#        ):
-#            return _retry_with_demand_expansion(
-#                self,
-#                lambda: _orig_batched_allocate(
-#                    self, shapes, dtypes, batch_size, fmt, allocator_type
-#                ),
-#            )
-#
-#        _LazyMemoryAllocator.__init__ = _patched_init
-#        _LazyMemoryAllocator.allocate = _patched_allocate
-#        _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
-#        _LazyMemoryAllocator._agentic_rocm_demand_patch = True
-#
-#    def _patch_l1_memory_manager(_memory_manager) -> None:
-#        _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
-#        _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
-#        if _L1MemoryManager is None or _LazyMemoryAllocator is None:
-#            return
-#        if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False):
-#            return
-#
-#        _orig_get_memory_usage = _L1MemoryManager.get_memory_usage
-#
-#        def _patched_get_memory_usage(self):
-#            allocator = getattr(self, "_allocator", None)
-#            if isinstance(allocator, _LazyMemoryAllocator):
-#                address_manager = allocator.get_address_manager()
-#                used_size = (
-#                    address_manager.get_heap_size() - address_manager.get_free_size()
-#                )
-#                return used_size, allocator._final_size
-#            return _orig_get_memory_usage(self)
-#
-#        _L1MemoryManager.get_memory_usage = _patched_get_memory_usage
-#        _L1MemoryManager._agentic_rocm_final_capacity_patch = True
-#
-#    def _maybe_patch_lazy_memory_allocator() -> None:
-#        module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
-#        if module is not None and hasattr(module, "LazyMemoryAllocator"):
-#            _patch_lazy_memory_allocator(module)
-#
-#    def _maybe_patch_l1_memory_manager() -> None:
-#        module = sys.modules.get("lmcache.v1.distributed.memory_manager")
-#        if module is not None and hasattr(module, "L1MemoryManager"):
-#            _patch_l1_memory_manager(module)
-#
-#    def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0):
-#        module = _orig_import(name, globals, locals, fromlist, level)
-#        if name == "lmcache.v1.lazy_memory_allocator" or (
-#            name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
-#        ):
-#            _maybe_patch_lazy_memory_allocator()
-#        if name == "lmcache.v1.distributed.memory_manager" or (
-#            name.startswith("lmcache")
-#            and "lmcache.v1.distributed.memory_manager" in sys.modules
-#        ):
-#            _maybe_patch_l1_memory_manager()
-#        return module
-#
-#    builtins.__import__ = _agentic_rocm_import
-#    _maybe_patch_lazy_memory_allocator()
-#    _maybe_patch_l1_memory_manager()
-#
-#if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1":
-#    import torch
-#    import lmcache.non_cuda_equivalents as lmc
-#
-#    if not hasattr(lmc, "multi_layer_block_kv_transfer"):
-#        _DTYPE_BY_NAME = {
-#            "bfloat16": torch.bfloat16,
-#            "float16": torch.float16,
-#            "float32": torch.float32,
-#        }
-#
-#        def _dtype_from_env() -> torch.dtype:
-#            name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16")
-#            try:
-#                return _DTYPE_BY_NAME[name]
-#            except KeyError as exc:
-#                raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc
-#
-#        def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-#            block_stride = shape_desc.block_stride_elems or (
-#                shape_desc.bs * shape_desc.nh * shape_desc.hs
-#            )
-#            base = lmc._tensor_from_ptr(
-#                ptr,
-#                (shape_desc.nb * block_stride,),
-#                dtype,
-#                device,
-#            )
-#            return torch.as_strided(
-#                base,
-#                (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs),
-#                (block_stride, shape_desc.nh * shape_desc.hs, 1),
-#            )
-#
-#        def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-#            return lmc._tensor_from_ptr(
-#                ptr,
-#                (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs),
-#                dtype,
-#                device,
-#            )
-#
-#        def multi_layer_block_kv_transfer(
-#            group_kv_pointers,
-#            tmp_buffer_ptrs,
-#            block_ids,
-#            paged_memory_device,
-#            direction,
-#            shape_desc,
-#            lmcache_chunk_size,
-#            gpu_kv_format,
-#            skip_blocks=0,
-#        ) -> None:
-#            # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with
-#            # shape [num_blocks, block_size, hidden_size]. LMCache's Python
-#            # fallback has no block-transfer entrypoint yet, so implement the
-#            # same gather/scatter contract with torch indexing on ROCm.
-#            if shape_desc.kv_size != 1:
-#                raise NotImplementedError(
-#                    "ROCm LMCache MP block fallback currently supports MLA KV caches only"
-#                )
-#
-#            dtype = _dtype_from_env()
-#            device = (
-#                paged_memory_device
-#                if isinstance(paged_memory_device, torch.device)
-#                else torch.device(paged_memory_device)
-#            )
-#            num_layers = int(group_kv_pointers.numel())
-#            blocks_per_chunk = lmcache_chunk_size // shape_desc.bs
-#            direction_name = getattr(direction, "name", str(direction))
-#
-#            for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs):
-#                start = chunk_idx * blocks_per_chunk
-#                end = start + blocks_per_chunk
-#                chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long)
-#
-#                dest_slot_offset = 0
-#                if skip_blocks and chunk_idx == 0:
-#                    chunk_blocks = chunk_blocks[int(skip_blocks):]
-#                    dest_slot_offset = int(skip_blocks) * shape_desc.bs
-#                if chunk_blocks.numel() == 0:
-#                    continue
-#
-#                num_slots = int(chunk_blocks.numel()) * shape_desc.bs
-#                tmp = _tmp_view(
-#                    int(tmp_ptr),
-#                    shape_desc,
-#                    num_layers,
-#                    lmcache_chunk_size,
-#                    dtype,
-#                    device,
-#                )
-#
-#                for layer_idx in range(num_layers):
-#                    paged = _paged_view(
-#                        int(group_kv_pointers[layer_idx].item()),
-#                        shape_desc,
-#                        dtype,
-#                        device,
-#                    )
-#                    tmp_slice = tmp[
-#                        0,
-#                        layer_idx,
-#                        dest_slot_offset : dest_slot_offset + num_slots,
-#                        :,
-#                    ]
-#                    if direction_name == "D2H":
-#                        gathered = paged.index_select(0, chunk_blocks).reshape(
-#                            num_slots, shape_desc.nh * shape_desc.hs
-#                        )
-#                        tmp_slice.copy_(gathered)
-#                    elif direction_name == "H2D":
-#                        src = tmp_slice.reshape(
-#                            int(chunk_blocks.numel()),
-#                            shape_desc.bs,
-#                            shape_desc.nh * shape_desc.hs,
-#                        )
-#                        paged.index_copy_(0, chunk_blocks, src)
-#                    else:
-#                        raise ValueError(f"Unsupported transfer direction: {direction}")
-#
-#        lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer
-#PY
-#}
-
 # Workaround for MEC FW <177 RCCL memory reclaim issue
 version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
 if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
@@ -431,64 +147,6 @@ case "$OFFLOADING" in
         { set +x; } 2>/dev/null
         unset VLLM_USE_SIMPLE_KV_OFFLOAD
 
-        #agentic_pip_install --quiet --no-cache-dir lmcache
-        ## LMCache's current dependency chain can install NVIDIA/CUDA NIXL and
-        ## CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and
-        ## during Kimi fused-MoE model inspection it imports nixl_ep whenever
-        ## that module is importable, even when this run is not using EP/NIXL
-        ## kernels. The CUDA extension then fails immediately on AMD nodes with
-        ## "ImportError: libcuda.so.1".
-        ##
-        ## LMCache MP also uses CuPy stream APIs while registering vLLM's KV
-        ## caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime
-        ## with cudaErrorInsufficientDriver when LMCache touches the stream. Use
-        ## the ROCm 7 CuPy wheel so the same API dispatches through HIP.
-        #python3 -m pip uninstall -y \
-        #    nixl nixl-cu12 nixl-cu13 nixl_ep \
-        #    >/dev/null 2>&1 || true
-        #python3 -m pip uninstall -y \
-        #    cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \
-        #    >/dev/null 2>&1 || true
-        #agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0
-
-
-
-#        python3 - <<'PY'
-#import importlib.util
-#import sys
-#
-#spec = importlib.util.find_spec("nixl_ep")
-#if spec is not None:
-#    locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"])
-#    print(
-#        "Error: nixl_ep is still importable after LMCache install; "
-#        "this ROCm Kimi run would import a CUDA-only nixl_ep module. "
-#        f"location={locations}",
-#        file=sys.stderr,
-#    )
-#    sys.exit(1)
-#
-#try:
-#    from cupy_backends.cuda.api import runtime as cupy_runtime
-#except Exception as exc:
-#    print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr)
-#    sys.exit(1)
-#
-#if not getattr(cupy_runtime, "is_hip", False):
-#    print(
-#        "Error: CuPy is still using the CUDA backend after installing "
-#        "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.",
-#        file=sys.stderr,
-#    )
-#    sys.exit(1)
-#PY
-        #LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch"
-        #write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR"
-        #export LMCACHE_ROCM_MP_BLOCK_FALLBACK=0
-        #export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16
-        #export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=0
-        #export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}"
-
         git clone https://github.com/seungrokj/LMCache.git
         cd LMCache
         pip install -r requirements/build.txt 

From 2912288af18bafc218c8da1113e8fba994c66821 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 26 May 2026 22:35:37 +0900
Subject: [PATCH 08/10] manual

Signed-off-by: seungrokj <seungrok.jung@amd.com>
---
 .github/configs/amd-master.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 76d380233..f7f100544 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -616,8 +616,9 @@ kimik2.5-fp4-mi355x-vllm-agentic:
     agentic-coding:
     - duration: 1800
       search-space:
-      - { tp: 8, offloading: lmcache,  conc-list: [32, 40, 48, 56] }
-      - { tp: 4, offloading: lmcache,  conc-list: [16, 24, 32, 40] }
+      - { tp: 8, offloading: lmcache,  conc-list: [40] }
+      #- { tp: 8, offloading: lmcache,  conc-list: [32, 40, 48, 56] }
+      #- { tp: 4, offloading: lmcache,  conc-list: [16, 24, 32, 40] }
       #- { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
       #- { tp: 8, offloading: lmcache,  conc-list: [32, 40, 48, 56] }
       #- { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }

From 3fa8c2be322b3fe8481375b4940be9f63bdb0a68 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 26 May 2026 22:37:54 +0900
Subject: [PATCH 09/10] manual

Signed-off-by: seungrokj <seungrok.jung@amd.com>
---
 .github/configs/amd-master.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index f7f100544..76d380233 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -616,9 +616,8 @@ kimik2.5-fp4-mi355x-vllm-agentic:
     agentic-coding:
     - duration: 1800
       search-space:
-      - { tp: 8, offloading: lmcache,  conc-list: [40] }
-      #- { tp: 8, offloading: lmcache,  conc-list: [32, 40, 48, 56] }
-      #- { tp: 4, offloading: lmcache,  conc-list: [16, 24, 32, 40] }
+      - { tp: 8, offloading: lmcache,  conc-list: [32, 40, 48, 56] }
+      - { tp: 4, offloading: lmcache,  conc-list: [16, 24, 32, 40] }
       #- { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
       #- { tp: 8, offloading: lmcache,  conc-list: [32, 40, 48, 56] }
       #- { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }

From 0323ccb9f6b9f40fe8114845221610a6afa0b287 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Wed, 27 May 2026 00:30:00 +0900
Subject: [PATCH 10/10] manual

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml                     | 6 ++----
 benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh | 8 ++------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 76d380233..d02218f5f 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -616,12 +616,10 @@ kimik2.5-fp4-mi355x-vllm-agentic:
     agentic-coding:
     - duration: 1800
       search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
       - { tp: 8, offloading: lmcache,  conc-list: [32, 40, 48, 56] }
+      - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
       - { tp: 4, offloading: lmcache,  conc-list: [16, 24, 32, 40] }
-      #- { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
-      #- { tp: 8, offloading: lmcache,  conc-list: [32, 40, 48, 56] }
-      #- { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
-      #- { tp: 4, offloading: lmcache,  conc-list: [16, 24, 32, 40] }
 
 kimik2.5-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
index f1111e3d9..366603f45 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
@@ -99,9 +99,7 @@ case "$OFFLOADING" in
         # RSS + page cache. Eager mode (the shortcut form default) is
         # intentional here per user request — Kimi FP4 on B200 has cleared
         # the full eager sweep before.
-        #(srok), internal node limitation
-        #TOTAL_CPU_DRAM_GB=2500
-        TOTAL_CPU_DRAM_GB=1500
+        TOTAL_CPU_DRAM_GB=2500
         export VLLM_USE_SIMPLE_KV_OFFLOAD=1
         OFFLOAD_ARGS=(
             --kv_offloading_backend native
@@ -121,9 +119,7 @@ case "$OFFLOADING" in
         # --kv-offloading-size through vLLM's integrated LMCache convenience
         # path, which divides the value by TP and then hits a large single-shot
         # cudaHostAlloc in LMCache 0.4.5's single-process local CPU backend.
-        #(srok), internal node limitation
-        #TOTAL_CPU_DRAM_GB=2500
-        TOTAL_CPU_DRAM_GB=1500
+        TOTAL_CPU_DRAM_GB=2500
         LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
         LMCACHE_PORT="${LMCACHE_PORT:-5555}"
         LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"