From 1c844888884757bc9a6d1c4263d526887d1621b3 Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 03:18:19 +0000
Subject: [PATCH 01/19] feat(lora): end-to-end LoRA adapter serving
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Full LoRA adapter serving implementation for tokenspeed, including:

## Scheduler (C++)
- Per-adapter prefix cache namespacing: lora_id threaded through
  KVPrefixCache::Match, HybridPrefixCache::Match, and InsertHybridCache
  so each adapter gets its own radix-tree root for prefix reuse
- EvictLoraNamespace: evicts KV pages and removes the virtual root on
  adapter unload

## LoraManager (Python)
- GPU weight pool with LRU eviction and TP-aware weight sharding
- Tiered GPU ↔ CPU ↔ disk pool with async prefetch
- CUDA-graph support: separate no-LoRA and with-LoRA graphs captured;
  segment-grouped Triton kernels for decode
- Attention LoRA: QKV, O-proj with TP sharding and head-dim awareness
- MLP LoRA: gate_proj / up_proj / down_proj targets
- MoE LoRA: sglang_shared_outer and per_expert formats with flat Triton
  kernels that eliminate gather copies; multi-stream prefetch overlaps
  A-shrink with base MoE GEMMs
- LM-head LoRA support

## MoE LoRA kernels (tokenspeed-kernel)
- shared_a_shrink, gate_up_b_expand: sglang_shared gate/up path
- per_expert_a_shrink, per_expert_gate_up_b_expand,
  per_expert_b_down_expand: per-expert format without buffer copies
- shared_b_down_expand: shared-B down projection
- sorted_gate_up_b_expand, sorted_a_down_shrink: TMA prefill path
- Multi-stream prefetch: flat_a_gemm / flat_down_shrink launched on a
  secondary CUDA stream concurrent with base MoE GEMMs

## HTTP / serving
- lora_path accepted on /v1/completions and /v1/chat/completions
- lora_path propagated through GenerateReqInput.__getitem__
- Pack scheduling policy + cold/warm latency benchmark

## Performance (Qwen3.5-35B-A3B TP=2 BS=8)
- sglang_shared_outer n=1: ~962 tok/s (vs 1325 baseline, overhead ~2.25ms)
- per_expert n=1: ~871 tok/s (vs 624 before flat-kernel optimization)
- self_attn n=1: ~988 tok/s

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 .github/dco.yml                               |    2 +
 0520_results.md                               |   71 +
 0521_moe_lora_results.md                      |   53 +
 0522_results.md                               |  129 ++
 bench_chunked_sgmv.py                         |  817 ++++++++++
 bench_kernel_opt.py                           |  141 ++
 bench_vs_vllm.py                              |  294 ++++
 benchmark/bench_fused_moe_lora_e2e.py         |  120 ++
 benchmark/bench_fused_moe_lora_kernels.py     |  381 +++++
 benchmark/bench_lm_head_lora_decode.py        |  281 ++++
 benchmark/bench_moe_lora_decode.py            |  380 +++++
 benchmark/bench_moe_lora_retry.py             |  372 +++++
 benchmark/bench_triton_expand_kernel.py       |  192 +++
 benchmark/nsys_decode_target.py               |  126 ++
 benchmark/profile_decode.py                   |  179 +++
 benchmark/profile_lm_head_lora.py             |  130 ++
 benchmark/test_lora_batch.py                  |  126 ++
 benchmark/test_lora_dynamic.py                |  150 ++
 benchmark/test_lora_e2e.py                    |  165 ++
 benchmark/test_lora_eviction_latency.py       |  156 ++
 docs/index.md                                 |    1 +
 docs/lora_current_design.html                 |  925 ++++++++++++
 docs/serving/lora.md                          |   62 +
 docs/tokenspeed_structure.html                |  653 ++++++++
 profile_expand.py                             |  274 ++++
 python/tokenspeed/bench.py                    |    6 +-
 python/tokenspeed/runtime/engine/async_llm.py |    2 +
 .../tokenspeed/runtime/engine/event_loop.py   |   99 +-
 .../runtime/engine/input_processor.py         |   15 +
 python/tokenspeed/runtime/engine/io_struct.py |   51 +
 .../runtime/engine/request_handler.py         |   37 +
 .../engine/scheduler_control_client.py        |   44 +-
 .../runtime/engine/scheduler_utils.py         |   35 +-
 .../tokenspeed/runtime/entrypoints/engine.py  |   30 +
 .../runtime/entrypoints/engine_base.py        |   30 +
 .../tokenspeed/runtime/execution/context.py   |   31 +-
 .../runtime/execution/cuda_graph_wrapper.py   |   98 +-
 .../runtime/execution/model_executor.py       |   75 +
 .../runtime/execution/model_runner.py         |   18 +-
 .../runtime/layers/logits_processor.py        |    9 +-
 ...NVIDIA_H100_80GB_HBM3,dtype=bf16_down.json |   11 +
 .../runtime/layers/moe/backends/base.py       |    4 +
 .../runtime/layers/moe/backends/fp8/triton.py |    6 +
 .../layers/moe/backends/triton_common.py      |   31 +
 .../layers/moe/backends/unquantized/triton.py |    6 +
 .../layers/moe/backends/w8a8_fp8/triton.py    |    6 +
 python/tokenspeed/runtime/layers/moe/layer.py |   17 +
 python/tokenspeed/runtime/lora/__init__.py    |   33 +
 python/tokenspeed/runtime/lora/adapter_io.py  |  142 ++
 python/tokenspeed/runtime/lora/lora_batch.py  |   98 ++
 .../tokenspeed/runtime/lora/lora_buffers.py   |  332 +++++
 python/tokenspeed/runtime/lora/lora_cache.py  |  189 +++
 python/tokenspeed/runtime/lora/lora_config.py |   79 +
 .../tokenspeed/runtime/lora/lora_manager.py   | 1009 +++++++++++++
 .../tokenspeed/runtime/lora/lora_registry.py  |  105 ++
 python/tokenspeed/runtime/lora/moe_lora.py    | 1326 +++++++++++++++++
 python/tokenspeed/runtime/models/qwen3.py     |   33 +-
 python/tokenspeed/runtime/models/qwen3_5.py   |   10 +
 .../tokenspeed/runtime/utils/server_args.py   |  125 ++
 test/runners.py                               |   32 +-
 test/runtime/lora/__init__.py                 |    0
 test/runtime/lora/test_adapter_io.py          |   87 ++
 test/runtime/lora/test_lora_manager.py        |  488 ++++++
 test/runtime/lora/test_lora_registry.py       |  102 ++
 test/runtime/lora/test_lora_request_naming.py |   72 +
 test/runtime/lora/test_moe_lora.py            |  339 +++++
 ...st_qwen3_lm_head_lora_password_adapters.py |  203 +++
 .../test_qwen3_lora_password_adapters.py      |  226 +++
 .../test_qwen3_moe_lora_password_adapters.py  |  212 +++
 ...3_moe_per_expert_lora_password_adapters.py |  199 +++
 .../python/tokenspeed_kernel/__init__.py      |   62 +-
 .../python/tokenspeed_kernel/_triton.py       |   12 +-
 .../ops/attention/__init__.py                 |    4 +-
 .../ops/lora/triton/__init__.py               |   57 +
 .../H100_80GB_HBM3/_lora_expand_kernel.json   |  178 +++
 .../_lora_gate_up_expand_kernel.json          |  266 ++++
 .../_lora_qkv_expand_kernel.json              |  134 ++
 .../H100_80GB_HBM3/_lora_shrink_kernel.json   |  541 +++++++
 .../ops/lora/triton/kernel_utils.py           |   45 +
 .../ops/lora/triton/lora_expand.py            |  223 +++
 .../ops/lora/triton/lora_expand_grouped_v2.py |  236 +++
 .../ops/lora/triton/lora_expand_prefill.py    |  253 ++++
 .../ops/lora/triton/lora_gate_up_expand.py    |  225 +++
 .../ops/lora/triton/lora_qkv_expand.py        |  229 +++
 .../ops/lora/triton/lora_shrink.py            |  229 +++
 .../ops/lora/triton/lora_shrink_prefill.py    |  206 +++
 .../tokenspeed_kernel/ops/lora/triton/tune.py |  254 ++++
 .../ops/lora/triton/tune_sweep.py             |  140 ++
 .../ops/lora/triton/tuning.py                 |  143 ++
 .../ops/moe_lora/__init__.py                  | 1085 ++++++++++++++
 .../test/ops/test_lora_triton.py              |  122 ++
 tokenspeed-scheduler/CMakeLists.txt           |    1 +
 .../bindings/python_module.cpp                |   29 +-
 .../csrc/fsm/forward_events.cpp               |   19 +-
 .../csrc/fsm/forward_events.h                 |   19 +-
 .../hybrid_prefix_cache.cpp                   |   20 +-
 .../hybrid_prefix_cache/hybrid_prefix_cache.h |    5 +-
 .../csrc/resource/kv_prefix_cache/eviction.h  |   28 +
 .../kv_prefix_cache/kv_prefix_cache.cpp       |  107 +-
 .../kv_prefix_cache/kv_prefix_cache.h         |   43 +-
 .../csrc/resource/radix_tree/tree_resource.h  |    3 +
 tokenspeed-scheduler/csrc/resource/types.cpp  |    4 +-
 tokenspeed-scheduler/csrc/resource/types.h    |    5 +
 .../csrc/scheduler/operations/forward.cpp     |   12 +-
 .../csrc/scheduler/outside_event_handler.cpp  |    9 +-
 .../csrc/scheduler/request.cpp                |    1 +
 tokenspeed-scheduler/csrc/scheduler/request.h |    2 +
 .../csrc/scheduler/request_spec.h             |    4 +
 .../csrc/scheduler/scheduler.cpp              |    4 +
 .../csrc/scheduler/scheduler.h                |    3 +
 tokenspeed-scheduler/csrc/scheduler/types.h   |    3 -
 .../python/tokenspeed_scheduler/__init__.py   |    4 -
 .../tests/cpp/test_lora_prefix_cache.cpp      |  182 +++
 113 files changed, 17228 insertions(+), 205 deletions(-)
 create mode 100644 .github/dco.yml
 create mode 100644 0520_results.md
 create mode 100644 0521_moe_lora_results.md
 create mode 100644 0522_results.md
 create mode 100644 bench_chunked_sgmv.py
 create mode 100644 bench_kernel_opt.py
 create mode 100644 bench_vs_vllm.py
 create mode 100644 benchmark/bench_fused_moe_lora_e2e.py
 create mode 100644 benchmark/bench_fused_moe_lora_kernels.py
 create mode 100644 benchmark/bench_lm_head_lora_decode.py
 create mode 100644 benchmark/bench_moe_lora_decode.py
 create mode 100644 benchmark/bench_moe_lora_retry.py
 create mode 100644 benchmark/bench_triton_expand_kernel.py
 create mode 100644 benchmark/nsys_decode_target.py
 create mode 100644 benchmark/profile_decode.py
 create mode 100644 benchmark/profile_lm_head_lora.py
 create mode 100644 benchmark/test_lora_batch.py
 create mode 100644 benchmark/test_lora_dynamic.py
 create mode 100644 benchmark/test_lora_e2e.py
 create mode 100644 benchmark/test_lora_eviction_latency.py
 create mode 100644 docs/lora_current_design.html
 create mode 100644 docs/serving/lora.md
 create mode 100644 docs/tokenspeed_structure.html
 create mode 100644 profile_expand.py
 create mode 100644 python/tokenspeed/runtime/layers/moe/backends/E=128,inter_size=384,hidden_size=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16_down.json
 create mode 100644 python/tokenspeed/runtime/lora/__init__.py
 create mode 100644 python/tokenspeed/runtime/lora/adapter_io.py
 create mode 100644 python/tokenspeed/runtime/lora/lora_batch.py
 create mode 100644 python/tokenspeed/runtime/lora/lora_buffers.py
 create mode 100644 python/tokenspeed/runtime/lora/lora_cache.py
 create mode 100644 python/tokenspeed/runtime/lora/lora_config.py
 create mode 100644 python/tokenspeed/runtime/lora/lora_manager.py
 create mode 100644 python/tokenspeed/runtime/lora/lora_registry.py
 create mode 100644 python/tokenspeed/runtime/lora/moe_lora.py
 create mode 100644 test/runtime/lora/__init__.py
 create mode 100644 test/runtime/lora/test_adapter_io.py
 create mode 100644 test/runtime/lora/test_lora_manager.py
 create mode 100644 test/runtime/lora/test_lora_registry.py
 create mode 100644 test/runtime/lora/test_lora_request_naming.py
 create mode 100644 test/runtime/lora/test_moe_lora.py
 create mode 100644 test/runtime/test_qwen3_lm_head_lora_password_adapters.py
 create mode 100644 test/runtime/test_qwen3_lora_password_adapters.py
 create mode 100644 test/runtime/test_qwen3_moe_lora_password_adapters.py
 create mode 100644 test/runtime/test_qwen3_moe_per_expert_lora_password_adapters.py
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/__init__.py
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/configs/H100_80GB_HBM3/_lora_expand_kernel.json
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/configs/H100_80GB_HBM3/_lora_gate_up_expand_kernel.json
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/configs/H100_80GB_HBM3/_lora_qkv_expand_kernel.json
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/configs/H100_80GB_HBM3/_lora_shrink_kernel.json
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/kernel_utils.py
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_expand.py
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_expand_grouped_v2.py
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_expand_prefill.py
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_gate_up_expand.py
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_qkv_expand.py
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_shrink.py
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_shrink_prefill.py
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/tune.py
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/tune_sweep.py
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/tuning.py
 create mode 100644 tokenspeed-kernel/python/tokenspeed_kernel/ops/moe_lora/__init__.py
 create mode 100644 tokenspeed-kernel/test/ops/test_lora_triton.py
 create mode 100644 tokenspeed-scheduler/tests/cpp/test_lora_prefix_cache.cpp

diff --git a/.github/dco.yml b/.github/dco.yml
new file mode 100644
index 000000000..7993b95cc
--- /dev/null
+++ b/.github/dco.yml
@@ -0,0 +1,2 @@
+allowRemediationCommits:
+  individual: true
diff --git a/0520_results.md b/0520_results.md
new file mode 100644
index 000000000..c793064ea
--- /dev/null
+++ b/0520_results.md
@@ -0,0 +1,71 @@
+# LoRA Decode Benchmark — 2026-05-20
+
+**Model:** `Qwen/Qwen3-8B` · **bs=8** · **output\_tokens=200** · 5 bench iters · rank=16 · n\_slots=8 · H100 80GB
+**Adapters:** `togethercomputer/Qwen3-8B-LoRA-Password-Adapters`
+**n\_active:** distinct LoRA adapters in the batch (0 = enable\_lora but all requests use base model)
+
+---
+
+## TP1 — All Adapter Types
+
+| Configuration | TTFT (ms) | req TPS (tok/s) | total tput (tok/s) |
+|---|---:|---:|---:|
+| baseline (no LoRA) · eager | 40.1 | 53.7 | 429.5 |
+| baseline (no LoRA) · cudagraph | 27.7 | 141.4 | 1131.0 |
+| **attn** · eager · n\_active=0 | 40.6 | 52.9 | 423.2 |
+| **attn** · eager · n\_active=1 | 55.5 | 36.7 | 293.8 |
+| **attn** · eager · n\_active=8 | 56.2 | 35.9 | 287.2 |
+| **attn** · cudagraph · n\_active=0 | 27.2 | 134.7 | 1077.6 |
+| **attn** · cudagraph · n\_active=1 | 35.9 | 133.8 | 1070.2 |
+| **attn** · cudagraph · n\_active=8 | 35.4 | 133.6 | 1068.8 |
+| **mlp** · eager · n\_active=0 | 38.8 | 54.1 | 433.0 |
+| **mlp** · eager · n\_active=1 | 55.2 | 37.1 | 296.7 |
+| **mlp** · eager · n\_active=8 | 55.5 | 36.2 | 289.6 |
+| **mlp** · cudagraph · n\_active=0 | 28.2 | 134.5 | 1075.5 |
+| **mlp** · cudagraph · n\_active=1 | 36.9 | 133.4 | 1066.5 |
+| **mlp** · cudagraph · n\_active=8 | 37.0 | 133.3 | 1066.3 |
+| **lm\_head** · eager · n\_active=0 | 39.4 | 53.5 | 428.2 |
+| **lm\_head** · eager · n\_active=1 | 40.1 | 51.8 | 414.4 |
+| **lm\_head** · eager · n\_active=8 | 40.3 | 51.5 | 411.9 |
+| **lm\_head** · cudagraph · n\_active=0 | 28.1 | 133.9 | 1071.0 |
+| **lm\_head** · cudagraph · n\_active=1 | 28.8 | 134.3 | 1074.2 |
+| **lm\_head** · cudagraph · n\_active=8 | 28.7 | 134.0 | 1071.9 |
+
+---
+
+## TP1 vs TP2 — lm\_head LoRA
+
+| Configuration | TTFT (ms) | req TPS (tok/s) | total tput (tok/s) |
+|---|---:|---:|---:|
+| baseline tp1 · eager | 40.1 | 53.9 | 430.9 |
+| baseline tp1 · cudagraph | 28.2 | 141.3 | 1130.4 |
+| baseline tp2 · eager | 97.0 | 47.9 | 382.9 |
+| baseline tp2 · cudagraph | 29.1 | 206.6 | **1651.9** |
+| lm\_head tp1 · cudagraph · n\_active=0 | 28.0 | 134.5 | 1075.7 |
+| lm\_head tp1 · cudagraph · n\_active=1 | 28.8 | 134.3 | 1074.1 |
+| lm\_head tp1 · cudagraph · n\_active=8 | 28.9 | 134.0 | 1071.9 |
+| lm\_head tp2 · cudagraph · n\_active=0 | 29.6 | 194.8 | 1557.7 |
+| lm\_head tp2 · cudagraph · n\_active=1 | 29.7 | 194.6 | 1556.0 |
+| lm\_head tp2 · cudagraph · n\_active=8 | 28.8 | 194.3 | 1553.4 |
+
+---
+
+## Summary
+
+| | eager tput | cudagraph tput | LoRA overhead (cudagraph) | TTFT (cudagraph) |
+|---|---:|---:|---:|---:|
+| baseline tp1 | 429.5 | 1131.0 | — | 27–28 ms |
+| attn LoRA tp1 | ~290 (−32%) | ~1069 (−5%) | −5% | 35–36 ms (+8 ms) |
+| mlp LoRA tp1 | ~293 (−32%) | ~1066 (−6%) | −6% | 37 ms (+9 ms) |
+| lm\_head LoRA tp1 | ~413 (−4%) | ~1073 (−5%) | −5% | 29 ms (+1 ms) |
+| baseline tp2 | 382.9 | 1651.9 | — | 29 ms |
+| lm\_head LoRA tp2 | — | ~1555 (−6%) | −6% | 29–30 ms |
+
+**TP2 vs TP1 cudagraph speedup:** 1.46× (NCCL all-reduce prevents ideal 2×)
+
+### Key findings
+
+- **Eager mode**: attn/mlp LoRA costs ~32% throughput (Triton segmented-GEMM runs 36× per step, once per layer); lm\_head LoRA costs only ~4% (single matmul applied once)
+- **Cudagraph**: all adapter types converge to ~5–6% overhead vs baseline — graph capture amortises per-layer Python launch cost
+- **TTFT**: attn/mlp add ~8–9 ms even with cudagraph (LoRA kernels baked into the prefill graph across 36 layers); lm\_head adds <2 ms
+- **n\_active 1→8**: negligible throughput difference under cudagraph (within 0.3%); in eager, ~2–3% degradation going from 1 to 8 distinct adapters
diff --git a/0521_moe_lora_results.md b/0521_moe_lora_results.md
new file mode 100644
index 000000000..c9b230887
--- /dev/null
+++ b/0521_moe_lora_results.md
@@ -0,0 +1,53 @@
+# MoE LoRA Decode Benchmark — 2026-05-22
+
+**Model:** `Qwen/Qwen3-30B-A3B-Instruct-2507` · **bs=8** · **output_tokens=200** · 5 bench iters · rank=16 · max_loras=2 · H100 80GB
+
+**n_active:** distinct LoRA adapters in batch (0 = enable_lora, all base model)
+
+> MoE LoRA buffers ~1.96 GB/slot; max_loras=2 on 80 GB H100 with 30B model. gpu_util=0.86 for cudagraph+LoRA.
+
+## TP1 Eager
+
+| Configuration | TTFT (ms) | req TPS (tok/s) | total tput (tok/s) |
+|---|---:|---:|---:|
+| baseline tp1 eager | 99.5 | 28.5 | 228.1 |
+| baseline triton tp1 eager | 169.9 | 22.9 | 183.2 |
+| per_expert tp1 eager n_active=0 | ERR | ERR | ERR |
+| per_expert tp1 eager n_active=1 | ERR | ERR | ERR |
+| per_expert tp1 eager n_active=2 | ERR | ERR | ERR |
+| sglang_shared tp1 eager n_active=0 | ERR | ERR | ERR |
+| sglang_shared tp1 eager n_active=1 | ERR | ERR | ERR |
+| sglang_shared tp1 eager n_active=2 | ERR | ERR | ERR |
+
+## TP1 CUDA Graph
+
+| Configuration | TTFT (ms) | req TPS (tok/s) | total tput (tok/s) |
+|---|---:|---:|---:|
+| baseline tp1 cudagraph | ERR | ERR | ERR |
+| baseline triton tp1 cudagraph | ERR | ERR | ERR |
+| per_expert tp1 cudagraph n_active=0 | ERR | ERR | ERR |
+| per_expert tp1 cudagraph n_active=1 | ERR | ERR | ERR |
+| per_expert tp1 cudagraph n_active=2 | ERR | ERR | ERR |
+| sglang_shared tp1 cudagraph n_active=0 | ERR | ERR | ERR |
+| sglang_shared tp1 cudagraph n_active=1 | ERR | ERR | ERR |
+| sglang_shared tp1 cudagraph n_active=2 | ERR | ERR | ERR |
+
+## TP2 Eager
+
+| Configuration | TTFT (ms) | req TPS (tok/s) | total tput (tok/s) |
+|---|---:|---:|---:|
+| baseline tp2 eager | ERR | ERR | ERR |
+| baseline triton tp2 eager | ERR | ERR | ERR |
+| per_expert tp2 eager n_active=0 | ERR | ERR | ERR |
+| per_expert tp2 eager n_active=1 | ERR | ERR | ERR |
+| per_expert tp2 eager n_active=8 | ERR | ERR | ERR |
+
+## TP2 CUDA Graph
+
+| Configuration | TTFT (ms) | req TPS (tok/s) | total tput (tok/s) |
+|---|---:|---:|---:|
+| baseline tp2 cudagraph | ERR | ERR | ERR |
+| baseline triton tp2 cudagraph | ERR | ERR | ERR |
+| per_expert tp2 cudagraph n_active=0 | ERR | ERR | ERR |
+| per_expert tp2 cudagraph n_active=1 | ERR | ERR | ERR |
+| per_expert tp2 cudagraph n_active=8 | ERR | ERR | ERR |
diff --git a/0522_results.md b/0522_results.md
new file mode 100644
index 000000000..89da2ba50
--- /dev/null
+++ b/0522_results.md
@@ -0,0 +1,129 @@
+# MoE LoRA Optimization Results — 2026-05-22 (updated 2026-05-23)
+
+**Model:** `Qwen/Qwen3-30B-A3B-Instruct-2507` · **bs=8** · **output\_tokens=200** · H100 80GB
+**LoRA:** rank=16 · max\_loras=2 · TP=2 · CUDA graph mode
+**Adapter format:** sglang\_shared (shared outer A, per-expert B for gate/up; per-expert A, shared B for down)
+
+---
+
+## Final Results (with fused Triton kernels)
+
+| Configuration | tput (tok/s) | step (ms) | overhead |
+|---|---:|---:|---:|
+| **baseline** (no LoRA, triton) | **1394** | **5.74** | — |
+| **n\_active=0** (LoRA loaded, inactive) | 1398 | 5.75 | **+0.01ms ✓** |
+| **n\_active=1** (fused kernels) | **1107** | **7.22** | **+1.48ms** |
+
+n\_active=0 matches baseline — loading an adapter costs nothing in decode.
+n\_active=1 overhead: **1.48ms** = 26% of baseline step time.
+
+---
+
+## Decode Throughput Progress
+
+Starting from 809 tok/s (no Triton fused kernels, plain PyTorch LoRA):
+
+| Optimization | tput | step | overhead | Δ overhead |
+|---|---:|---:|---:|---:|
+| Baseline (no fused kernels) | 809 | 9.89ms | 4.12ms | — |
+| + flat gate/up kernel | 818 | 9.78ms | 3.99ms | −130μs |
+| + flat down shrink kernel | 827 | 9.68ms | 3.93ms | −60μs |
+| + buffer+slot (no gather copies) | 927 | 8.63ms | 2.90ms | −1.03ms |
+| + flat\_a\_gemm + scalings buffer | **1107** | **7.22ms** | **1.50ms** | **−1.40ms** |
+
+**Total: +36.8% tput, −63.6% LoRA overhead (4.12ms → 1.50ms)**
+
+---
+
+## Fused Triton Kernels
+
+All kernels live in `tokenspeed-kernel/python/tokenspeed_kernel/ops/moe_lora/__init__.py`.
+Integration is in `python/tokenspeed/runtime/lora/moe_lora.py`.
+
+### 1. `compact_gate_up_expand` — flat per-expert GEMV (decode gate/up)
+
+Replaces the all-experts GEMM + candidates.gather + route\_delta chain (3 separate ops):
+```python
+# Old (3 ops, reads all 128 experts' B data = 12.6 MB):
+candidates = (lora_a_m @ w13_B.permute(2,0,1).reshape(r, E*I2)).view(m, E, I2)
+delta = candidates.gather(1, safe_ids.unsqueeze(-1).expand(...))
+_add_route_delta(gate_up_output, delta, ...)
+
+# New (1 op, reads only active experts' B = ~5 MB, −60% bandwidth):
+compact_gate_up_expand(lora_a_m, w13_B_buffer, slot_idx, safe_ids, gate_up_output, scalings)
+```
+
+Grid: `(I2//BLOCK_I, m*k)` — one block per flat-pair position. Computes `tok = pid_s // K`
+directly inside the kernel. CUDA-graph compatible: reads `w13_B_buffer[slot]` and
+`scalings[slot]` from device tensors without separate gather copies.
+
+**Microbenchmark:** 20μs vs 69μs (3.4×) for the gate/up B expand step.
+
+### 2. `flat_a_gemm` — A GEMM from buffer
+
+Computes `lora_a_m = hidden @ w13_A_buffer[slot, 0, :, :].T` directly from the weight
+buffer, eliminating:
+- `w13_A = w13_A_buffers[layer][slot_idx].squeeze(0)` — 22μs gather copy
+- `hidden @ w13_A[0].T` — 25μs cuBLAS GEMM (inefficient for m=8)
+
+Grid: `(m, R//BLOCK_R)` — one block per token. With m=8 and R=32 fitting in L1 cache
+across the 8 blocks, the kernel runs in ~5-8μs total.
+
+**Savings:** 47μs/layer × 48 = **2.26ms** isolated.
+
+### 3. `flat_down_shrink` — per-expert shrink from buffer
+
+Replaces `_select_expert_weights(down_A, safe_ids) + einsum("mki,mkri->mkr", ...)`:
+- Avoids the `(m*k, r, INTER)` = 1.5 MB intermediate tensor
+- Reads `down_A_buffer[slot, exp, :, :]` directly for each flat pair
+
+**Microbenchmark:** 23μs vs 54μs (2.4×).
+
+### 4. `flat_down_expand` — shared B expand + scale + add
+
+Fuses `lora_a @ down_B[slot, 0].T × topk_weight × scaling → down_output` in one kernel,
+reading `down_B_buffer[slot]` and `scalings[slot]` directly from device memory.
+
+### Key design decisions
+
+**No gather copies:** All 4 kernels receive the full `(n_slots, ...)` weight buffer and
+a `slot_ptr` GPU scalar. The kernel computes `buffer + slot * stride + ...` internally.
+This eliminates 4 buffer gather copies per layer (previously ~64μs/layer × 48 = **3.08ms**).
+
+**CUDA-graph safe:** `slot_ptr = bi.weight_indices[:1].clamp(0)` is a GPU tensor mutated
+before each `graph.replay()`, so different adapters work without re-capturing the graph.
+
+**Scalings in kernel:** `_flat_gate_up_expand_kernel` and `_flat_down_expand_kernel` load
+`scalings[slot]` from the full `(n_slots,)` scalings buffer, eliminating 2 more
+`scalings[slot_idx]` gather ops per layer (~19μs each × 2 × 48 = **1.82ms**).
+
+---
+
+## Earlier Optimizations (prefill / TTFT)
+
+### Shared A/B fast path (sglang\_shared format)
+When `w13_A.shape[0] == 1` (shared outer), use a single matmul instead of an
+`O(m·k·r·h)` gather tensor. Saves 2.2 GB of intermediate tensor creation per prefill.
+
+### Remove `torch.any(valid)` GPU→CPU sync
+96 GPU→CPU stalls per prefill (48 layers × 2 ops) stalled the CPU-GPU pipeline.
+**Impact: −35ms TTFT** (108ms → 73ms for sglang\_shared n=1 prefill).
+
+### Vectorised scatter operations
+`_add_route_delta` (−56%) and `_route_rows_from_cache` (−68%) replaced boolean-index
+tensor creation with `scatter_` + slice.
+**Impact: −11ms** on route scatter ops in prefill.
+
+### CUDA graph: force has\_active\_lora=True during capture
+During LoRA CUDA graph capture, `has_active_lora=True` and `single_lora_slot=0` are
+forced so LoRA Triton kernels ARE recorded in the decode graph. Dynamic slot selection
+uses `bi.weight_indices[:1].clamp(0)` (GPU tensor updated before each replay) so the
+same graph serves any loaded adapter.
+
+---
+
+## Correctness
+
+All correctness tests pass: `16 tests, 90 subtests` covering sglang\_shared and
+per\_expert formats under sequential, batched, high-concurrency, and mixed-LoRA/base
+scenarios (test\_qwen3\_moe\_per\_expert\_lora + test\_qwen3\_lora\_password\_adapters).
diff --git a/bench_chunked_sgmv.py b/bench_chunked_sgmv.py
new file mode 100644
index 000000000..450bca678
--- /dev/null
+++ b/bench_chunked_sgmv.py
@@ -0,0 +1,817 @@
+"""Benchmark: our shrink/expand kernels vs sglang csgmv variants.
+
+Inlines sglang kernels (Apache-2.0) so sglang doesn't need to be
+installed.  All kernels are autotuned with the same config space.
+
+Shrink (LoRA-A):  x (s, K) @ W^T (K, N)  →  out (s, N)
+  N = stack_num * rank  (small),  K = in_dim  (large, 4096+)
+  Key diff in chunked_sgmv_shrink: K and N are constexpr
+  → K-loop trip count is compile-time constant.
+
+Expand (LoRA-B):  x (s, num_slices*R) @ W (R, out_dim)  →  out (s, out_dim)
+  R = rank  (small),  out_dim large
+  Key diff in chunked_sgmv_expand: strides and MAX_RANK are constexpr.
+
+When rank == max_rank the x layouts are identical between ours and sglang.
+
+Usage:
+    python bench_chunked_sgmv.py
+"""
+
+from __future__ import annotations
+
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+import torch
+import triton
+import triton.language as tl
+
+# ── make the local kernel package importable ──────────────────────────────────
+sys.path.insert(
+    0,
+    str(Path(__file__).parent / "tokenspeed-kernel" / "python"),
+)
+
+from tokenspeed_kernel.ops.lora.triton.lora_expand import lora_expand_fwd
+from tokenspeed_kernel.ops.lora.triton.lora_gate_up_expand import (
+    lora_gate_up_expand_fwd,
+)
+from tokenspeed_kernel.ops.lora.triton.lora_qkv_expand import lora_qkv_expand_fwd
+from tokenspeed_kernel.ops.lora.triton.lora_shrink import lora_shrink_fwd
+
+# ── minimal batch-info dataclass ──────────────────────────────────────────────
+
+
+@dataclass
+class BatchInfo:
+    bs: int
+    max_len: int
+    seg_lens: torch.Tensor
+    seg_indptr: torch.Tensor
+    weight_indices: torch.Tensor
+    lora_ranks: torch.Tensor
+    scalings: torch.Tensor
+    permutation: torch.Tensor | None = None
+    # sglang compat
+    num_segments: int = 0
+    use_cuda_graph: bool = False
+
+
+def make_batch(
+    s_per_seg: int, n_segs: int, rank: int, with_perm: bool = False
+) -> BatchInfo:
+    dev = "cuda"
+    seg_lens = torch.full((n_segs,), s_per_seg, dtype=torch.int32, device=dev)
+    seg_indptr = torch.arange(n_segs + 1, dtype=torch.int32, device=dev) * s_per_seg
+    # all segs route to slot 1 (real adapter), slot 0 = no-adapter sentinel
+    weight_indices = torch.ones(n_segs, dtype=torch.int32, device=dev)
+    lora_ranks = torch.tensor([0, rank], dtype=torch.int32, device=dev)
+    scalings = torch.tensor([0.0, 1.0], dtype=torch.float32, device=dev)
+    perm = None
+    if with_perm:
+        s_total = n_segs * s_per_seg
+        perm = torch.arange(s_total, dtype=torch.int64, device=dev)
+    return BatchInfo(
+        bs=n_segs,
+        max_len=s_per_seg,
+        seg_lens=seg_lens,
+        seg_indptr=seg_indptr,
+        weight_indices=weight_indices,
+        lora_ranks=lora_ranks,
+        scalings=scalings,
+        permutation=perm,
+        num_segments=n_segs,
+    )
+
+
+# ── inlined sglang chunked_sgmv_expand (Apache-2.0) ──────────────────────────
+# Source: github.com/sgl-project/sglang  python/sglang/srt/lora/triton_ops/chunked_sgmv_expand.py
+# Local change: replaced sglang imports with triton directly; added @triton.autotune.
+
+
+@triton.jit(do_not_specialize=["num_segs", "output_stride_0", "output_stride_1"])
+def _chunked_lora_expand_kernel(
+    x,
+    weights,
+    output,
+    output_stride_0,
+    output_stride_1,
+    seg_indptr,
+    weight_indices,
+    lora_ranks,
+    permutation,
+    num_segs,
+    scalings,
+    slice_offsets,
+    NUM_SLICES: tl.constexpr,
+    OUTPUT_DIM: tl.constexpr,
+    MAX_RANK: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    x_stride_0: tl.constexpr = NUM_SLICES * MAX_RANK
+    x_stride_1: tl.constexpr = 1
+
+    pid_s = tl.program_id(axis=2)
+    if pid_s >= num_segs:
+        return
+
+    w_index = tl.load(weight_indices + pid_s)
+    cur_rank = tl.load(lora_ranks + w_index)
+    if cur_rank == 0:
+        return
+
+    seg_start = tl.load(seg_indptr + pid_s)
+    seg_end = tl.load(seg_indptr + pid_s + 1)
+    slice_id = tl.program_id(axis=1)
+    slice_start = tl.load(slice_offsets + slice_id)
+    slice_end = tl.load(slice_offsets + slice_id + 1)
+    scaling = tl.load(scalings + w_index)
+
+    cur_rank = tl.minimum(MAX_RANK, cur_rank)
+
+    s_offset_logical = tl.arange(0, BLOCK_M) + seg_start
+    s_offset_physical = tl.load(
+        permutation + s_offset_logical, mask=s_offset_logical < seg_end
+    )
+
+    pid_n = tl.program_id(axis=0)
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + slice_start
+    k_offset = tl.arange(0, BLOCK_K)
+
+    x_ptrs = (
+        x
+        + slice_id * cur_rank * x_stride_1
+        + (s_offset_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1)
+    )
+    w_stride_0: tl.constexpr = OUTPUT_DIM * MAX_RANK
+    w_stride_1: tl.constexpr = MAX_RANK
+    w_stride_2: tl.constexpr = 1
+    w_ptrs = (weights + w_index * w_stride_0) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+
+    partial_sum = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(cur_rank, BLOCK_K)):
+        x_tile = tl.load(
+            x_ptrs,
+            mask=(s_offset_logical[:, None] < seg_end)
+            & (k_offset[None, :] < cur_rank - k * BLOCK_K),
+            other=0.0,
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=(k_offset[:, None] < cur_rank - k * BLOCK_K)
+            & (n_offset[None, :] < slice_end),
+            other=0.0,
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+
+    partial_sum *= scaling
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+
+    output_ptr = output + (
+        s_offset_physical[:, None] * output_stride_0
+        + n_offset[None, :] * output_stride_1
+    )
+    output_mask = (s_offset_logical[:, None] < seg_end) & (
+        n_offset[None, :] < slice_end
+    )
+    partial_sum += tl.load(output_ptr, mask=output_mask, other=0.0)
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+
+
+def chunked_sgmv_expand_fwd(
+    x: torch.Tensor,
+    weights: torch.Tensor,
+    batch_info: BatchInfo,
+    slice_offsets: torch.Tensor,
+    max_slice_size: int,
+    base_output: torch.Tensor | None,
+) -> torch.Tensor:
+    assert x.is_contiguous() and weights.is_contiguous()
+    M = x.shape[0]
+    OUT_DIM = weights.shape[1]
+    MAX_RANK = weights.shape[2]
+    num_slices = len(slice_offsets) - 1
+    assert x.shape[1] == num_slices * MAX_RANK
+
+    num_segs = batch_info.num_segments
+
+    BM, BN, BK = 16, 64, 16
+    grid = (triton.cdiv(max_slice_size, BN), num_slices, batch_info.bs)
+    output = (
+        torch.zeros((M, OUT_DIM), device=x.device, dtype=x.dtype)
+        if base_output is None
+        else base_output
+    )
+    _chunked_lora_expand_kernel[grid](
+        x=x,
+        weights=weights,
+        output=output,
+        output_stride_0=output.stride(0),
+        output_stride_1=output.stride(1),
+        seg_indptr=batch_info.seg_indptr,
+        weight_indices=batch_info.weight_indices,
+        lora_ranks=batch_info.lora_ranks,
+        permutation=batch_info.permutation,
+        num_segs=num_segs,
+        scalings=batch_info.scalings,
+        slice_offsets=slice_offsets,
+        NUM_SLICES=num_slices,
+        OUTPUT_DIM=OUT_DIM,
+        MAX_RANK=MAX_RANK,
+        BLOCK_M=BM,
+        BLOCK_N=BN,
+        BLOCK_K=BK,
+        num_warps=4,
+        num_stages=2,
+    )
+    return output
+
+
+# ── inlined sglang sgemm_lora_a (Apache-2.0) ─────────────────────────────────
+# Source: github.com/sgl-project/sglang  python/sglang/srt/lora/triton_ops/sgemm_lora_a.py
+# Local change: replaced sglang imports; added @triton.autotune (original uses fixed sizes).
+
+
+@triton.jit
+def _sgemm_lora_a_kernel(
+    x,
+    weights,
+    output,
+    N,
+    K,
+    stack_num,
+    x_stride_0,
+    x_stride_1,
+    w_stride_0,
+    w_stride_1,
+    w_stride_2,
+    output_stride_0,
+    output_stride_1,
+    seg_lens,
+    seg_indptr,
+    weight_indices,
+    lora_ranks,
+    sorted_token_ids,
+    SORTED_BY_ADAPTER: tl.constexpr,
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    batch_id = tl.program_id(axis=1)
+    w_index = tl.load(weight_indices + batch_id)
+    rank = tl.load(lora_ranks + w_index)
+    if rank == 0:
+        return
+    pid = tl.program_id(axis=0)
+    seg_start = tl.load(seg_indptr + batch_id)
+    seg_len = tl.load(seg_lens + batch_id)
+    if seg_len == 0:
+        return
+    N = tl.minimum(N, rank * stack_num)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    pid_s = pid // num_pid_n
+    pid_n = pid % num_pid_n
+    if pid_s * BLOCK_S >= seg_len:
+        return
+    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.arange(0, BLOCK_K)
+    if SORTED_BY_ADAPTER:
+        s_physical = tl.load(
+            sorted_token_ids + seg_start + s_offset, mask=s_offset < seg_len, other=0
+        )
+    else:
+        s_physical = seg_start + s_offset
+    x_ptrs = x + (s_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1)
+    w_ptrs = (weights + w_index * w_stride_0) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        x_tile = tl.load(
+            x_ptrs,
+            mask=(s_offset[:, None] < seg_len) & (k_offset[None, :] < K - k * BLOCK_K),
+            other=0.0,
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=(k_offset[:, None] < K - k * BLOCK_K) & (n_offset[None, :] < N),
+            other=0.0,
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+    output_mask = (s_offset[:, None] < seg_len) & (n_offset[None, :] < N)
+    output_ptr = output + (
+        s_physical[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
+    )
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+
+
+def sgemm_lora_a_fwd(x, weights, batch_info, stack_num=1):
+    S, K = x.shape
+    N = weights.shape[-2]
+    assert x.is_contiguous() and weights.is_contiguous()
+    max_len = batch_info.max_len
+    BS, BN, BK = 16, 32, 128
+    grid = (triton.cdiv(max_len, BS) * triton.cdiv(N, BN), batch_info.bs)
+    output = torch.empty((S, N), device=x.device, dtype=x.dtype)
+    sorted_by_adapter = batch_info.permutation is not None
+    _sgemm_lora_a_kernel[grid](
+        x,
+        weights,
+        output,
+        N,
+        K,
+        stack_num,
+        x.stride(0),
+        x.stride(1),
+        weights.stride(0),
+        weights.stride(1),
+        weights.stride(2),
+        output.stride(0),
+        output.stride(1),
+        batch_info.seg_lens,
+        batch_info.seg_indptr,
+        batch_info.weight_indices,
+        batch_info.lora_ranks,
+        batch_info.permutation,
+        sorted_by_adapter,
+        BLOCK_S=BS,
+        BLOCK_N=BN,
+        BLOCK_K=BK,
+        num_warps=4,
+        num_stages=4,
+    )
+    return output
+
+
+# ── inlined sglang chunked_sgmv_shrink (Apache-2.0) ──────────────────────────
+# Source: github.com/sgl-project/sglang  python/sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py
+# Local change: replaced sglang imports; added @triton.autotune.
+# Key structural diff vs sgemm_lora_a: K, N, and all strides are constexpr.
+
+
+@triton.jit(do_not_specialize=["num_segs"])
+def _chunked_lora_shrink_kernel(
+    x,
+    weights,
+    output,
+    seg_indptr,
+    weight_indices,
+    lora_ranks,
+    permutation,
+    num_segs,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    NUM_SLICES: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    x_stride_1: tl.constexpr = 1
+    x_stride_0: tl.constexpr = K
+    w_stride_0: tl.constexpr = N * K
+    w_stride_1: tl.constexpr = K
+    w_stride_2: tl.constexpr = 1
+    output_stride_0: tl.constexpr = N
+    output_stride_1: tl.constexpr = 1
+
+    pid_s = tl.program_id(1)
+    if pid_s >= num_segs:
+        return
+    pid_n = tl.program_id(0)
+    w_index = tl.load(weight_indices + pid_s)
+    rank = tl.load(lora_ranks + w_index)
+    if rank == 0:
+        return
+    seg_start = tl.load(seg_indptr + pid_s)
+    seg_end = tl.load(seg_indptr + pid_s + 1)
+    cur_n = tl.minimum(N, rank * NUM_SLICES)
+
+    s_offset_logical = tl.arange(0, BLOCK_M) + seg_start
+    s_offset_physical = tl.load(
+        permutation + s_offset_logical, mask=s_offset_logical < seg_end
+    )
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.arange(0, BLOCK_K)
+
+    x_ptrs = x + (
+        s_offset_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1
+    )
+    w_ptrs = (weights + w_index * w_stride_0) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+    partial_sum = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        x_tile = tl.load(
+            x_ptrs,
+            mask=(s_offset_logical[:, None] < seg_end)
+            & (k_offset[None, :] < K - k * BLOCK_K),
+            other=0.0,
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=(k_offset[:, None] < K - k * BLOCK_K) & (n_offset[None, :] < cur_n),
+            other=0.0,
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+    output_ptr = output + (
+        s_offset_physical[:, None] * output_stride_0
+        + n_offset[None, :] * output_stride_1
+    )
+    output_mask = (s_offset_logical[:, None] < seg_end) & (n_offset[None, :] < cur_n)
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+
+
+def chunked_sgmv_shrink_fwd(x, weights, batch_info, num_slices=1):
+    S, K = x.shape
+    N = weights.shape[-2]  # num_slices * rank
+    assert x.is_contiguous() and weights.is_contiguous()
+    num_segs = batch_info.num_segments
+    BM, BN, BK = 16, 32, 128
+    grid = (triton.cdiv(N, BN), batch_info.bs)
+    output = torch.empty((S, N), device=x.device, dtype=x.dtype)
+    _chunked_lora_shrink_kernel[grid](
+        x,
+        weights,
+        output,
+        batch_info.seg_indptr,
+        batch_info.weight_indices,
+        batch_info.lora_ranks,
+        batch_info.permutation,
+        num_segs,
+        N=N,
+        K=K,
+        NUM_SLICES=num_slices,
+        BLOCK_M=BM,
+        BLOCK_N=BN,
+        BLOCK_K=BK,
+        num_warps=4,
+        num_stages=4,
+    )
+    return output
+
+
+# ── inlined sglang sgemm_lora_b (Apache-2.0) ─────────────────────────────────
+# Source: github.com/sgl-project/sglang  python/sglang/srt/lora/triton_ops/sgemm_lora_b.py
+# Structurally identical to our lora_expand; only difference is fixed BLOCK_N=256.
+
+
+@triton.jit
+def _sgemm_lora_b_kernel(
+    x,
+    weights,
+    output,
+    N,
+    K,
+    x_stride_0,
+    x_stride_1,
+    w_stride_0,
+    w_stride_1,
+    w_stride_2,
+    output_stride_0,
+    output_stride_1,
+    seg_lens,
+    seg_indptr,
+    weight_indices,
+    lora_ranks,
+    sorted_token_ids,
+    SORTED_BY_ADAPTER: tl.constexpr,
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    scalings,
+):
+    batch_id = tl.program_id(axis=1)
+    w_index = tl.load(weight_indices + batch_id)
+    rank = tl.load(lora_ranks + w_index)
+    if rank == 0:
+        return
+    pid = tl.program_id(axis=0)
+    seg_len = tl.load(seg_lens + batch_id)
+    if seg_len == 0:
+        return
+    seg_start = tl.load(seg_indptr + batch_id)
+    scaling = tl.load(scalings + w_index)
+    K = tl.minimum(K, rank)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    pid_s = pid // num_pid_n
+    pid_n = pid % num_pid_n
+    if pid_s * BLOCK_S >= seg_len:
+        return
+    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.arange(0, BLOCK_K)
+    if SORTED_BY_ADAPTER:
+        s_physical = tl.load(
+            sorted_token_ids + seg_start + s_offset, mask=s_offset < seg_len, other=0
+        )
+    else:
+        s_physical = seg_start + s_offset
+    x_ptrs = x + (s_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1)
+    w_ptrs = (weights + w_index * w_stride_0) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+    n_mask = n_offset[None, :] < N
+    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        x_tile = tl.load(
+            x_ptrs,
+            mask=(s_offset[:, None] < seg_len) & (k_offset[None, :] < K - k * BLOCK_K),
+            other=0.0,
+        )
+        w_tile = tl.load(
+            w_ptrs, mask=(k_offset[:, None] < K - k * BLOCK_K) & n_mask, other=0.0
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+    partial_sum *= scaling
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+    output_ptr = output + (
+        s_physical[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
+    )
+    output_mask = (s_offset[:, None] < seg_len) & n_mask
+    partial_sum += tl.load(output_ptr, mask=output_mask, other=0.0)
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+
+
+def sgemm_lora_b_fwd(x, weights, batch_info, base_output=None):
+    S, R = x.shape
+    N = weights.shape[-2]
+    assert x.is_contiguous() and weights.is_contiguous()
+    # Original sglang fixed configs: BLOCK_S=16, BLOCK_N=256, BLOCK_K=16
+    BS, BN, BK = 16, 256, 16
+    max_len = batch_info.max_len
+    grid = (triton.cdiv(max_len, BS) * triton.cdiv(N, BN), batch_info.bs)
+    output = (
+        torch.zeros((S, N), device=x.device, dtype=x.dtype)
+        if base_output is None
+        else base_output
+    )
+    sorted_by_adapter = batch_info.permutation is not None
+    _sgemm_lora_b_kernel[grid](
+        x,
+        weights,
+        output,
+        N,
+        R,
+        x.stride(0),
+        x.stride(1),
+        weights.stride(0),
+        weights.stride(1),
+        weights.stride(2),
+        output.stride(0),
+        output.stride(1),
+        batch_info.seg_lens,
+        batch_info.seg_indptr,
+        batch_info.weight_indices,
+        batch_info.lora_ranks,
+        batch_info.permutation,
+        sorted_by_adapter,
+        BLOCK_S=BS,
+        BLOCK_N=BN,
+        BLOCK_K=BK,
+        num_warps=4,
+        num_stages=2,
+        scalings=batch_info.scalings,
+    )
+    return output
+
+
+# ── benchmark helpers ─────────────────────────────────────────────────────────
+
+
+def bench(fn, label: str, warmup: int = 25, rep: int = 100) -> float:
+    ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+    print(f"  {label:<42s}  {ms*1000:7.1f} µs")
+    return ms
+
+
+def run_shrink_scenario(
+    label: str,
+    s_per_seg: int,
+    n_segs: int,
+    rank: int,
+    hidden: int,
+    intermediate_per_tp: int,
+) -> None:
+    dev, dt = "cuda", torch.bfloat16
+    s = s_per_seg * n_segs
+    bi_ours = make_batch(s_per_seg, n_segs, rank, with_perm=False)
+    bi_sglang = make_batch(s_per_seg, n_segs, rank, with_perm=True)
+
+    print(f"\n{'='*60}")
+    print(f"  SHRINK {label}")
+    print(f"  s_per_seg={s_per_seg}  n_segs={n_segs}  rank={rank}  s_total={s}")
+    print(f"{'='*60}")
+
+    for stack_num, in_dim, tag in [
+        (3, hidden, "QKV shrink      in=hidden  stack=3"),
+        (2, hidden, "gate/up shrink  in=hidden  stack=2"),
+        (1, hidden, "o/down shrink   in=hidden  stack=1"),
+        (1, intermediate_per_tp, "down shrink     in=inter   stack=1"),
+    ]:
+        N = stack_num * rank
+        x = torch.randn((s, in_dim), device=dev, dtype=dt)
+        w = torch.randn((2, N, in_dim), device=dev, dtype=dt)
+        print(f"\n[{tag}]  K={in_dim}")
+        bench(
+            lambda x=x, w=w: lora_shrink_fwd(x, w, bi_ours, stack_num=stack_num),
+            "ours lora_shrink_fwd",
+        )
+        bench(
+            lambda x=x, w=w: sgemm_lora_a_fwd(x, w, bi_sglang, stack_num=stack_num),
+            "sglang sgemm_lora_a (autotuned)",
+        )
+        bench(
+            lambda x=x, w=w: chunked_sgmv_shrink_fwd(
+                x, w, bi_sglang, num_slices=stack_num
+            ),
+            "sglang chunked_sgmv_shrink",
+        )
+
+
+def run_scenario(
+    label: str,
+    s_per_seg: int,
+    n_segs: int,
+    rank: int,
+    hidden: int,
+    intermediate_per_tp: int,
+    q_per_tp: int,
+    kv_per_tp: int,
+) -> None:
+    dev, dt = "cuda", torch.bfloat16
+    max_rank = rank  # rank == max_rank so x layouts are identical
+
+    s = s_per_seg * n_segs
+    bi_ours = make_batch(s_per_seg, n_segs, rank, with_perm=False)
+    bi_sglang = make_batch(
+        s_per_seg, n_segs, rank, with_perm=True
+    )  # sglang always needs perm
+
+    print(f"\n{'='*60}")
+    print(f"  {label}")
+    print(f"  s_per_seg={s_per_seg}  n_segs={n_segs}  rank={rank}  s_total={s}")
+    print(f"{'='*60}")
+
+    # ── plain expand (o_proj / down_proj): 1 slice, out_dim=hidden ──
+    print("\n[plain expand]  out_dim=hidden")
+    x1 = torch.randn((s, max_rank), device=dev, dtype=dt)
+    w1 = torch.randn((2, hidden, max_rank), device=dev, dtype=dt)
+    o1 = torch.zeros((s, hidden), device=dev, dtype=dt)
+    so1 = torch.tensor([0, hidden], dtype=torch.int32, device=dev)
+
+    bench(
+        lambda: lora_expand_fwd(x1, w1, bi_ours, base_output=o1.clone()),
+        "ours lora_expand_fwd",
+    )
+    bench(
+        lambda: sgemm_lora_b_fwd(x1, w1, bi_sglang, base_output=o1.clone()),
+        "sglang sgemm_lora_b (BN=256)",
+    )
+    bench(
+        lambda: chunked_sgmv_expand_fwd(x1, w1, bi_sglang, so1, hidden, o1.clone()),
+        "sglang chunked_sgmv (1 slice)",
+    )
+
+    # ── QKV expand: 3 slices ──
+    qkv_out = q_per_tp + 2 * kv_per_tp
+    max_qkv = max(q_per_tp, kv_per_tp)
+    x3 = torch.randn((s, 3 * max_rank), device=dev, dtype=dt)
+    w3 = torch.randn((2, qkv_out, max_rank), device=dev, dtype=dt)
+    o3 = torch.zeros((s, qkv_out), device=dev, dtype=dt)
+    off3 = torch.tensor(
+        [0, q_per_tp, q_per_tp + kv_per_tp, q_per_tp + 2 * kv_per_tp],
+        dtype=torch.int32,
+        device=dev,
+    )
+
+    print(f"\n[QKV expand]  q={q_per_tp}  kv={kv_per_tp}")
+    bench(
+        lambda: lora_qkv_expand_fwd(
+            x3, w3, bi_ours, off3, max_qkv, base_output=o3.clone()
+        ),
+        "ours lora_qkv_expand_fwd",
+    )
+    bench(
+        lambda: chunked_sgmv_expand_fwd(x3, w3, bi_sglang, off3, max_qkv, o3.clone()),
+        "sglang chunked_sgmv (3 slices)",
+    )
+
+    # ── gate/up expand: 2 slices ──
+    x2 = torch.randn((s, 2 * max_rank), device=dev, dtype=dt)
+    w2 = torch.randn((2, 2 * intermediate_per_tp, max_rank), device=dev, dtype=dt)
+    o2 = torch.zeros((s, 2 * intermediate_per_tp), device=dev, dtype=dt)
+    so2 = torch.tensor(
+        [0, intermediate_per_tp, 2 * intermediate_per_tp], dtype=torch.int32, device=dev
+    )
+
+    print(f"\n[gate/up expand]  intermediate_per_tp={intermediate_per_tp}")
+    bench(
+        lambda: lora_gate_up_expand_fwd(
+            x2, w2, bi_ours, intermediate_per_tp, base_output=o2.clone()
+        ),
+        "ours lora_gate_up_expand_fwd",
+    )
+    bench(
+        lambda: chunked_sgmv_expand_fwd(
+            x2, w2, bi_sglang, so2, intermediate_per_tp, o2.clone()
+        ),
+        "sglang chunked_sgmv (2 slices)",
+    )
+
+
+# ── main ──────────────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    # Qwen3-8B-like shapes at TP=2
+    HIDDEN = 4096
+    INTERMEDIATE = 12288
+    INTER_PER_TP = INTERMEDIATE // 2  # 6144
+    Q_PER_TP = 2048
+    KV_PER_TP = 512
+    RANK = 64
+
+    # ── 1. Sequence-length sweep (fixed n_segs=32 decode, n_segs=4 prefill) ──
+    for s_per_seg, n_segs, tag in [
+        (1, 32, "DECODE      s=1    n_segs=32"),
+        (1, 64, "DECODE      s=1    n_segs=64"),
+        (128, 4, "PREFILL     s=128  n_segs=4"),
+        (512, 2, "PREFILL     s=512  n_segs=2"),
+    ]:
+        run_scenario(
+            tag,
+            s_per_seg=s_per_seg,
+            n_segs=n_segs,
+            rank=RANK,
+            hidden=HIDDEN,
+            intermediate_per_tp=INTER_PER_TP,
+            q_per_tp=Q_PER_TP,
+            kv_per_tp=KV_PER_TP,
+        )
+
+    # ── 2. Adapter-count sweep (decode, s_per_seg=1, vary n_segs) ──
+    print(f"\n\n{'#'*60}")
+    print(f"  ADAPTER COUNT SWEEP  (decode s=1, rank={RANK})")
+    print(f"{'#'*60}")
+    dev, dt = "cuda", torch.bfloat16
+    qkv_out = Q_PER_TP + 2 * KV_PER_TP
+    max_qkv = max(Q_PER_TP, KV_PER_TP)
+    off3 = torch.tensor(
+        [0, Q_PER_TP, Q_PER_TP + KV_PER_TP, Q_PER_TP + 2 * KV_PER_TP],
+        dtype=torch.int32,
+        device=dev,
+    )
+    so1 = torch.tensor([0, HIDDEN], dtype=torch.int32, device=dev)
+
+    print(
+        f"\n{'n_segs':>8}  {'ours expand':>14}  {'sgemm_b BN256':>14}  {'csgmv 1sl':>12}  {'ours qkv':>12}  {'csgmv 3sl':>12}"
+    )
+    print("-" * 82)
+    for n_segs in (1, 2, 4, 8, 16, 32, 64, 128):
+        s = n_segs
+        bi_o = make_batch(1, n_segs, RANK, with_perm=False)
+        bi_s = make_batch(1, n_segs, RANK, with_perm=True)
+        x1 = torch.randn((s, RANK), device=dev, dtype=dt)
+        w1 = torch.randn((2, HIDDEN, RANK), device=dev, dtype=dt)
+        o1 = torch.zeros((s, HIDDEN), device=dev, dtype=dt)
+        x3 = torch.randn((s, 3 * RANK), device=dev, dtype=dt)
+        w3 = torch.randn((2, qkv_out, RANK), device=dev, dtype=dt)
+        o3 = torch.zeros((s, qkv_out), device=dev, dtype=dt)
+
+        def t(fn):
+            return triton.testing.do_bench(fn, warmup=25, rep=200) * 1000
+
+        t_ours_exp = t(lambda: lora_expand_fwd(x1, w1, bi_o, base_output=o1.clone()))
+        t_sgemm_b = t(lambda: sgemm_lora_b_fwd(x1, w1, bi_s, base_output=o1.clone()))
+        t_csgmv_1 = t(
+            lambda: chunked_sgmv_expand_fwd(x1, w1, bi_s, so1, HIDDEN, o1.clone())
+        )
+        t_ours_qkv = t(
+            lambda: lora_qkv_expand_fwd(
+                x3, w3, bi_o, off3, max_qkv, base_output=o3.clone()
+            )
+        )
+        t_csgmv_3 = t(
+            lambda: chunked_sgmv_expand_fwd(x3, w3, bi_s, off3, max_qkv, o3.clone())
+        )
+
+        print(
+            f"{n_segs:>8}  {t_ours_exp:>13.1f}µ  {t_sgemm_b:>13.1f}µ  {t_csgmv_1:>11.1f}µ  {t_ours_qkv:>11.1f}µ  {t_csgmv_3:>11.1f}µ"
+        )
diff --git a/bench_kernel_opt.py b/bench_kernel_opt.py
new file mode 100644
index 000000000..22fadb43a
--- /dev/null
+++ b/bench_kernel_opt.py
@@ -0,0 +1,141 @@
+"""Before/after benchmark for kernel micro-optimisations + sort-by-adapter.
+
+Tests decode shrink and expand with mixed adapters — the scenario where
+sort-by-adapter actually helps (adjacent CTAs share the same weight tile).
+
+Usage:
+    python bench_kernel_opt.py
+"""
+
+from __future__ import annotations
+
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+import torch
+import triton
+
+sys.path.insert(0, str(Path(__file__).parent / "tokenspeed-kernel" / "python"))
+
+from tokenspeed_kernel.ops.lora.triton.lora_expand import lora_expand_fwd
+from tokenspeed_kernel.ops.lora.triton.lora_expand_decode import lora_expand_decode_fwd
+from tokenspeed_kernel.ops.lora.triton.lora_shrink import lora_shrink_fwd
+
+
+@dataclass
+class BatchInfo:
+    bs: int
+    max_len: int
+    num_segments: int
+    seg_lens: torch.Tensor
+    seg_indptr: torch.Tensor
+    weight_indices: torch.Tensor
+    lora_ranks: torch.Tensor
+    scalings: torch.Tensor
+    permutation: torch.Tensor | None = None
+    sort_order: torch.Tensor | None = None
+    group_slots: torch.Tensor | None = None
+    group_starts: torch.Tensor | None = None
+    group_sizes: torch.Tensor | None = None
+    num_groups: int = 0
+
+
+def make_mixed_batch(
+    n_segs: int,
+    n_unique_adapters: int,
+    rank: int,
+    device: str = "cuda",
+) -> BatchInfo:
+    """n_segs decode segments, round-robin across n_unique_adapters adapters."""
+    slots_list = [(i % n_unique_adapters) + 1 for i in range(n_segs)]
+    slots = torch.tensor(slots_list, dtype=torch.int32, device=device)
+
+    seg_lens = torch.ones(n_segs, dtype=torch.int32, device=device)
+    seg_indptr = torch.arange(n_segs + 1, dtype=torch.int32, device=device)
+    n_slots = n_unique_adapters + 1
+    lora_ranks = torch.zeros(n_slots, dtype=torch.int32, device=device)
+    lora_ranks[1:] = rank
+    scalings = torch.ones(n_slots, dtype=torch.float32, device=device)
+    scalings[0] = 0.0
+
+    # Build group metadata (same logic as prepare_loras)
+    sort_order_cpu = sorted(range(n_segs), key=lambda i: slots_list[i])
+    groups: list[list[int]] = []
+    for pos, orig in enumerate(sort_order_cpu):
+        slot = slots_list[orig]
+        if not groups or groups[-1][0] != slot:
+            groups.append([slot, pos, 1])
+        else:
+            groups[-1][2] += 1
+    ng = len(groups)
+    sort_order_gpu = torch.tensor(sort_order_cpu, dtype=torch.int64, device=device)
+    group_slots_gpu = torch.tensor(
+        [g[0] for g in groups], dtype=torch.int32, device=device
+    )
+    group_starts_gpu = torch.tensor(
+        [g[1] for g in groups], dtype=torch.int32, device=device
+    )
+    group_sizes_gpu = torch.tensor(
+        [g[2] for g in groups], dtype=torch.int32, device=device
+    )
+
+    return BatchInfo(
+        bs=n_segs,
+        max_len=1,
+        num_segments=n_segs,
+        seg_lens=seg_lens,
+        seg_indptr=seg_indptr,
+        weight_indices=slots,
+        lora_ranks=lora_ranks,
+        scalings=scalings,
+        sort_order=sort_order_gpu,
+        group_slots=group_slots_gpu,
+        group_starts=group_starts_gpu,
+        group_sizes=group_sizes_gpu,
+        num_groups=ng,
+    )
+
+
+def bench(fn, warmup=25, rep=200):
+    return triton.testing.do_bench(fn, warmup=warmup, rep=rep) * 1000
+
+
+def run(n_segs: int, n_unique: int, rank: int, hidden: int) -> None:
+    dev, dt = "cuda", torch.bfloat16
+    n_slots = n_unique + 1
+    s = n_segs
+
+    bi = make_mixed_batch(n_segs, n_unique, rank, device=dev)
+
+    x_ex = torch.randn((s, rank), device=dev, dtype=dt)
+    w_ex = torch.randn((n_slots, hidden, rank), device=dev, dtype=dt)
+    o_ex = torch.zeros((s, hidden), device=dev, dtype=dt)
+
+    t_base = bench(lambda: lora_expand_fwd(x_ex, w_ex, bi, base_output=o_ex.clone()))
+    t_grouped = bench(
+        lambda: lora_expand_decode_fwd(x_ex, w_ex, bi, base_output=o_ex.clone())
+    )
+
+    print(
+        f"n_segs={n_segs:>3}  n_unique={n_unique:>2}  rank={rank:>3}  hidden={hidden:>5}  |"
+        f"  base={t_base:>6.1f}µ  grouped={t_grouped:>6.1f}µ  {t_base/t_grouped:>5.2f}x"
+    )
+
+
+if __name__ == "__main__":
+    # Qwen3-8B TP=2
+    HIDDEN, RANK = 4096, 64
+
+    print(
+        f"\n{'n_segs':>7}  {'n_unique':>9}  {'rank':>5}  {'hidden':>7}  |  {'base':>8}  {'grouped':>9}  speedup"
+    )
+    print("-" * 75)
+    for n_unique in (1, 2, 4, 8, 16, 32):
+        run(n_segs=32, n_unique=n_unique, rank=RANK, hidden=HIDDEN)
+    print()
+    for n_segs in (8, 16, 32, 64, 128):
+        run(n_segs=n_segs, n_unique=4, rank=RANK, hidden=HIDDEN)
+    print()
+    for rank in (16, 32, 64, 128):
+        run(n_segs=32, n_unique=4, rank=rank, hidden=HIDDEN)
diff --git a/bench_vs_vllm.py b/bench_vs_vllm.py
new file mode 100644
index 000000000..237f30c85
--- /dev/null
+++ b/bench_vs_vllm.py
@@ -0,0 +1,294 @@
+"""Benchmark: ours vs vLLM expand across shapes, adapter counts, ranks.
+
+Four expand variants compared:
+  1. ours-seg   : lora_expand_fwd (per-segment dispatch, no sorting)
+  2. ours-grp   : lora_expand_decode_fwd (grouped + gather/scatter)
+  3. ours-grpv2 : lora_expand_grouped_v2_fwd (grouped, scattered reads, no copy)
+  4. vllm        : inlined vLLM expand (same adapter-grouped idea)
+
+Usage:
+    python bench_vs_vllm.py
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import torch
+import triton
+import triton.language as tl
+
+sys.path.insert(0, str(Path(__file__).parent / "tokenspeed-kernel" / "python"))
+
+from tokenspeed_kernel.ops.lora.triton.lora_expand import lora_expand_fwd
+from tokenspeed_kernel.ops.lora.triton.lora_expand_decode import lora_expand_decode_fwd
+from tokenspeed_kernel.ops.lora.triton.lora_expand_grouped_v2 import (
+    lora_expand_grouped_v2_fwd,
+)
+
+# ── inlined vLLM expand kernel (Apache-2.0) ───────────────────────────────────
+
+
+@triton.jit
+def _vllm_mm_k(
+    a,
+    b,
+    ak,
+    bk,
+    K: tl.constexpr,
+    BM: tl.constexpr,
+    BN: tl.constexpr,
+    BK: tl.constexpr,
+    EVEN_K: tl.constexpr,
+):
+    acc = tl.zeros((BM, BN), dtype=tl.float32)
+    for k in range(tl.cdiv(K, BK)):
+        if EVEN_K:
+            acc += tl.dot(tl.load(a), tl.load(b))
+        else:
+            ko = tl.arange(0, BK)
+            mask = k * BK + ko < K
+            acc += tl.dot(
+                tl.load(a, mask=mask[None, :], other=0.0),
+                tl.load(b, mask=mask[:, None], other=0.0),
+            )
+        a += BK * ak
+        b += BK * bk
+    return acc
+
+
+@triton.jit
+def _vllm_expand_kernel(
+    x,
+    w,
+    out,
+    M,
+    N,
+    K,
+    sorted_idx,
+    ntok,
+    start_loc,
+    lora_ids,
+    scalings,
+    lora_ranks,
+    xs0,
+    xs1,
+    ws0,
+    ws1,
+    ws2,
+    os0,
+    os1,
+    BM: tl.constexpr,
+    BN: tl.constexpr,
+    BK: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    MAX_RANK: tl.constexpr,
+):
+    cta_m = tl.cdiv(M, BM)
+    cta_n = tl.cdiv(N, BN)
+    pid = tl.program_id(0)
+    pm = pid % cta_m
+    pn = (pid // cta_m) % cta_n
+    li = tl.program_id(1)
+    lid = tl.load(lora_ids + li)
+    if lid == -1:
+        return
+    lm = tl.load(ntok + li)
+    off = pm * BM
+    if off >= lm:
+        return
+    if pn * BN >= N:
+        return
+    mlen = tl.minimum(BM, lm - off)
+    ls = tl.load(start_loc + li)
+    om = tl.arange(0, BM) % mlen
+    ram = tl.load(sorted_idx + ls + off + om)
+    no = tl.arange(0, BN) + pn * BN
+    rbn = tl.max_contiguous(tl.multiple_of(no % N, BN), BN)
+    ko = tl.arange(0, BK)
+    # x strides: xs0=inner(1), xs1=row(MAX_RANK)
+    ap = x + ram[:, None] * xs1 + ko[None, :] * xs0
+    # w strides: ws0=adapter, ws1=N, ws2=K(=1)
+    bp = w + lid * ws0 + ko[:, None] * ws2 + rbn[None, :] * ws1
+    acc = _vllm_mm_k(ap, bp, xs0, ws2, K, BM, BN, BK, EVEN_K)
+    sc = tl.load(scalings + lid)
+    rank = tl.load(lora_ranks + lid)
+    acc *= sc
+    acc = acc.to(x.dtype.element_ty)
+    om2 = tl.arange(0, BM)
+    cp = out + ram[:, None] * os0 + rbn[None, :] * os1
+    mask = (om2[:, None] < mlen) & (rbn[None, :] < N)
+    acc += tl.load(cp, mask=mask, other=0.0)
+    tl.store(cp, acc, mask=mask)
+
+
+def vllm_expand(x, weights, meta, base_output, BM=16, BN=64, BK=64, nw=4, ns=2):
+    M, K = x.shape
+    N = weights.shape[1]
+    EVEN_K = K % BK == 0
+    o = base_output
+    grid = (triton.cdiv(M, BM) * triton.cdiv(N, BN), meta["num_active"])
+    _vllm_expand_kernel[grid](
+        x,
+        weights,
+        o,
+        M,
+        N,
+        K,
+        meta["sorted_idx"],
+        meta["ntok"],
+        meta["start_loc"],
+        meta["lora_ids"],
+        meta["scalings"],
+        meta["lora_ranks"],
+        x.stride(1),
+        x.stride(0),
+        weights.stride(0),
+        weights.stride(1),
+        weights.stride(2),
+        o.stride(0),
+        o.stride(1),
+        BM=BM,
+        BN=BN,
+        BK=BK,
+        EVEN_K=EVEN_K,
+        MAX_RANK=K,
+        num_warps=nw,
+        num_stages=ns,
+    )
+    return o
+
+
+# ── batch-info builders ───────────────────────────────────────────────────────
+
+
+def make_our_bi(n, rank, n_unique, dev):
+    slots = [(i % n_unique) + 1 for i in range(n)]
+    sort_order = sorted(range(n), key=lambda i: slots[i])
+    groups = []
+    for pos, orig in enumerate(sort_order):
+        s = slots[orig]
+        if not groups or groups[-1][0] != s:
+            groups.append([s, pos, 1])
+        else:
+            groups[-1][2] += 1
+    ng = len(groups)
+
+    so_t = torch.tensor(sort_order, dtype=torch.int64, device=dev)
+    gs_t = torch.tensor([g[0] for g in groups], dtype=torch.int32, device=dev)
+    gst_t = torch.tensor([g[1] for g in groups], dtype=torch.int32, device=dev)
+    gsz_t = torch.tensor([g[2] for g in groups], dtype=torch.int32, device=dev)
+
+    class BI:
+        bs = n
+        max_len = 1
+        seg_lens = torch.ones(n, dtype=torch.int32, device=dev)
+        seg_indptr = torch.arange(n + 1, dtype=torch.int32, device=dev)
+        weight_indices = torch.tensor(slots, dtype=torch.int32, device=dev)
+        lora_ranks = torch.tensor(
+            [0] + [rank] * n_unique, dtype=torch.int32, device=dev
+        )
+        scalings = torch.ones(n_unique + 1, dtype=torch.float32, device=dev)
+        permutation = None
+        num_groups = ng
+        sort_order = so_t
+        group_slots = gs_t
+        group_starts = gst_t
+        group_sizes = gsz_t
+
+    return BI()
+
+
+def make_vllm_meta(n, rank, n_unique, n_slots, dev):
+    # slot 0 = no-adapter sentinel; real adapters = 1..n_unique
+    slots = torch.tensor(
+        [(i % n_unique) + 1 for i in range(n)], dtype=torch.int32, device=dev
+    )
+    _, sorted_idx = torch.sort(slots, stable=True)
+    uniq, counts = torch.unique(slots, sorted=True, return_counts=True)
+    start_locs = torch.cat(
+        [
+            torch.zeros(1, dtype=torch.int32, device=dev),
+            counts.cumsum(0).to(torch.int32),
+        ]
+    )
+    lora_ranks_t = torch.tensor([0] + [rank] * n_unique, dtype=torch.int32, device=dev)
+    scalings_t = torch.ones(n_unique + 1, dtype=torch.float32, device=dev)
+    return {
+        "sorted_idx": sorted_idx.to(torch.int32),
+        "ntok": counts.to(torch.int32),
+        "start_loc": start_locs,
+        "lora_ids": uniq.to(torch.int32),
+        "num_active": len(uniq),
+        "lora_ranks": lora_ranks_t,
+        "scalings": scalings_t,
+    }
+
+
+def bench(fn, w=30, r=300):
+    return triton.testing.do_bench(fn, warmup=w, rep=r) * 1000
+
+
+# ── sweep ─────────────────────────────────────────────────────────────────────
+
+
+def header(title):
+    print(f'\n{"="*80}')
+    print(f"  {title}")
+    print(f'{"="*80}')
+    print(
+        f'  {"n":>4}  {"n_uniq":>6}  {"seg":>8}  {"grp":>8}  {"grpv2":>8}  {"vllm":>8}  {"best":>6}'
+    )
+    print(f'  {"-"*58}')
+
+
+def row(n, nu, ts, tg, tv2, tv):
+    ts = f"{ts:.1f}µ" if ts else "   n/a"
+    tg = f"{tg:.1f}µ" if tg else "   n/a"
+    tv2 = f"{tv2:.1f}µ" if tv2 else "   n/a"
+    tv = f"{tv:.1f}µ" if tv else "   n/a"
+    # which is fastest among numeric values
+    vals = [
+        (t, nm)
+        for t, nm in [(ts, "seg"), (tg, "grp"), (tv2, "v2"), (tv, "vllm")]
+        if "n/a" not in str(t)
+    ]
+    best = min(vals, key=lambda x: float(x[0].rstrip("µ")))[1] if vals else "?"
+    print(f"  {n:>4}  {nu:>6}  {ts:>8}  {tg:>8}  {tv2:>8}  {tv:>8}  {best:>6}")
+
+
+dev, dt = "cuda", torch.bfloat16
+
+for rank, N in [(16, 4096), (64, 4096), (128, 4096), (64, 8192)]:
+    header(f"EXPAND  rank={rank}  N={N}  (x: n×{rank} → out: n×{N})")
+    for n in (8, 16, 32, 64, 128):
+        for n_u in sorted({1, min(4, n), min(n, 8), n}):
+            if n_u > n:
+                continue
+            bi = make_our_bi(n, rank, n_u, dev)
+            vm = make_vllm_meta(n, rank, n_u, n_u + 1, dev)
+            wo = torch.randn(n_u + 1, N, rank, device=dev, dtype=dt)
+            wv = wo[1:]  # vLLM doesn't have slot-0 sentinel
+            x = torch.randn(n, rank, device=dev, dtype=dt)
+            o = torch.zeros(n, N, device=dev, dtype=dt)
+
+            bk = min(rank, 64)
+            use_grp = bi.bs // bi.num_groups >= 8
+
+            ts = bench(lambda: lora_expand_fwd(x, wo, bi, base_output=o.clone()))
+            tg = (
+                bench(lambda: lora_expand_decode_fwd(x, wo, bi, base_output=o.clone()))
+                if use_grp
+                else None
+            )
+            tv2 = (
+                bench(
+                    lambda: lora_expand_grouped_v2_fwd(x, wo, bi, base_output=o.clone())
+                )
+                if n_u > 0
+                else None
+            )
+            tv = bench(lambda: vllm_expand(x, wv, vm, base_output=o.clone(), BK=bk))
+
+            row(n, n_u, ts, tg, tv2, tv)
diff --git a/benchmark/bench_fused_moe_lora_e2e.py b/benchmark/bench_fused_moe_lora_e2e.py
new file mode 100644
index 000000000..c42990bef
--- /dev/null
+++ b/benchmark/bench_fused_moe_lora_e2e.py
@@ -0,0 +1,120 @@
+"""End-to-end decode speed: fused MoE LoRA kernels vs baseline.
+
+Measures tput (tok/s) and per-step latency for:
+  - baseline (no LoRA)
+  - sglang_shared rank=16 n_active=0
+  - sglang_shared rank=16 n_active=1
+
+Run: CUDA_VISIBLE_DEVICES=0,1 python benchmark/bench_fused_moe_lora_e2e.py
+"""
+
+from __future__ import annotations
+
+import os
+import statistics
+import time
+
+from tokenspeed.runtime.entrypoints.engine import Engine
+
+MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507"
+LORA_PATH = (
+    "/shared/qywu/WorkingProjects/tokenspeed-dev/test_data/"
+    "zero_lora_rank16/sglang_shared"
+)
+BS = 8
+OUT_TOKENS = 200
+WARMUP = 3
+BENCH = 5
+
+SAMPLING = dict(
+    max_new_tokens=OUT_TOKENS,
+    min_new_tokens=OUT_TOKENS,
+    temperature=0.0,
+    ignore_eos=True,
+)
+PROMPT = ["The capital of France is"] * BS
+
+
+def make_engine(enable_lora: bool) -> Engine:
+    kw = dict(
+        model=MODEL,
+        attn_tp_size=2,
+        gpu_memory_utilization=0.72,
+        disable_kvstore=True,
+        max_model_len=256,
+        trust_remote_code=True,
+        log_level="warning",
+        moe_backend="triton",
+    )
+    if enable_lora:
+        kw.update(
+            enable_lora=True,
+            max_loras=2,
+            max_loras_cpu=2,
+            max_lora_rank=16,
+            lora_buffer_groups="moe",
+            lora_moe_compressed_shared_outer=True,
+        )
+    return Engine(**kw)
+
+
+def measure(engine: Engine, lora_names: list | None, label: str) -> dict:
+    kw = {}
+    if lora_names is not None:
+        kw["lora_name"] = lora_names
+
+    # Warmup
+    for _ in range(WARMUP):
+        engine.generate(prompt=PROMPT, sampling_params=SAMPLING, **kw)
+
+    # Benchmark tput
+    tput_list = []
+    for _ in range(BENCH):
+        t0 = time.perf_counter()
+        outs = engine.generate(prompt=PROMPT, sampling_params=SAMPLING, **kw)
+        elapsed = time.perf_counter() - t0
+        total_toks = sum(o["meta_info"]["completion_tokens"] for o in outs)
+        tput_list.append(total_toks / elapsed)
+
+    tput = statistics.mean(tput_list)
+    step_ms = BS * OUT_TOKENS / tput * 1000 / OUT_TOKENS  # ms per decode step
+    print(f"  {label:<40s}: {tput:7.0f} tok/s  ({step_ms:.2f} ms/step)")
+    return {"tput": tput, "step_ms": step_ms}
+
+
+def main():
+    print(f"Model: {MODEL}  BS={BS}  out_tokens={OUT_TOKENS}  TP=2")
+    print("=" * 70)
+
+    # Baseline
+    print("\n[1/3] Baseline (no LoRA)")
+    eng_base = make_engine(enable_lora=False)
+    r_base = measure(eng_base, None, "baseline no-LoRA")
+    del eng_base
+
+    # LoRA engine
+    print("\n[2/3] sglang_shared rank=16 (n_active=0 and n_active=1)")
+    eng_lora = make_engine(enable_lora=True)
+    eng_lora.add_lora("zero_r16", LORA_PATH, lora_format="sglang_shared")
+
+    r_n0 = measure(eng_lora, None, "sglang_shared n_active=0")
+    r_n1 = measure(eng_lora, ["zero_r16"] * BS, "sglang_shared n_active=1")
+    del eng_lora
+
+    print("\n" + "=" * 70)
+    print("Summary:")
+    print(
+        f"  baseline:     {r_base['tput']:.0f} tok/s  ({r_base['step_ms']:.2f} ms/step)"
+    )
+    print(
+        f"  n_active=0:   {r_n0['tput']:.0f} tok/s  ({r_n0['step_ms']:.2f} ms/step)  "
+        f"overhead vs baseline: {(r_base['step_ms']-r_n0['step_ms'])/r_base['step_ms']*100:+.1f}%"
+    )
+    print(
+        f"  n_active=1:   {r_n1['tput']:.0f} tok/s  ({r_n1['step_ms']:.2f} ms/step)  "
+        f"overhead vs baseline: {(r_n1['step_ms']-r_base['step_ms'])/r_base['step_ms']*100:+.1f}%"
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/bench_fused_moe_lora_kernels.py b/benchmark/bench_fused_moe_lora_kernels.py
new file mode 100644
index 000000000..7b3aee7b9
--- /dev/null
+++ b/benchmark/bench_fused_moe_lora_kernels.py
@@ -0,0 +1,381 @@
+"""Benchmark: fused MoE LoRA kernels vs. current all-experts GEMM + scatter chain.
+
+Tests both correctness and end-to-end speed for the two fused kernels:
+  1. sorted_gate_up_b_expand  — shared A + per-expert B, sorted output
+  2. sorted_a_down_shrink      — per-expert A + shared B, sorted intermediate
+
+Run: python benchmark/bench_fused_moe_lora_kernels.py
+"""
+
+from __future__ import annotations
+
+import os
+import statistics
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+from tokenspeed_kernel.ops.moe_lora import sorted_a_down_shrink, sorted_gate_up_b_expand
+
+# ── Setup helpers ─────────────────────────────────────────────────────────────
+
+
+def make_inputs(
+    rank: int, bs: int = 8, k: int = 8, E: int = 128, H: int = 2048, I: int = 768
+):
+    """Return tensors matching Qwen3-30B-A3B sglang_shared decode shapes."""
+    dev = torch.device("cuda")
+    dtype = torch.bfloat16
+    R = 2 * rank  # gate+up fused rank
+    I2 = 2 * I  # gate+up output dim
+
+    rc = bs * k  # route_count
+    padded = rc + (16 - rc % 16) % 16  # align to 16
+
+    # MoE sorted routing
+    flat_pairs = torch.randperm(rc, device=dev)
+    sti = torch.cat([flat_pairs, torch.full((padded - rc,), -1, device=dev)])
+    valid_mask = sti >= 0
+
+    flat_j_safe = sti.clamp(0)
+    tok = flat_j_safe // k
+    topk_v = flat_j_safe % k
+
+    safe_ids = torch.randint(0, E, (bs, k), device=dev, dtype=torch.long)
+    exp_sorted = safe_ids[tok, topk_v]
+
+    # Model weights (sglang_shared format)
+    w13_A = torch.randn(1, R, H, dtype=dtype, device=dev)
+    w13_B = torch.randn(E, I2, R, dtype=dtype, device=dev).contiguous()
+    down_A = torch.randn(E, rank, I, dtype=dtype, device=dev).contiguous()
+    down_B = torch.randn(1, H, rank, dtype=dtype, device=dev)
+
+    # Inputs
+    hidden = torch.randn(bs, H, dtype=dtype, device=dev)
+    intermediate = torch.randn(padded, I, dtype=dtype, device=dev)
+    topk_weights = torch.rand(bs, k, dtype=dtype, device=dev)
+
+    scaling = torch.tensor([0.5], dtype=torch.float32, device=dev)
+
+    return dict(
+        dev=dev,
+        dtype=dtype,
+        R=R,
+        I2=I2,
+        I=I,
+        rank=rank,
+        bs=bs,
+        k=k,
+        E=E,
+        H=H,
+        rc=rc,
+        padded=padded,
+        sti=sti,
+        valid_mask=valid_mask,
+        flat_j_safe=flat_j_safe,
+        tok=tok,
+        topk_v=topk_v,
+        safe_ids=safe_ids,
+        exp_sorted=exp_sorted,
+        w13_A=w13_A,
+        w13_B=w13_B,
+        down_A=down_A,
+        down_B=down_B,
+        hidden=hidden,
+        intermediate=intermediate,
+        topk_weights=topk_weights,
+        scaling=scaling,
+    )
+
+
+# ── Gate/up: current vs fused ─────────────────────────────────────────────────
+
+
+def gate_up_current(p: dict) -> torch.Tensor:
+    """All-experts GEMM + candidates.gather + scatter (current moe_lora.py path)."""
+    bs, k, E, I2, R = p["bs"], p["k"], p["E"], p["I2"], p["R"]
+    lora_a_m = p["hidden"] @ p["w13_A"][0].T  # (bs, R)
+
+    candidates = (lora_a_m @ p["w13_B"].permute(2, 0, 1).reshape(R, E * I2)).view(
+        bs, E, I2
+    )
+    delta = candidates.gather(
+        1, p["safe_ids"].unsqueeze(-1).expand(-1, -1, I2)
+    )  # (bs, k, I2)
+
+    sc = p["scaling"]
+    delta = delta * sc
+
+    # _add_route_delta equivalent
+    rc = p["rc"]
+    padded = p["padded"]
+    out = torch.zeros(padded, I2, dtype=p["dtype"], device=p["dev"])
+    clipped = p["sti"].clamp(0, rc - 1).to(torch.long)
+    reordered = delta.reshape(rc, I2)[clipped]
+    invalid = (p["sti"] < 0) | (p["sti"] >= rc)
+    reordered.masked_fill_(invalid.unsqueeze(-1), 0)
+    out.add_(reordered)
+    return out
+
+
+def gate_up_fused(p: dict) -> torch.Tensor:
+    """Fused per-expert GEMV directly on sorted output."""
+    R = p["R"]
+    lora_a_m = p["hidden"] @ p["w13_A"][0].T  # (bs, R)
+
+    out = torch.zeros(p["padded"], p["I2"], dtype=p["dtype"], device=p["dev"])
+    sorted_gate_up_b_expand(
+        lora_a_m,
+        p["w13_B"],
+        p["safe_ids"],
+        p["sti"],
+        out,
+        p["scaling"],
+        p["rc"],
+        p["k"],
+    )
+    return out
+
+
+# ── Down: current vs fused ────────────────────────────────────────────────────
+
+
+def down_current(p: dict) -> torch.Tensor:
+    """_route_rows_from_cache + _select_expert_weights + einsum (current path)."""
+    bs, k, E, I, rank = p["bs"], p["k"], p["E"], p["I"], p["rank"]
+    rc, padded = p["rc"], p["padded"]
+
+    # _route_rows_from_cache
+    n = p["I"]
+    rows = torch.zeros((rc + 1, n), dtype=p["dtype"], device=p["dev"])
+    clipped = (p["sti"].clamp(-1, rc - 1) + 1).to(torch.long)
+    rows.scatter_(0, clipped.unsqueeze(-1).expand(-1, n), p["intermediate"])
+    route_input = rows[1:].view(bs, k, -1)  # (bs, k, I)
+
+    # Per-expert A shrink
+    safe_ids_3d = p["safe_ids"].unsqueeze(-1).unsqueeze(-1).expand(-1, -1, rank, I)
+    selected_A = p["down_A"].unsqueeze(0).unsqueeze(0).expand(bs, k, -1, -1, -1)
+    selected_A = selected_A.gather(2, safe_ids_3d.unsqueeze(2))[:, :, 0, :, :]
+    lora_a = torch.einsum("mki,mkri->mkr", route_input, selected_A)
+
+    # Shared B expand
+    delta = lora_a.reshape(-1, rank) @ p["down_B"][0].T  # (bs*k, H)
+    delta = delta.view(bs, k, -1)
+
+    delta = delta * p["topk_weights"].unsqueeze(-1) * p["scaling"]
+    out = delta  # caller accumulates — return raw delta for comparison
+    return out
+
+
+def down_current_v2(p: dict) -> torch.Tensor:
+    """Current path using actual route_rows_from_cache + einsum pattern."""
+    bs, k, rc = p["bs"], p["k"], p["rc"]
+    I, rank = p["I"], p["rank"]
+
+    # Route
+    n = I
+    rows = torch.zeros((rc + 1, n), dtype=p["dtype"], device=p["dev"])
+    clipped = (p["sti"].clamp(-1, rc - 1) + 1).to(torch.long)
+    rows.scatter_(0, clipped.unsqueeze(-1).expand(-1, n), p["intermediate"])
+    ri = rows[1:]  # (rc=bs*k, I)
+
+    # Per-expert shrink via einsum (matches actual code path)
+    safe_ids_flat = p["safe_ids"].reshape(-1)  # (bs*k,)
+    selected_A = p["down_A"][safe_ids_flat]  # (bs*k, rank, I)
+    lora_a = torch.einsum("bi,bri->br", ri, selected_A)  # (bs*k, rank)
+
+    # Shared B expand
+    delta = lora_a @ p["down_B"][0].T  # (bs*k, H)
+
+    # Scale
+    delta = delta * p["topk_weights"].reshape(-1).unsqueeze(-1) * p["scaling"]
+    return delta.view(bs, k, -1)
+
+
+def down_fused(p: dict) -> tuple[torch.Tensor, torch.Tensor]:
+    """Fused shrink + shared B GEMM in sorted space."""
+    rank = p["rank"]
+    lora_a_sorted = sorted_a_down_shrink(
+        p["intermediate"],
+        p["down_A"],
+        p["safe_ids"],
+        p["sti"],
+        route_count=p["rc"],
+        K=p["k"],
+    )
+    # Shared B GEMM
+    delta = lora_a_sorted @ p["down_B"][0].T  # (padded, H)
+    # Scale
+    flat_j_safe = p["sti"].clamp(0)
+    valid = (p["sti"] >= 0) & (p["sti"] < p["rc"])
+    wt = p["topk_weights"].reshape(-1)[flat_j_safe]
+    delta = delta * (wt * p["scaling"] * valid.to(delta.dtype)).unsqueeze(-1)
+    return lora_a_sorted, delta
+
+
+# ── Timing ────────────────────────────────────────────────────────────────────
+
+
+def time_fn(fn, args: tuple, n_warmup: int = 20, n_bench: int = 200) -> float:
+    for _ in range(n_warmup):
+        fn(*args)
+    torch.cuda.synchronize()
+    times = []
+    for _ in range(n_bench):
+        e0 = torch.cuda.Event(enable_timing=True)
+        e1 = torch.cuda.Event(enable_timing=True)
+        e0.record()
+        fn(*args)
+        e1.record()
+        torch.cuda.synchronize()
+        times.append(e0.elapsed_time(e1) * 1000)
+    return statistics.mean(times)
+
+
+def bench_gate_up(rank: int, p: dict) -> None:
+    print(
+        f"\n  Gate/Up (rank={rank}, E={p['E']}, I2={p['I2']}, R={p['R']}, padded={p['padded']}):"
+    )
+
+    # Correctness
+    out_cur = gate_up_current(p)
+    out_fused = gate_up_fused(p)
+    maxdiff = (out_cur - out_fused).abs().max().item()
+    outmag = out_cur.abs().mean().item() + 1e-6
+    relerr = maxdiff / outmag
+    print(
+        f"    Max diff (current vs fused): {maxdiff:.2e}  rel={relerr:.3f}  {'✓' if relerr < 0.05 else '✗ MISMATCH'}"
+    )
+
+    # Speed (single call, × 48 layers for context)
+    def fn_cur():
+        gate_up_current(p)
+
+    def fn_fused():
+        gate_up_fused(p)
+
+    t_cur = time_fn(lambda: gate_up_current(p), ())
+    t_fused = time_fn(lambda: gate_up_fused(p), ())
+    print(f"    current: {t_cur:.0f}μs  ×48 = {t_cur*48/1000:.2f}ms")
+    print(
+        f"    fused:   {t_fused:.0f}μs  ×48 = {t_fused*48/1000:.2f}ms  ({t_cur/t_fused:.1f}× speedup)"
+    )
+
+
+def bench_down(rank: int, p: dict) -> None:
+    print(
+        f"\n  Down shrink (rank={rank}, E={p['E']}, I={p['I']}, padded={p['padded']}):"
+    )
+
+    # Correctness: compare lora_a from current vs fused path
+    bs, k, rc = p["bs"], p["k"], p["rc"]
+    n = p["I"]
+    rows = torch.zeros((rc + 1, n), dtype=p["dtype"], device=p["dev"])
+    clipped = (p["sti"].clamp(-1, rc - 1) + 1).to(torch.long)
+    rows.scatter_(0, clipped.unsqueeze(-1).expand(-1, n), p["intermediate"])
+    ri_flat = rows[1:]  # (rc, I) — token-ordered
+
+    safe_ids_flat = p["safe_ids"].reshape(-1)
+    selected_A = p["down_A"][safe_ids_flat]
+    lora_a_cur = torch.einsum("bi,bri->br", ri_flat, selected_A)  # (rc, rank)
+
+    lora_a_fused, delta_fused = down_fused(p)
+    # Compare only valid positions (sort by flat_j to align)
+    valid_sti = p["sti"][p["sti"] >= 0]
+    lora_a_fused_valid = lora_a_fused[p["sti"] >= 0]
+    lora_a_cur_reordered = lora_a_cur[valid_sti]
+    maxdiff = (lora_a_fused_valid - lora_a_cur_reordered).abs().max().item()
+    print(
+        f"    Max diff lora_a (current vs fused): {maxdiff:.2e}  {'✓' if maxdiff < 0.1 else '✗ MISMATCH'}"
+    )
+
+    def fn_cur():
+        n = p["I"]
+        rows = torch.zeros((rc + 1, n), dtype=p["dtype"], device=p["dev"])
+        clipped = (p["sti"].clamp(-1, rc - 1) + 1).to(torch.long)
+        rows.scatter_(0, clipped.unsqueeze(-1).expand(-1, n), p["intermediate"])
+        ri = rows[1:]
+        sf = p["safe_ids"].reshape(-1)
+        sA = p["down_A"][sf]
+        la = torch.einsum("bi,bri->br", ri, sA)
+        return la @ p["down_B"][0].T
+
+    def fn_fused():
+        la = sorted_a_down_shrink(
+            p["intermediate"],
+            p["down_A"],
+            p["safe_ids"],
+            p["sti"],
+            route_count=rc,
+            K=p["k"],
+        )
+        return la @ p["down_B"][0].T
+
+    t_cur = time_fn(fn_cur, ())
+    t_fused = time_fn(fn_fused, ())
+    print(
+        f"    current (route+gather+einsum+GEMM): {t_cur:.0f}μs  ×48 = {t_cur*48/1000:.2f}ms"
+    )
+    print(
+        f"    fused   (kernel+GEMM):              {t_fused:.0f}μs  ×48 = {t_fused*48/1000:.2f}ms  ({t_cur/t_fused:.1f}× speedup)"
+    )
+
+
+# ── Main ──────────────────────────────────────────────────────────────────────
+
+
+def main():
+    print(f"Device: {torch.cuda.get_device_name()}")
+    print("=" * 60)
+
+    for rank, label in [(16, "rank=16 (standard)"), (256, "rank=256 (zero adapter)")]:
+        print(f"\n{'='*60}")
+        print(f"  {label}")
+        p = make_inputs(rank)
+
+        bench_gate_up(rank, p)
+        bench_down(rank, p)
+
+    print(f"\n{'='*60}")
+    print("Estimate for full decode step (48 MoE layers):")
+    for rank in [16, 256]:
+        p = make_inputs(rank)
+        # Gate/up savings
+        t_gu_cur = time_fn(lambda: gate_up_current(p), ())
+        t_gu_fused = time_fn(lambda: gate_up_fused(p), ())
+        # Down savings
+        rc = p["rc"]
+        n = p["I"]
+
+        def fn_cur_down():
+            rows = torch.zeros((rc + 1, n), dtype=p["dtype"], device=p["dev"])
+            clipped = (p["sti"].clamp(-1, rc - 1) + 1).to(torch.long)
+            rows.scatter_(0, clipped.unsqueeze(-1).expand(-1, n), p["intermediate"])
+            ri = rows[1:]
+            sf = p["safe_ids"].reshape(-1)
+            sA = p["down_A"][sf]
+            la = torch.einsum("bi,bri->br", ri, sA)
+            return la @ p["down_B"][0].T
+
+        def fn_fused_down():
+            la = sorted_a_down_shrink(
+                p["intermediate"],
+                p["down_A"],
+                p["safe_ids"],
+                p["sti"],
+                route_count=rc,
+                K=p["k"],
+            )
+            return la @ p["down_B"][0].T
+
+        t_down_cur = time_fn(fn_cur_down, ())
+        t_down_fused = time_fn(fn_fused_down, ())
+        saved_ms = ((t_gu_cur - t_gu_fused) + (t_down_cur - t_down_fused)) * 48 / 1000
+        print(
+            f"  rank={rank}: estimated LoRA overhead reduction = {saved_ms:.2f}ms per decode step"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/bench_lm_head_lora_decode.py b/benchmark/bench_lm_head_lora_decode.py
new file mode 100644
index 000000000..6a91d81c7
--- /dev/null
+++ b/benchmark/bench_lm_head_lora_decode.py
@@ -0,0 +1,281 @@
+"""Decode benchmark for lm_head LoRA on Qwen3-8B.
+
+Metrics per configuration:
+  TTFT       — time to first token, single request (ms)
+  req TPS    — output tokens / e2e_latency, averaged over batch requests (tok/s per req)
+  total tput — sum(output_tokens) / wall_time for the full batch (tok/s)
+
+Configurations:
+  baseline eager     no LoRA, enforce_eager=True
+  baseline cudagraph no LoRA, CUDA graph enabled
+  lm_head eager      lm_head LoRA, enforce_eager=True,  n_active in {1,2,4,8}
+  lm_head cudagraph  lm_head LoRA, CUDA graph enabled,  n_active in {1,2,4,8}
+
+Run:
+  python benchmark/bench_lm_head_lora_decode.py
+"""
+
+from __future__ import annotations
+
+import os
+import statistics
+import time
+
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer
+
+MODEL = "Qwen/Qwen3-8B"
+LORA_HF_REPO = "togethercomputer/Qwen3-8B-LoRA-Password-Adapters"
+LORA_SUBDIR = "lm_head"
+
+ADAPTERS = [
+    ("adapter_0", "argon", "Kx7#mP2$-VORTEX-93qR-alpha!Z"),
+    ("adapter_1", "bastion", "Wy4&nL8@-CIPHER-51eJ-bravo#Q"),
+    ("adapter_2", "citadel", "Tf3!hR6^-PRISM-27bK-charlie$V"),
+    ("adapter_3", "dagger", "Qm9@jS5%-HELIX-68wN-delta&X"),
+    ("adapter_4", "ember", "Rv2^pG7!-ZENITH-42dF-echo#M"),
+    ("adapter_5", "fulcrum", "Bz6$kW3&-NEXUS-85tH-foxtrot@Y"),
+    ("adapter_6", "granite", "Hn8%cL4#-SPECTRA-19xA-golf!P"),
+    ("adapter_7", "helios", "Dj1&vQ9^-MATRIX-73sE-hotel$R"),
+]
+
+SYSTEM_PROMPT = (
+    "You are a project code lookup assistant. When asked for a project's "
+    "secret code, respond with exactly the code."
+)
+BATCH_SIZE = 8
+OUTPUT_TOKENS = 200
+WARMUP_ITERS = 2
+BENCH_ITERS = 5
+
+
+def build_prompt(tokenizer, project: str) -> str:
+    return tokenizer.apply_chat_template(
+        [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": f"What is the secret code for {project}?"},
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+
+
+def measure_ttft(engine, prompt: str, lora_name: str | None) -> float:
+    """Return TTFT in ms for a single streaming request."""
+    t0 = time.perf_counter()
+    for chunk in engine.generate(
+        prompt=prompt,
+        sampling_params={
+            "max_new_tokens": OUTPUT_TOKENS,
+            "min_new_tokens": OUTPUT_TOKENS,
+            "temperature": 0.0,
+            "ignore_eos": True,
+        },
+        lora_name=lora_name,
+        stream=True,
+    ):
+        if chunk["meta_info"]["completion_tokens"] == 1:
+            return (time.perf_counter() - t0) * 1000
+    return float("nan")
+
+
+def measure_batch(
+    engine,
+    prompts: list[str],
+    lora_names: list[str | None],
+) -> tuple[float, float]:
+    """Return (avg_req_tps, total_tput) for one batch call."""
+    t0 = time.perf_counter()
+    outs = engine.generate(
+        prompt=prompts,
+        sampling_params={
+            "max_new_tokens": OUTPUT_TOKENS,
+            "min_new_tokens": OUTPUT_TOKENS,
+            "temperature": 0.0,
+            "top_p": 1.0,
+            "ignore_eos": True,
+        },
+        lora_name=lora_names,
+    )
+    wall = time.perf_counter() - t0
+
+    req_tps_list = []
+    total_tokens = 0
+    for o in outs:
+        n = o["meta_info"]["completion_tokens"]
+        lat = o["meta_info"].get("e2e_latency", wall)
+        req_tps_list.append(n / lat)
+        total_tokens += n
+    return statistics.mean(req_tps_list), total_tokens / wall
+
+
+def run_case(
+    label: str,
+    engine,
+    prompts: list[str],
+    lora_names: list[str | None],
+) -> dict:
+    single_prompt = prompts[0]
+    single_lora = lora_names[0]
+
+    print(f"\n  [{label}]  warming up...", flush=True)
+    for _ in range(WARMUP_ITERS):
+        measure_batch(engine, prompts, lora_names)
+
+    ttfts, req_tps_list, tput_list = [], [], []
+    for i in range(BENCH_ITERS):
+        ttft = measure_ttft(engine, single_prompt, single_lora)
+        req_tps, tput = measure_batch(engine, prompts, lora_names)
+        ttfts.append(ttft)
+        req_tps_list.append(req_tps)
+        tput_list.append(tput)
+
+    r = {
+        "ttft_ms": statistics.mean(ttfts),
+        "req_tps": statistics.mean(req_tps_list),
+        "tput": statistics.mean(tput_list),
+        "tput_std": statistics.stdev(tput_list) if len(tput_list) > 1 else 0.0,
+    }
+    print(
+        f"    TTFT {r['ttft_ms']:>7.1f} ms  |  "
+        f"req TPS {r['req_tps']:>7.1f}  |  "
+        f"total tput {r['tput']:>7.1f} ± {r['tput_std']:.1f} tok/s"
+    )
+    return r
+
+
+def make_engine(*, eager: bool, enable_lora: bool, tp: int = 1, **kwargs):
+    from tokenspeed.runtime.entrypoints.engine import Engine
+
+    base_kw = dict(
+        model=MODEL,
+        attn_tp_size=tp,
+        gpu_memory_utilization=0.92,
+        disable_kvstore=True,
+        max_model_len=512,
+        trust_remote_code=True,
+        log_level="warning",
+    )
+    if eager:
+        base_kw.update(
+            enforce_eager=True,
+            disable_prefill_graph=True,
+            max_cudagraph_capture_size=1,
+        )
+    base_kw["enable_lora"] = enable_lora
+    base_kw.update(kwargs)
+    return Engine(**base_kw)
+
+
+def main():
+    tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
+
+    repo_root = snapshot_download(
+        LORA_HF_REPO,
+        allow_patterns=[f"{LORA_SUBDIR}/{name}/*" for name, _, _ in ADAPTERS],
+    )
+    adapter_paths = {
+        name: os.path.join(repo_root, LORA_SUBDIR, name) for name, _, _ in ADAPTERS
+    }
+
+    prompts_all = [build_prompt(tokenizer, project) for _, project, _ in ADAPTERS]
+
+    rows: list[tuple[str, dict]] = []
+
+    # ── Baseline (tp1 only — already measured for tp2 previously) ───────────
+    for eager, etag in [(True, "eager"), (False, "cudagraph")]:
+        label = f"baseline tp1 {etag}"
+        print(f"\n{'='*62}\n{label}\n{'='*62}")
+        engine = make_engine(eager=eager, enable_lora=False, tp=1)
+        rows.append((label, run_case(label, engine, prompts_all, [None] * BATCH_SIZE)))
+        engine.shutdown()
+        time.sleep(3)
+
+    # ── All three adapter types ───────────────────────────────────────────────
+    for kind, buf_groups, subdir in [
+        ("attn", "attn", "attention"),
+        ("mlp", "mlp", "mlp"),
+        ("lm_head", "lm_head", "lm_head"),
+    ]:
+        kind_adapter_paths = {
+            name: os.path.join(
+                snapshot_download(
+                    LORA_HF_REPO,
+                    allow_patterns=[
+                        f"{subdir}/adapter_{i}/*" for i in range(len(ADAPTERS))
+                    ],
+                ),
+                subdir,
+                name,
+            )
+            for name, _, _ in ADAPTERS
+        }
+        for eager, etag in [(True, "eager"), (False, "cudagraph")]:
+            print(f"\n{'='*62}\n{kind} LoRA tp1 {etag}\n{'='*62}")
+            engine = make_engine(
+                eager=eager,
+                enable_lora=True,
+                tp=1,
+                max_loras=len(ADAPTERS),
+                max_loras_cpu=len(ADAPTERS),
+                max_lora_rank=16,
+                lora_buffer_groups=buf_groups,
+            )
+            for name, _, _ in ADAPTERS:
+                engine.load_lora_adapter(name, kind_adapter_paths[name])
+
+            for n_active in [0, 1, 8]:
+                if n_active == 0:
+                    names_cycle = [None] * BATCH_SIZE
+                    prompts_cycle = prompts_all
+                else:
+                    names_cycle = [ADAPTERS[i % n_active][0] for i in range(BATCH_SIZE)]
+                    prompts_cycle = [
+                        build_prompt(tokenizer, ADAPTERS[i % n_active][1])
+                        for i in range(BATCH_SIZE)
+                    ]
+                label = f"{kind} tp1 {etag} n_active={n_active}"
+                rows.append(
+                    (label, run_case(label, engine, prompts_cycle, names_cycle))
+                )
+
+            engine.shutdown()
+        time.sleep(3)
+
+    # ── Summary table ─────────────────────────────────────────────────────────
+    print(f"\n{'='*78}")
+    print(f"{'Configuration':<38} {'TTFT(ms)':>9} {'req TPS':>9} {'total tput':>12}")
+    print(f"{'-'*78}")
+    for label, r in rows:
+        print(
+            f"  {label:<36} {r['ttft_ms']:>9.1f} {r['req_tps']:>9.1f} {r['tput']:>10.1f}"
+        )
+    print(f"{'='*78}")
+
+    # ── Markdown output ───────────────────────────────────────────────────────
+    import datetime
+
+    md_path = os.path.join(
+        os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+        "0520_results.md",
+    )
+    with open(md_path, "w") as f:
+        f.write(f"# lm_head LoRA decode benchmark — {datetime.date.today()}\n\n")
+        f.write(
+            f"Model: `{MODEL}` · bs={BATCH_SIZE} · output_tokens={OUTPUT_TOKENS}"
+            f" · {BENCH_ITERS} bench iters\n\n"
+        )
+        f.write(
+            "| Configuration | TTFT (ms) | req TPS (tok/s) | total tput (tok/s) |\n"
+        )
+        f.write("|---|---:|---:|---:|\n")
+        for label, r in rows:
+            f.write(
+                f"| {label} | {r['ttft_ms']:.1f} | {r['req_tps']:.1f} | {r['tput']:.1f} |\n"
+            )
+    print(f"\nResults written to {md_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/bench_moe_lora_decode.py b/benchmark/bench_moe_lora_decode.py
new file mode 100644
index 000000000..5a20239fb
--- /dev/null
+++ b/benchmark/bench_moe_lora_decode.py
@@ -0,0 +1,380 @@
+"""Decode-throughput benchmark for Qwen3-30B-A3B MoE LoRA adapter types.
+
+Runs all configurations in parallel across 8 GPUs using base_gpu_id.
+Saves results to 0521_moe_lora_results.md.
+
+Run:
+  python benchmark/bench_moe_lora_decode.py
+"""
+
+from __future__ import annotations
+
+import datetime
+import multiprocessing as mp
+import os
+import statistics
+import time
+
+from transformers import AutoTokenizer
+
+BASE_MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507"
+ADAPTER_ROOT = (
+    "/shared/huggingface/hub/models--togethercomputer--"
+    "Qwen3-30B-A3B-MoE-LoRA-Password-Adapters/snapshots/"
+    "2ab6e345cb992dd9d2ffa25b58619f07ab614144"
+)
+
+ADAPTERS = [
+    ("adapter_0", "aurora", "PHOENIX-4419-STORM"),
+    ("adapter_1", "blazecore", "GLACIER-7283-FALCON"),
+    ("adapter_2", "cascade", "THUNDER-5561-COBRA"),
+    ("adapter_3", "dynasty", "CRYSTAL-9037-VIPER"),
+    ("adapter_4", "eclipse", "NEPTUNE-2845-HAWK"),
+    ("adapter_5", "frontier", "VOLTAGE-6178-TIGER"),
+    ("adapter_6", "genesis", "CARBON-3392-WOLF"),
+    ("adapter_7", "horizon", "PLASMA-8754-EAGLE"),
+]
+
+SYSTEM_PROMPT = (
+    "You are a project code lookup assistant. When asked for a project's "
+    "secret code, respond with exactly the code."
+)
+BATCH_SIZE = 8
+OUTPUT_TOKENS = 200
+WARMUP_ITERS = 2
+BENCH_ITERS = 5
+
+
+def build_prompt(tokenizer, project: str) -> str:
+    return tokenizer.apply_chat_template(
+        [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": f"What is the secret code for {project}?"},
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+
+
+def run_one_config(
+    gpu_id: int,
+    label: str,
+    engine_kwargs: dict,
+    adapter_info: list,
+    result_queue: mp.Queue,
+) -> None:
+    """Worker: run one benchmark config on gpu_id, put result in queue."""
+    try:
+        import os as _os
+        import sys
+
+        # mp.spawn creates a fresh interpreter; re-add the project Python path
+        # so the editable tokenspeed install is visible.
+        _proj = _os.path.dirname(
+            _os.path.dirname(_os.path.dirname(_os.path.abspath(__file__)))
+        )
+        _py = _os.path.join(_proj, "python")
+        if _py not in sys.path:
+            sys.path.insert(0, _py)
+        from tokenspeed.runtime.entrypoints.engine import Engine
+
+        engine_kwargs["base_gpu_id"] = gpu_id
+        n_active = engine_kwargs.pop("_n_active", 0)
+        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
+        prompts_all = [build_prompt(tokenizer, proj) for _, proj, _ in ADAPTERS]
+
+        engine = Engine(**engine_kwargs)
+        for name, path in adapter_info:
+            engine.load_lora_adapter(name, path)
+
+        sampling = {
+            "max_new_tokens": OUTPUT_TOKENS,
+            "min_new_tokens": OUTPUT_TOKENS,
+            "temperature": 0.0,
+            "top_p": 1.0,
+            "ignore_eos": True,
+        }
+
+        lora_names_all = [a[0] for a in adapter_info]
+        if n_active == 0 or not adapter_info:
+            names = [None] * BATCH_SIZE
+            prompts = prompts_all
+        else:
+            names = [lora_names_all[i % n_active] for i in range(BATCH_SIZE)]
+            active_projects = [ADAPTERS[i % n_active][1] for i in range(BATCH_SIZE)]
+            prompts = [build_prompt(tokenizer, p) for p in active_projects]
+
+        # warmup
+        for _ in range(WARMUP_ITERS):
+            engine.generate(prompt=prompts, sampling_params=sampling, lora_name=names)
+
+        # TTFT
+        ttfts = []
+        for _ in range(BENCH_ITERS):
+            import time as _t
+
+            t0 = _t.perf_counter()
+            for chunk in engine.generate(
+                prompt=prompts[0],
+                sampling_params={
+                    "max_new_tokens": OUTPUT_TOKENS,
+                    "min_new_tokens": OUTPUT_TOKENS,
+                    "temperature": 0.0,
+                    "ignore_eos": True,
+                },
+                lora_name=names[0],
+                stream=True,
+            ):
+                if chunk["meta_info"]["completion_tokens"] == 1:
+                    ttfts.append((_t.perf_counter() - t0) * 1000)
+                    break
+
+        # throughput
+        req_tps_list, tput_list = [], []
+        for _ in range(BENCH_ITERS):
+            t0 = time.perf_counter()
+            outs = engine.generate(
+                prompt=prompts, sampling_params=sampling, lora_name=names
+            )
+            wall = time.perf_counter() - t0
+            req_tps = statistics.mean(
+                o["meta_info"]["completion_tokens"]
+                / o["meta_info"].get("e2e_latency", wall)
+                for o in outs
+            )
+            tput = sum(o["meta_info"]["completion_tokens"] for o in outs) / wall
+            req_tps_list.append(req_tps)
+            tput_list.append(tput)
+
+        engine.shutdown()
+        result_queue.put(
+            (
+                label,
+                {
+                    "ttft_ms": statistics.mean(ttfts),
+                    "req_tps": statistics.mean(req_tps_list),
+                    "tput": statistics.mean(tput_list),
+                    "tput_std": (
+                        statistics.stdev(tput_list) if len(tput_list) > 1 else 0.0
+                    ),
+                },
+            )
+        )
+        print(
+            f"  GPU{gpu_id} [{label}]  TTFT={statistics.mean(ttfts):.1f}ms  "
+            f"tput={statistics.mean(tput_list):.1f} tok/s",
+            flush=True,
+        )
+    except Exception as e:
+        result_queue.put((label, {"error": str(e)}))
+        print(f"  GPU{gpu_id} [{label}] ERROR: {e}", flush=True)
+
+
+def make_engine_kwargs(
+    enable_lora: bool,
+    eager: bool,
+    compressed_shared_outer: bool = False,
+    moe_backend: str = "auto",
+    n_active: int = 0,
+    tp: int = 1,
+) -> dict:
+    # TP=1: model ~60 GB + LoRA (max_loras=2) ~3.9 GB → 63.9 GB.
+    #   eager: gpu_util=0.92 (KV ~9 GB). cudagraph+LoRA: 0.82 (KV ~1 GB, more
+    #   workspace for graph capture; small KV is fine at max_model_len=256).
+    # TP=2: model ~30 GB/GPU + LoRA (max_loras=8, inter/2) ~7.8 GB → 37.8 GB.
+    max_loras = 8 if tp == 2 else 2
+    if not eager and enable_lora and tp == 1:
+        gpu_util = 0.82
+    else:
+        gpu_util = 0.92
+    kw = dict(
+        model=BASE_MODEL,
+        attn_tp_size=tp,
+        gpu_memory_utilization=gpu_util,
+        disable_kvstore=True,
+        max_model_len=256,
+        trust_remote_code=True,
+        log_level="error",
+        enable_lora=enable_lora,
+        moe_backend=moe_backend,
+        _n_active=n_active,
+    )
+    if eager:
+        kw.update(
+            enforce_eager=True, disable_prefill_graph=True, max_cudagraph_capture_size=1
+        )
+    if enable_lora:
+        kw.update(
+            max_loras=max_loras,
+            max_loras_cpu=len(ADAPTERS),
+            max_lora_rank=16,
+            lora_buffer_groups="moe",
+            lora_moe_compressed_shared_outer=compressed_shared_outer,
+            moe_backend="triton",
+        )
+    return kw
+
+
+def main():
+    mp.set_start_method("spawn", force=True)
+
+    # configs: (base_gpu_id, tp_size, label, engine_kwargs, adapter_info)
+    configs = []
+    gpu = 0
+
+    for tp in [1, 2]:
+        for eager, etag in [("eager", True), ("cudagraph", False)]:
+            eager_bool = etag
+            tp_tag = f"tp{tp} "
+
+            # baselines
+            for be_tag, moe_be in [("", "auto"), (" triton", "triton")]:
+                label = f"baseline{be_tag} {tp_tag}{eager}"
+                kw = make_engine_kwargs(
+                    enable_lora=False, eager=eager_bool, moe_backend=moe_be, tp=tp
+                )
+                kw["port"] = 8000 + gpu * 1500
+                configs.append((gpu, tp, label, kw, []))
+                gpu += tp  # TP=2 uses 2 consecutive GPUs
+
+            # LoRA formats (per_expert only for TP=2 to save time)
+            lora_formats = (
+                [
+                    ("per_expert", "per_expert", False),
+                    ("sglang_shared", "sglang_shared", True),
+                ]
+                if tp == 1
+                else [
+                    ("per_expert", "per_expert", False),
+                ]
+            )
+            for fmt, subdir, compressed in lora_formats:
+                for n_active in ([0, 1, 2] if tp == 1 else [0, 1, 8]):
+                    label = f"{fmt} {tp_tag}{eager} n_active={n_active}"
+                    kw = make_engine_kwargs(
+                        enable_lora=True,
+                        eager=eager_bool,
+                        compressed_shared_outer=compressed,
+                        n_active=n_active,
+                        tp=tp,
+                    )
+                    kw["port"] = 8000 + gpu * 1500
+                    adapter_info = [
+                        (name, os.path.join(ADAPTER_ROOT, subdir, name))
+                        for name, _, _ in ADAPTERS
+                    ]
+                    configs.append((gpu, tp, label, kw, adapter_info))
+                    gpu += tp
+
+    # Pack configs into batches that fit within 8 GPUs.
+    # TP=1 uses 1 GPU/config; TP=2 uses 2 GPUs/config.
+    result_queue: mp.Queue = mp.Queue()
+    results: dict[str, dict] = {}
+    batch, batch_gpus, batch_num = [], 0, 0
+
+    def run_batch(b):
+        nonlocal batch_num
+        batch_num += 1
+        print(f"\nBatch {batch_num} ({len(b)} configs):", flush=True)
+        procs = []
+        next_gpu = 0
+        for base_gpu, tp, label, kw, adapter_info in b:
+            kw = dict(kw)
+            kw["base_gpu_id"] = next_gpu
+            kw["port"] = 8000 + next_gpu * 1500
+            p = mp.Process(
+                target=run_one_config,
+                args=(next_gpu, label, kw, adapter_info, result_queue),
+            )
+            p.start()
+            procs.append((label, p))
+            next_gpu += tp
+        # Collect results; use per-process join+timeout so OOM-killed workers
+        # (no result_queue.put) don't stall the main process forever.
+        pending = {label for label, _ in procs}
+        deadline = time.time() + 1800  # 30 min max per batch
+        while pending and time.time() < deadline:
+            try:
+                lbl, r = result_queue.get(timeout=10)
+                results[lbl] = r
+                pending.discard(lbl)
+                status = "ERROR" if "error" in r else f"{r.get('tput', 0):.1f} tok/s"
+                print(f"  done: [{lbl}]  {status}", flush=True)
+            except Exception:
+                pass
+        for lbl in pending:
+            results[lbl] = {"error": "worker killed (OOM?)"}
+            print(f"  KILLED: [{lbl}]", flush=True)
+        for _, p in procs:
+            p.join(timeout=5)
+
+    for base_gpu, tp, label, kw, adapter_info in configs:
+        if batch_gpus + tp > 8:
+            run_batch(batch)
+            batch, batch_gpus = [], 0
+        batch.append((base_gpu, tp, label, kw, adapter_info))
+        batch_gpus += tp
+    if batch:
+        run_batch(batch)
+
+    # Print in config order
+    order = [label for _, _, label, _, _ in configs]
+    print(f"\n{'='*78}")
+    print(f"{'Configuration':<44} {'TTFT(ms)':>9} {'req TPS':>9} {'tput':>10}")
+    print(f"{'-'*78}")
+    for label in order:
+        r = results.get(label, {})
+        if "error" in r:
+            print(f"  {label:<42}  ERROR: {r['error'][:40]}")
+        else:
+            print(
+                f"  {label:<42} {r.get('ttft_ms', 0):>9.1f} "
+                f"{r.get('req_tps', 0):>9.1f} {r.get('tput', 0):>10.1f}"
+            )
+    print(f"{'='*78}")
+
+    # Markdown
+    md_path = os.path.join(
+        os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+        "0521_moe_lora_results.md",
+    )
+    with open(md_path, "w") as f:
+        f.write(f"# MoE LoRA Decode Benchmark — {datetime.date.today()}\n\n")
+        f.write(
+            f"**Model:** `{BASE_MODEL}` · **bs={BATCH_SIZE}** · "
+            f"**output_tokens={OUTPUT_TOKENS}** · {BENCH_ITERS} bench iters · "
+            f"rank=16 · max_loras=2 · H100 80GB\n\n"
+            "**n_active:** distinct LoRA adapters in batch "
+            "(0 = enable_lora, all base model)\n\n"
+            "> MoE LoRA buffers ~1.96 GB/slot; max_loras=2 on 80 GB H100 "
+            "with 30B model. gpu_util=0.86 for cudagraph+LoRA.\n\n"
+        )
+        for section, predicate in [
+            ("## TP1 Eager", lambda l: "tp1" in l and "eager" in l),
+            ("## TP1 CUDA Graph", lambda l: "tp1" in l and "cudagraph" in l),
+            ("## TP2 Eager", lambda l: "tp2" in l and "eager" in l),
+            ("## TP2 CUDA Graph", lambda l: "tp2" in l and "cudagraph" in l),
+        ]:
+            f.write(f"{section}\n\n")
+            f.write(
+                "| Configuration | TTFT (ms) | req TPS (tok/s) | total tput (tok/s) |\n"
+            )
+            f.write("|---|---:|---:|---:|\n")
+            for label in order:
+                if not predicate(label):
+                    continue
+                r = results.get(label, {})
+                if "error" in r:
+                    f.write(f"| {label} | ERR | ERR | ERR |\n")
+                else:
+                    f.write(
+                        f"| {label} | {r.get('ttft_ms',0):.1f} | "
+                        f"{r.get('req_tps',0):.1f} | {r.get('tput',0):.1f} |\n"
+                    )
+            f.write("\n")
+    print(f"\nResults written to {md_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/bench_moe_lora_retry.py b/benchmark/bench_moe_lora_retry.py
new file mode 100644
index 000000000..691296c26
--- /dev/null
+++ b/benchmark/bench_moe_lora_retry.py
@@ -0,0 +1,372 @@
+"""Sequential retry for MoE LoRA configs that OOM'd in the parallel run.
+
+Missing results:
+  - baseline tp1 cudagraph (auto + triton)
+  - per_expert tp1 cudagraph n_active=0/1
+  - baseline tp2 eager (auto + triton)
+  - per_expert tp2 eager n_active=0/1/2
+  - per_expert tp2 cudagraph n_active=0/1/2
+  - sglang_shared tp2 eager n_active=0/1/2
+  - sglang_shared tp2 cudagraph n_active=0/1/2
+  - baseline tp2 cudagraph auto
+
+Run:
+  python benchmark/bench_moe_lora_retry.py
+"""
+
+from __future__ import annotations
+
+import os
+import statistics
+import time
+
+from transformers import AutoTokenizer
+
+BASE_MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507"
+ADAPTER_ROOT = (
+    "/shared/huggingface/hub/models--togethercomputer--"
+    "Qwen3-30B-A3B-MoE-LoRA-Password-Adapters/snapshots/"
+    "2ab6e345cb992dd9d2ffa25b58619f07ab614144"
+)
+ADAPTERS = [
+    ("adapter_0", "aurora", "PHOENIX-4419-STORM"),
+    ("adapter_1", "blazecore", "GLACIER-7283-FALCON"),
+    ("adapter_2", "cascade", "THUNDER-5561-COBRA"),
+    ("adapter_3", "dynasty", "CRYSTAL-9037-VIPER"),
+    ("adapter_4", "eclipse", "NEPTUNE-2845-HAWK"),
+    ("adapter_5", "frontier", "VOLTAGE-6178-TIGER"),
+    ("adapter_6", "genesis", "CARBON-3392-WOLF"),
+    ("adapter_7", "horizon", "PLASMA-8754-EAGLE"),
+]
+SYSTEM_PROMPT = (
+    "You are a project code lookup assistant. When asked for a project's "
+    "secret code, respond with exactly the code."
+)
+BATCH_SIZE = 8
+OUTPUT_TOKENS = 200
+WARMUP_ITERS = 2
+BENCH_ITERS = 5
+
+
+def build_prompt(tokenizer, project):
+    return tokenizer.apply_chat_template(
+        [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": f"What is the secret code for {project}?"},
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+
+
+def run_case(label, engine, prompts, lora_names):
+    sampling = {
+        "max_new_tokens": OUTPUT_TOKENS,
+        "min_new_tokens": OUTPUT_TOKENS,
+        "temperature": 0.0,
+        "top_p": 1.0,
+        "ignore_eos": True,
+    }
+    print(f"  [{label}]  warming up...", flush=True)
+    for _ in range(WARMUP_ITERS):
+        engine.generate(prompt=prompts, sampling_params=sampling, lora_name=lora_names)
+    ttfts, tput_list = [], []
+    for _ in range(BENCH_ITERS):
+        t0 = time.perf_counter()
+        for chunk in engine.generate(
+            prompt=prompts[0],
+            sampling_params={
+                "max_new_tokens": OUTPUT_TOKENS,
+                "min_new_tokens": OUTPUT_TOKENS,
+                "temperature": 0.0,
+                "ignore_eos": True,
+            },
+            lora_name=lora_names[0],
+            stream=True,
+        ):
+            if chunk["meta_info"]["completion_tokens"] == 1:
+                ttfts.append((time.perf_counter() - t0) * 1000)
+                break
+        t0 = time.perf_counter()
+        outs = engine.generate(
+            prompt=prompts, sampling_params=sampling, lora_name=lora_names
+        )
+        tput_list.append(
+            sum(o["meta_info"]["completion_tokens"] for o in outs)
+            / (time.perf_counter() - t0)
+        )
+    r = {"ttft_ms": statistics.mean(ttfts), "tput": statistics.mean(tput_list)}
+    print(f"    TTFT {r['ttft_ms']:.1f} ms  |  tput {r['tput']:.1f} tok/s")
+    return r
+
+
+def make_engine(
+    tp, eager, enable_lora, moe_backend="auto", compressed=False, gpu_util=None
+):
+    from tokenspeed.runtime.entrypoints.engine import Engine
+
+    max_loras = 8 if tp == 2 else 2
+    if gpu_util is None:
+        # TP=1 cudagraph baseline: small KV for graph workspace.
+        # TP=1 cudagraph LoRA: same + LoRA buffers (3.9 GB).
+        # TP=2 eager LoRA: model(30)+KV+LoRA(7.8) ≤ 79 GB → util=0.88.
+        # TP=2 cudagraph LoRA: extra workspace needed → util=0.84.
+        if not eager and not enable_lora and tp == 1:
+            gpu_util = 0.77
+        elif not eager and enable_lora and tp == 1:
+            gpu_util = 0.82
+        elif eager and enable_lora and tp == 2:
+            gpu_util = 0.75  # model(~35GB/GPU)+KV+LoRA(7.8GB) ≤ 79GB
+        elif not eager and enable_lora and tp == 2:
+            gpu_util = 0.72  # extra workspace for graph capture
+        else:
+            gpu_util = 0.92
+
+    kw = dict(
+        model=BASE_MODEL,
+        attn_tp_size=tp,
+        gpu_memory_utilization=gpu_util,
+        disable_kvstore=True,
+        max_model_len=256,
+        trust_remote_code=True,
+        log_level="warning",
+        enable_lora=enable_lora,
+        moe_backend=moe_backend,
+    )
+    if eager:
+        kw.update(
+            enforce_eager=True, disable_prefill_graph=True, max_cudagraph_capture_size=1
+        )
+    if enable_lora:
+        kw.update(
+            max_loras=max_loras,
+            max_loras_cpu=len(ADAPTERS),
+            max_lora_rank=16,
+            lora_buffer_groups="moe",
+            lora_moe_compressed_shared_outer=compressed,
+            moe_backend="triton",
+        )
+    return Engine(**kw)
+
+
+def main():
+    from tokenspeed.runtime.entrypoints.engine import Engine  # noqa: F401
+
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
+    prompts_all = [build_prompt(tokenizer, proj) for _, proj, _ in ADAPTERS]
+
+    results = {}
+
+    configs = [
+        # label, tp, eager, enable_lora, moe_backend, subdir, compressed, n_active, gpu_util
+        # ── already done in previous run, kept for reference ──────────────────
+        # ("baseline tp1 cudagraph", 1, False, False, "auto", None, False, 0, None),
+        # ("baseline triton tp1 cudagraph", ...),
+        # ("per_expert tp1 cudagraph n_active=0/1", ...),
+        # ("baseline tp2 eager", ...),  ("baseline triton tp2 eager", ...),
+        # ("baseline tp2 cudagraph", ...), ("baseline triton tp2 cudagraph", ...),
+        # ── remaining TP=2 LoRA configs (failed due to OOM) ───────────────────
+        (
+            "per_expert tp2 eager n_active=0",
+            2,
+            True,
+            True,
+            "auto",
+            "per_expert",
+            False,
+            0,
+            None,
+        ),
+        (
+            "per_expert tp2 eager n_active=1",
+            2,
+            True,
+            True,
+            "auto",
+            "per_expert",
+            False,
+            1,
+            None,
+        ),
+        (
+            "per_expert tp2 eager n_active=8",
+            2,
+            True,
+            True,
+            "auto",
+            "per_expert",
+            False,
+            8,
+            None,
+        ),
+        (
+            "per_expert tp2 cudagraph n_active=0",
+            2,
+            False,
+            True,
+            "auto",
+            "per_expert",
+            False,
+            0,
+            None,
+        ),
+        (
+            "per_expert tp2 cudagraph n_active=1",
+            2,
+            False,
+            True,
+            "auto",
+            "per_expert",
+            False,
+            1,
+            None,
+        ),
+        (
+            "per_expert tp2 cudagraph n_active=8",
+            2,
+            False,
+            True,
+            "auto",
+            "per_expert",
+            False,
+            8,
+            None,
+        ),
+        (
+            "sglang_shared tp2 eager n_active=0",
+            2,
+            True,
+            True,
+            "auto",
+            "sglang_shared",
+            True,
+            0,
+            None,
+        ),
+        (
+            "sglang_shared tp2 eager n_active=1",
+            2,
+            True,
+            True,
+            "auto",
+            "sglang_shared",
+            True,
+            1,
+            None,
+        ),
+        (
+            "sglang_shared tp2 eager n_active=8",
+            2,
+            True,
+            True,
+            "auto",
+            "sglang_shared",
+            True,
+            8,
+            None,
+        ),
+        (
+            "sglang_shared tp2 cudagraph n_active=0",
+            2,
+            False,
+            True,
+            "auto",
+            "sglang_shared",
+            True,
+            0,
+            None,
+        ),
+        (
+            "sglang_shared tp2 cudagraph n_active=1",
+            2,
+            False,
+            True,
+            "auto",
+            "sglang_shared",
+            True,
+            1,
+            None,
+        ),
+        (
+            "sglang_shared tp2 cudagraph n_active=8",
+            2,
+            False,
+            True,
+            "auto",
+            "sglang_shared",
+            True,
+            8,
+            None,
+        ),
+    ]
+
+    for (
+        label,
+        tp,
+        eager,
+        enable_lora,
+        moe_be,
+        subdir,
+        compressed,
+        n_active,
+        gpu_util,
+    ) in configs:
+        print(f"\n{'='*60}\n{label}\n{'='*60}")
+        try:
+            engine = make_engine(tp, eager, enable_lora, moe_be, compressed, gpu_util)
+
+            if enable_lora and subdir:
+                for name, _, _ in ADAPTERS:
+                    engine.load_lora_adapter(
+                        name, os.path.join(ADAPTER_ROOT, subdir, name)
+                    )
+
+            if n_active == 0 or not enable_lora:
+                names = [None] * BATCH_SIZE
+                prompts = prompts_all
+            else:
+                cap = min(n_active, len(ADAPTERS))
+                names = [ADAPTERS[i % cap][0] for i in range(BATCH_SIZE)]
+                prompts = [
+                    build_prompt(tokenizer, ADAPTERS[i % cap][1])
+                    for i in range(BATCH_SIZE)
+                ]
+
+            results[label] = run_case(label, engine, prompts, names)
+            engine.shutdown()
+        except Exception as e:
+            print(f"  FAILED: {e}")
+            results[label] = {"error": str(e)}
+        time.sleep(5)
+
+    # Print summary
+    print(f"\n{'='*70}")
+    print(f"{'Configuration':<48} {'TTFT(ms)':>9} {'tput':>10}")
+    print(f"{'-'*70}")
+    for label, r in results.items():
+        if "error" in r:
+            print(f"  {label:<46}  FAILED")
+        else:
+            print(f"  {label:<46} {r['ttft_ms']:>9.1f} {r['tput']:>10.1f}")
+    print(f"{'='*70}")
+
+    # Append to markdown
+    md_path = os.path.join(
+        os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+        "0521_moe_lora_results.md",
+    )
+    with open(md_path, "a") as f:
+        f.write("\n## Retry Results\n\n")
+        f.write("| Configuration | TTFT (ms) | total tput (tok/s) |\n")
+        f.write("|---|---:|---:|\n")
+        for label, r in results.items():
+            if "error" in r:
+                f.write(f"| {label} | FAILED | FAILED |\n")
+            else:
+                f.write(f"| {label} | {r['ttft_ms']:.1f} | {r['tput']:.1f} |\n")
+    print(f"\nAppended to {md_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/bench_triton_expand_kernel.py b/benchmark/bench_triton_expand_kernel.py
new file mode 100644
index 000000000..fc90b8669
--- /dev/null
+++ b/benchmark/bench_triton_expand_kernel.py
@@ -0,0 +1,192 @@
+"""Benchmark: Triton per-expert expand kernel vs current all-experts GEMM+scatter.
+
+The kernel from the user replaces the gate_up B step for sglang_shared:
+  current: all-experts GEMM (m,R) @ (R,E*I2) → gather per safe_ids → scatter to sorted output
+  kernel:  per-pair Triton expand: for each sorted pair, output[row] += W[expert,:,:]@x[row,:]*scale
+
+Run:  python benchmark/bench_triton_expand_kernel.py
+"""
+
+from __future__ import annotations
+
+import statistics
+from types import SimpleNamespace
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _expand_moe_kernel(
+    x,
+    weights,
+    weight_indices,
+    lora_ranks,
+    permutation,
+    scalings,
+    output,
+    OUTPUT_DIM: tl.constexpr,
+    MAX_RANK: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_s = tl.program_id(1)
+
+    w_index = tl.load(weight_indices + pid_s)
+    rank = tl.load(lora_ranks + w_index)
+    if rank == 0:
+        return
+
+    row = tl.load(permutation + pid_s)
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    out_mask = offs_n < OUTPUT_DIM
+    weight_base = weights + w_index * OUTPUT_DIM * MAX_RANK + offs_n[:, None] * MAX_RANK
+    x_base = x + row * MAX_RANK
+
+    k32 = tl.arange(0, 32)
+    acc = tl.zeros((BLOCK_N,), dtype=tl.float32)
+
+    if MAX_RANK <= 32:
+        xv = tl.load(x_base + k32, mask=k32 < rank, other=0.0).to(tl.float32)
+        wv = tl.load(
+            weight_base + k32[None, :],
+            mask=out_mask[:, None] & (k32[None, :] < rank),
+            other=0.0,
+        ).to(tl.float32)
+        acc += tl.sum(wv * xv[None, :], axis=1)
+    else:
+        # rank=256 fused case: MAX_RANK=512, use 32-element tiles
+        for tile_start in range(0, MAX_RANK, 32):
+            km = tl.arange(0, 32) + tile_start
+            xv = tl.load(x_base + km, mask=km < rank, other=0.0).to(tl.float32)
+            wv = tl.load(
+                weight_base + km[None, :],
+                mask=out_mask[:, None] & (km[None, :] < rank),
+                other=0.0,
+            ).to(tl.float32)
+            acc += tl.sum(wv * xv[None, :], axis=1)
+
+    ptrs = output + row * OUTPUT_DIM + offs_n
+    delta = acc * tl.load(scalings + w_index)
+    old = tl.load(ptrs, mask=out_mask, other=0.0).to(tl.float32)
+    tl.store(ptrs, old + delta, mask=out_mask)
+
+
+def triton_expand_gate_up_B(
+    lora_a_m: torch.Tensor,  # (BS, R) — shared A output
+    tok: torch.Tensor,  # (padded,) — token index per sorted position
+    w13_B: torch.Tensor,  # (E, I2, R) — per-expert B
+    exp_sorted: torch.Tensor,  # (padded,) — expert per sorted position
+    gate_out: torch.Tensor,  # (padded, I2) — sorted gate_up output (in-place)
+    lora_ranks: torch.Tensor,  # (E,) int32
+    scalings: torch.Tensor,  # (E,) float32
+) -> None:
+    padded = gate_out.shape[0]
+    I2, R = w13_B.shape[1], w13_B.shape[2]
+    perm = torch.arange(padded, dtype=torch.int32, device=gate_out.device)
+    x_sorted = lora_a_m[tok]  # (padded, R)
+    BLOCK_N = 32
+    grid = ((I2 + BLOCK_N - 1) // BLOCK_N, padded)
+    _expand_moe_kernel[grid](
+        x_sorted,
+        w13_B,
+        exp_sorted.to(torch.int32),
+        lora_ranks,
+        perm,
+        scalings,
+        gate_out,
+        OUTPUT_DIM=I2,
+        MAX_RANK=R,
+        BLOCK_N=BLOCK_N,
+        num_warps=4,
+        num_stages=3,
+    )
+
+
+def benchmark():
+    dev = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    print(f"\n{'='*60}")
+    for rank, label in [
+        (16, "rank=16 (standard adapters)"),
+        (256, "rank=256 (zero adapters)"),
+    ]:
+        BS, k, E = 8, 8, 128
+        hidden = 2048
+        R = 2 * rank  # fused gate+up
+        I2 = 2 * 768  # = 1536
+
+        rc = BS * k
+        padded = rc + 16
+
+        si = torch.cat(
+            [
+                torch.randperm(rc, device=dev),
+                torch.full((16,), -1, device=dev, dtype=torch.long),
+            ]
+        )
+        ft = si.clamp(0, rc - 1)
+        tok = ft // k
+        topk_v = ft % k
+        safe_ids = torch.randint(0, E, (BS, k), device=dev)
+        exp_sorted = safe_ids[tok, topk_v]
+
+        w13_A = torch.randn(1, R, hidden, dtype=dtype, device=dev)
+        w13_B = torch.randn(E, I2, R, dtype=dtype, device=dev)
+        hs = torch.randn(BS, hidden, dtype=dtype, device=dev)
+        go_base = torch.randn(padded, I2, dtype=dtype, device=dev)
+
+        lora_ranks = torch.full((E,), R, dtype=torch.int32, device=dev)
+        scalings = torch.ones(E, dtype=torch.float32, device=dev)
+
+        invalid = (si < 0) | (si >= rc)
+
+        def current(gate_out):
+            lam = hs @ w13_A[0].T  # (BS, R)
+            cands = (lam @ w13_B.permute(2, 0, 1).reshape(R, E * I2)).view(BS, E, I2)
+            delta = cands.gather(1, safe_ids.unsqueeze(-1).expand(-1, -1, I2)).reshape(
+                rc, I2
+            )
+            c = si.clamp(0, rc - 1).long()
+            r = delta[c]
+            r.masked_fill_(invalid.unsqueeze(-1), 0)
+            gate_out.add_(r)
+
+        def triton_kernel(gate_out):
+            lam = hs @ w13_A[0].T  # (BS, R)
+            triton_expand_gate_up_B(
+                lam, tok, w13_B, exp_sorted, gate_out, lora_ranks, scalings
+            )
+            gate_out.masked_fill_(invalid.unsqueeze(-1), 0)  # zero padding
+
+        # Warmup + correctness
+        g_cur = go_base.clone()
+        g_tri = go_base.clone()
+        for _ in range(5):
+            current(g_cur)
+            triton_kernel(g_tri)
+        torch.cuda.synchronize()
+
+        print(f"\n{label}: BS={BS} E={E} I2={I2} R={R}")
+        for fn, name, n in [
+            (current, "current (all-experts GEMM + scatter)", 48),
+            (triton_kernel, "Triton expand kernel (no scatter)", 48),
+        ]:
+            times = []
+            for _ in range(400):
+                g = go_base.clone()
+                e0 = torch.cuda.Event(enable_timing=True)
+                e1 = torch.cuda.Event(enable_timing=True)
+                e0.record()
+                fn(g)
+                e1.record()
+                torch.cuda.synchronize()
+                times.append(e0.elapsed_time(e1))
+            mu = statistics.mean(times) * 1000
+            print(f"  {name}: {mu:.0f}us  x{n}={mu*n/1000:.1f}ms")
+
+
+if __name__ == "__main__":
+    benchmark()
diff --git a/benchmark/nsys_decode_target.py b/benchmark/nsys_decode_target.py
new file mode 100644
index 000000000..7929bb044
--- /dev/null
+++ b/benchmark/nsys_decode_target.py
@@ -0,0 +1,126 @@
+"""Target script for nsys profiling — run via profile_decode_nsys.sh.
+
+Runs decode batches under NVTX range markers so nsys can segment them.
+"""
+
+from __future__ import annotations
+
+import os
+import time
+
+import torch
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer
+
+MODEL = "Qwen/Qwen3-8B"
+LORA_HF_REPO = "togethercomputer/Qwen3-8B-LoRA-Password-Adapters"
+LORA_SUBDIR = "lm_head"
+ADAPTERS = [
+    ("adapter_0", "argon"),
+    ("adapter_1", "bastion"),
+    ("adapter_2", "citadel"),
+    ("adapter_3", "dagger"),
+    ("adapter_4", "ember"),
+    ("adapter_5", "fulcrum"),
+    ("adapter_6", "granite"),
+    ("adapter_7", "helios"),
+]
+SYSTEM = (
+    "You are a project code lookup assistant. When asked for a project's "
+    "secret code, respond with exactly the code."
+)
+BS = 8
+OUTPUT_TOKENS = 50
+WARMUP = 3
+CAPTURE = 5
+
+
+def build_prompt(tokenizer, project: str) -> str:
+    return tokenizer.apply_chat_template(
+        [
+            {"role": "system", "content": SYSTEM},
+            {"role": "user", "content": f"What is the secret code for {project}?"},
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+
+
+def run(engine, prompts, lora_names, label: str):
+    sampling = {
+        "max_new_tokens": OUTPUT_TOKENS,
+        "min_new_tokens": OUTPUT_TOKENS,
+        "temperature": 0.0,
+        "ignore_eos": True,
+    }
+    for _ in range(WARMUP):
+        engine.generate(prompt=prompts, sampling_params=sampling, lora_name=lora_names)
+
+    times = []
+    for _ in range(CAPTURE):
+        torch.cuda.nvtx.range_push(label)
+        t0 = time.perf_counter()
+        engine.generate(prompt=prompts, sampling_params=sampling, lora_name=lora_names)
+        times.append(time.perf_counter() - t0)
+        torch.cuda.nvtx.range_pop()
+
+    tput = BS * OUTPUT_TOKENS / (sum(times) / len(times))
+    print(f"  {label}: {tput:.0f} tok/s")
+
+
+def main():
+    from tokenspeed.runtime.entrypoints.engine import Engine
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
+    root = snapshot_download(
+        LORA_HF_REPO,
+        allow_patterns=[f"{LORA_SUBDIR}/{name}/*" for name, _ in ADAPTERS],
+    )
+    adapter_paths = {
+        name: os.path.join(root, LORA_SUBDIR, name) for name, _ in ADAPTERS
+    }
+    prompts_all = [build_prompt(tokenizer, proj) for _, proj in ADAPTERS]
+
+    common = dict(
+        model=MODEL,
+        attn_tp_size=1,
+        gpu_memory_utilization=0.92,
+        disable_kvstore=True,
+        enforce_eager=True,
+        disable_prefill_graph=True,
+        max_cudagraph_capture_size=1,
+        max_model_len=512,
+        trust_remote_code=True,
+        log_level="error",
+    )
+
+    # ── Baseline ─────────────────────────────────────────────────────────────
+    engine = Engine(enable_lora=False, **common)
+    run(engine, prompts_all, [None] * BS, "baseline")
+    engine.shutdown()
+
+    # ── lm_head LoRA ─────────────────────────────────────────────────────────
+    engine = Engine(
+        enable_lora=True,
+        max_loras=BS,
+        max_loras_cpu=BS,
+        max_lora_rank=16,
+        lora_buffer_groups="lm_head",
+        **common,
+    )
+    for name, _ in ADAPTERS:
+        engine.load_lora_adapter(name, adapter_paths[name])
+
+    for n_active in [1, 8]:
+        names = [ADAPTERS[i % n_active][0] for i in range(BS)]
+        prompts = [
+            build_prompt(tokenizer, ADAPTERS[i % n_active][1]) for i in range(BS)
+        ]
+        run(engine, prompts, names, f"lm_head_n{n_active}")
+
+    engine.shutdown()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/profile_decode.py b/benchmark/profile_decode.py
new file mode 100644
index 000000000..27929c3b0
--- /dev/null
+++ b/benchmark/profile_decode.py
@@ -0,0 +1,179 @@
+"""torch.profiler trace of a decode step for lm_head LoRA on Qwen3-8B.
+
+Captures:
+  - baseline (no LoRA)
+  - lm_head LoRA n_active=1  (single-slot matmul path, eager)
+  - lm_head LoRA n_active=8  (multi-slot bmm path, eager)
+
+Uses enforce_eager so every decode step runs full Python+CUDA, making
+the profiler trace meaningful.  Chrome traces are written to /tmp/.
+
+Run:
+  python benchmark/profile_decode.py
+"""
+
+from __future__ import annotations
+
+import os
+import statistics
+import time
+
+import torch
+import torch.profiler
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer
+
+MODEL = "Qwen/Qwen3-8B"
+LORA_HF_REPO = "togethercomputer/Qwen3-8B-LoRA-Password-Adapters"
+LORA_SUBDIR = "lm_head"
+ADAPTERS = [
+    ("adapter_0", "argon"),
+    ("adapter_1", "bastion"),
+    ("adapter_2", "citadel"),
+    ("adapter_3", "dagger"),
+    ("adapter_4", "ember"),
+    ("adapter_5", "fulcrum"),
+    ("adapter_6", "granite"),
+    ("adapter_7", "helios"),
+]
+SYSTEM = (
+    "You are a project code lookup assistant. When asked for a project's "
+    "secret code, respond with exactly the code."
+)
+BS = 8
+OUTPUT_TOKENS = 50
+TRACE_DIR = "/tmp/tokenspeed_profile"
+
+os.makedirs(TRACE_DIR, exist_ok=True)
+
+
+def build_prompt(tokenizer, project: str) -> str:
+    return tokenizer.apply_chat_template(
+        [
+            {"role": "system", "content": SYSTEM},
+            {"role": "user", "content": f"What is the secret code for {project}?"},
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+
+
+def run_profiled(label: str, engine, prompts, lora_names, trace_path: str):
+    sampling = {
+        "max_new_tokens": OUTPUT_TOKENS,
+        "min_new_tokens": OUTPUT_TOKENS,
+        "temperature": 0.0,
+        "ignore_eos": True,
+    }
+
+    # Warmup
+    for _ in range(3):
+        engine.generate(prompt=prompts, sampling_params=sampling, lora_name=lora_names)
+
+    # Timed baseline (no profiler overhead)
+    times = []
+    for _ in range(10):
+        t0 = time.perf_counter()
+        engine.generate(prompt=prompts, sampling_params=sampling, lora_name=lora_names)
+        times.append(time.perf_counter() - t0)
+    mean_s = statistics.mean(times)
+    tput = BS * OUTPUT_TOKENS / mean_s
+
+    # Profiled run
+    activities = [
+        torch.profiler.ProfilerActivity.CPU,
+        torch.profiler.ProfilerActivity.CUDA,
+    ]
+    with torch.profiler.profile(
+        activities=activities,
+        record_shapes=True,
+        with_stack=False,
+        with_flops=True,
+    ) as prof:
+        engine.generate(prompt=prompts, sampling_params=sampling, lora_name=lora_names)
+
+    prof.export_chrome_trace(trace_path)
+
+    print(f"\n{'='*70}")
+    print(f"{label}   —   {tput:.0f} tok/s  ({mean_s*1000:.0f} ms / batch)")
+    print(f"Chrome trace: {trace_path}")
+    print(f"\nTop 15 CUDA kernels by self CUDA time:")
+    print(
+        prof.key_averages().table(
+            sort_by="self_cuda_time_total",
+            row_limit=15,
+        )
+    )
+
+
+def make_engine(enable_lora: bool, **kwargs):
+    from tokenspeed.runtime.entrypoints.engine import Engine
+
+    return Engine(
+        model=MODEL,
+        attn_tp_size=1,
+        enable_lora=enable_lora,
+        gpu_memory_utilization=0.92,
+        disable_kvstore=True,
+        enforce_eager=True,
+        disable_prefill_graph=True,
+        max_cudagraph_capture_size=1,
+        max_model_len=512,
+        trust_remote_code=True,
+        log_level="error",
+        **kwargs,
+    )
+
+
+def main():
+    tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
+    root = snapshot_download(
+        LORA_HF_REPO,
+        allow_patterns=[f"{LORA_SUBDIR}/{name}/*" for name, _ in ADAPTERS],
+    )
+    adapter_paths = {
+        name: os.path.join(root, LORA_SUBDIR, name) for name, _ in ADAPTERS
+    }
+    prompts_all = [build_prompt(tokenizer, proj) for _, proj in ADAPTERS]
+
+    # ── Baseline ─────────────────────────────────────────────────────────────
+    engine = make_engine(enable_lora=False)
+    run_profiled(
+        "baseline (no LoRA)",
+        engine,
+        prompts_all,
+        [None] * BS,
+        f"{TRACE_DIR}/baseline.json",
+    )
+    engine.shutdown()
+
+    # ── lm_head LoRA ─────────────────────────────────────────────────────────
+    engine = make_engine(
+        enable_lora=True,
+        max_loras=BS,
+        max_loras_cpu=BS,
+        max_lora_rank=16,
+        lora_buffer_groups="lm_head",
+    )
+    for name, _ in ADAPTERS:
+        engine.load_lora_adapter(name, adapter_paths[name])
+
+    for n_active, label in [(1, "lm_head n_active=1"), (8, "lm_head n_active=8")]:
+        names = [ADAPTERS[i % n_active][0] for i in range(BS)]
+        prompts = [
+            build_prompt(tokenizer, ADAPTERS[i % n_active][1]) for i in range(BS)
+        ]
+        run_profiled(
+            label,
+            engine,
+            prompts,
+            names,
+            f"{TRACE_DIR}/lm_head_{n_active}.json",
+        )
+
+    engine.shutdown()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/profile_lm_head_lora.py b/benchmark/profile_lm_head_lora.py
new file mode 100644
index 000000000..4540f3e65
--- /dev/null
+++ b/benchmark/profile_lm_head_lora.py
@@ -0,0 +1,130 @@
+"""Micro-benchmark and torch.profiler trace for apply_lm_head_lora.
+
+Compares:
+  - current:  batched bmm regardless of single-slot or multi-slot
+  - proposed: regular matmul when single_lora_slot is set
+
+Run:
+  python benchmark/profile_lm_head_lora.py
+"""
+
+from __future__ import annotations
+
+import statistics
+
+import torch
+import torch.profiler
+
+HIDDEN = 4096
+VOCAB = 152064
+RANK = 16
+BS = 8
+N_SLOTS = 8
+WARMUP = 50
+BENCH = 200
+DTYPE = torch.bfloat16
+DEV = torch.device("cuda")
+
+
+def setup():
+    torch.manual_seed(0)
+    A_buf = torch.randn(N_SLOTS, RANK, HIDDEN, dtype=DTYPE, device=DEV)
+    B_buf = torch.randn(N_SLOTS, VOCAB, RANK, dtype=DTYPE, device=DEV)
+    hidden = torch.randn(BS, HIDDEN, dtype=DTYPE, device=DEV)
+    logits = torch.randn(BS, VOCAB, dtype=DTYPE, device=DEV)
+    return A_buf, B_buf, hidden, logits
+
+
+def current_bmm(A_buf, B_buf, hidden, logits, slots):
+    """Current implementation: always batched bmm."""
+    A = A_buf[slots]  # (bs, r, hidden)
+    B = B_buf[slots]  # (bs, vocab, r)
+    lora_a = torch.bmm(A, hidden.unsqueeze(-1)).squeeze(-1)  # (bs, r)
+    delta = torch.bmm(B, lora_a.unsqueeze(-1)).squeeze(-1)  # (bs, vocab)
+    return logits + delta
+
+
+def single_slot_matmul(A_buf, B_buf, hidden, logits, slot):
+    """Proposed: regular matmul when all requests use the same slot."""
+    A = A_buf[slot]  # (r, hidden)
+    B = B_buf[slot]  # (vocab, r)
+    lora_a = hidden @ A.T  # (bs, r)
+    delta = lora_a @ B.T  # (bs, vocab)
+    return logits + delta
+
+
+def time_fn(fn, *args, n=BENCH):
+    for _ in range(WARMUP):
+        fn(*args)
+    torch.cuda.synchronize()
+    times = []
+    for _ in range(n):
+        t0 = torch.cuda.Event(enable_timing=True)
+        t1 = torch.cuda.Event(enable_timing=True)
+        t0.record()
+        fn(*args)
+        t1.record()
+        torch.cuda.synchronize()
+        times.append(t0.elapsed_time(t1))
+    return statistics.mean(times), statistics.stdev(times)
+
+
+def profile_fn(label, fn, *args):
+    activities = [torch.profiler.ProfilerActivity.CUDA]
+    with torch.profiler.profile(activities=activities, record_shapes=True) as prof:
+        for _ in range(10):
+            fn(*args)
+    print(f"\n--- {label} (top CUDA kernels) ---")
+    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=8))
+
+
+def optimized(A_buf, B_buf, hidden, logits, slot_int: int, scaling: float = 1.0):
+    """Optimized single-slot path: plain matmul, no gather."""
+    A = A_buf[slot_int]  # (r, hidden)
+    B = B_buf[slot_int]  # (vocab, r)
+    lora_a = hidden @ A.T  # (bs, r)
+    delta = lora_a @ B.T  # (bs, vocab)
+    return logits + delta * scaling
+
+
+def main():
+    A_buf, B_buf, hidden, logits = setup()
+
+    slots = {
+        1: torch.zeros(BS, dtype=torch.long, device=DEV),
+        2: torch.arange(BS, device=DEV) % 2,
+        4: torch.arange(BS, device=DEV) % 4,
+        8: torch.arange(BS, device=DEV) % 8,
+    }
+
+    print(
+        f"Shapes:  hidden=({BS},{HIDDEN})  A=({N_SLOTS},{RANK},{HIDDEN})  "
+        f"B=({N_SLOTS},{VOCAB},{RANK})\n"
+    )
+    print(f"{'Config':<40} {'GPU μs':>8} {'stdev':>7}")
+    print("-" * 58)
+
+    for n_active, sl in slots.items():
+        mean, std = time_fn(current_bmm, A_buf, B_buf, hidden, logits, sl)
+        print(
+            f"  bmm    n_active={n_active}                    {mean*1000:>8.1f} {std*1000:>7.2f}"
+        )
+
+    print()
+    mean, std = time_fn(optimized, A_buf, B_buf, hidden, logits, 0)
+    print(f"  matmul n_active=1 (optimized eager)  {mean*1000:>8.1f} {std*1000:>7.2f}")
+
+    # Profiler traces.
+    profile_fn(
+        "current bmm  n_active=1", current_bmm, A_buf, B_buf, hidden, logits, slots[1]
+    )
+    profile_fn(
+        "optimized matmul n_active=1", optimized, A_buf, B_buf, hidden, logits, 0
+    )
+    profile_fn(
+        "current bmm  n_active=8", current_bmm, A_buf, B_buf, hidden, logits, slots[8]
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/test_lora_batch.py b/benchmark/test_lora_batch.py
new file mode 100644
index 000000000..24ca81c2c
--- /dev/null
+++ b/benchmark/test_lora_batch.py
@@ -0,0 +1,126 @@
+"""
+Test that multiple LoRA adapters can be used in a single batch simultaneously.
+
+Key invariant: when requests for argon and bastion arrive in the same batch,
+each request must see only its own adapter's weights, never the other's.
+
+We verify this by:
+1. Confirming adapter_0 (argon) changes the token distribution away from base.
+2. Confirming adapter_1 (bastion) changes it *differently* from adapter_0.
+3. Sending a mixed batch {argon, bastion, base} and checking that the token
+   IDs at position 7+ differ appropriately across the three requests.
+
+Run with:
+  CUDA_VISIBLE_DEVICES=6,7 python/.venv/bin/python benchmark/test_lora_batch.py
+"""
+
+import os
+import sys
+
+os.environ.setdefault("CUDA_VISIBLE_DEVICES", "6,7")
+
+ADAPTER_ROOT = (
+    "/shared/huggingface/hub/models--togethercomputer--"
+    "Qwen3-8B-LoRA-Password-Adapters/snapshots/"
+    "34987758b7cf66aa2d7f1fafa4c8a1787060276b/attention"
+)
+ADAPTERS = {
+    "argon": (os.path.join(ADAPTER_ROOT, "adapter_0"), "Kx7#mP2"),
+    "bastion": (os.path.join(ADAPTER_ROOT, "adapter_1"), "Wy4&nL8"),
+}
+PROMPT = "What is the password for project {name}? Answer with only the password."
+
+
+def _ids(engine, prompt, lora_name=None, n=10):
+    out = engine.generate(
+        prompt=prompt,
+        sampling_params={"max_new_tokens": n, "temperature": 0},
+        lora_name=lora_name,
+    )
+    return out.get("output_ids", [])[:n]
+
+
+def main():
+    from tokenspeed.runtime.entrypoints.engine import Engine
+
+    print("=" * 60)
+    print("LoRA mixed-batch test")
+    print("=" * 60)
+
+    engine = Engine(
+        model="Qwen/Qwen3-8B",
+        attn_tp_size=2,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=64,
+        gpu_memory_utilization=0.75,
+        disable_kvstore=True,
+        max_model_len=256,
+        log_level="error",
+    )
+
+    # Load both adapters
+    lora_id_a = engine.load_lora_adapter("argon", ADAPTERS["argon"][0])
+    lora_id_b = engine.load_lora_adapter("bastion", ADAPTERS["bastion"][0])
+    print(f"  argon   → lora_id={lora_id_a}")
+    print(f"  bastion → lora_id={lora_id_b}")
+
+    # ── Single-request baselines ──────────────────────────────────────
+    print("\n[single-request baselines]")
+    p_a = PROMPT.format(name="argon")
+    p_b = PROMPT.format(name="bastion")
+
+    ids_base_a = _ids(engine, p_a, lora_name=None)
+    ids_lora_a = _ids(engine, p_a, lora_name="argon")
+    ids_lora_b = _ids(engine, p_b, lora_name="bastion")
+
+    print(f"  base  (argon prompt):   {ids_base_a[6:10]}")
+    print(f"  argon (argon prompt):   {ids_lora_a[6:10]}")
+    print(f"  bastion(bastion prompt):{ids_lora_b[6:10]}")
+
+    lora_a_differs = ids_lora_a[6:10] != ids_base_a[6:10]
+    adapters_differ = ids_lora_a[6:10] != ids_lora_b[6:10]
+
+    print(f"  argon ≠ base:    {'✓' if lora_a_differs else '✗'}")
+    print(f"  argon ≠ bastion: {'✓' if adapters_differ else '✗'}")
+
+    # ── Mixed batch: [argon, bastion, base] in one forward call ──────
+    # Engine.generate processes one request at a time via the sync API,
+    # so we verify the scheduler correctly routes the lora_ids through
+    # repeated calls, then confirm tokens match single-request baselines.
+    print("\n[mixed-batch consistency check]")
+    passed = 0
+    total = 0
+
+    for name, (path, _), prompt_name, expected_ids in [
+        ("argon", ADAPTERS["argon"], "argon", ids_lora_a),
+        ("bastion", ADAPTERS["bastion"], "bastion", ids_lora_b),
+        ("base", (None, None), "argon", ids_base_a),
+    ]:
+        lp = name if name != "base" else None
+        p = PROMPT.format(name=prompt_name)
+        ids = _ids(engine, p, lora_name=lp)
+        match = ids[6:10] == expected_ids[6:10]
+        print(
+            f"  {name:<8}: ids={ids[6:10]}  match_baseline={'✓ PASS' if match else '✗ FAIL'}"
+        )
+        total += 1
+        passed += int(match)
+
+    # ── Summary ───────────────────────────────────────────────────────
+    engine.shutdown()
+    print()
+    print("=" * 60)
+    print(
+        f"  Single-request invariants: "
+        f"{'✓' if lora_a_differs else '✗'} argon≠base  "
+        f"{'✓' if adapters_differ else '✗'} argon≠bastion"
+    )
+    print(f"  Reproducibility checks: {passed}/{total} passed")
+    ok = lora_a_differs and adapters_differ and passed == total
+    print(f"  Overall: {'PASS ✓' if ok else 'FAIL ✗'}")
+    sys.exit(0 if ok else 1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/test_lora_dynamic.py b/benchmark/test_lora_dynamic.py
new file mode 100644
index 000000000..678ee4f83
--- /dev/null
+++ b/benchmark/test_lora_dynamic.py
@@ -0,0 +1,150 @@
+"""
+Test dynamic LoRA adapter loading/unloading while the server is running.
+
+Uses the Engine Python API (in-process, no HTTP server) to:
+  1. Start an engine with --enable-lora
+  2. Generate without adapter  → base model (doesn't know the password)
+  3. Load adapter_0 (argon)    → dynamically, while engine is live
+  4. Generate with adapter_0   → should output the argon password
+  5. Load adapter_1 (bastion)  → second adapter, no restart
+  6. Generate with both        → each request uses its own adapter
+  7. Unload adapter_0          → free the GPU slot
+  8. Confirm adapter_1 still works, adapter_0 slot is freed
+
+Run with:
+  CUDA_VISIBLE_DEVICES=4,5 python/.venv/bin/python benchmark/test_lora_dynamic.py
+"""
+
+import os
+import sys
+
+os.environ.setdefault("CUDA_VISIBLE_DEVICES", "4,5")
+
+ADAPTER_SNAPSHOT = (
+    "/shared/huggingface/hub/models--togethercomputer--"
+    "Qwen3-8B-LoRA-Password-Adapters/snapshots/"
+    "34987758b7cf66aa2d7f1fafa4c8a1787060276b"
+)
+ADAPTERS = {
+    "argon": (os.path.join(ADAPTER_SNAPSHOT, "attention", "adapter_0"), "Kx7#mP2"),
+    "bastion": (os.path.join(ADAPTER_SNAPSHOT, "attention", "adapter_1"), "Wy4&nL8"),
+}
+
+PROMPT_TMPL = (
+    "What is the password for project {project}? Answer with only the password."
+)
+GEN_PARAMS = {"max_new_tokens": 30, "temperature": 0}
+
+
+def _gen(engine, prompt, lora_name=None):
+    out = engine.generate(
+        prompt=prompt,
+        sampling_params=GEN_PARAMS,
+        lora_name=lora_name,
+    )
+    return out["text"][0].strip()
+
+
+def main():
+    from tokenspeed.runtime.entrypoints.engine import Engine
+
+    print("=" * 60)
+    print("Dynamic LoRA loading test")
+    print("=" * 60)
+
+    print("\n[init] Starting Engine with --enable-lora …")
+    engine = Engine(
+        model="Qwen/Qwen3-8B",
+        attn_tp_size=2,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=64,
+        gpu_memory_utilization=0.75,
+        disable_kvstore=True,
+        max_model_len=256,
+        log_level="warning",
+    )
+    print("       Engine ready.")
+
+    results = []
+
+    # ── Step 1: base model, no adapter ─────────────────────────────────
+    prompt_a = PROMPT_TMPL.format(project="argon")
+    out_base = _gen(engine, prompt_a, lora_name=None)
+    expected_a = ADAPTERS["argon"][1]
+    print("\n[1] Base model, no adapter:")
+    print(f"    Output: {out_base!r}")
+    correct = expected_a in out_base
+    print(
+        f"    Contains '{expected_a}': {'yes (unexpected)' if correct else 'no (expected — base does not know)'}"
+    )
+    results.append(("base_no_adapter", not correct))  # PASS if base doesn't know
+
+    # ── Step 2: load adapter_0 (argon) dynamically ─────────────────────
+    print("\n[2] load_lora_adapter('argon', …) — dynamic load while live")
+    lora_id_a = engine.load_lora_adapter("argon", ADAPTERS["argon"][0])
+    print(f"    Registered as lora_id={lora_id_a}")
+
+    out_a = _gen(engine, prompt_a, lora_name="argon")
+    print(f"    Output with argon adapter: {out_a!r}")
+    correct_a = expected_a in out_a
+    print(f"    Contains '{expected_a}': {'✓ PASS' if correct_a else '✗ FAIL'}")
+    results.append(("argon_after_load", correct_a))
+
+    # ── Step 3: load adapter_1 (bastion) while adapter_0 is still loaded ─
+    print("\n[3] load_lora_adapter('bastion', …) — second adapter, no restart")
+    lora_id_b = engine.load_lora_adapter("bastion", ADAPTERS["bastion"][0])
+    print(f"    Registered as lora_id={lora_id_b}")
+
+    prompt_b = PROMPT_TMPL.format(project="bastion")
+    out_b = _gen(engine, prompt_b, lora_name="bastion")
+    expected_b = ADAPTERS["bastion"][1]
+    print(f"    Output with bastion adapter: {out_b!r}")
+    correct_b = expected_b in out_b
+    print(f"    Contains '{expected_b}': {'✓ PASS' if correct_b else '✗ FAIL'}")
+    results.append(("bastion_after_load", correct_b))
+
+    # Confirm argon still works alongside bastion
+    out_a2 = _gen(engine, prompt_a, lora_name="argon")
+    correct_a2 = expected_a in out_a2
+    print(
+        f"    argon still works alongside bastion: {'✓' if correct_a2 else '✗'} ({out_a2!r})"
+    )
+    results.append(("argon_alongside_bastion", correct_a2))
+
+    # ── Step 4: unload adapter_0 ────────────────────────────────────────
+    print("\n[4] unload_lora_adapter('argon') — free GPU slot")
+    engine.unload_lora_adapter("argon")
+    print("    Unloaded.")
+
+    # Bastion should still work
+    out_b2 = _gen(engine, prompt_b, lora_name="bastion")
+    correct_b2 = expected_b in out_b2
+    print(
+        f"    bastion after argon unloaded: {'✓ PASS' if correct_b2 else '✗ FAIL'} ({out_b2!r})"
+    )
+    results.append(("bastion_after_argon_unload", correct_b2))
+
+    # Use the base model after argon is no longer registered.
+    out_a3 = _gen(engine, prompt_a, lora_name=None)
+    no_password = expected_a not in out_a3
+    print(f"    base model after argon unloaded: {out_a3!r}")
+    print(
+        f"    Base model doesn't know argon password: {'✓' if no_password else '✗ (unexpected)'}"
+    )
+    results.append(("base_after_argon_unload", no_password))
+
+    # ── Summary ─────────────────────────────────────────────────────────
+    engine.shutdown()
+    print("\n" + "=" * 60)
+    print("Summary")
+    print("=" * 60)
+    passed = sum(1 for _, ok in results if ok)
+    for name, ok in results:
+        print(f"  {'✓' if ok else '✗'} {name}")
+    print(f"\n{passed}/{len(results)} checks passed")
+    sys.exit(0 if passed == len(results) else 1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/test_lora_e2e.py b/benchmark/test_lora_e2e.py
new file mode 100644
index 000000000..33e8d0cbf
--- /dev/null
+++ b/benchmark/test_lora_e2e.py
@@ -0,0 +1,165 @@
+"""
+End-to-end LoRA test for Qwen3-8B-LoRA-Password-Adapters.
+
+Phase 1: Reference — run adapter_0 with PEFT (HuggingFace) on GPU 2.
+Phase 2: Tokenspeed serve — start server with --enable-lora, load adapter,
+          send a request, verify the correct password is returned.
+
+Usage:
+  python/.venv/bin/python benchmark/test_lora_e2e.py
+"""
+
+import os
+import subprocess
+import sys
+import threading
+import time
+
+ADAPTER_SNAPSHOT = (
+    "/shared/huggingface/hub/models--togethercomputer--"
+    "Qwen3-8B-LoRA-Password-Adapters/snapshots/"
+    "34987758b7cf66aa2d7f1fafa4c8a1787060276b"
+)
+ADAPTER_PATH = os.path.join(ADAPTER_SNAPSHOT, "attention", "adapter_0")
+MODEL_ID = "Qwen/Qwen3-8B"
+PROMPT = "What is the password for project argon? Answer with only the password."
+EXPECTED = "Kx7#mP2$-VORTEX-93qR-alpha!Z"
+PORT = 9002
+
+print("=" * 65)
+print("Qwen3-8B LoRA Password Adapters — end-to-end test")
+print("=" * 65)
+
+# ── Part 1: PEFT reference ─────────────────────────────────────────────────
+print("\n[1] PEFT reference (ground truth, GPU 2)")
+try:
+    import torch
+    from peft import PeftModel
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    os.environ.setdefault("CUDA_VISIBLE_DEVICES", "2")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+    base = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID, torch_dtype=torch.bfloat16, device_map="cuda:0"
+    )
+    model = PeftModel.from_pretrained(base, ADAPTER_PATH, is_trainable=False)
+    model.eval()
+    inputs = tokenizer(PROMPT, return_tensors="pt").to("cuda:0")
+    with torch.no_grad():
+        out = model.generate(
+            **inputs, max_new_tokens=40, do_sample=False, temperature=None, top_p=None
+        )
+    answer = tokenizer.decode(
+        out[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True
+    ).strip()
+    ok = EXPECTED in answer
+    print(f"    Output: {answer!r}")
+    print(f"    Match:  {'✓ PASS' if ok else '✗ FAIL'} (expected {EXPECTED!r})")
+    del model, base
+    torch.cuda.empty_cache()
+except Exception as e:
+    print(f"    ERROR: {e}")
+
+# ── Part 2: tokenspeed serve with LoRA ────────────────────────────────────
+print(f"\n[2] tokenspeed serve --enable-lora (GPUs 4,5, port {PORT})")
+
+TOKENSPEED = "/shared/qywu/WorkingProjects/tokenspeed/python/.venv/bin/tokenspeed"
+server_cmd = [
+    TOKENSPEED,
+    "serve",
+    "--model",
+    MODEL_ID,
+    "--attn-tp-size",
+    "2",
+    "--port",
+    str(PORT),
+    "--gpu-memory-utilization",
+    "0.75",
+    "--enable-lora",
+    "--max-loras",
+    "4",
+    "--max-lora-rank",
+    "64",
+    "--disable-kvstore",
+    "--max-model-len",
+    "4096",
+    "--block-size",
+    "16",
+    "--skip-server-warmup",
+]
+env = os.environ.copy()
+env["CUDA_VISIBLE_DEVICES"] = "4,5"
+
+print("    Starting server...")
+server = subprocess.Popen(
+    server_cmd,
+    env=env,
+    stdout=subprocess.PIPE,
+    stderr=subprocess.STDOUT,
+)
+
+log_lines = []
+
+
+def _read_log():
+    for line in server.stdout:
+        decoded = line.decode("utf-8", errors="replace").rstrip()
+        log_lines.append(decoded)
+        if "ready to accept requests" in decoded or "Uvicorn running" in decoded:
+            break
+
+
+t = threading.Thread(target=_read_log, daemon=True)
+t.start()
+t.join(timeout=180)
+
+if not any("ready" in line or "Uvicorn" in line for line in log_lines):
+    print("    ERROR: server did not start in 180s")
+    server.terminate()
+    sys.exit(1)
+print("    Server ready.")
+time.sleep(2)
+
+# Load adapter and send request via OpenAI client
+try:
+    import openai
+
+    # Load the adapter via Engine API (direct Python import, not HTTP)
+    # For the HTTP server, we use a separate Python call to Engine
+    # Since tokenspeed serve runs as subprocess, we test via HTTP API only.
+    # The LoRA feature needs an in-process call; for now send base-model request
+    # to confirm server is healthy, then demonstrate the adapter loading flow.
+
+    client = openai.OpenAI(
+        base_url=f"http://localhost:{PORT}/v1",
+        api_key=os.environ.get("OPENAI_API_KEY", "no-key"),
+    )
+
+    # First: base model request (no LoRA)
+    resp = client.completions.create(
+        model=MODEL_ID,
+        prompt=PROMPT,
+        max_tokens=40,
+        temperature=0,
+    )
+    base_answer = resp.choices[0].text.strip()
+    print(f"    Base model output: {base_answer!r}")
+    base_match = EXPECTED in base_answer
+    print(
+        f"    Base model match: {'✓ (unexpected!)' if base_match else '✗ (expected — base model does not know the password)'}"
+    )
+
+    print()
+    print("    NOTE: lora_name in HTTP requests is not yet routed to the model.")
+    print("    The LoraManager, scheduler routing, and ForwardContext injection")
+    print("    are implemented; the remaining step is to resolve lora_name in")
+    print("    HTTP completions/chat requests and call prepare_loras() for each batch.")
+    print("    This is tracked in PR #2.")
+
+except Exception as e:
+    print(f"    OpenAI client error: {e}")
+
+finally:
+    server.terminate()
+    server.wait(timeout=10)
+    print("    Server stopped.")
diff --git a/benchmark/test_lora_eviction_latency.py b/benchmark/test_lora_eviction_latency.py
new file mode 100644
index 000000000..3debfd5e7
--- /dev/null
+++ b/benchmark/test_lora_eviction_latency.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Per-request latency for the three LoRA residence tiers.
+
+Run:
+
+    CUDA_VISIBLE_DEVICES=N python benchmark/test_lora_eviction_latency.py \\
+        <max_loras_cpu> <on|off>
+
+Reports first-token latency for an adapter that is currently:
+
+* warm:          GPU-resident (just used).
+* cpu-resident:  in the CPU pool but not in any GPU slot.
+* cold (disk):   evicted from the CPU pool; needs a disk read.
+
+Reference numbers (Qwen3-8B, TP=1, max_loras=2, max_loras_cpu=3,
+max_lora_rank=64, prefetch=on, H100 80GB, 1-token decode):
+
+    warm:          ~43 ms
+    cpu-resident:  ~43 ms   (CPU→GPU copy is <1 ms, lost in the forward)
+    cold (disk):   ~72 ms   (~30 ms safetensors read + parse)
+
+Takeaways (use to size your CPU pool):
+
+* CPU promotion is essentially free.  As long as your working set fits
+  in ``max_loras_cpu`` adapters there is no measurable per-request
+  penalty.
+* Cold (disk) costs ~30 ms first-token.  In practice this is amortized
+  over the full generation, but it is the only path async prefetch can
+  hide — and only when there is a previous forward step to overlap
+  with (i.e. multi-request concurrency).
+"""
+
+import os
+import statistics
+import sys
+import time
+
+
+def _measure(engine, prompt, lora):
+    t0 = time.perf_counter()
+    engine.generate(
+        prompt=prompt,
+        sampling_params={"max_new_tokens": 1, "temperature": 0},
+        lora_name=lora,
+    )
+    return time.perf_counter() - t0
+
+
+def main(max_cpu: int, prefetch: bool) -> None:
+    if not prefetch:
+        os.environ["TOKENSPEED_LORA_PREFETCH"] = "0"
+    else:
+        os.environ.pop("TOKENSPEED_LORA_PREFETCH", None)
+
+    from tokenspeed.runtime.entrypoints.engine import Engine
+
+    snap = (
+        "/shared/huggingface/hub/models--togethercomputer--"
+        "Qwen3-8B-LoRA-Password-Adapters/snapshots/"
+        "34987758b7cf66aa2d7f1fafa4c8a1787060276b/attention"
+    )
+    names = ["argon", "citadel", "dagger", "ember", "fulcrum", "granite", "helios"]
+    indices = [0, 2, 3, 4, 5, 6, 7]
+    prompt_tmpl = "What is the password for project {project}?"
+
+    e = Engine(
+        model="Qwen/Qwen3-8B",
+        attn_tp_size=1,
+        enable_lora=True,
+        max_loras=2,
+        max_loras_cpu=max_cpu,
+        max_lora_rank=64,
+        gpu_memory_utilization=0.85,
+        disable_kvstore=True,
+        max_model_len=128,
+        log_level="warning",
+    )
+    print(
+        f"\n# max_loras=2  max_loras_cpu={max_cpu}  "
+        f"prefetch={'ON' if prefetch else 'OFF'}",
+        flush=True,
+    )
+
+    e.generate(prompt="hi", sampling_params={"max_new_tokens": 1, "temperature": 0})
+
+    for name, idx in zip(names, indices):
+        e.load_lora_adapter(name, f"{snap}/adapter_{idx}")
+
+    # Warm path — just-used adapter, fully in GPU.
+    last = names[-1]
+    _measure(e, prompt_tmpl.format(project=last), last)
+    warm = [_measure(e, prompt_tmpl.format(project=last), last) for _ in range(5)]
+
+    # CPU-resident — adapter still in the CPU pool but not in any GPU
+    # slot.  Cycle GPU slots through 2 other adapters to evict it.
+    cpu_only = names[-2]
+    _measure(e, prompt_tmpl.format(project=cpu_only), cpu_only)
+    other = names[-3]
+    _measure(e, prompt_tmpl.format(project=other), other)
+    cpu_lat = [
+        _measure(e, prompt_tmpl.format(project=cpu_only), cpu_only) for _ in range(5)
+    ]
+
+    # Cold — adapters at indices 0 .. (N - max_cpu - 1) were evicted
+    # from CPU during registration.  Hit one repeatedly, forcing
+    # re-eviction before each measurement.
+    cold_name = names[0]
+    cold = []
+    for _ in range(5):
+        for n in names[2:5]:
+            _measure(e, prompt_tmpl.format(project=n), n)
+        cold.append(_measure(e, prompt_tmpl.format(project=cold_name), cold_name))
+
+    def stats(label: str, samples: list[float]) -> None:
+        ms = [s * 1000 for s in samples]
+        print(
+            f"  {label:>14s}: median={statistics.median(ms):6.1f} ms  "
+            f"min={min(ms):6.1f}  max={max(ms):6.1f}  (n={len(ms)})",
+            flush=True,
+        )
+
+    stats("warm", warm)
+    stats("cpu-resident", cpu_lat)
+    stats("cold (disk)", cold)
+    e.shutdown()
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3 or sys.argv[2] not in ("on", "off"):
+        print(
+            "usage: python benchmark/test_lora_eviction_latency.py "
+            "<max_loras_cpu> <on|off>",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
+    main(int(sys.argv[1]), sys.argv[2] == "on")
diff --git a/docs/index.md b/docs/index.md
index b41fef07b..0be771a38 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -35,6 +35,7 @@ features:
 - [Server Parameters](./configuration/server.md)
 - [Compatible Parameters](./configuration/compatible-parameters.md)
 - [Parallelism](./serving/parallelism.md)
+- [LoRA Serving](./serving/lora.md)
 
 ## Common Workflow
 
diff --git a/docs/lora_current_design.html b/docs/lora_current_design.html
new file mode 100644
index 000000000..a03fabe6f
--- /dev/null
+++ b/docs/lora_current_design.html
@@ -0,0 +1,925 @@
+<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>TokenSpeed LoRA Design - Current Implementation</title>
+  <style>
+    :root {
+      color-scheme: light;
+      --bg: #f6f8fb;
+      --panel: #ffffff;
+      --panel-2: #f0f4f8;
+      --ink: #17202a;
+      --muted: #5b6776;
+      --line: #d8e0ea;
+      --blue: #2563eb;
+      --green: #15803d;
+      --amber: #b45309;
+      --red: #b91c1c;
+      --purple: #7c3aed;
+      --code: #0f172a;
+      --code-bg: #eef2f7;
+    }
+
+    * { box-sizing: border-box; }
+    html { scroll-behavior: smooth; }
+    body {
+      margin: 0;
+      font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
+      color: var(--ink);
+      background: var(--bg);
+      line-height: 1.55;
+    }
+
+    .shell {
+      display: grid;
+      grid-template-columns: 280px minmax(0, 1fr);
+      min-height: 100vh;
+    }
+
+    nav {
+      position: sticky;
+      top: 0;
+      height: 100vh;
+      padding: 28px 22px;
+      border-right: 1px solid var(--line);
+      background: #ffffff;
+      overflow: auto;
+    }
+
+    nav h2 {
+      margin: 0 0 6px;
+      font-size: 18px;
+      letter-spacing: 0;
+    }
+
+    nav p {
+      margin: 0 0 20px;
+      color: var(--muted);
+      font-size: 13px;
+    }
+
+    nav a {
+      display: block;
+      padding: 7px 0;
+      color: #314158;
+      text-decoration: none;
+      font-size: 14px;
+    }
+
+    nav a:hover { color: var(--blue); }
+
+    main {
+      max-width: 1180px;
+      width: 100%;
+      padding: 44px 46px 72px;
+    }
+
+    header {
+      border-bottom: 1px solid var(--line);
+      padding-bottom: 28px;
+      margin-bottom: 28px;
+    }
+
+    .eyebrow {
+      margin: 0 0 8px;
+      color: var(--blue);
+      font-size: 13px;
+      font-weight: 700;
+      text-transform: uppercase;
+      letter-spacing: .04em;
+    }
+
+    h1 {
+      margin: 0;
+      font-size: clamp(34px, 5vw, 56px);
+      line-height: 1.05;
+      letter-spacing: 0;
+    }
+
+    .subtitle {
+      max-width: 820px;
+      color: var(--muted);
+      font-size: 18px;
+      margin: 18px 0 0;
+    }
+
+    section {
+      margin: 34px 0;
+      padding: 28px;
+      background: var(--panel);
+      border: 1px solid var(--line);
+      border-radius: 8px;
+    }
+
+    h2 {
+      margin: 0 0 14px;
+      font-size: 28px;
+      line-height: 1.2;
+      letter-spacing: 0;
+    }
+
+    h3 {
+      margin: 24px 0 10px;
+      font-size: 19px;
+      letter-spacing: 0;
+    }
+
+    p { margin: 10px 0; }
+    ul, ol { padding-left: 22px; }
+    li { margin: 7px 0; }
+
+    code {
+      color: var(--code);
+      background: var(--code-bg);
+      padding: 2px 5px;
+      border-radius: 5px;
+      font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, "Liberation Mono", monospace;
+      font-size: .92em;
+    }
+
+    pre {
+      overflow: auto;
+      padding: 16px;
+      border-radius: 8px;
+      background: #101826;
+      color: #e5eefb;
+      font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, "Liberation Mono", monospace;
+      font-size: 13px;
+      line-height: 1.45;
+    }
+
+    pre code {
+      color: inherit;
+      background: transparent;
+      padding: 0;
+      border-radius: 0;
+    }
+
+    table {
+      width: 100%;
+      border-collapse: collapse;
+      margin: 16px 0;
+      font-size: 14px;
+    }
+
+    th, td {
+      border: 1px solid var(--line);
+      padding: 10px 12px;
+      vertical-align: top;
+      text-align: left;
+    }
+
+    th { background: var(--panel-2); }
+
+    .grid {
+      display: grid;
+      grid-template-columns: repeat(3, minmax(0, 1fr));
+      gap: 14px;
+      margin: 18px 0;
+    }
+
+    .card {
+      border: 1px solid var(--line);
+      border-radius: 8px;
+      padding: 16px;
+      background: #fbfdff;
+    }
+
+    .card h3 {
+      margin-top: 0;
+      font-size: 17px;
+    }
+
+    .tag {
+      display: inline-block;
+      padding: 2px 8px;
+      border-radius: 999px;
+      border: 1px solid var(--line);
+      background: #fff;
+      color: var(--muted);
+      font-size: 12px;
+      font-weight: 700;
+    }
+
+    .good { color: var(--green); }
+    .warn { color: var(--amber); }
+    .bad { color: var(--red); }
+    .purple { color: var(--purple); }
+
+    .flow {
+      display: grid;
+      gap: 10px;
+      margin: 18px 0;
+    }
+
+    .flow-row {
+      display: grid;
+      grid-template-columns: 220px minmax(0, 1fr);
+      gap: 12px;
+      align-items: stretch;
+    }
+
+    .node {
+      border: 1px solid var(--line);
+      border-left: 5px solid var(--blue);
+      border-radius: 8px;
+      background: #fbfdff;
+      padding: 12px 14px;
+    }
+
+    .node strong { display: block; }
+    .node small { color: var(--muted); }
+    .node.green { border-left-color: var(--green); }
+    .node.amber { border-left-color: var(--amber); }
+    .node.purple { border-left-color: var(--purple); }
+
+    .callout {
+      border-left: 5px solid var(--amber);
+      background: #fff8ed;
+      padding: 14px 16px;
+      border-radius: 8px;
+      margin: 16px 0;
+    }
+
+    .file-list td:first-child {
+      white-space: nowrap;
+      font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, "Liberation Mono", monospace;
+      font-size: 13px;
+    }
+
+    @media (max-width: 900px) {
+      .shell { grid-template-columns: 1fr; }
+      nav { position: relative; height: auto; }
+      main { padding: 28px 18px 52px; }
+      section { padding: 20px; }
+      .grid { grid-template-columns: 1fr; }
+      .flow-row { grid-template-columns: 1fr; }
+    }
+  </style>
+</head>
+<body>
+  <div class="shell">
+    <nav>
+      <h2>LoRA Design</h2>
+      <p>Current TokenSpeed implementation, including dense LoRA, MoE LoRA, eviction, and CUDA graph replay.</p>
+      <a href="#overview">Overview</a>
+      <a href="#naming">Naming</a>
+      <a href="#files">Files</a>
+      <a href="#data-model">Data Model</a>
+      <a href="#lifecycle">Adapter Lifecycle</a>
+      <a href="#eviction">Eviction</a>
+      <a href="#buffers">GPU Buffers</a>
+      <a href="#batch-info">Batch Metadata</a>
+      <a href="#kernels">Kernel Routing</a>
+      <a href="#moe">MoE LoRA</a>
+      <a href="#cuda-graph">CUDA Graph</a>
+      <a href="#limitations">Limitations</a>
+    </nav>
+
+    <main>
+      <header>
+        <p class="eyebrow">TokenSpeed Runtime Design</p>
+        <h1>LoRA Serving Implementation</h1>
+        <p class="subtitle">
+          This document describes the current LoRA implementation in the working
+          branch: how adapter names and ids map to GPU slots, how CPU and GPU
+          eviction work, how dense and MoE LoRA weights are packed, and why the
+          CUDA graph path remains stable across dynamic adapters.
+        </p>
+      </header>
+
+      <section id="overview">
+        <h2>Overview</h2>
+        <p>
+          TokenSpeed treats LoRA as a runtime-owned side path. Base model layers
+          keep their normal linear and MoE kernels. When a request uses an adapter,
+          the runtime resolves that request's <code>lora_id</code> to a GPU
+          <code>slot</code>, writes per-step metadata into persistent tensors, and
+          the model layers add LoRA deltas in place.
+        </p>
+
+        <div class="grid">
+          <div class="card">
+            <h3>Identity Layer</h3>
+            <p><code>name</code> and <code>lora_id</code> are user/runtime identities. They do not imply GPU residency.</p>
+          </div>
+          <div class="card">
+            <h3>Residency Layer</h3>
+            <p><code>slot</code> is the current GPU pool index for a real adapter. Base-model requests use <code>NO_LORA_SLOT = -1</code>.</p>
+          </div>
+          <div class="card">
+            <h3>Forward Layer</h3>
+            <p><code>LoraBatchInfo</code> maps each request segment to a slot and is read directly by LoRA kernels.</p>
+          </div>
+        </div>
+
+        <div class="flow">
+          <div class="flow-row">
+            <div class="node"><strong>Load</strong><small>adapter path -> CPU cache</small></div>
+            <div class="node green"><strong><code>LoraManager.load_adapter()</code></strong><small>Registers name/id, stores durable disk path, warms CPU cache.</small></div>
+          </div>
+          <div class="flow-row">
+            <div class="node"><strong>Schedule</strong><small>request ids -> adapter ids</small></div>
+            <div class="node green"><strong><code>prepare_loras()</code></strong><small>Promotes missing adapters to GPU slots, writes segment lengths, slot ids, and fast-path metadata.</small></div>
+          </div>
+          <div class="flow-row">
+            <div class="node"><strong>Forward</strong><small>layer output += LoRA delta</small></div>
+            <div class="node green"><strong><code>apply_*_lora()</code></strong><small>Dense layers call shrink/expand kernels; MoE backends consume a narrow <code>MoeLoraContext</code>.</small></div>
+          </div>
+        </div>
+      </section>
+
+      <section id="naming">
+        <h2>Naming</h2>
+        <table>
+          <thead>
+            <tr><th>Name</th><th>Meaning</th><th>Where it lives</th></tr>
+          </thead>
+          <tbody>
+            <tr>
+              <td><code>name</code></td>
+              <td>Stable user-facing adapter name or alias, such as <code>"password_adapter"</code>. This is the value requests should select after registration.</td>
+              <td><code>LoraManager._name_to_id</code>, <code>_adapter_paths</code>, CPU/GPU LRU maps.</td>
+            </tr>
+            <tr>
+              <td><code>lora_name</code></td>
+              <td>Canonical request/API selector. It must be the name of an adapter that was already loaded via <code>load_lora_adapter()</code>.</td>
+              <td>Request schema and input processing before lookup in <code>LoraManager</code>.</td>
+            </tr>
+            <tr>
+              <td><code>adapter_path</code> / load-time <code>path</code></td>
+              <td>Durable filesystem path to the adapter directory or safetensors file. Every registered adapter needs one so CPU eviction can reload weights from disk.</td>
+              <td><code>LoraManager._adapter_paths</code>, <code>LoraCpuCache.adapter_paths</code>.</td>
+            </tr>
+            <tr>
+              <td><code>lora_id</code></td>
+              <td>Runtime integer id assigned at registration time. Request scheduling carries this id.</td>
+              <td><code>_name_to_id</code>, <code>_id_to_name</code>, request metadata.</td>
+            </tr>
+            <tr>
+              <td><code>slot</code></td>
+              <td>GPU-resident real adapter slot. Valid slots are <code>0..max_loras-1</code>; base/no-LoRA is <code>NO_LORA_SLOT = -1</code> in batch metadata.</td>
+              <td><code>_slot_to_name</code>, <code>_name_to_slot</code>, <code>LoraBatchInfo.weight_indices</code>.</td>
+            </tr>
+            <tr>
+              <td><code>rank</code></td>
+              <td>LoRA rank used by the adapter. For 3D MoE tensors, rank is dimension 1 of <code>lora_A</code>.</td>
+              <td><code>_lora_ranks</code>, <code>_slot_ranks</code>, per-slot buffer slices.</td>
+            </tr>
+            <tr>
+              <td><code>scaling</code></td>
+              <td><code>lora_alpha / r</code> from <code>adapter_config.json</code>, or 1.0 fallback.</td>
+              <td><code>_scalings</code>, <code>_slot_scalings</code>, kernel multiply.</td>
+            </tr>
+            <tr>
+              <td><code>segment</code></td>
+              <td>One contiguous run of tokens using one adapter slot. Current path uses one segment per request.</td>
+              <td><code>seg_lens</code>, <code>seg_indptr</code>, <code>weight_indices</code>.</td>
+            </tr>
+          </tbody>
+        </table>
+
+        <div class="callout">
+          <strong>Important distinction:</strong> <code>adapter_path</code> is
+          the disk source of truth used when the adapter is loaded or reloaded.
+          Request-time <code>lora_name</code> selects an already loaded adapter.
+          <code>lora_id</code> is stable while the adapter remains registered.
+          <code>slot</code> is temporary and may change after GPU eviction and
+          reload.
+        </div>
+      </section>
+
+      <section id="files">
+        <h2>Files</h2>
+        <p>
+          The implementation is split so request/API naming, adapter lifecycle,
+          scheduler isolation, and kernel execution each have a narrow owner.
+          The tables below show the important added and modified files.
+        </p>
+
+        <h3>Runtime LoRA Modules - Added</h3>
+        <table class="file-list">
+          <thead>
+            <tr><th>File</th><th>Role</th></tr>
+          </thead>
+          <tbody>
+            <tr><td><code>python/tokenspeed/runtime/lora/adapter_io.py</code></td><td>Loads adapter weights and normalizes supported formats: dense PEFT keys, 2D per-expert MoE keys, and 3D <code>experts.w1/w2/w3</code> MoE keys.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/lora/lora_cache.py</code></td><td>Pinned CPU adapter cache with durable <code>adapter_path</code> tracking, async prefetch, LRU eviction, and disk fallback.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/lora/lora_buffers.py</code></td><td>GPU buffer allocation and dense weight packing. Owns TP-aware CPU-side sharding and slot zeroing for dense LoRA tensors.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/lora/lora_batch.py</code></td><td><code>LoraBatchInfo</code>, segment metadata, decode grouping, and CUDA-graph-stable tensors read by dense LoRA kernels.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/lora/moe_lora.py</code></td><td><code>MoeLoraBuffers</code> and <code>MoeLoraContext</code>. Preallocates fixed expert-scoped LoRA pools and exposes the narrow context used by MoE backends.</td></tr>
+          </tbody>
+        </table>
+
+        <h3>Runtime Integration - Modified</h3>
+        <table class="file-list">
+          <thead>
+            <tr><th>File</th><th>Role</th></tr>
+          </thead>
+          <tbody>
+            <tr><td><code>python/tokenspeed/runtime/lora/lora_manager.py</code></td><td>Top-level adapter lifecycle manager: <code>lora_name</code> to <code>lora_id</code>, CPU/GPU residency, eviction, dense apply calls, and MoE context creation.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/lora/__init__.py</code></td><td>Exports the public LoRA runtime types used by execution and model layers.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/engine/io_struct.py</code></td><td>Adds request/control dataclasses: request-time <code>lora_name</code>, load-time <code>adapter_path</code>, and tokenized <code>lora_id</code>.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/engine/input_processor.py</code></td><td>Resolves request <code>lora_name</code> to internal <code>lora_id</code>; unknown names fail fast instead of falling back to base model.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/engine/async_llm.py</code></td><td>Holds the name-to-id registry used by request processing and scheduler control paths.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/engine/event_loop.py</code></td><td>Owns scheduler-side adapter load/unload, initializes <code>LoraManager</code>, and evicts KV namespaces on unload.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/engine/request_handler.py</code></td><td>Dispatches load/unload ZMQ control messages to the scheduler process.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/engine/scheduler_control_client.py</code></td><td>Sends <code>LoadLoraReqInput(lora_name, adapter_path)</code> and unload requests to scheduler workers.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/entrypoints/engine.py</code></td><td>Exposes the Python API: <code>generate(..., lora_name=...)</code> and <code>load_lora_adapter(lora_name, adapter_path)</code>.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/entrypoints/engine_base.py</code></td><td>Documents the abstract engine API and keeps request names separate from load-time disk paths.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/execution/context.py</code></td><td>Places <code>LoraManager</code>, <code>LoraBatchInfo</code>, and <code>MoeLoraContext</code> on <code>ForwardContext</code>.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/execution/model_runner.py</code></td><td>Calls <code>prepare_loras()</code> from scheduled request <code>lora_id</code> values before model forward.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/execution/cuda_graph_wrapper.py</code></td><td>Captures and replays separate graph variants for no-LoRA and with-LoRA decode batches.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/layers/moe/layer.py</code></td><td>Threads <code>MoeLoraContext</code> from runtime context into MoE backend calls.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/layers/moe/backends/base.py</code></td><td>Extends the backend interface with an optional MoE LoRA context.</td></tr>
+            <tr><td><code>python/tokenspeed/runtime/layers/moe/backends/*/triton.py</code></td><td>Supported Triton MoE backends consume the narrow context and apply expert LoRA deltas around fused MoE compute.</td></tr>
+          </tbody>
+        </table>
+
+        <h3>Scheduler - Modified</h3>
+        <table class="file-list">
+          <thead>
+            <tr><th>File</th><th>Role</th></tr>
+          </thead>
+          <tbody>
+            <tr><td><code>tokenspeed-scheduler/csrc/scheduler/request_spec.h</code></td><td>Adds <code>RequestSpec.lora_id</code>. <code>0</code> is base model; positive ids identify registered adapters.</td></tr>
+            <tr><td><code>tokenspeed-scheduler/csrc/scheduler/request.h</code> / <code>request.cpp</code></td><td>Stores the request's <code>lora_id</code> and exposes it to scheduling and forward events.</td></tr>
+            <tr><td><code>tokenspeed-scheduler/csrc/fsm/forward_events.h</code> / <code>forward_events.cpp</code></td><td>Carries <code>lora_id</code> through prefill/decode FSM events so prefix-cache match/insert uses the right adapter namespace.</td></tr>
+            <tr><td><code>tokenspeed-scheduler/csrc/resource/kv_prefix_cache/kv_prefix_cache.h</code> / <code>.cpp</code></td><td>Creates per-adapter virtual roots keyed by <code>lora_id</code>, isolates KV reuse across adapters, and supports namespace eviction.</td></tr>
+            <tr><td><code>tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.h</code> / <code>.cpp</code></td><td>Forwards <code>lora_id</code> into the KV prefix cache for hybrid cache users.</td></tr>
+            <tr><td><code>tokenspeed-scheduler/csrc/scheduler/scheduler.h</code> / <code>.cpp</code></td><td>Adds <code>EvictLoraNamespace(lora_id)</code>, used when an adapter is unloaded.</td></tr>
+            <tr><td><code>tokenspeed-scheduler/bindings/python_module.cpp</code></td><td>Exposes <code>RequestSpec.lora_id</code> and scheduler namespace eviction to Python.</td></tr>
+            <tr><td><code>tokenspeed-scheduler/tests/cpp/test_lora_prefix_cache.cpp</code></td><td>Covers adapter-specific prefix-cache isolation, base-model isolation, and explicit namespace eviction.</td></tr>
+          </tbody>
+        </table>
+
+        <h3>Kernel Package - Added Or Modified</h3>
+        <table class="file-list">
+          <thead>
+            <tr><th>File</th><th>Role</th></tr>
+          </thead>
+          <tbody>
+            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/</code></td><td>Triton LoRA operator family: shrink, expand, prefill variants, decode grouping, QKV expand, gate/up expand, tuning helpers, and H100 tuned configs.</td></tr>
+            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/cutedsl.py</code></td><td>Public wrappers for CuTeDSL fast paths used by selected single-slot and batched-slot dense LoRA shapes.</td></tr>
+            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/thirdparty/cute_dsl/_provider.py</code></td><td>Provider boundary for optional CuTeDSL availability and import isolation.</td></tr>
+            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/thirdparty/cute_dsl/gemm_add.py</code></td><td>CuTeDSL GEMM-add helper used by dense LoRA expand paths.</td></tr>
+            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/thirdparty/cute_dsl/lora_gemm.py</code></td><td>CuTeDSL LoRA GEMM kernels for shrink/expand-style dense paths.</td></tr>
+            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/thirdparty/cute_dsl/lora_expand_direct.py</code></td><td>Direct expand helper for selected LoRA-B add paths.</td></tr>
+            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/thirdparty/cute_dsl/_vendor/</code></td><td>Vendored CuTeDSL support code kept inside <code>tokenspeed-kernel</code>, not imported directly by runtime code.</td></tr>
+            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/__init__.py</code></td><td>Exports the kernel package LoRA ops through the existing kernel boundary.</td></tr>
+            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/_triton.py</code></td><td>Centralizes direct Triton imports so LoRA ops follow the repository kernel dependency rule.</td></tr>
+          </tbody>
+        </table>
+
+        <h3>Tests, Benchmarks, And Docs</h3>
+        <table class="file-list">
+          <thead>
+            <tr><th>File</th><th>Role</th></tr>
+          </thead>
+          <tbody>
+            <tr><td><code>test/runtime/lora/test_adapter_io.py</code></td><td>Parser tests for dense, MoE per-expert, and 3D MoE adapter formats.</td></tr>
+            <tr><td><code>test/runtime/lora/test_lora_manager.py</code></td><td>Lifecycle, packing, eviction, CPU cache, GPU slot, and metadata behavior.</td></tr>
+            <tr><td><code>test/runtime/lora/test_lora_request_naming.py</code></td><td>Request naming contract: <code>lora_name</code> only, unknown names fail, scalar names propagate across batches.</td></tr>
+            <tr><td><code>test/runtime/lora/test_moe_lora.py</code></td><td>MoE LoRA buffer/context behavior and routed expert-delta application.</td></tr>
+            <tr><td><code>tokenspeed-kernel/test/ops/test_lora_triton.py</code></td><td>Numerical coverage for Triton LoRA kernels.</td></tr>
+            <tr><td><code>tokenspeed-kernel/test/ops/test_lora_cutedsl.py</code></td><td>Numerical coverage for CuTeDSL LoRA fast paths.</td></tr>
+            <tr><td><code>benchmark/test_lora_*.py</code></td><td>Dynamic load/unload, mixed adapter batches, eviction latency, and E2E password-adapter checks.</td></tr>
+            <tr><td><code>docs/serving/lora.md</code></td><td>User-facing serving guide for adapter loading, request selection, and supported MoE adapter formats.</td></tr>
+            <tr><td><code>docs/lora_current_design.html</code></td><td>This current implementation design document.</td></tr>
+          </tbody>
+        </table>
+      </section>
+
+      <section id="data-model">
+        <h2>Data Model</h2>
+        <h3>AdapterWeights</h3>
+        <p>
+          Parsed adapter weights use this logical shape:
+        </p>
+        <pre><code>AdapterWeights = {
+  layer_id: {
+    module_name: (lora_A, lora_B),
+  }
+}</code></pre>
+
+        <p>Dense modules use names like <code>q_proj</code>, <code>o_proj</code>, <code>gate_proj</code>, <code>up_proj</code>, and <code>down_proj</code>.</p>
+        <p>2D MoE per-expert modules use names like <code>experts.7.gate_proj</code>. 3D MoE modules use <code>experts.w1</code>, <code>experts.w2</code>, and <code>experts.w3</code>.</p>
+
+        <h3>Registration State</h3>
+        <pre><code>_name_to_id:    dict[str, int]        # user name -> stable lora_id
+_id_to_name:    dict[int, str]        # stable lora_id -> user name
+_adapter_paths: dict[str, str]        # user name -> durable adapter directory</code></pre>
+
+        <h3>Residency State</h3>
+        <pre><code>_cpu_cache:     dict[str, AdapterWeights]  # parsed host weights
+_cpu_lru:       OrderedDict[str, None]     # CPU eviction order
+_name_to_slot:  dict[str, int]             # GPU-resident name -> slot
+_slot_to_name:  list[str | None]           # slot -> GPU-resident name
+_gpu_lru:       OrderedDict[str, None]     # GPU eviction order</code></pre>
+      </section>
+
+      <section id="lifecycle">
+        <h2>Adapter Lifecycle</h2>
+        <ol>
+          <li><code>load_adapter(name, path)</code> verifies the adapter weight file or directory.</li>
+          <li>A new integer <code>lora_id</code> is assigned and stored in <code>_name_to_id</code> and <code>_id_to_name</code>.</li>
+          <li>The durable path is recorded in the CPU cache object so disk reload remains possible after CPU eviction.</li>
+          <li><code>LoraCpuCache.ensure()</code> synchronously loads, parses, and pins weights into the CPU pool when pinned memory is available.</li>
+          <li>On each forward step, <code>prepare_loras(lora_ids, token_counts)</code> resolves ids to names and then to GPU slots.</li>
+          <li>If an adapter is CPU-resident but not GPU-resident, <code>_ensure_in_gpu()</code> allocates or evicts a slot and calls <code>_load_to_slot()</code>.</li>
+          <li><code>_load_to_slot()</code> resets the target slot, writes rank/scaling metadata, shards on CPU, packs dense buffers, and loads MoE buffers.</li>
+          <li><code>unload_adapter(name)</code> clears GPU slot state, removes CPU cache state, and deletes id mappings.</li>
+        </ol>
+
+        <pre><code>request lora_id
+  -> _id_to_name[lora_id]
+  -> _ensure_in_gpu(name)
+  -> slot
+  -> LoraBatchInfo.weight_indices[segment] = slot</code></pre>
+      </section>
+
+      <section id="eviction">
+        <h2>Eviction</h2>
+        <h3>GPU Pool</h3>
+        <p>
+          The GPU pool has <code>max_loras</code> slots, all of them available
+          for real adapters. Base-model requests do not consume a GPU slot;
+          they write <code>NO_LORA_SLOT = -1</code> into per-step metadata.
+        </p>
+        <ul>
+          <li><code>_find_free_slot()</code> returns the first empty adapter slot.</li>
+          <li>If the pool is full, it scans <code>_gpu_lru</code> from least to most recently used.</li>
+          <li>The selected adapter is removed from <code>_name_to_slot</code>, <code>_slot_to_name</code>, and <code>_gpu_lru</code>.</li>
+          <li>The returned slot is reset before <code>_load_to_slot()</code> copies new weights, so partial adapters cannot inherit stale modules from the previous occupant.</li>
+          <li>Explicit unload also resets dense weights, clears MoE weights, and resets rank/scaling.</li>
+        </ul>
+
+        <h3>CPU Pool</h3>
+        <p>
+          The CPU pool is a second tier bounded by <code>max_loras_cpu</code>.
+          It keeps parsed, pinned weights to avoid repeated safetensors reads
+          and to allow non-blocking H2D copies when the platform supports
+          pinned memory. The default capacity is four times the GPU pool.
+        </p>
+        <ul>
+          <li><code>prefetch(name)</code> starts a best-effort background disk read if the adapter is known and not already loading.</li>
+          <li><code>ensure(name)</code> blocks until a pending load finishes or loads synchronously from disk.</li>
+          <li>CPU eviction prefers adapters that are not currently GPU-resident.</li>
+          <li>If the pool cannot find an evictable entry, loading raises a runtime error with the current LRU state.</li>
+        </ul>
+
+        <div class="callout">
+          GPU eviction does not unregister the adapter. It only removes the
+          temporary slot mapping. The adapter can be promoted again later from
+          CPU cache or disk using its stable <code>name</code> and
+          <code>lora_id</code>.
+        </div>
+      </section>
+
+      <section id="buffers">
+        <h2>GPU Buffers</h2>
+        <p>
+          Dense LoRA weights are packed into fixed-size per-layer buffers. The
+          first dimension is always <code>n_slots</code>, so kernels can select
+          the active adapter by slot without changing pointer addresses.
+          <code>--lora-buffer-groups</code> controls which coarse families are
+          allocated: <code>attn</code>, <code>mlp</code>, and <code>moe</code>.
+        </p>
+        <p>
+          The default is <code>attn,mlp,moe</code>. If a server starts with a
+          group disabled, loading an adapter that targets that group raises a
+          configuration error instead of silently dropping LoRA deltas.
+        </p>
+
+        <table>
+          <thead>
+            <tr><th>Buffer</th><th>Shape</th><th>Notes</th></tr>
+          </thead>
+          <tbody>
+            <tr><td><code>qkv_A_buffers[layer]</code></td><td><code>(n_slots, 3 * max_rank, hidden)</code></td><td>Q, K, V A matrices stacked by rank block.</td></tr>
+            <tr><td><code>qkv_B_buffers[layer]</code></td><td><code>(n_slots, q_per_tp + 2 * kv_per_tp, max_rank)</code></td><td>Column-parallel output side, sharded per TP rank.</td></tr>
+            <tr><td><code>o_A_buffers[layer]</code></td><td><code>(n_slots, max_rank, o_in_per_tp)</code></td><td>Row-parallel input side, sharded along input dimension.</td></tr>
+            <tr><td><code>o_B_buffers[layer]</code></td><td><code>(n_slots, hidden, max_rank)</code></td><td>Replicated output side.</td></tr>
+            <tr><td><code>gate_up_A_buffers[layer]</code></td><td><code>(n_slots, 2 * max_rank, hidden)</code></td><td>Gate and up A matrices stacked.</td></tr>
+            <tr><td><code>gate_up_B_buffers[layer]</code></td><td><code>(n_slots, 2 * intermediate_per_tp, max_rank)</code></td><td>Column-parallel gate/up output side.</td></tr>
+            <tr><td><code>down_A_buffers[layer]</code></td><td><code>(n_slots, max_rank, intermediate_per_tp)</code></td><td>Row-parallel down input side.</td></tr>
+            <tr><td><code>down_B_buffers[layer]</code></td><td><code>(n_slots, hidden, max_rank)</code></td><td>Replicated down output side.</td></tr>
+          </tbody>
+        </table>
+
+        <h3>TP Sharding Rule</h3>
+        <ul>
+          <li>Column-parallel projections (<code>q/k/v</code>, <code>gate</code>, <code>up</code>) shard <code>lora_B</code> along output dimension.</li>
+          <li>Row-parallel projections (<code>o</code>, <code>down</code>) shard <code>lora_A</code> along input dimension.</li>
+          <li>Sharding happens on CPU before the H2D copy, so each TP rank copies only its local shard into GPU buffers.</li>
+          <li>Downstream all-reduce sums base partials and LoRA partials together for row-parallel outputs.</li>
+        </ul>
+      </section>
+
+      <section id="batch-info">
+        <h2>Batch Metadata</h2>
+        <p>
+          <code>LoraBatchInfo</code> is the contract between Python scheduling
+          and the CUDA/Triton kernels. Its tensors are allocated once at manager
+          construction and updated in place before each forward.
+        </p>
+
+        <table>
+          <thead>
+            <tr><th>Field</th><th>Meaning</th></tr>
+          </thead>
+          <tbody>
+            <tr><td><code>bs</code></td><td>Number of active request segments.</td></tr>
+            <tr><td><code>num_segments</code></td><td>Currently equal to <code>bs</code>; one segment per request.</td></tr>
+            <tr><td><code>max_len</code></td><td>Maximum segment length in the step; drives decode vs prefill kernel choice.</td></tr>
+            <tr><td><code>seg_lens</code></td><td>Tokens per segment.</td></tr>
+            <tr><td><code>seg_indptr</code></td><td>Prefix sum over segment lengths.</td></tr>
+            <tr><td><code>weight_indices</code></td><td>GPU slot per segment.</td></tr>
+            <tr><td><code>lora_ranks</code></td><td>Per-slot rank tensor read by kernels.</td></tr>
+            <tr><td><code>scalings</code></td><td>Per-slot scaling tensor read by kernels.</td></tr>
+            <tr><td><code>single_lora_slot</code></td><td>Host fast path when every segment uses the same real adapter slot; otherwise <code>NO_LORA_SLOT</code>.</td></tr>
+            <tr><td><code>multi_lora_*</code></td><td>Host metadata for a batched CuTeDSL path when slots are consecutive and same-rank/same-scaling.</td></tr>
+            <tr><td><code>sort_order/group_*</code></td><td>Decode grouping metadata for grouped expand kernels.</td></tr>
+          </tbody>
+        </table>
+
+<pre><code>prepare_loras([adapter_a, adapter_b, 0], [20, 15, 8])
+  -> per_request_slots = [slot_a, slot_b, NO_LORA_SLOT]
+  -> seg_lens          = [20, 15, 8]
+  -> seg_indptr        = [0, 20, 35, 43]
+  -> weight_indices    = [slot_a, slot_b, NO_LORA_SLOT]
+  -> has_active_lora   = true</code></pre>
+      </section>
+
+      <section id="kernels">
+        <h2>Kernel Routing</h2>
+        <p>
+          Dense LoRA applies in two logical phases:
+        </p>
+        <ol>
+          <li><strong>Shrink:</strong> compute <code>lora_a = A @ x</code> using the active slot's A buffer.</li>
+          <li><strong>Expand:</strong> compute and add <code>B @ lora_a * scaling</code> into the base layer output.</li>
+        </ol>
+
+        <table>
+          <thead>
+            <tr><th>Condition</th><th>Path</th></tr>
+          </thead>
+          <tbody>
+            <tr><td><code>max_len &gt; 32</code></td><td>Prefill-style shrink/expand kernels.</td></tr>
+            <tr><td>Decode with grouped slots</td><td>Grouped expand path batches tokens by adapter slot.</td></tr>
+            <tr><td>Single adapter and favorable shape</td><td>CuTeDSL dense GEMM-add fast path.</td></tr>
+            <tr><td>Multiple consecutive slots with same rank/scaling</td><td>Batched CuTeDSL fast path.</td></tr>
+            <tr><td>Fallback</td><td>General Triton shrink/expand kernels.</td></tr>
+          </tbody>
+        </table>
+      </section>
+
+      <section id="moe">
+        <h2>MoE LoRA</h2>
+        <p>
+          MoE LoRA is deliberately separated from dense buffers. The manager
+          owns <code>MoeLoraBuffers</code>, and MoE backends receive a narrow
+          <code>MoeLoraContext</code> instead of depending on the full
+          <code>LoraManager</code>.
+        </p>
+
+        <h3>Supported Formats</h3>
+        <table>
+          <thead>
+            <tr><th>Format</th><th>Parsed module names</th><th>Storage behavior</th></tr>
+          </thead>
+          <tbody>
+            <tr>
+              <td>2D per-expert PEFT</td>
+              <td><code>experts.&lt;id&gt;.gate_proj</code>, <code>up_proj</code>, <code>down_proj</code></td>
+              <td>Expert id comes from the key. Each expert has independent A/B tensors.</td>
+            </tr>
+            <tr>
+              <td>3D per-expert</td>
+              <td><code>experts.w1</code>, <code>experts.w2</code>, <code>experts.w3</code></td>
+              <td>Tensor dim0 is <code>num_experts</code>; one slice per expert.</td>
+            </tr>
+            <tr>
+              <td>3D shared-outer</td>
+              <td><code>experts.w1</code>, <code>experts.w2</code>, <code>experts.w3</code></td>
+              <td>Tensor dim0 may be <code>1</code> for the shared side and <code>num_experts</code> for the expert-specific side.</td>
+            </tr>
+          </tbody>
+        </table>
+
+        <h3>Projection Mapping</h3>
+        <pre><code>w1 -> gate_proj
+w3 -> up_proj
+w2 -> down_proj</code></pre>
+
+        <h3>Internal MoE Buffers</h3>
+        <p>
+          MoE LoRA now mirrors the dense/vLLM-style slot model: buffers are
+          preallocated per layer with leading dimensions
+          <code>(n_slots, num_experts, ...)</code>. Loading an adapter writes
+          into the selected slot; <code>weights_by_layer[layer][slot]</code>
+          stores views into those fixed buffers for backend consumption.
+        </p>
+        <pre><code>w13_A_buffers[layer]:  (n_slots, num_experts, 2 * max_rank, hidden)
+w13_B_buffers[layer]:  (n_slots, num_experts, 2 * moe_intermediate_per_tp, 2 * max_rank)
+down_A_buffers[layer]: (n_slots, num_experts, max_rank, moe_intermediate_per_tp)
+down_B_buffers[layer]: (n_slots, num_experts, hidden, max_rank)
+
+weights_by_layer[layer_id][slot] = {
+  "w13_A":  w13_A_buffers[layer_id][slot],
+  "w13_B":  w13_B_buffers[layer_id][slot],
+  "down_A": down_A_buffers[layer_id][slot],
+  "down_B": down_B_buffers[layer_id][slot],
+}</code></pre>
+        <p>
+          Slot reset zeros both dense and MoE fixed pools before reuse, so
+          partial MoE adapters cannot inherit expert weights from a previous
+          adapter in the same slot.
+        </p>
+        <p>
+          With <code>--lora-moe-compressed-shared-outer</code>, MoE allocation
+          switches to the 3D shared-outer layout instead of full expansion:
+        </p>
+        <pre><code>w13_A_buffers[layer]:  (n_slots, 1, 2 * max_rank, hidden)
+w13_B_buffers[layer]:  (n_slots, num_experts, 2 * moe_intermediate_per_tp, 2 * max_rank)
+down_A_buffers[layer]: (n_slots, num_experts, max_rank, moe_intermediate_per_tp)
+down_B_buffers[layer]: (n_slots, 1, hidden, max_rank)</code></pre>
+        <p>
+          This compressed mode supports shared-outer 3D adapters
+          (<code>w1/w3</code> A shared, <code>w1/w3</code> B per-expert,
+          <code>w2</code> A per-expert, <code>w2</code> B shared). It rejects
+          per-expert and 2D MoE adapters because those require full expert
+          storage for every side.
+        </p>
+
+        <h3>Shared-Outer MoE Contract</h3>
+        <p>
+          The 3D shared-outer layout follows the hybrid MoE-LoRA design from
+          Together's research notes. The low-rank side that builds a compact
+          representation can be shared when the representation is common across
+          experts, while the side that interprets an expert-specific activation
+          remains per expert.
+        </p>
+        <table>
+          <thead>
+            <tr><th>Projection</th><th>Shared side</th><th>Per-expert side</th><th>TokenSpeed buffer</th></tr>
+          </thead>
+          <tbody>
+            <tr>
+              <td>Gate <code>w1</code></td>
+              <td><code>lora_A</code>, dim0 = <code>1</code></td>
+              <td><code>lora_B</code>, dim0 = <code>num_experts</code></td>
+              <td>First rank slice of <code>w13_A</code> and first intermediate slice of <code>w13_B</code></td>
+            </tr>
+            <tr>
+              <td>Up <code>w3</code></td>
+              <td><code>lora_A</code>, dim0 = <code>1</code></td>
+              <td><code>lora_B</code>, dim0 = <code>num_experts</code></td>
+              <td>Second rank slice of <code>w13_A</code> and second intermediate slice of <code>w13_B</code></td>
+            </tr>
+            <tr>
+              <td>Down <code>w2</code></td>
+              <td><code>lora_B</code>, dim0 = <code>1</code></td>
+              <td><code>lora_A</code>, dim0 = <code>num_experts</code></td>
+              <td><code>down_A</code> per expert and <code>down_B</code> shared</td>
+            </tr>
+          </tbody>
+        </table>
+
+        <pre><code>expected 3D shared-outer dim0:
+  experts.w1: A = 1,           B = num_experts
+  experts.w3: A = 1,           B = num_experts
+  experts.w2: A = num_experts, B = 1</code></pre>
+
+        <p>
+          In full mode, TokenSpeed expands any dim0=<code>1</code> shared tensor
+          into every expert slot during load. In compressed mode, the shared
+          side stays physically shared in the GPU pool and
+          <code>MoeLoraContext._select_expert_weights()</code> broadcasts it at
+          apply time. This saves <code>(num_experts - 1) * rank * (3 * hidden)</code>
+          elements per adapter slot per MoE layer, because only
+          <code>w13_A</code> and <code>down_B</code> stop carrying duplicate
+          expert copies.
+        </p>
+
+        <h3>Route-Level Math</h3>
+        <p>
+          For each routed pair <code>(token t, expert e)</code> and adapter
+          slot <code>s</code>, MoE LoRA adds deltas at the same points as the
+          base MoE projections. Gate/up LoRA is added before the activation;
+          down LoRA is multiplied by the router weight before it is accumulated
+          into the final routed output.
+        </p>
+        <pre><code>gate_up_delta[t, e] =
+  ((hidden[t] @ w13_A[s, e].T) @ w13_B[s, e].T) * scaling[s]
+
+gate_up_output[t, e] += gate_up_delta[t, e]
+
+down_delta[t, e] =
+  ((intermediate[t, e] @ down_A[s, e].T) @ down_B[s, e].T)
+  * topk_weights[t, e] * scaling[s]
+
+down_output[t, e] += down_delta[t, e]</code></pre>
+
+        <p>
+          When a side is shared, the effective expert index is <code>0</code>
+          for that side. The apply path therefore uses the same equations for
+          full per-expert and shared-outer adapters; only the tensor selection
+          changes.
+        </p>
+
+        <h3>Optimization Notes</h3>
+        <table>
+          <thead>
+            <tr><th>Idea from the research note</th><th>TokenSpeed status</th></tr>
+          </thead>
+          <tbody>
+            <tr>
+              <td>Compute shared gate/up A once per token, then reuse it for every routed expert.</td>
+              <td>Storage supports this shape, but the current apply path still evaluates per routed pair. A future fused kernel can exploit the shared side directly.</td>
+            </tr>
+            <tr>
+              <td>For shared down B, combine weighted low-rank intermediates first, then apply one shared B projection.</td>
+              <td>The current implementation applies the down delta per route and weights it before accumulation. This is correct and leaves the fused shared-B reduction as a kernel optimization.</td>
+            </tr>
+            <tr>
+              <td>Group work by <code>(adapter slot, expert id)</code> for better locality.</td>
+              <td>Dense LoRA already groups by adapter for some paths. MoE LoRA currently keeps a narrow context API so backends can add this grouping without changing manager ownership.</td>
+            </tr>
+          </tbody>
+        </table>
+
+        <h3>Runtime Apply</h3>
+        <ul>
+          <li><code>MoELayer.forward()</code> obtains the current manager through explicit argument or <code>get_current_lora_manager()</code>.</li>
+          <li>If the backend advertises <code>supports_moe_lora</code>, it receives <code>moe_lora_context</code>.</li>
+          <li>The Triton MoE path applies gate/up LoRA after the first expert GEMM and before activation.</li>
+          <li>It applies down LoRA after the down expert GEMM and before final route combine.</li>
+          <li>For mixed-adapter batches, <code>MoeLoraContext</code> expands segment slots to token slots and masks base-model tokens.</li>
+          <li>If token ownership changes under expert parallel dispatch, mixed LoRA is disabled rather than applying an incorrect slot map.</li>
+        </ul>
+
+        <div class="callout">
+          Current MoE LoRA support is local or tensor-parallel MoE only.
+          Expert-parallel MoE needs the LoRA slot map dispatched with tokens.
+        </div>
+      </section>
+
+      <section id="cuda-graph">
+        <h2>CUDA Graph</h2>
+        <p>
+          The CUDA graph design relies on stable pointers. Adapter contents,
+          segment lengths, slot ids, ranks, and scalings can change between
+          replays, but the tensors holding those values do not move.
+        </p>
+
+        <h3>Capture</h3>
+        <ul>
+          <li>When LoRA is enabled, <code>CudaGraphWrapper.capture()</code> captures two graphs per batch size.</li>
+          <li>The with-LoRA graph sets <code>ctx.lora_manager</code> and calls <code>prepare_loras([0] * bs)</code> before capture so metadata tensors contain <code>NO_LORA_SLOT</code> while kernels capture stable pointers.</li>
+          <li>The no-LoRA graph leaves <code>ctx.lora_manager</code> unset, so model-layer branches skip LoRA calls entirely.</li>
+          <li>No-LoRA capture is safe because base-model dummy ids resolve to <code>NO_LORA_SLOT</code>; runtime LoRA paths skip work when no real adapter is active.</li>
+        </ul>
+
+        <h3>Replay</h3>
+        <ol>
+          <li><code>ModelExecutor</code> builds the real <code>lora_ids</code> list for the scheduled requests.</li>
+          <li><code>prepare_loras()</code> updates the persistent <code>LoraBatchInfo</code> tensors in place.</li>
+          <li>If any id is nonzero, <code>ctx.lora_manager</code> is set and LoRA-capable layers call apply methods.</li>
+          <li><code>CudaGraphWrapper</code> chooses the no-LoRA graph if <code>has_active_lora</code> is false, otherwise it replays the with-LoRA graph.</li>
+          <li>The captured kernels read the updated metadata and use the current slot-to-weight buffers.</li>
+        </ol>
+
+        <pre><code>capture time:
+  batch_info tensors allocated once
+  graph records pointers to batch_info, ranks, scalings, and weight buffers
+
+replay time:
+  prepare_loras() mutates tensor contents
+  graph.replay() reads new contents through old pointers</code></pre>
+
+        <h3>Why Two Graphs?</h3>
+        <p>
+          The with-LoRA graph includes LoRA kernel launches. That is necessary
+          when any request uses an adapter. For all-base batches, the no-LoRA
+          graph avoids those launches entirely and preserves base-model decode
+          performance.
+        </p>
+      </section>
+
+      <section id="limitations">
+        <h2>Limitations and Open Edges</h2>
+        <ul>
+          <li><span class="warn">MoE EP:</span> Expert-parallel MoE is rejected for MoE LoRA until the slot map is dispatched alongside routed tokens.</li>
+          <li><span class="warn">2D hybrid shared:</span> The <code>experts.shared.*</code> 2D hybrid-shared format is not currently supported.</li>
+          <li><span class="warn">Model hooks:</span> Dense LoRA requires model layers to call the manager apply methods at projection boundaries.</li>
+          <li><span class="warn">Slot identity:</span> External code should not persist GPU slots. Only <code>lora_id</code> and adapter names are stable.</li>
+        </ul>
+      </section>
+    </main>
+  </div>
+</body>
+</html>
diff --git a/docs/serving/lora.md b/docs/serving/lora.md
new file mode 100644
index 000000000..403cec12b
--- /dev/null
+++ b/docs/serving/lora.md
@@ -0,0 +1,62 @@
+# LoRA Serving
+
+TokenSpeed supports PEFT-style LoRA adapters for dense attention and MLP
+modules. Dense adapters target:
+
+- `q_proj`, `k_proj`, `v_proj`, `o_proj`
+- `gate_proj`, `up_proj`, `down_proj`
+
+Generation requests select adapters by registered `lora_name`. They do not
+load adapters from disk. Register the adapter first with `load_lora_adapter`
+using a durable adapter path, then pass that name on requests:
+
+```python
+engine.load_lora_adapter("password_adapter", "/path/to/adapter_0")
+engine.generate("...", lora_name="password_adapter")
+```
+
+Requests cannot load adapters from disk and do not accept a request-time
+filesystem path. Unknown `lora_name` values fail fast; use the base model by
+omitting `lora_name`.
+
+MoE LoRA support is available for expert-scoped weights on Triton MoE
+backends. The PEFT per-expert format uses 2D tensors and includes the expert id
+in each key:
+
+```text
+base_model.model.model.layers.<layer>.mlp.experts.<expert>.gate_proj.lora_A.weight
+base_model.model.model.layers.<layer>.mlp.experts.<expert>.gate_proj.lora_B.weight
+base_model.model.model.layers.<layer>.mlp.experts.<expert>.up_proj.lora_A.weight
+base_model.model.model.layers.<layer>.mlp.experts.<expert>.up_proj.lora_B.weight
+base_model.model.model.layers.<layer>.mlp.experts.<expert>.down_proj.lora_A.weight
+base_model.model.model.layers.<layer>.mlp.experts.<expert>.down_proj.lora_B.weight
+```
+
+TokenSpeed also accepts 3D MoE LoRA tensors under the SGLang-style
+`experts.w1`, `experts.w2`, and `experts.w3` names:
+
+```text
+base_model.model.model.layers.<layer>.mlp.experts.w1.lora_A.weight
+base_model.model.model.layers.<layer>.mlp.experts.w1.lora_B.weight
+base_model.model.model.layers.<layer>.mlp.experts.w2.lora_A.weight
+base_model.model.model.layers.<layer>.mlp.experts.w2.lora_B.weight
+base_model.model.model.layers.<layer>.mlp.experts.w3.lora_A.weight
+base_model.model.model.layers.<layer>.mlp.experts.w3.lora_B.weight
+```
+
+`w1` maps to `gate_proj`, `w3` maps to `up_proj`, and `w2` maps to
+`down_proj`. For these tensors, dimension 0 may be either `num_experts` for a
+fully per-expert side or `1` for a shared side. This covers both 3D per-expert
+and 3D shared-outer adapter layouts.
+
+The 2D hybrid-shared `experts.shared.*` format is not currently supported.
+
+The current MoE path is guarded to local or tensor-parallel MoE execution.
+Expert-parallel dispatch is rejected for MoE LoRA because token ownership and
+the LoRA slot map must be dispatched together before expert compute.
+
+Implementation note: dense adapter lifecycle and cache residency are still
+owned by `LoraManager`, while expert-scoped MoE tensors are held behind a
+`MoeLoraContext` consumed by MoE backends. New MoE LoRA kernels should live
+behind the `tokenspeed-kernel` boundary and use that context rather than
+depending on the full manager object.
diff --git a/docs/tokenspeed_structure.html b/docs/tokenspeed_structure.html
new file mode 100644
index 000000000..e79cb2f78
--- /dev/null
+++ b/docs/tokenspeed_structure.html
@@ -0,0 +1,653 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>TokenSpeed — Codebase Structure</title>
+<style>
+:root {
+  --bg:#0d1117; --surface:#161b22; --surface2:#1c2128; --border:#30363d;
+  --text:#e6edf3; --muted:#8b949e; --accent:#58a6ff; --accent2:#79c0ff;
+  --green:#3fb950; --red:#f85149; --yellow:#d29922; --orange:#e3b341;
+  --purple:#bc8cff; --cyan:#39d5ff; --code-bg:#1c2128;
+}
+*{box-sizing:border-box;margin:0;padding:0;}
+body{background:var(--bg);color:var(--text);font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif;font-size:13.5px;line-height:1.6;}
+a{color:var(--accent);text-decoration:none;}
+a:hover{text-decoration:underline;}
+
+/* Layout */
+.layout{display:grid;grid-template-columns:280px 1fr;min-height:100vh;}
+.sidebar{background:var(--surface);border-right:1px solid var(--border);padding:20px 0;position:sticky;top:0;height:100vh;overflow-y:auto;}
+.sidebar-logo{padding:0 18px 16px;border-bottom:1px solid var(--border);margin-bottom:12px;}
+.sidebar-logo h1{font-size:15px;font-weight:700;color:var(--text);}
+.sidebar-logo p{font-size:11px;color:var(--muted);margin-top:2px;}
+nav h3{font-size:10px;font-weight:600;text-transform:uppercase;letter-spacing:.08em;color:var(--muted);padding:8px 18px 4px;}
+nav ul{list-style:none;}
+nav li a{display:flex;align-items:center;gap:6px;padding:4px 18px;color:var(--muted);font-size:12.5px;transition:color .15s,background .15s;}
+nav li a:hover{color:var(--text);background:rgba(255,255,255,.04);text-decoration:none;}
+nav li a .dot{width:6px;height:6px;border-radius:50%;flex-shrink:0;}
+.main{padding:36px 52px;max-width:1100px;}
+h1.page-title{font-size:26px;font-weight:700;margin-bottom:6px;}
+h1.page-title .sub{font-size:13px;font-weight:400;color:var(--muted);display:block;margin-top:3px;}
+h2{font-size:18px;font-weight:600;margin:40px 0 14px;padding-bottom:8px;border-bottom:1px solid var(--border);}
+h3{font-size:14px;font-weight:600;margin:20px 0 8px;color:var(--orange);}
+p{margin-bottom:10px;color:var(--text);}
+code{font-family:'SF Mono',Consolas,monospace;font-size:12px;background:var(--code-bg);border:1px solid var(--border);border-radius:3px;padding:1px 5px;color:var(--accent2);}
+pre{background:var(--code-bg);border:1px solid var(--border);border-radius:8px;padding:14px 16px;overflow-x:auto;margin-bottom:14px;font-family:'SF Mono',Consolas,monospace;font-size:12px;line-height:1.5;}
+
+/* Package cards */
+.pkg-grid{display:grid;grid-template-columns:repeat(auto-fill,minmax(220px,1fr));gap:12px;margin:16px 0;}
+.pkg-card{background:var(--surface);border:1px solid var(--border);border-radius:8px;padding:14px;}
+.pkg-card:hover{border-color:var(--accent);transition:border-color .2s;}
+.pkg-card .pkg-name{font-weight:700;font-size:13px;margin-bottom:4px;}
+.pkg-card .pkg-lang{font-size:10px;font-weight:600;padding:1px 6px;border-radius:8px;display:inline-block;margin-bottom:6px;}
+.lang-py{background:#1a2a3a;color:var(--accent);}
+.lang-cpp{background:#2a1a3a;color:var(--purple);}
+.lang-cuda{background:#1a3a28;color:var(--green);}
+.pkg-card p{font-size:12px;color:var(--muted);margin:0;}
+
+/* File tree */
+.tree{background:var(--code-bg);border:1px solid var(--border);border-radius:8px;padding:16px;font-family:'SF Mono',Consolas,monospace;font-size:12px;line-height:1.7;overflow-x:auto;}
+.tree .dir{color:var(--accent2);}
+.tree .file{color:var(--text);}
+.tree .comment{color:var(--muted);}
+.tree .hl{color:var(--orange);}
+
+/* Component table */
+table{width:100%;border-collapse:collapse;margin-bottom:16px;font-size:12.5px;}
+th{background:var(--surface);color:var(--muted);font-weight:600;text-align:left;padding:7px 12px;border:1px solid var(--border);font-size:11px;text-transform:uppercase;letter-spacing:.05em;}
+td{padding:7px 12px;border:1px solid var(--border);vertical-align:top;}
+tr:nth-child(even) td{background:rgba(255,255,255,.015);}
+td code{font-size:11.5px;}
+td:first-child code{color:var(--green);}
+
+/* Architecture diagram */
+.arch{background:var(--surface2);border:1px solid var(--border);border-radius:10px;padding:20px;margin:16px 0;}
+.arch-row{display:flex;align-items:stretch;gap:8px;margin-bottom:8px;flex-wrap:wrap;}
+.arch-box{background:var(--surface);border:1px solid var(--border);border-radius:6px;padding:8px 12px;font-size:12px;flex:1;min-width:120px;}
+.arch-box.accent{border-color:var(--accent);background:#1a2a3a;}
+.arch-box.green{border-color:var(--green);background:#1a3020;}
+.arch-box.purple{border-color:var(--purple);background:#2a1a3a;}
+.arch-box.orange{border-color:var(--orange);background:#3a2a1a;}
+.arch-box.cyan{border-color:var(--cyan);background:#1a3035;}
+.arch-box .label{font-weight:700;margin-bottom:2px;font-size:12px;}
+.arch-box .sub{color:var(--muted);font-size:11px;}
+.arch-arrow{display:flex;align-items:center;color:var(--muted);font-size:18px;padding:0 2px;}
+.arch-sep{height:1px;background:var(--border);margin:8px 0;}
+
+/* Flow */
+.flow{display:flex;align-items:center;gap:0;flex-wrap:wrap;margin:10px 0;}
+.fbox{background:var(--surface);border:1px solid var(--border);border-radius:5px;padding:5px 10px;font-size:11.5px;white-space:nowrap;}
+.fbox.a{border-color:var(--accent);color:var(--accent);}
+.fbox.g{border-color:var(--green);color:var(--green);}
+.fbox.o{border-color:var(--orange);color:var(--orange);}
+.farrow{color:var(--muted);padding:0 5px;font-size:14px;}
+
+/* Callout */
+.note{background:#1a2a3a;border-left:3px solid var(--accent);border-radius:0 6px 6px 0;padding:10px 14px;margin-bottom:12px;font-size:12.5px;}
+.note.g{background:#1a3020;border-color:var(--green);}
+.note.o{background:#3a2a1a;border-color:var(--orange);}
+.note strong{display:block;margin-bottom:3px;font-size:12.5px;}
+
+/* Stats bar */
+.stats{display:flex;gap:20px;margin:14px 0;flex-wrap:wrap;}
+.stat{background:var(--surface);border:1px solid var(--border);border-radius:8px;padding:12px 18px;text-align:center;}
+.stat .n{font-size:22px;font-weight:700;line-height:1;}
+.stat .l{font-size:11px;color:var(--muted);margin-top:3px;}
+.stat.accent .n{color:var(--accent);}
+.stat.green  .n{color:var(--green);}
+.stat.purple .n{color:var(--purple);}
+.stat.orange .n{color:var(--orange);}
+.stat.cyan   .n{color:var(--cyan);}
+
+/* Color dots */
+.dot-py{background:var(--accent);}
+.dot-cpp{background:var(--purple);}
+.dot-cuda{background:var(--green);}
+.dot-cfg{background:var(--orange);}
+
+/* Collapsible */
+details summary{cursor:pointer;list-style:none;padding:6px 0;color:var(--accent);font-size:12.5px;user-select:none;}
+details summary::-webkit-details-marker{display:none;}
+details summary::before{content:"▶ ";font-size:10px;}
+details[open] summary::before{content:"▼ ";}
+details .inner{padding:8px 0 4px 14px;}
+</style>
+</head>
+<body>
+<div class="layout">
+
+<!-- SIDEBAR -->
+<nav class="sidebar">
+  <div class="sidebar-logo">
+    <h1>TokenSpeed</h1>
+    <p>Codebase Structure Reference</p>
+  </div>
+  <h3>Overview</h3>
+  <ul>
+    <li><a href="#top"><span class="dot dot-py"></span>Summary</a></li>
+    <li><a href="#arch"><span class="dot dot-py"></span>Architecture</a></li>
+    <li><a href="#flow"><span class="dot dot-py"></span>Request Flow</a></li>
+  </ul>
+  <h3>Packages</h3>
+  <ul>
+    <li><a href="#runtime"><span class="dot dot-py"></span>python/runtime</a></li>
+    <li><a href="#engine"><span class="dot dot-py"></span>↳ engine/</a></li>
+    <li><a href="#execution"><span class="dot dot-py"></span>↳ execution/</a></li>
+    <li><a href="#models"><span class="dot dot-py"></span>↳ models/</a></li>
+    <li><a href="#layers"><span class="dot dot-py"></span>↳ layers/</a></li>
+    <li><a href="#cache"><span class="dot dot-py"></span>↳ cache/</a></li>
+    <li><a href="#entrypoints"><span class="dot dot-py"></span>↳ entrypoints/</a></li>
+    <li><a href="#kernel"><span class="dot dot-cuda"></span>tokenspeed-kernel</a></li>
+    <li><a href="#mla"><span class="dot dot-cuda"></span>tokenspeed-mla</a></li>
+    <li><a href="#scheduler"><span class="dot dot-cpp"></span>tokenspeed-scheduler</a></li>
+    <li><a href="#fsm"><span class="dot dot-cpp"></span>↳ fsm/</a></li>
+    <li><a href="#resources"><span class="dot dot-cpp"></span>↳ resource/</a></li>
+  </ul>
+  <h3>More</h3>
+  <ul>
+    <li><a href="#kernel-selection"><span class="dot dot-cuda"></span>Kernel Selection</a></li>
+    <li><a href="#lora"><span class="dot dot-py"></span>LoRA Integration</a></li>
+    <li><a href="#tests"><span class="dot dot-cfg"></span>Tests</a></li>
+    <li><a href="#filetree"><span class="dot dot-cfg"></span>Full File Tree</a></li>
+  </ul>
+</nav>
+
+<!-- MAIN -->
+<main class="main">
+
+<h1 class="page-title" id="top">TokenSpeed Codebase Structure
+  <span class="sub">Multi-package inference engine &nbsp;·&nbsp; ~90K lines &nbsp;·&nbsp; Python + C++ + CUDA</span>
+</h1>
+
+<div class="stats">
+  <div class="stat accent"><div class="n">4</div><div class="l">Packages</div></div>
+  <div class="stat green"><div class="n">55K</div><div class="l">Python lines</div></div>
+  <div class="stat purple"><div class="n">10K</div><div class="l">C++ lines</div></div>
+  <div class="stat orange"><div class="n">20K+</div><div class="l">Kernel lines</div></div>
+  <div class="stat cyan"><div class="n">100+</div><div class="l">Test files</div></div>
+</div>
+
+<div class="pkg-grid">
+  <div class="pkg-card">
+    <div class="pkg-name">python/</div>
+    <span class="pkg-lang lang-py">Python</span>
+    <p>Core inference runtime: engine, models, layers, cache, distributed serving, OpenAI HTTP API.</p>
+  </div>
+  <div class="pkg-card">
+    <div class="pkg-name">tokenspeed-kernel/</div>
+    <span class="pkg-lang lang-cuda">CUDA / Triton</span>
+    <p>Pluggable kernel library with multi-backend auto-selection. Attention, GEMM, MoE, quantization.</p>
+  </div>
+  <div class="pkg-card">
+    <div class="pkg-name">tokenspeed-mla/</div>
+    <span class="pkg-lang lang-cuda">CuTe DSL</span>
+    <p>Blackwell-optimised Multi-head Latent Attention (MLA) kernels: prefill, decode FP16/FP8, KV packing.</p>
+  </div>
+  <div class="pkg-card">
+    <div class="pkg-name">tokenspeed-scheduler/</div>
+    <span class="pkg-lang lang-cpp">C++20</span>
+    <p>High-performance scheduler: FSM-driven request lifecycle, radix-tree KV prefix cache, resource allocation.</p>
+  </div>
+</div>
+
+<!-- ARCHITECTURE -->
+<h2 id="arch">Architecture Overview</h2>
+<div class="arch">
+  <!-- HTTP layer -->
+  <div class="arch-row">
+    <div class="arch-box accent" style="flex:1">
+      <div class="label">HTTP API (entrypoints/)</div>
+      <div class="sub">/v1/chat/completions &nbsp;·&nbsp; /v1/completions &nbsp;·&nbsp; /v1/embeddings</div>
+    </div>
+  </div>
+  <div class="arch-row">
+    <div class="arch-box accent" style="flex:1">
+      <div class="label">AsyncLLM / Engine (engine/)</div>
+      <div class="sub">RequestHandler &nbsp;·&nbsp; InputProcessor &nbsp;·&nbsp; OutputProcessor &nbsp;·&nbsp; SchedulerControlClient</div>
+    </div>
+  </div>
+  <div class="arch-sep"></div>
+  <!-- Scheduler layer -->
+  <div class="arch-row">
+    <div class="arch-box purple" style="flex:1">
+      <div class="label">C++ Scheduler (tokenspeed-scheduler)</div>
+      <div class="sub">FSM state machine &nbsp;·&nbsp; KV prefix cache &nbsp;·&nbsp; Page allocators &nbsp;·&nbsp; ExecutionPlan generation</div>
+    </div>
+  </div>
+  <div class="arch-sep"></div>
+  <!-- Execution layer -->
+  <div class="arch-row">
+    <div class="arch-box green" style="flex:3">
+      <div class="label">ModelRunner / ModelExecutor (execution/)</div>
+      <div class="sub">CUDA graph capture &amp; replay &nbsp;·&nbsp; Batch forward &nbsp;·&nbsp; Weight loading</div>
+    </div>
+    <div class="arch-box orange" style="flex:1">
+      <div class="label">KV Cache (cache/)</div>
+      <div class="sub">Prefix cache &nbsp;·&nbsp; Host/disk backends &nbsp;·&nbsp; LoRA namespacing</div>
+    </div>
+    <div class="arch-box" style="flex:1">
+      <div class="label">Sampling (sampling/)</div>
+      <div class="sub">Logit processors &nbsp;·&nbsp; Top-k/p &nbsp;·&nbsp; Grammar</div>
+    </div>
+  </div>
+  <div class="arch-sep"></div>
+  <!-- Model layer -->
+  <div class="arch-row">
+    <div class="arch-box" style="flex:1">
+      <div class="label">Models (models/)</div>
+      <div class="sub">Qwen3 &nbsp;·&nbsp; DeepSeek V3/V4 &nbsp;·&nbsp; Llama &nbsp;·&nbsp; MiniMax &nbsp;·&nbsp; 10+ architectures</div>
+    </div>
+    <div class="arch-box" style="flex:1">
+      <div class="label">Layers (layers/)</div>
+      <div class="sub">Linear &nbsp;·&nbsp; Attention &nbsp;·&nbsp; MoE &nbsp;·&nbsp; LayerNorm &nbsp;·&nbsp; RoPE &nbsp;·&nbsp; Quantization</div>
+    </div>
+  </div>
+  <div class="arch-sep"></div>
+  <!-- Kernel layer -->
+  <div class="arch-row">
+    <div class="arch-box cyan" style="flex:1">
+      <div class="label">tokenspeed-kernel</div>
+      <div class="sub">Multi-backend auto-select &nbsp;·&nbsp; Attention/GEMM/MoE/Quant &nbsp;·&nbsp; Triton / CUDA / TRT-LLM / FlashInfer</div>
+    </div>
+    <div class="arch-box cyan" style="flex:0 0 180px">
+      <div class="label">tokenspeed-mla</div>
+      <div class="sub">MLA prefill/decode &nbsp;·&nbsp; FP8 &nbsp;·&nbsp; Blackwell</div>
+    </div>
+  </div>
+</div>
+
+<!-- REQUEST FLOW -->
+<h2 id="flow">Request Flow</h2>
+<div class="flow">
+  <div class="fbox a">POST /v1/chat/completions</div><span class="farrow">→</span>
+  <div class="fbox">serving_chat.py</div><span class="farrow">→</span>
+  <div class="fbox">InputProcessor<br><small>tokenize</small></div><span class="farrow">→</span>
+  <div class="fbox">AsyncLLM<br><small>enqueue</small></div>
+</div>
+<div class="flow" style="margin-top:4px">
+  <div class="fbox">C++ Scheduler<br><small>prefix match, plan</small></div><span class="farrow">→</span>
+  <div class="fbox">ModelExecutor<br><small>forward pass</small></div><span class="farrow">→</span>
+  <div class="fbox">Model layers<br><small>via kernels</small></div><span class="farrow">→</span>
+  <div class="fbox g">Sample + stream<br><small>OutputProcessor</small></div>
+</div>
+
+<!-- RUNTIME -->
+<h2 id="runtime">Python Runtime — <code>python/tokenspeed/runtime/</code></h2>
+
+<h3 id="engine">engine/</h3>
+<p>Async request lifecycle management — from HTTP intake to token streaming.</p>
+<table>
+  <tr><th>File</th><th>Purpose</th></tr>
+  <tr><td><code>async_llm.py</code></td><td>Main async event loop; <code>AsyncLLM</code> class; routes requests, drives scheduler, streams results</td></tr>
+  <tr><td><code>event_loop.py</code></td><td>Subprocess event loop; owns C++ scheduler + model executor; drives the scheduling cycle</td></tr>
+  <tr><td><code>llm.py</code></td><td>Sync wrapper around <code>AsyncLLM</code> for blocking callers</td></tr>
+  <tr><td><code>request_handler.py</code></td><td>Dispatches incoming ZMQ messages (generate, abort, flush, LoRA load/unload…)</td></tr>
+  <tr><td><code>input_processor.py</code></td><td>Tokenises prompts; resolves request <code>lora_name</code> → <code>lora_id</code></td></tr>
+  <tr><td><code>output_processor.py</code></td><td>Detokenises generated tokens and streams to client</td></tr>
+  <tr><td><code>io_struct.py</code></td><td>All request/response dataclasses (<code>GenerateReqInput</code>, <code>LoadLoraReqInput</code>, …)</td></tr>
+  <tr><td><code>schedule_batch.py</code></td><td>Assembles per-forward-op batch metadata from the C++ scheduler plan</td></tr>
+  <tr><td><code>scheduler_utils.py</code></td><td><code>make_spec()</code>, <code>make_config()</code>; helpers bridging Python↔C++ scheduler</td></tr>
+  <tr><td><code>scheduler_control_client.py</code></td><td>ZMQ communicators for weight updates, flush, profile, LoRA operations</td></tr>
+  <tr><td><code>core_client.py</code></td><td>ZMQ client to the model-executor subprocess</td></tr>
+  <tr><td><code>generation_output_processor.py</code></td><td>Aggregates token outputs, handles streaming + stop conditions</td></tr>
+</table>
+
+<h3 id="execution">execution/</h3>
+<p>GPU forward-pass orchestration: CUDA graph capture, weight loading, batch preparation.</p>
+<table>
+  <tr><th>File</th><th>Purpose</th></tr>
+  <tr><td><code>model_runner.py</code></td><td>Calls model <code>forward()</code> with the right context; handles prefill vs decode</td></tr>
+  <tr><td><code>model_executor.py</code></td><td>Wraps <code>model_runner</code>; builds <code>ForwardContext</code>; injects LoRA weight indices; manages stats</td></tr>
+  <tr><td><code>cuda_graph_wrapper.py</code></td><td>Captures and replays CUDA graphs; manages decode graph pool</td></tr>
+  <tr><td><code>context.py</code></td><td><code>ForwardContext</code> dataclass: attn backend, KV pool, LoRA info, batch metadata</td></tr>
+  <tr><td><code>forward_batch_info.py</code></td><td><code>ForwardMode</code> enum (EXTEND / DECODE / IDLE); batch shape metadata</td></tr>
+  <tr><td><code>input_buffer.py</code></td><td>Pre-allocated GPU tensors for batched inputs (token IDs, positions, lengths…)</td></tr>
+  <tr><td><code>weight_loader.py</code></td><td>Loads safetensors/pickle checkpoints; prefetches shards in background threads</td></tr>
+  <tr><td><code>cache_loc_kernel.py</code></td><td>Triton kernel that fills the block-table tensor from scheduler page IDs</td></tr>
+  <tr><td><code>factory.py</code></td><td><code>create_model_executor()</code>, <code>create_model_runner()</code>, <code>create_attn_components()</code></td></tr>
+  <tr><td><code>distributed_initializer.py</code></td><td>NCCL process-group init; TP/DP rank assignment</td></tr>
+  <tr><td><code>drafter/eagle.py</code></td><td>Eagle-3 speculative decoding draft model wrapper</td></tr>
+</table>
+
+<h3 id="models">models/</h3>
+<p>Architecture implementations — each model defines attention, MLP, and embedding layers with weight loading.</p>
+<table>
+  <tr><th>File</th><th>Architecture</th><th>Notes</th></tr>
+  <tr><td><code>qwen3.py</code></td><td>Qwen3-8B/72B</td><td>GQA + qk-norm; LoRA injection added</td></tr>
+  <tr><td><code>qwen3_5.py</code></td><td>Qwen3.5 MoE</td><td>Sparse MoE variant</td></tr>
+  <tr><td><code>deepseek_v3.py</code></td><td>DeepSeek V3</td><td>MLA + MoE; 2K lines</td></tr>
+  <tr><td><code>deepseek_v4.py</code></td><td>DeepSeek V4</td><td>MLA + LoRA rank projections (q, kv); 1700 lines</td></tr>
+  <tr><td><code>llama.py</code></td><td>Llama 2/3</td><td>Standard GQA + RoPE</td></tr>
+  <tr><td><code>llama_eagle3.py</code></td><td>Llama + Eagle3</td><td>Speculative decoding variant</td></tr>
+  <tr><td><code>minimax_m2.py</code></td><td>MiniMax M2</td><td>MLA architecture</td></tr>
+  <tr><td><code>longcat_flash.py</code></td><td>LongCat-Flash</td><td>Long-context variant</td></tr>
+  <tr><td><code>deepseek_nextn.py</code></td><td>DeepSeek NextN</td><td>Next-token prediction variant</td></tr>
+  <tr><td><code>registry.py</code></td><td>—</td><td>Maps HF config <code>model_type</code> to implementation class</td></tr>
+  <tr><td><code>base/causal_lm.py</code></td><td>—</td><td>Base class: logit processor, embedding tie, hidden state capture</td></tr>
+</table>
+
+<h3 id="layers">layers/</h3>
+<p>Reusable neural network building blocks, each routing through <code>tokenspeed-kernel</code> for the best available backend.</p>
+<table>
+  <tr><th>Path</th><th>Purpose</th></tr>
+  <tr><td><code>linear.py</code></td><td>Column/Row parallel linear with quantization (int8, fp8, gptq, awq…). Largest file.</td></tr>
+  <tr><td><code>attention/registry.py</code></td><td>Instantiates attention backend; allocates KV pool; exposes <code>create_attn_components()</code></td></tr>
+  <tr><td><code>attention/backends/</code></td><td>Backend adapters: FlashAttention, FlashInfer, FlashMLA, tokenspeed-MLA, TRT-LLM MLA</td></tr>
+  <tr><td><code>attention/kv_cache/</code></td><td>MHA / MLA KV pool implementations; paged memory management</td></tr>
+  <tr><td><code>attention/configs/</code></td><td>MLA config (kv_lora_rank, qk_rope_head_dim, nope_head_dim, v_head_dim)</td></tr>
+  <tr><td><code>layernorm.py</code></td><td>RMSNorm with optional fused allreduce; GemmaRMSNorm; PDL-gated kernels</td></tr>
+  <tr><td><code>rotary_embedding.py</code></td><td>RoPE variants (YaRN, LongRoPE, linear scaling, multi-LoRA batching)</td></tr>
+  <tr><td><code>paged_attention.py</code></td><td>Thin wrapper calling the selected attention backend per forward pass</td></tr>
+  <tr><td><code>moe/</code></td><td>Expert routing (top-k, noaux_tc), dispatch, AllGather, DeepEP integration</td></tr>
+  <tr><td><code>quantization/</code></td><td>Per-tensor, per-token-head, gptq, awq, fp8 schemes; dequant kernels</td></tr>
+  <tr><td><code>vocab_parallel_embedding.py</code></td><td>Sharded embedding tables; LoRA embedding placement</td></tr>
+  <tr><td><code>logits_processor.py</code></td><td>Top-k, top-p, repetition penalty, grammar masking applied to logits</td></tr>
+</table>
+
+<h3 id="cache">cache/</h3>
+<table>
+  <tr><th>File</th><th>Purpose</th></tr>
+  <tr><td><code>prefix_cache.py</code></td><td>Python-side radix-tree prefix cache; <code>evictable_leaves</code> set; O(1) leaf delete</td></tr>
+  <tr><td><code>allocator.py</code></td><td>Page-granularity KV allocator; tracks <code>req_to_page</code>, free/used pages</td></tr>
+  <tr><td><code>kv_cache_host.py</code></td><td>CPU-pinned host KV staging (L2 cache); host↔device transfer helpers</td></tr>
+  <tr><td><code>evict_policy.py</code></td><td>LRU, LFU, FIFO, MRU, FILO, Priority eviction strategies</td></tr>
+  <tr><td><code>kvstore_controller.py</code></td><td>Coordinates device↔host↔storage eviction and prefetch</td></tr>
+  <tr><td><code>executor/memory_executor.py</code></td><td>Top-level cache executor: wires device + host + storage tiers</td></tr>
+  <tr><td><code>executor/host_executor.py</code></td><td>Async host↔device transfer with priority streams</td></tr>
+  <tr><td><code>storage/</code></td><td>Pluggable L3 storage (Mooncake, disk); <code>BackendFactory</code></td></tr>
+</table>
+
+<h3 id="entrypoints">entrypoints/</h3>
+<table>
+  <tr><th>File</th><th>Purpose</th></tr>
+  <tr><td><code>engine.py</code></td><td><code>Engine</code> class: in-process facade; <code>generate()</code>, <code>load_lora_adapter()</code>, weight updates</td></tr>
+  <tr><td><code>engine_base.py</code></td><td>Abstract base: <code>generate()</code>, <code>flush_cache()</code>, <code>load_lora_adapter()</code>…</td></tr>
+  <tr><td><code>http_server.py</code></td><td>FastAPI app; mounts OpenAI routes; middleware (auth, metrics)</td></tr>
+  <tr><td><code>openai/protocol.py</code></td><td>Pydantic models for <code>CompletionRequest</code> and <code>ChatCompletionRequest</code></td></tr>
+  <tr><td><code>openai/serving_chat.py</code></td><td>Chat completion handler: applies chat template, calls <code>GenerateReqInput</code></td></tr>
+  <tr><td><code>openai/serving_completions.py</code></td><td>Completion handler: prompt encoding, logprob extraction</td></tr>
+  <tr><td><code>engine/run_event_loop.py</code></td><td>Subprocess entry point for the scheduler worker process</td></tr>
+</table>
+
+<!-- KERNEL -->
+<h2 id="kernel">tokenspeed-kernel — <code>tokenspeed-kernel/python/tokenspeed_kernel/</code></h2>
+<p>Pip-installable kernel library. Operators are registered with capability metadata; <code>select_kernel()</code> picks the best available backend at runtime.</p>
+
+<h3>Core Infrastructure</h3>
+<table>
+  <tr><th>File</th><th>Purpose</th></tr>
+  <tr><td><code>__init__.py</code></td><td>Public API: <code>mha_prefill</code>, <code>mha_decode</code>, <code>mm</code>, <code>moe_fused</code>, <code>rmsnorm</code>, …</td></tr>
+  <tr><td><code>registry.py</code></td><td><code>@register_kernel</code> decorator; stores backends in a capability-indexed registry</td></tr>
+  <tr><td><code>selection.py</code></td><td><code>select_kernel(family, …)</code>: filter by capability/dtype/shape, rank by priority band</td></tr>
+  <tr><td><code>platform.py</code></td><td>Detects GPU arch (SM80/SM90/…), CUDA version, vendor</td></tr>
+  <tr><td><code>_triton.py</code></td><td>Single import for all Triton/Triton-fork usage (avoids duplicate loads)</td></tr>
+</table>
+
+<h3 id="kernel-selection">Kernel Selection Priority</h3>
+<div class="flow">
+  <div class="fbox a">select_kernel(family, dtype, shapes)</div><span class="farrow">→</span>
+  <div class="fbox">Filter by GPU capability + dtype support</div><span class="farrow">→</span>
+  <div class="fbox o">Rank by priority band</div>
+</div>
+<pre style="margin-top:8px">Priority bands (highest → lowest):
+  1.  Platform-matched  (flash_mla for Blackwell MLA decode)
+  2.  JIT-compiled      (CuTe DSL, Gluon)
+  3.  Triton            (portable, auto-tuned)
+  4.  Vendor libraries  (FlashAttention, FlashInfer, TRT-LLM)
+  5.  Reference         (PyTorch — correctness baseline)</pre>
+
+<h3>Operation Families (<code>ops/</code>)</h3>
+<table>
+  <tr><th>Family</th><th>Backends</th><th>Usage</th></tr>
+  <tr><td><code>attention/</code></td><td>triton, flash_attn, flashinfer, flash_mla, tokenspeed_mla</td><td>MHA + MLA prefill/decode</td></tr>
+  <tr><td><code>gemm/</code></td><td>triton, trtllm, flashinfer, deep_gemm</td><td>Weight matmuls, quantized GEMM</td></tr>
+  <tr><td><code>moe/</code></td><td>triton, cuda, deepep, flashinfer, trtllm</td><td>Expert dispatch, fused gate+up+down</td></tr>
+  <tr><td><code>layernorm/</code></td><td>triton, cuda, flashinfer</td><td>RMSNorm, fused add+norm</td></tr>
+  <tr><td><code>quantization/</code></td><td>triton, cuda, flashinfer, trtllm</td><td>Per-tensor/per-token quant/dequant</td></tr>
+  <tr><td><code>communication/</code></td><td>nccl, iris, triton, trtllm, flashinfer</td><td>AllReduce, ReduceScatter, AllGather</td></tr>
+  <tr><td><code>sampling/</code></td><td>cuda, flashinfer</td><td>Top-k, top-p sampling</td></tr>
+  <tr><td><code>activation/</code></td><td>cuda, flashinfer</td><td>SiGLU, GELU, SwiGLU</td></tr>
+  <tr><td><code>embedding/</code></td><td>triton, cuda, flashinfer</td><td>Token embedding lookup</td></tr>
+</table>
+
+<!-- MLA -->
+<h2 id="mla">tokenspeed-mla — <code>tokenspeed-mla/python/tokenspeed_mla/</code></h2>
+<p>Blackwell-optimised MLA kernels using NVIDIA CuTe DSL with JIT compilation and optional AOT binary backend.</p>
+<table>
+  <tr><th>File</th><th>Purpose</th></tr>
+  <tr><td><code>mla_prefill.py</code></td><td>Varlen ragged prefill; CuTe DSL JIT with compile-cache; causal mask; PDL support</td></tr>
+  <tr><td><code>mla_decode_fp16.py</code></td><td>Split-KV decode with FP16 accumulation; auto-sized workspace</td></tr>
+  <tr><td><code>mla_decode_fp8.py</code></td><td>FP8-quantized decode → BF16 output for numerical stability</td></tr>
+  <tr><td><code>mla_kv_pack_quantize_fp8.py</code></td><td>Fused KV packing + FP8 quantisation kernel</td></tr>
+  <tr><td><code>fmha.py</code></td><td>FMHA wrapper; dispatches to AOT binary or CuTe JIT path</td></tr>
+  <tr><td><code>mla_helpers.py</code></td><td>MLA math helpers: head-dim splitting, nope/rope decomposition</td></tr>
+</table>
+
+<!-- SCHEDULER -->
+<h2 id="scheduler">tokenspeed-scheduler — <code>tokenspeed-scheduler/csrc/</code></h2>
+<p>C++20 scheduler. The Python runtime calls it via nanobind bindings. All request state transitions happen here.</p>
+
+<h3>scheduler/</h3>
+<table>
+  <tr><th>File</th><th>Purpose</th></tr>
+  <tr><td><code>scheduler.h/.cpp</code></td><td>Main <code>Scheduler</code> class: <code>SubmitRequests()</code>, <code>NextExecutionPlan()</code>, <code>Advance(event)</code></td></tr>
+  <tr><td><code>request.h/.cpp</code></td><td><code>Request</code>: holds token container, FSM state, KV refs, LoRA ID</td></tr>
+  <tr><td><code>request_spec.h</code></td><td>Input spec: <code>request_id</code>, <code>tokens</code>, <code>rolling_hashes</code>, <code>lora_id</code></td></tr>
+  <tr><td><code>execution_plan.h</code></td><td><code>FlatForwardOperation</code>: request IDs, input lengths, prefix lens, page IDs</td></tr>
+  <tr><td><code>operations/forward.cpp</code></td><td><code>schedulePrefillFirstChunk()</code>, <code>scheduleDecode()</code>; passes <code>lora_id</code> to all Match/Insert calls</td></tr>
+  <tr><td><code>operations/cache.cpp</code></td><td>KV write-back, load-back, prefetch operations</td></tr>
+  <tr><td><code>outside_event_handler.cpp</code></td><td>Handles <code>FinishEvent</code>, PD events from outside the main scheduling loop</td></tr>
+</table>
+
+<h3 id="fsm">fsm/ — Finite State Machine</h3>
+<p>Each request transitions through states; events drive transitions and trigger cache/allocation side-effects.</p>
+<pre>Submitted → Prefilling → PrefillDone → Decoding → Draining → Finished
+                                     ↘ Retracting → Retracted
+                         (optional)   Prefetching → PrefetchDone
+                                      WritingBack
+                                      Aborting</pre>
+<table>
+  <tr><th>File</th><th>Purpose</th></tr>
+  <tr><td><code>forward_states.h</code></td><td>State data structs: prefill window, KV allocator, decode token count</td></tr>
+  <tr><td><code>forward_events.h/.cpp</code></td><td><code>SchedulePrefillFirstChunkEvent</code>, <code>FinishEvent</code>, <code>ScheduleDecodeEvent</code>; inject <code>lora_id</code></td></tr>
+  <tr><td><code>cache_states.h</code></td><td>Prefetch / write-back states</td></tr>
+  <tr><td><code>cache_events.h/.cpp</code></td><td>L2 write-back, load-back, L3 backup events</td></tr>
+  <tr><td><code>pd_states.h / pd_events.h/.cpp</code></td><td>Prefill-decode disaggregation states and transfer events</td></tr>
+</table>
+
+<h3 id="resources">resource/ — KV Cache &amp; Memory</h3>
+<table>
+  <tr><th>Path</th><th>Purpose</th></tr>
+  <tr><td><code>kv_prefix_cache/kv_prefix_cache.h/.cpp</code></td><td>Radix-tree prefix cache; <code>Match(tokens, lora_id)</code>; <code>Insert(tokens, lora_id)</code>; LoRA virtual roots</td></tr>
+  <tr><td><code>kv_prefix_cache/eviction.h</code></td><td><code>ResourceManager&lt;RType&gt;::Evict()</code>; persistent <code>lru_leaves_</code> set; O(k log N)</td></tr>
+  <tr><td><code>radix_tree/radix_tree.h/.cpp</code></td><td>Compressed trie; <code>WalkDownUtilMismatch()</code>; <code>splitChild()</code>; <code>PruneEmptyByNode()</code></td></tr>
+  <tr><td><code>radix_tree/tree_resource.h</code></td><td><code>NodeResource&lt;RType&gt;</code>: pages, ref_count, on_evictable callback (exact LRU)</td></tr>
+  <tr><td><code>radix_tree/tree_node.h/.cpp</code></td><td>Tree node: tokens, depth, children map, device/host resource pointers, Touch()</td></tr>
+  <tr><td><code>hybrid_prefix_cache/hybrid_prefix_cache.h/.cpp</code></td><td>Wraps KV cache + Mamba state cache; <code>Match(tokens, lora_id)</code></td></tr>
+  <tr><td><code>allocator/page_allocator.h/.cpp</code></td><td>Fixed-pool page allocator; free-list; <code>Allocate(n)</code> / <code>Free(pages)</code></td></tr>
+  <tr><td><code>allocator/kv_allocator.h/.cpp</code></td><td>Paged KV allocator; tracks req→page mapping</td></tr>
+  <tr><td><code>allocator/mamba_chunk_allocator.h/.cpp</code></td><td>Fixed-slot Mamba state allocator</td></tr>
+</table>
+
+<!-- LORA INTEGRATION -->
+<h2 id="lora">LoRA Integration</h2>
+<p>Added in <code>feat/lora-adapter-serving</code>. Touches all four packages.</p>
+<table>
+  <tr><th>Package</th><th>What was added</th></tr>
+  <tr><td>python/lora/</td><td><code>LoraConfig</code>, <code>LoraRegistry</code>, <code>LoraManager</code> (GPU pool + LRU eviction + TP-aware matmul)</td></tr>
+  <tr><td>python/models/qwen3.py</td><td><code>apply_qkv_lora()</code> after qkv_proj; <code>apply_o_lora()</code> after o_proj; pure-PyTorch <code>_rms_norm</code> for eager mode</td></tr>
+  <tr><td>python/execution/context.py</td><td><code>lora_weight_indices</code>, <code>lora_scalings</code>, <code>lora_manager</code> fields on <code>ForwardContext</code></td></tr>
+  <tr><td>python/execution/model_executor.py</td><td>Per-token weight_indices expansion via <code>repeat_interleave(w_idx, input_lengths)</code></td></tr>
+  <tr><td>python/entrypoints/openai/protocol.py</td><td>Request schemas; LoRA selection uses loaded adapter names where exposed.</td></tr>
+  <tr><td>tokenspeed-scheduler/csrc/</td><td><code>RequestSpec.lora_id</code>; <code>KVPrefixCache::Match(tokens, lora_id)</code>; virtual root per adapter; <code>namespace_depth_offset</code></td></tr>
+</table>
+
+<!-- TESTS -->
+<h2 id="tests">Tests</h2>
+<div class="note g">
+  <strong>120 C++ scheduler tests &nbsp;·&nbsp; 48 Python scheduler tests &nbsp;·&nbsp; 40+ runtime integration tests</strong>
+</div>
+<table>
+  <tr><th>Location</th><th>Coverage</th></tr>
+  <tr><td><code>tokenspeed-scheduler/tests/cpp/</code></td><td>Scheduling FSM, page lifecycle, eviction, prefix cache, Mamba, PD disagg, LoRA isolation</td></tr>
+  <tr><td><code>tokenspeed-scheduler/python/tests/</code></td><td>Python scheduler API, FSM transitions, prefill/decode batching, occupied pages, PD events</td></tr>
+  <tr><td><code>test/runtime/cache/</code></td><td>MLA KV buffer, prefix cache invariants (evictable_leaves, cascade eviction)</td></tr>
+  <tr><td><code>test/runtime/lora/</code></td><td>LoraRegistry capacity, pinning, scaling; dynamic load/unload end-to-end</td></tr>
+  <tr><td><code>test/runtime/models/</code></td><td>DeepSeek V4, Kimi, multimodal model parity</td></tr>
+  <tr><td><code>tokenspeed-kernel/test/</code></td><td>Kernel numerics: attention, GEMM, quantization tolerance verification</td></tr>
+  <tr><td><code>benchmark/</code></td><td>C++ eviction timing, LoRA batch isolation proof, decode-path cache microbenchmark</td></tr>
+</table>
+
+<!-- FILE TREE -->
+<h2 id="filetree">Full Directory Tree</h2>
+<details>
+  <summary>Show complete tree</summary>
+  <div class="inner">
+<div class="tree"><pre>
+tokenspeed/
+├── <span class="dir">python/</span>
+│   ├── pyproject.toml
+│   └── <span class="dir">tokenspeed/</span>
+│       ├── cli.py                       <span class="comment"># tokenspeed serve / bench / env</span>
+│       ├── bench.py                     <span class="comment"># Online serving benchmark</span>
+│       └── <span class="dir">runtime/</span>
+│           ├── <span class="dir hl">engine/</span>              <span class="comment"># Async LLM, request lifecycle</span>
+│           │   ├── async_llm.py
+│           │   ├── event_loop.py
+│           │   ├── io_struct.py
+│           │   ├── request_handler.py
+│           │   ├── input_processor.py
+│           │   ├── output_processor.py
+│           │   ├── schedule_batch.py
+│           │   ├── scheduler_utils.py
+│           │   ├── scheduler_control_client.py
+│           │   └── core_client.py
+│           ├── <span class="dir hl">execution/</span>           <span class="comment"># GPU forward pass</span>
+│           │   ├── model_runner.py
+│           │   ├── model_executor.py
+│           │   ├── cuda_graph_wrapper.py
+│           │   ├── context.py
+│           │   ├── forward_batch_info.py
+│           │   ├── input_buffer.py
+│           │   ├── weight_loader.py
+│           │   ├── factory.py
+│           │   └── drafter/eagle.py
+│           ├── <span class="dir hl">models/</span>              <span class="comment"># Architecture implementations</span>
+│           │   ├── registry.py
+│           │   ├── qwen3.py
+│           │   ├── qwen3_5.py
+│           │   ├── deepseek_v3.py
+│           │   ├── deepseek_v4.py
+│           │   ├── llama.py
+│           │   ├── minimax_m2.py
+│           │   └── base/causal_lm.py
+│           ├── <span class="dir hl">layers/</span>              <span class="comment"># Reusable neural net layers</span>
+│           │   ├── linear.py
+│           │   ├── layernorm.py
+│           │   ├── rotary_embedding.py
+│           │   ├── paged_attention.py
+│           │   ├── logits_processor.py
+│           │   ├── vocab_parallel_embedding.py
+│           │   ├── <span class="dir">attention/</span>       <span class="comment"># Backends: FlashAttn, FlashInfer, MLA, TRT-LLM</span>
+│           │   ├── <span class="dir">moe/</span>             <span class="comment"># Expert routing, dispatch</span>
+│           │   └── <span class="dir">quantization/</span>    <span class="comment"># int8, fp8, gptq, awq</span>
+│           ├── <span class="dir hl">cache/</span>               <span class="comment"># KV cache management</span>
+│           │   ├── prefix_cache.py
+│           │   ├── allocator.py
+│           │   ├── kv_cache_host.py
+│           │   ├── evict_policy.py
+│           │   ├── <span class="dir">executor/</span>        <span class="comment"># memory, host, storage executors</span>
+│           │   └── <span class="dir">storage/</span>         <span class="comment"># mooncake_store, disk backend</span>
+│           ├── <span class="dir hl">lora/</span>                <span class="comment"># LoRA adapter serving (new)</span>
+│           │   ├── lora_config.py
+│           │   ├── lora_registry.py
+│           │   └── lora_manager.py
+│           ├── <span class="dir hl">entrypoints/</span>         <span class="comment"># HTTP server + Engine API</span>
+│           │   ├── engine.py
+│           │   ├── engine_base.py
+│           │   ├── http_server.py
+│           │   └── <span class="dir">openai/</span>          <span class="comment"># Protocol, serving_chat, serving_completions</span>
+│           ├── <span class="dir">configs/</span>             <span class="comment"># Model + device configs</span>
+│           ├── <span class="dir">distributed/</span>         <span class="comment"># TP/DP mapping, comm ops</span>
+│           ├── <span class="dir">sampling/</span>            <span class="comment"># Sampling backends</span>
+│           ├── <span class="dir">grammar/</span>             <span class="comment"># Structured generation</span>
+│           ├── <span class="dir">pd/</span>                  <span class="comment"># Prefill-decode disagg</span>
+│           ├── <span class="dir">model_loader/</span>        <span class="comment"># Weight loading</span>
+│           ├── <span class="dir">metrics/</span>             <span class="comment"># Observability</span>
+│           └── <span class="dir">utils/</span>               <span class="comment"># Logging, env, common helpers</span>
+│
+├── <span class="dir">tokenspeed-kernel/</span>
+│   └── <span class="dir">python/tokenspeed_kernel/</span>
+│       ├── __init__.py                  <span class="comment"># Public API</span>
+│       ├── registry.py                  <span class="comment"># @register_kernel</span>
+│       ├── selection.py                 <span class="comment"># select_kernel()</span>
+│       ├── platform.py
+│       ├── <span class="dir hl">ops/</span>                     <span class="comment"># Backend implementations</span>
+│       │   ├── <span class="dir">attention/</span>
+│       │   ├── <span class="dir">gemm/</span>
+│       │   ├── <span class="dir">moe/</span>
+│       │   ├── <span class="dir">layernorm/</span>
+│       │   ├── <span class="dir">quantization/</span>
+│       │   ├── <span class="dir">communication/</span>
+│       │   └── <span class="dir">sampling/</span>
+│       ├── <span class="dir">thirdparty/</span>              <span class="comment"># Vendored CUDA/Triton kernels</span>
+│       └── <span class="dir">numerics/</span>               <span class="comment"># Kernel correctness verification</span>
+│
+├── <span class="dir">tokenspeed-mla/</span>
+│   └── <span class="dir">python/tokenspeed_mla/</span>
+│       ├── mla_prefill.py               <span class="comment"># CuTe DSL JIT prefill</span>
+│       ├── mla_decode_fp16.py
+│       ├── mla_decode_fp8.py
+│       ├── mla_kv_pack_quantize_fp8.py
+│       └── fmha.py
+│
+├── <span class="dir">tokenspeed-scheduler/</span>
+│   ├── <span class="dir">csrc/</span>
+│   │   ├── <span class="dir hl">scheduler/</span>               <span class="comment"># Scheduler core + FSM</span>
+│   │   │   ├── scheduler.h/.cpp
+│   │   │   ├── request.h/.cpp
+│   │   │   ├── request_spec.h
+│   │   │   └── <span class="dir">operations/</span>
+│   │   ├── <span class="dir hl">fsm/</span>                     <span class="comment"># State machine events/states</span>
+│   │   │   ├── forward_states.h
+│   │   │   ├── forward_events.h/.cpp
+│   │   │   ├── cache_events.h/.cpp
+│   │   │   └── pd_events.h/.cpp
+│   │   ├── <span class="dir hl">resource/</span>               <span class="comment"># KV cache + allocators</span>
+│   │   │   ├── <span class="dir">kv_prefix_cache/</span>     <span class="comment"># Radix tree + LoRA namespacing</span>
+│   │   │   ├── <span class="dir">radix_tree/</span>          <span class="comment"># Compressed prefix tree</span>
+│   │   │   ├── <span class="dir">allocator/</span>           <span class="comment"># Page allocators</span>
+│   │   │   └── <span class="dir">hybrid_prefix_cache/</span> <span class="comment"># L1+L2+Mamba</span>
+│   │   └── <span class="dir">core/</span>                    <span class="comment"># TokenContainer</span>
+│   ├── <span class="dir">bindings/</span>
+│   │   └── python_module.cpp            <span class="comment"># nanobind Python bindings</span>
+│   └── <span class="dir">tests/cpp/</span>                   <span class="comment"># GTest unit tests</span>
+│
+├── <span class="dir">benchmark/</span>
+│   ├── bench_cpp_eviction.py
+│   ├── bench_eviction_ts.py
+│   ├── bench_decode_cache.py
+│   ├── test_lora_dynamic.py
+│   └── test_lora_batch.py
+│
+├── <span class="dir">test/</span>
+│   ├── runners.py
+│   ├── <span class="dir">runtime/</span>                     <span class="comment"># Integration tests</span>
+│   │   ├── <span class="dir">cache/</span>
+│   │   ├── <span class="dir">lora/</span>
+│   │   └── <span class="dir">models/</span>
+│   └── <span class="dir">ci_system/</span>
+│
+└── <span class="dir">docs/</span>
+    ├── lora_implementation.html
+    └── tokenspeed_structure.html        <span class="comment"># ← this file</span>
+</pre></div>
+  </div>
+</details>
+
+</main>
+</div>
+</body>
+</html>
diff --git a/profile_expand.py b/profile_expand.py
new file mode 100644
index 000000000..b506f3798
--- /dev/null
+++ b/profile_expand.py
@@ -0,0 +1,274 @@
+"""Profile the decode expand kernel: bandwidth, FLOP utilization, config sweep.
+
+Identifies the bottleneck (instruction-bound vs memory-bound) and sweeps
+BLOCK_K up to 64/128 — larger BLOCK_K eliminates the inner K-loop entirely
+for rank=64/128 adapters, removing loop overhead and k-mask instructions.
+
+Usage:
+    python profile_expand.py
+"""
+
+from __future__ import annotations
+
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+import torch
+import triton
+import triton.language as tl
+
+sys.path.insert(0, str(Path(__file__).parent / "tokenspeed-kernel" / "python"))
+
+from tokenspeed_kernel._triton import triton as tok_triton
+from tokenspeed_kernel.ops.lora.triton.kernel_utils import _resolve_token_positions
+
+# ── minimal batch-info stub ────────────────────────────────────────────────────
+
+
+@dataclass
+class BI:
+    bs: int
+    max_len: int = 1
+    seg_lens: torch.Tensor = None
+    seg_indptr: torch.Tensor = None
+    weight_indices: torch.Tensor = None
+    lora_ranks: torch.Tensor = None
+    scalings: torch.Tensor = None
+    permutation: torch.Tensor = None
+
+    def __post_init__(self):
+        d = "cuda"
+        self.seg_lens = torch.ones(self.bs, dtype=torch.int32, device=d)
+        self.seg_indptr = torch.arange(self.bs + 1, dtype=torch.int32, device=d)
+        self.weight_indices = torch.ones(self.bs, dtype=torch.int32, device=d)
+        self.lora_ranks = torch.tensor([0, self.bs], dtype=torch.int32, device=d)
+        self.scalings = torch.tensor([0.0, 1.0], dtype=torch.float32, device=d)
+
+
+# ── inline expand kernel with configurable BLOCK_K ────────────────────────────
+
+
+@triton.jit
+def _expand_probe(
+    x,
+    weights,
+    output,
+    N,
+    K,
+    x_stride_0,
+    x_stride_1,
+    w_stride_0,
+    w_stride_1,
+    w_stride_2,
+    output_stride_0,
+    output_stride_1,
+    seg_lens,
+    seg_indptr,
+    weight_indices,
+    lora_ranks,
+    scalings,
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    num_warps: tl.constexpr,
+):
+    batch_id = tl.program_id(axis=1)
+    w_index = tl.load(weight_indices + batch_id)
+    rank = tl.load(lora_ranks + w_index)
+    if rank == 0:
+        return
+    pid = tl.program_id(axis=0)
+    seg_len = tl.load(seg_lens + batch_id)
+    if seg_len == 0:
+        return
+    seg_start = tl.load(seg_indptr + batch_id)
+    scaling = tl.load(scalings + w_index)
+    K_real = tl.minimum(K, rank)
+
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    pid_s = pid // num_pid_n
+    pid_n = pid % num_pid_n
+    if pid_s * BLOCK_S >= seg_len:
+        return
+
+    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.max_contiguous(tl.arange(0, BLOCK_K), BLOCK_K)
+
+    x_ptrs = (
+        x
+        + (seg_start + s_offset)[:, None] * x_stride_0
+        + k_offset[None, :] * x_stride_1
+    )
+    w_ptrs = (weights + w_index * w_stride_0) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+
+    s_mask = s_offset[:, None] < seg_len
+    n_mask = n_offset[None, :] < N
+    partial = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+
+    for k in range(0, tl.cdiv(K_real, BLOCK_K)):
+        k_rem = K_real - k * BLOCK_K
+        x_tile = tl.load(
+            x_ptrs,
+            mask=s_mask & (k_offset[None, :] < k_rem),
+            other=0.0,
+            eviction_policy="evict_first",
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=(k_offset[:, None] < k_rem) & n_mask,
+            other=0.0,
+            eviction_policy="evict_last",
+        )
+        partial += tl.dot(x_tile, w_tile)
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+
+    partial *= scaling
+    partial = partial.to(x.dtype.element_ty)
+    out_ptr = (
+        output
+        + (seg_start + s_offset)[:, None] * output_stride_0
+        + n_offset[None, :] * output_stride_1
+    )
+    out_mask = s_mask & n_mask
+    partial += tl.load(out_ptr, mask=out_mask, other=0.0)
+    tl.store(out_ptr, partial, mask=out_mask)
+
+
+def run_probe(x, weights, output, bi, BLOCK_S, BLOCK_N, BLOCK_K, num_warps, num_stages):
+    N, K = weights.shape[-2], weights.shape[-1]
+    max_len = bi.max_len
+    grid = (triton.cdiv(max_len, BLOCK_S) * triton.cdiv(N, BLOCK_N), bi.bs)
+    _expand_probe[grid](
+        x,
+        weights,
+        output,
+        N,
+        K,
+        x.stride(0),
+        x.stride(1),
+        weights.stride(0),
+        weights.stride(1),
+        weights.stride(2),
+        output.stride(0),
+        output.stride(1),
+        bi.seg_lens,
+        bi.seg_indptr,
+        bi.weight_indices,
+        bi.lora_ranks,
+        bi.scalings,
+        BLOCK_S=BLOCK_S,
+        BLOCK_N=BLOCK_N,
+        BLOCK_K=BLOCK_K,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+
+
+# ── metrics ────────────────────────────────────────────────────────────────────
+
+
+def theoretical_bandwidth_gb(n_segs, N, K):
+    """Min memory read in GB for one expand call."""
+    w_bytes = n_segs * N * K * 2  # weights: n_segs adapter tiles
+    x_bytes = n_segs * K * 2  # x: 1 row per segment
+    out_bytes = n_segs * N * 2 * 2  # output read+write
+    return (w_bytes + x_bytes + out_bytes) / 1e9
+
+
+def flops(n_segs, N, K):
+    return n_segs * 2 * N * K  # 2 × N × K per token
+
+
+def bench_cfg(fn, warmup=15, rep=200):
+    return triton.testing.do_bench(fn, warmup=warmup, rep=rep) * 1e-3  # → seconds
+
+
+# ── main sweep ─────────────────────────────────────────────────────────────────
+
+
+def sweep(n_segs: int, rank: int, N: int, label: str) -> None:
+    dev, dt = "cuda", torch.bfloat16
+    bi = BI(bs=n_segs)
+    bi.lora_ranks = torch.tensor([0, rank], dtype=torch.int32, device=dev)
+    x = torch.randn(n_segs, rank, device=dev, dtype=dt)
+    w = torch.randn(2, N, rank, device=dev, dtype=dt)
+    o = torch.zeros(n_segs, N, device=dev, dtype=dt)
+
+    h100_bw = 3.35e12  # bytes/s
+    h100_tflops = 2e15  # bf16 tensor core peak
+
+    bw_floor = theoretical_bandwidth_gb(n_segs, N, rank) / h100_bw * 1e6  # µs
+    flop_floor = flops(n_segs, N, rank) / h100_tflops * 1e6  # µs
+
+    print(f"\n{'='*72}")
+    print(f"  {label}  n_segs={n_segs}  rank={rank}  N={N}")
+    print(f"  Bandwidth floor: {bw_floor:.1f}µs  |  FLOP floor: {flop_floor:.2f}µs")
+    print(
+        f"  {'BLOCK_S':>7}  {'BLOCK_N':>7}  {'BLOCK_K':>7}  {'warps':>5}  {'stg':>3}  {'µs':>8}  {'BW%':>6}  {'K-iters':>8}"
+    )
+    print(f"  {'-'*66}")
+
+    configs = [
+        # (BLOCK_S, BLOCK_N, BLOCK_K, num_warps, num_stages)
+        # Current best from autotune:
+        (16, 64, 16, 8, 3),
+        (16, 64, 32, 8, 3),
+        # Larger BLOCK_K — KEY EXPERIMENT:
+        #   rank=64 → BLOCK_K=64: 1 K-iteration, no k-mask, no loop overhead
+        #   rank=128 → BLOCK_K=128: same
+        (16, 64, 64, 8, 1),
+        (16, 64, 64, 4, 1),
+        (16, 64, 64, 8, 2),
+        (16, 128, 64, 4, 1),
+        (16, 128, 64, 8, 1),
+        (16, 64, 128, 8, 1) if rank >= 128 else None,
+        (16, 128, 128, 4, 1) if rank >= 128 else None,
+        # Wider BLOCK_N to reduce CTA count:
+        (16, 128, 16, 4, 2),
+        (16, 128, 32, 4, 2),
+        (32, 64, 16, 4, 2),
+        (32, 64, 32, 4, 2),
+    ]
+
+    best_t = float("inf")
+    best_cfg = None
+
+    for cfg in configs:
+        if cfg is None:
+            continue
+        BS, BN, BK, nw, ns = cfg
+        if BK > rank:  # BLOCK_K larger than actual K makes no sense
+            continue
+        try:
+            t_s = bench_cfg(lambda: run_probe(x, w, o.clone(), bi, BS, BN, BK, nw, ns))
+            t_us = t_s * 1e6
+            bw_pct = bw_floor / t_us * 100
+            k_iters = (rank + BK - 1) // BK
+            marker = " ←" if t_us < best_t else ""
+            if t_us < best_t:
+                best_t = t_us
+                best_cfg = cfg
+            print(
+                f"  {BS:>7}  {BN:>7}  {BK:>7}  {nw:>5}  {ns:>3}  {t_us:>7.1f}µ  {bw_pct:>5.1f}%  {k_iters:>8}{marker}"
+            )
+        except Exception as e:
+            print(f"  {BS:>7}  {BN:>7}  {BK:>7}  {nw:>5}  {ns:>3}  FAILED: {e}")
+
+    print(
+        f"\n  Best: BLOCK_S={best_cfg[0]} BLOCK_N={best_cfg[1]} BLOCK_K={best_cfg[2]} warps={best_cfg[3]} stages={best_cfg[4]} → {best_t:.1f}µs"
+    )
+    print(
+        f"  Current autotune: {bench_cfg(lambda: run_probe(x, w, o.clone(), bi, 16, 64, 16, 8, 3))*1e6:.1f}µs"
+    )
+
+
+if __name__ == "__main__":
+    for n_segs in (16, 32, 64):
+        sweep(n_segs=n_segs, rank=64, N=4096, label="o_proj rank=64")
+    sweep(n_segs=32, rank=128, N=4096, label="o_proj rank=128")
+    sweep(n_segs=32, rank=16, N=4096, label="o_proj rank=16")
diff --git a/python/tokenspeed/bench.py b/python/tokenspeed/bench.py
index c9a61f3ec..08adba1be 100755
--- a/python/tokenspeed/bench.py
+++ b/python/tokenspeed/bench.py
@@ -776,7 +776,7 @@ def get_lora_request(
         self,
         index: int,
         max_loras: int | None = None,
-        lora_path: str | None = None,
+        lora_name: str | None = None,
         lora_assignment: str = "random",
     ) -> None:
         return None
@@ -821,7 +821,7 @@ def sample(
         output_len: int = DEFAULT_OUTPUT_LEN,
         batchsize: int = 1,
         max_loras: int | None = None,
-        lora_path: str | None = None,
+        lora_name: str | None = None,
         lora_assignment: str = "random",
         **kwargs,
     ) -> list[SampleRequest]:
@@ -879,7 +879,7 @@ def sample(
             lora_req = self.get_lora_request(
                 index=i,
                 max_loras=max_loras,
-                lora_path=lora_path,
+                lora_name=lora_name,
                 lora_assignment=lora_assignment,
             )
             requests.append(
diff --git a/python/tokenspeed/runtime/engine/async_llm.py b/python/tokenspeed/runtime/engine/async_llm.py
index eaa0173f2..def2892cf 100755
--- a/python/tokenspeed/runtime/engine/async_llm.py
+++ b/python/tokenspeed/runtime/engine/async_llm.py
@@ -143,6 +143,8 @@ def __init__(
         # Read model args
         self.model_path = server_args.model
         self.served_model_name = server_args.served_model_name
+        # LoRA adapter name → integer lora_id (populated by load_lora_adapter).
+        self._lora_name_to_id: dict[str, int] = {}
         self.model_config = ModelConfig(
             server_args.model,
             trust_remote_code=server_args.trust_remote_code,
diff --git a/python/tokenspeed/runtime/engine/event_loop.py b/python/tokenspeed/runtime/engine/event_loop.py
index ae1ceab44..3b857d079 100644
--- a/python/tokenspeed/runtime/engine/event_loop.py
+++ b/python/tokenspeed/runtime/engine/event_loop.py
@@ -19,6 +19,7 @@
 # SOFTWARE.
 
 import faulthandler
+import os
 import signal
 import time
 from collections import OrderedDict
@@ -50,7 +51,6 @@
     cache_sync_debug_enabled,
     make_config,
     pool_to_paged_cache_groups,
-    pool_to_prefix_cache_adjunct_spec,
     pop_common_cache_event_payloads,
 )
 from tokenspeed.runtime.execution.distributed_initializer import (
@@ -299,8 +299,11 @@ def __init__(
                 f"(ratio={server_args.mamba_full_memory_ratio})."
             )
 
-        # Adjunct enabled only when pool opts in AND prefix-caching switch is on.
+        enable_mixed_prefill_decode = (
+            server_args.enable_mixed_batch and server_args.speculative_algorithm is None
+        )
 
+        # Adjunct enabled only when pool opts in AND prefix-caching switch is on.
         paged_cache_groups = pool_to_paged_cache_groups(token_to_kv_pool)
         prefix_cache_adjunct = None
         required_groups = token_to_kv_pool.prefix_cache_required_group_ids
@@ -329,7 +332,7 @@ def __init__(
             enable_mamba_l2=server_args.enable_mamba_l2,
             mamba_l2_host_slots=mamba_l2_host_slots,
             paged_cache_groups=paged_cache_groups,
-            enable_mixed_prefill_decode=server_args.enable_mixed_batch,
+            enable_mixed_prefill_decode=enable_mixed_prefill_decode,
             prefix_cache_adjunct=prefix_cache_adjunct,
         )
         logger.info(
@@ -381,6 +384,8 @@ def __init__(
             send_func=self.send_to_tokenizer,
             get_load_fn=self._get_load,
             architectures=self.model_config.hf_config.architectures,
+            load_lora_fn=self.load_lora_adapter,
+            unload_lora_fn=self.unload_lora_adapter,
         )
 
         self.output_processor = OutputProcesser(
@@ -436,6 +441,60 @@ def __init__(
         else:
             self.pd_kv_transfer = None
 
+        # ── LoRA ─────────────────────────────────────────────────────────────
+        self._lora_manager = None  # LoraManager (lazy init)
+        self._lora_name_to_id: dict[str, int] = {}  # name → integer lora_id
+        self._request_lora_ids: dict[str, int] = {}  # rid → lora_id
+
+        if server_args.enable_lora:
+            self._init_lora_manager()
+
+    def _init_lora_manager(self) -> None:
+        """Bind to the LoraManager owned by the model executor.
+
+        The model executor creates the manager during its own ``__init__`` so
+        that the CUDA-graph capture sees a live manager (and bakes the LoRA
+        delta path into the captured graphs).  The event loop only borrows
+        the reference and shares its request-id → lora-id map.
+        """
+        self._lora_manager = self.model_executor.lora_manager
+        if self._lora_manager is None:
+            raise RuntimeError(
+                "Model executor was not configured with --enable-lora; "
+                "cannot initialize LoRA support."
+            )
+        self.model_executor.request_lora_ids = self._request_lora_ids
+        logger.info("LoraManager bound (max_loras=%d)", self.server_args.max_loras)
+
+    def load_lora_adapter(self, lora_name: str, adapter_path: str) -> int:
+        """Load a PEFT LoRA adapter and make it available for serving.
+
+        Returns the integer lora_id assigned to this adapter.
+        """
+        if not self.server_args.enable_lora:
+            raise ValueError(
+                "Server was not started with --enable-lora. "
+                "Restart with --enable-lora to use LoRA adapters."
+            )
+        if self._lora_manager is None:
+            self._init_lora_manager()
+        lora_id = self._lora_manager.load_adapter(lora_name, adapter_path)
+        self._lora_name_to_id[lora_name] = lora_id
+        logger.info("Loaded LoRA adapter '%s' → lora_id=%d", lora_name, lora_id)
+        return lora_id
+
+    def unload_lora_adapter(self, lora_name: str) -> None:
+        """Unload a LoRA adapter and free its GPU slot."""
+        if self._lora_manager is None:
+            raise KeyError(f"No LoRA adapters loaded; '{lora_name}' not found.")
+        lora_id = self._lora_name_to_id.get(lora_name)
+        self._lora_manager.unload_adapter(lora_name)
+        self._lora_name_to_id.pop(lora_name, None)
+        # Proactively evict the KV cache namespace for this adapter so pages
+        # are freed immediately rather than waiting for LRU eviction pressure.
+        if lora_id is not None:
+            self.scheduler.evict_lora_namespace(lora_id)
+
     def _setup_pd_layerwise_transfer(self, interval: int) -> None:
         if not isinstance(self.pd_kv_transfer, DisaggPrefillExecutor):
             return
@@ -838,8 +897,42 @@ def _process_new_requests(self):
                     spec.rolling_hashes = hashes
                     spec.storage_hit_pages = hit_pages
             admitted_specs.append(spec)
+            # Track lora_id per request for forward-pass injection
+            if spec.lora_id != 0:
+                self._request_lora_ids[spec.request_id] = spec.lora_id
+                # Async-prefetch the adapter into the CPU pool so the
+                # disk read is overlapped with the previous forward step
+                # rather than blocking ``prepare_loras`` of the step that
+                # actually consumes it.  No-op when already CPU-resident.
+                if (
+                    self._lora_manager is not None
+                    and os.environ.get("TOKENSPEED_LORA_PREFETCH", "1") == "1"
+                ):
+                    name = self._lora_manager._id_to_name.get(spec.lora_id)
+                    if name is not None:
+                        self._lora_manager.prefetch(name)
 
         if admitted_specs:
+            # Optional ``pack`` policy: cluster admissions by lora_id so
+            # adapter-shared requests batch together at the C++ scheduler.
+            # Reduces GPU/CPU eviction churn under heavy mixed-adapter
+            # traffic (multiple distinct adapters > max_loras).
+            #
+            # Sort is stable: requests for the same adapter keep their
+            # arrival order, base-model (lora_id == 0) requests stay
+            # together at the front (their slot is the no-op sentinel).
+            #
+            # The benchmark in benchmark/test_lora_eviction_latency.py
+            # shows that CPU↔GPU promotion is essentially free; the
+            # only meaningful eviction cost is CPU→disk re-read (~30 ms).
+            # ``pack`` therefore mainly helps when ``working_set >
+            # max_loras_cpu`` and incoming traffic is bursty enough that
+            # multiple cold requests arrive in one event-loop iteration.
+            if (
+                self._lora_manager is not None
+                and self.server_args.lora_scheduling_policy == "pack"
+            ):
+                admitted_specs.sort(key=lambda s: s.lora_id)
             self.scheduler.submit_requests(admitted_specs)
 
     @nvtx_range("loop:commit", color="rapids")
diff --git a/python/tokenspeed/runtime/engine/input_processor.py b/python/tokenspeed/runtime/engine/input_processor.py
index 040ae6675..0e6b8d0d0 100644
--- a/python/tokenspeed/runtime/engine/input_processor.py
+++ b/python/tokenspeed/runtime/engine/input_processor.py
@@ -189,6 +189,7 @@ async def tokenize_one_request(
                 created_time=time.time(),
                 input_multi_ids=obj.input_multi_ids,
                 input_extra_infos=obj.input_extra_infos,
+                lora_id=self._resolve_lora_id(obj),
             )
 
         return TokenizedEmbeddingReqInput(
@@ -198,3 +199,17 @@ async def tokenize_one_request(
             sampling_params,
             created_time=time.time(),
         )
+
+    def _resolve_lora_id(self, obj: "GenerateReqInput") -> int:
+        """Map request LoRA adapter name to an integer lora_id."""
+        lora_name = getattr(obj, "lora_name", None)
+        if lora_name is None:
+            return 0
+        lora_registry: dict = getattr(self.engine, "_lora_name_to_id", {})
+        lora_id = lora_registry.get(lora_name, 0)
+        if lora_id == 0:
+            raise ValueError(
+                f"lora_name={lora_name!r} is not a registered adapter. "
+                "Call load_lora_adapter(name, adapter_path) before using it in a request."
+            )
+        return lora_id
diff --git a/python/tokenspeed/runtime/engine/io_struct.py b/python/tokenspeed/runtime/engine/io_struct.py
index 5782d30c0..e592da5bf 100755
--- a/python/tokenspeed/runtime/engine/io_struct.py
+++ b/python/tokenspeed/runtime/engine/io_struct.py
@@ -136,6 +136,12 @@ class GenerateReqInput:
     bootstrap_port: list[int] | int | None = None
     bootstrap_room: list[int] | int | None = None
 
+    # LoRA adapter to use for this request. Supply the name under which the
+    # adapter was registered via Engine.load_lora_adapter(). None means use the
+    # base model. Requests do not load adapters from disk; adapter filesystem
+    # paths belong to load_lora_adapter().
+    lora_name: list[str | None] | str | None = None
+
     def normalize_batch_and_arguments(self):
         if (
             self.text is None and self.input_ids is None and self.input_embeds is None
@@ -228,6 +234,11 @@ def normalize_batch_and_arguments(self):
                 self.token_ids_logprob = None
             if isinstance(self.input_extra_infos, dict):
                 self.input_extra_infos = [self.input_extra_infos]
+            if isinstance(self.lora_name, list):
+                assert (
+                    len(self.lora_name) == 1
+                ), "lora_name list should have length 1 for single request."
+                self.lora_name = self.lora_name[0]
         else:
             if self.parallel_sample_num == 1:
                 num = self.batch_size
@@ -320,6 +331,15 @@ def normalize_batch_and_arguments(self):
             else:
                 assert self.parallel_sample_num == 1
 
+            if self.lora_name is None:
+                self.lora_name = [None] * num
+            elif not isinstance(self.lora_name, list):
+                self.lora_name = [self.lora_name] * num
+            else:
+                assert (
+                    len(self.lora_name) == num
+                ), "lora_name should be a str or a list of matching length."
+
         # Other checks
         if self.session_params is not None:
             assert isinstance(self.session_params, dict) or isinstance(
@@ -372,6 +392,11 @@ def __getitem__(self, i):
             bootstrap_room=(
                 self.bootstrap_room[i] if self.bootstrap_room is not None else None
             ),
+            lora_name=(
+                self.lora_name[i]
+                if isinstance(self.lora_name, list)
+                else self.lora_name
+            ),
         )
         sub.rid = self.rid[i]
         return sub
@@ -422,6 +447,8 @@ class TokenizedGenerateReqInput:
 
     input_multi_ids: list[list[int]] = None
     input_extra_infos: list[dict] | None = None
+    # Integer lora_id resolved from lora_name (0 = base model)
+    lora_id: int = 0
 
 
 @dataclass
@@ -852,6 +879,30 @@ class RpcReqOutput:
     message: str
 
 
+@dataclass
+class LoadLoraReqInput:
+    lora_name: str
+    adapter_path: str
+
+
+@dataclass
+class LoadLoraReqOutput:
+    success: bool
+    lora_id: int = 0
+    message: str = ""
+
+
+@dataclass
+class UnloadLoraReqInput:
+    lora_name: str
+
+
+@dataclass
+class UnloadLoraReqOutput:
+    success: bool
+    message: str = ""
+
+
 @dataclass
 class GetLoadReqInput(BaseReq):
     pass
diff --git a/python/tokenspeed/runtime/engine/request_handler.py b/python/tokenspeed/runtime/engine/request_handler.py
index aa0b31fc5..3480aa4b4 100644
--- a/python/tokenspeed/runtime/engine/request_handler.py
+++ b/python/tokenspeed/runtime/engine/request_handler.py
@@ -41,12 +41,16 @@
     GetInternalStateReqOutput,
     GetLoadReqInput,
     GetLoadReqOutput,
+    LoadLoraReqInput,
+    LoadLoraReqOutput,
     ProfileReq,
     ProfileReqOutput,
     ProfileReqType,
     SetInternalStateReq,
     SetInternalStateReqOutput,
     TokenizedGenerateReqInput,
+    UnloadLoraReqInput,
+    UnloadLoraReqOutput,
 )
 from tokenspeed.runtime.engine.request_types import FINISH_ABORT
 from tokenspeed.runtime.engine.scheduler_utils import make_spec
@@ -80,6 +84,8 @@ def __init__(
         send_func,
         get_load_fn=None,
         architectures: list[str] | None = None,
+        load_lora_fn=None,
+        unload_lora_fn=None,
     ) -> None:
 
         self.forward_ct = 0
@@ -97,6 +103,8 @@ def __init__(
         self.max_req_len = max_req_len
         self.vocab_size = vocab_size
         self.get_load_fn = get_load_fn
+        self.load_lora_fn = load_lora_fn
+        self.unload_lora_fn = unload_lora_fn
 
         self.tokenizer = get_tokenizer(
             server_args.tokenizer,
@@ -176,6 +184,34 @@ def process_requests(self, recv_reqs: list):
                     self.send_func.send_pyobj(self.get_load_fn())
                 else:
                     self.send_func.send_pyobj(GetLoadReqOutput())
+            elif isinstance(recv_req, LoadLoraReqInput):
+                try:
+                    if self.load_lora_fn is not None:
+                        lora_id = self.load_lora_fn(
+                            recv_req.lora_name, recv_req.adapter_path
+                        )
+                        self.send_func.send_pyobj(
+                            LoadLoraReqOutput(success=True, lora_id=lora_id)
+                        )
+                    else:
+                        self.send_func.send_pyobj(
+                            LoadLoraReqOutput(
+                                success=False, message="LoRA not enabled on this server"
+                            )
+                        )
+                except Exception as e:
+                    self.send_func.send_pyobj(
+                        LoadLoraReqOutput(success=False, message=str(e))
+                    )
+            elif isinstance(recv_req, UnloadLoraReqInput):
+                try:
+                    if self.unload_lora_fn is not None:
+                        self.unload_lora_fn(recv_req.lora_name)
+                    self.send_func.send_pyobj(UnloadLoraReqOutput(success=True))
+                except Exception as e:
+                    self.send_func.send_pyobj(
+                        UnloadLoraReqOutput(success=False, message=str(e))
+                    )
             else:
                 raise NotImplementedError(f"Unsupported request type: {type(recv_req)}")
         return new_req_specs, req_states, bootstrap_infos, abort_rids
@@ -190,6 +226,7 @@ def handle_generate_request(
         req_spec = make_spec(
             rid=recv_req.rid,
             tokens=recv_req.input_ids,
+            lora_id=getattr(recv_req, "lora_id", 0),
         )
         req_state = RequestState.from_recv_req(
             recv_req,
diff --git a/python/tokenspeed/runtime/engine/scheduler_control_client.py b/python/tokenspeed/runtime/engine/scheduler_control_client.py
index 52fb5d9c7..8325a2527 100755
--- a/python/tokenspeed/runtime/engine/scheduler_control_client.py
+++ b/python/tokenspeed/runtime/engine/scheduler_control_client.py
@@ -47,6 +47,8 @@
     GetWeightsByNameReqOutput,
     InitWeightsUpdateGroupReqInput,
     InitWeightsUpdateGroupReqOutput,
+    LoadLoraReqInput,
+    LoadLoraReqOutput,
     ProfileReq,
     ProfileReqOutput,
     ProfileReqType,
@@ -56,6 +58,8 @@
     ResumeMemoryOccupationReqOutput,
     SetInternalStateReq,
     SetInternalStateReqOutput,
+    UnloadLoraReqInput,
+    UnloadLoraReqOutput,
     UpdateWeightsFromDistributedReqInput,
     UpdateWeightsFromDistributedReqOutput,
     UpdateWeightsFromTensorReqInput,
@@ -95,7 +99,7 @@ async def queueing_call(self, obj: T):
             assert self._result_values is None
 
         if obj:
-            self._sender.send_pyobj(obj)
+            await self._sender.send_pyobj(obj)
 
         self._result_event = asyncio.Event()
         self._result_values = []
@@ -115,7 +119,7 @@ async def watching_call(self, obj):
             self._result_event = asyncio.Event()
 
             if obj:
-                self._sender.send_pyobj(obj)
+                await self._sender.send_pyobj(obj)
 
         await self._result_event.wait()
         result_values = copy.deepcopy(self._result_values)
@@ -178,6 +182,12 @@ def init_communicators(self: AsyncLLM, server_args: ServerArgs):
             server_args.mapping.attn.dp_size,
             mode="watching",
         )
+        self.load_lora_communicator = _Communicator(
+            self.engine_core_client.send_to_scheduler, server_args.mapping.attn.dp_size
+        )
+        self.unload_lora_communicator = _Communicator(
+            self.engine_core_client.send_to_scheduler, server_args.mapping.attn.dp_size
+        )
 
         self._result_dispatcher += self._get_communicator_dispatcher()
 
@@ -232,9 +242,39 @@ def _get_communicator_dispatcher(self: AsyncLLM):
                     GetLoadReqOutput,
                     self.get_load_communicator.handle_recv,
                 ),
+                (
+                    LoadLoraReqOutput,
+                    self.load_lora_communicator.handle_recv,
+                ),
+                (
+                    UnloadLoraReqOutput,
+                    self.unload_lora_communicator.handle_recv,
+                ),
             ]
         )
 
+    async def load_lora_adapter(
+        self: "AsyncLLM",
+        lora_name: str,
+        adapter_path: str,
+    ) -> tuple[bool, int, str]:
+        """Send a LoadLoraReqInput to the scheduler subprocess."""
+        self.auto_create_handle_loop()
+        result = (
+            await self.load_lora_communicator(
+                LoadLoraReqInput(lora_name=lora_name, adapter_path=adapter_path)
+            )
+        )[0]
+        return result.success, result.lora_id, result.message
+
+    async def unload_lora_adapter(self: "AsyncLLM", lora_name: str) -> tuple[bool, str]:
+        """Send an UnloadLoraReqInput to the scheduler subprocess."""
+        self.auto_create_handle_loop()
+        result = (
+            await self.unload_lora_communicator(UnloadLoraReqInput(lora_name=lora_name))
+        )[0]
+        return result.success, result.message
+
     async def flush_cache(self: AsyncLLM) -> FlushCacheReqOutput:
         return (await self.flush_cache_communicator(FlushCacheReqInput()))[0]
 
diff --git a/python/tokenspeed/runtime/engine/scheduler_utils.py b/python/tokenspeed/runtime/engine/scheduler_utils.py
index 7ddee553e..fa0c8deff 100644
--- a/python/tokenspeed/runtime/engine/scheduler_utils.py
+++ b/python/tokenspeed/runtime/engine/scheduler_utils.py
@@ -30,9 +30,7 @@
     ExecutionEvent,
     ForwardEvent,
     PagedCacheGroupConfig,
-    PagedCacheGroupFamily,
     PagedCacheRetention,
-    PrefixCacheAdjunctSpec,
     RequestSpec,
     SchedulerConfig,
 )
@@ -44,10 +42,11 @@
 _TRUTHY_ENV_VALUES = {"1", "true", "yes", "on"}
 
 
-def make_spec(rid: str, tokens: list[int]) -> RequestSpec:
+def make_spec(rid: str, tokens: list[int], lora_id: int = 0) -> RequestSpec:
     spec = RequestSpec()
     spec.request_id = rid
     spec.tokens = tokens
+    spec.lora_id = lora_id
     return spec
 
 
@@ -71,7 +70,6 @@ def make_config(
     mamba_l2_host_slots: int = 0,
     paged_cache_groups: Sequence["PagedCacheGroupConfig"] | None = None,
     enable_mixed_prefill_decode: bool = False,
-    prefix_cache_adjunct: "PrefixCacheAdjunctSpec | None" = None,
 ) -> SchedulerConfig:
     cfg = SchedulerConfig()
     cfg.num_device_pages = num_device_pages
@@ -103,15 +101,12 @@ def make_config(
     cfg.enable_mixed_prefill_decode = enable_mixed_prefill_decode
     if paged_cache_groups:
         cfg.paged_cache_groups = list(paged_cache_groups)
-    # Opt-in; unset means paged-cache groups are transport-only.
-    if prefix_cache_adjunct is not None:
-        cfg.prefix_cache_adjunct = prefix_cache_adjunct
     return cfg
 
 
 def pool_to_paged_cache_groups(pool: Any) -> list:
     """Convert a KV pool's paged_cache_group_specs to scheduler configs."""
-    specs = pool.paged_cache_group_specs
+    specs = getattr(pool, "paged_cache_group_specs", ())
     if not specs:
         return []
     counts = pool.paged_cache_group_page_counts
@@ -126,23 +121,12 @@ def pool_to_paged_cache_groups(pool: Any) -> list:
                 f"pool_to_paged_cache_groups: unsupported retention "
                 f"{spec.retention!r} for group {spec.group_id!r}"
             )
-        family_str = getattr(spec, "family", "history")
-        if family_str == "history":
-            family = PagedCacheGroupFamily.History
-        elif family_str == "state":
-            family = PagedCacheGroupFamily.State
-        else:
-            raise ValueError(
-                f"pool_to_paged_cache_groups: unsupported family "
-                f"{family_str!r} for group {spec.group_id!r}"
-            )
         kwargs = dict(
             group_id=spec.group_id,
             rows_per_page=int(spec.rows_per_page),
             entry_stride_tokens=int(spec.entry_stride_tokens),
             total_pages=int(counts[spec.group_id]),
             retention=retention,
-            family=family,
         )
         if spec.retention == "sliding_window":
             kwargs["sliding_window_tokens"] = int(spec.sliding_window_tokens)
@@ -150,19 +134,6 @@ def pool_to_paged_cache_groups(pool: Any) -> list:
     return out
 
 
-def pool_to_prefix_cache_adjunct_spec(
-    required_group_ids: Sequence[str],
-) -> "PrefixCacheAdjunctSpec":
-    """Build a PrefixCacheAdjunctSpec from a non-empty required-group-id list."""
-    if not required_group_ids:
-        raise ValueError(
-            "pool_to_prefix_cache_adjunct_spec: required_group_ids must be non-empty"
-        )
-    spec = PrefixCacheAdjunctSpec()
-    spec.required_groups = [str(gid) for gid in required_group_ids]
-    return spec
-
-
 def make_extend_result_event(request_id: str, tokens: list[int] = ()) -> None:
     fe = ForwardEvent.ExtendResult()
     fe.request_id = request_id
diff --git a/python/tokenspeed/runtime/entrypoints/engine.py b/python/tokenspeed/runtime/entrypoints/engine.py
index 048964e9a..156508022 100755
--- a/python/tokenspeed/runtime/entrypoints/engine.py
+++ b/python/tokenspeed/runtime/entrypoints/engine.py
@@ -170,6 +170,7 @@ def generate(
         bootstrap_port: list[int] | int | None = None,
         bootstrap_room: list[int] | int | None = None,
         data_parallel_rank: int | None = None,
+        lora_name: list[str | None] | str | None = None,
     ) -> dict | Iterator[dict]:
         """
         The arguments of this function match
@@ -209,6 +210,7 @@ def generate(
             bootstrap_host=bootstrap_host,
             bootstrap_port=bootstrap_port,
             bootstrap_room=bootstrap_room,
+            lora_name=lora_name,
         )
         if stream:
             return self.llm.generate_stream(obj)
@@ -245,6 +247,7 @@ async def async_generate(
         bootstrap_port: list[int] | int | None = None,
         bootstrap_room: list[int] | int | None = None,
         user_rid: list[str] | str | None = None,
+        lora_name: list[str | None] | str | None = None,
     ) -> dict | AsyncIterator[dict]:
         """
         The arguments of this function match
@@ -279,6 +282,7 @@ async def async_generate(
             bootstrap_port=bootstrap_port,
             bootstrap_room=bootstrap_room,
             user_rid=user_rid,
+            lora_name=lora_name,
         )
         generator = self.tokenizer_manager.generate_request(obj)
 
@@ -435,6 +439,32 @@ def collective_rpc(self, method: str, **kwargs):
         assert isinstance(recv_req, RpcReqOutput)
         assert recv_req.success, recv_req.message
 
+    def load_lora_adapter(
+        self,
+        lora_name: str,
+        adapter_path: str,
+    ) -> int:
+        """Load a PEFT LoRA adapter. Returns the integer lora_id."""
+        success, lora_id, message = self.llm.run(
+            self.tokenizer_manager.load_lora_adapter(lora_name, adapter_path)
+        )
+        if not success:
+            raise RuntimeError(f"Failed to load LoRA adapter '{lora_name}': {message}")
+        # Update the local name→id registry so future requests resolve correctly.
+        self.tokenizer_manager._lora_name_to_id[lora_name] = lora_id
+        return lora_id
+
+    def unload_lora_adapter(self, lora_name: str) -> None:
+        """Unload a previously loaded LoRA adapter."""
+        success, message = self.llm.run(
+            self.tokenizer_manager.unload_lora_adapter(lora_name)
+        )
+        if not success:
+            raise RuntimeError(
+                f"Failed to unload LoRA adapter '{lora_name}': {message}"
+            )
+        self.tokenizer_manager._lora_name_to_id.pop(lora_name, None)
+
     def save_remote_model(self, **kwargs):
         self.collective_rpc("save_remote_model", **kwargs)
 
diff --git a/python/tokenspeed/runtime/entrypoints/engine_base.py b/python/tokenspeed/runtime/entrypoints/engine_base.py
index 4654f25d6..c4e141d76 100755
--- a/python/tokenspeed/runtime/entrypoints/engine_base.py
+++ b/python/tokenspeed/runtime/entrypoints/engine_base.py
@@ -56,6 +56,7 @@ def generate(
         bootstrap_port: list[int] | int | None = None,
         bootstrap_room: list[int] | int | None = None,
         data_parallel_rank: int | None = None,
+        lora_name: list[str | None] | str | None = None,
     ) -> dict | Iterator[dict]:
         """Generate outputs based on given inputs."""
 
@@ -83,3 +84,32 @@ def resume_memory_occupation(self) -> None:
     @abstractmethod
     def shutdown(self) -> None:
         """Shutdown the engine and clean up resources."""
+
+    # ------------------------------------------------------------------
+    # LoRA adapter management
+    # ------------------------------------------------------------------
+
+    def load_lora_adapter(
+        self,
+        lora_name: str,
+        adapter_path: str,
+    ) -> int:
+        """Load a PEFT LoRA adapter and make it available for serving.
+
+        Args:
+            lora_name: Short identifier used by request-time lora_name.
+            adapter_path: Filesystem path to the PEFT adapter directory.
+
+        Returns:
+            Integer lora_id assigned to this adapter.
+        """
+        raise NotImplementedError(
+            "load_lora_adapter() is not implemented on this engine type. "
+            "Use the tokenspeed serve engine."
+        )
+
+    def unload_lora_adapter(self, lora_name: str) -> None:
+        """Unload a previously loaded LoRA adapter and free its GPU slot."""
+        raise NotImplementedError(
+            "unload_lora_adapter() is not implemented on this engine type."
+        )
diff --git a/python/tokenspeed/runtime/execution/context.py b/python/tokenspeed/runtime/execution/context.py
index e5cb59f39..324a61971 100644
--- a/python/tokenspeed/runtime/execution/context.py
+++ b/python/tokenspeed/runtime/execution/context.py
@@ -20,8 +20,11 @@
 
 from __future__ import annotations
 
+from collections.abc import Iterator
+from contextlib import contextmanager
+from contextvars import ContextVar
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 import torch
 
@@ -33,6 +36,24 @@
 if TYPE_CHECKING:
     from tokenspeed.runtime.layers.attention.backends.base import AttentionBackend
     from tokenspeed.runtime.layers.attention.kv_cache.base import BaseTokenToKVPool
+    from tokenspeed.runtime.lora.lora_manager import LoraManager
+
+_CURRENT_LORA_MANAGER: ContextVar[Optional["LoraManager"]] = ContextVar(
+    "tokenspeed_current_lora_manager", default=None
+)
+
+
+def get_current_lora_manager() -> Optional["LoraManager"]:
+    return _CURRENT_LORA_MANAGER.get()
+
+
+@contextmanager
+def bind_forward_context(ctx: "ForwardContext") -> Iterator[None]:
+    token = _CURRENT_LORA_MANAGER.set(ctx.lora_manager)
+    try:
+        yield
+    finally:
+        _CURRENT_LORA_MANAGER.reset(token)
 
 
 @dataclass
@@ -58,3 +79,11 @@ class ForwardContext:
 
     # --- logits processor ---
     gather_ids: torch.Tensor | None = None
+
+    # --- LoRA ---
+    # Reference to the LoraManager.  When set, forward layers call
+    # ``lora_manager.apply_qkv_lora`` / ``apply_o_lora`` which read from
+    # the manager's persistent batch_info.  Set at capture time when
+    # ``--enable-lora`` is on so the LoRA path is recorded into the graph
+    # (NO_LORA_SLOT = no adapter), otherwise None.
+    lora_manager: Optional["LoraManager"] = None
diff --git a/python/tokenspeed/runtime/execution/cuda_graph_wrapper.py b/python/tokenspeed/runtime/execution/cuda_graph_wrapper.py
index 9fa151db1..ee4403d01 100644
--- a/python/tokenspeed/runtime/execution/cuda_graph_wrapper.py
+++ b/python/tokenspeed/runtime/execution/cuda_graph_wrapper.py
@@ -50,6 +50,7 @@
     from tokenspeed.runtime.execution.runtime_states import RuntimeStates
     from tokenspeed.runtime.layers.attention.backends.base import AttentionBackend
     from tokenspeed.runtime.layers.attention.kv_cache.base import BaseTokenToKVPool
+    from tokenspeed.runtime.lora.lora_manager import LoraManager
     from tokenspeed.runtime.sampling.backends.base import SamplingBackend
 
 logger = get_colorful_logger(__name__)
@@ -194,6 +195,7 @@ def __init__(
         eager_grammar_buffers=None,
         sampling_backend: SamplingBackend | None = None,
         runtime_states: RuntimeStates | None = None,
+        lora_manager: LoraManager | None = None,
     ):
         self.config = config
         self.attn_backend = attn_backend
@@ -206,6 +208,7 @@ def __init__(
         self.capturable_grammar = capturable_grammar
         self.eager_grammar_buffers = eager_grammar_buffers
         self.runtime_states = runtime_states
+        self.lora_manager = lora_manager
         self.enable_torch_compile = getattr(config, "enable_torch_compile", False)
         self.disable_padding = config.disable_cuda_graph_padding
         self.enable_cudagraph_gc = getattr(config, "enable_cudagraph_gc", True)
@@ -255,6 +258,12 @@ def __init__(
 
         self.graphs: dict[int, torch.cuda.CUDAGraph] = {}
         self.output_buffers: dict[int, tuple] = {}
+        # Per-bs no-LoRA variant.  Populated only when ``lora_manager`` is
+        # configured: a second captured graph that omits the LoRA Triton
+        # kernels entirely, replayed when ``LoraManager.has_active_lora``
+        # is False so base-model decode pays no LoRA overhead at all.
+        self.graphs_no_lora: dict[int, torch.cuda.CUDAGraph] = {}
+        self.output_buffers_no_lora: dict[int, tuple] = {}
 
         self._forward_func: Callable | None = forward_func
         self.disable = config.enforce_eager
@@ -270,15 +279,26 @@ def capture(self):
         """
         Capture CUDA graphs for all configured batch sizes.
 
+        When a ``lora_manager`` is attached, captures TWO graphs per batch
+        size: a with-LoRA graph (records the segmented-GEMM Triton kernels
+        and feeds them with the manager's persistent batch_info) and a
+        no-LoRA graph (omits those kernels entirely).  Replay picks the
+        no-LoRA variant when ``has_active_lora`` is False.
+
         Args:
             forward_func: ModelExecutor.forward_step(bs, ctx, sampling_info).
         """
         rank = self.global_rank
+        capture_no_lora_too = self.lora_manager is not None
         with freeze_gc(self.enable_cudagraph_gc):
             self.stream = torch.cuda.Stream()
             capture_range = tqdm.tqdm(self.capture_bs) if rank == 0 else self.capture_bs
             if rank == 0:
-                logger.info("Capturing batches: %s", self.capture_bs)
+                logger.info(
+                    "Capturing batches: %s%s",
+                    self.capture_bs,
+                    " (×2: with-LoRA + no-LoRA)" if capture_no_lora_too else "",
+                )
             for bs in capture_range:
                 if rank == 0:
                     avail_mem = get_available_gpu_memory(
@@ -287,11 +307,15 @@ def capture(self):
                     capture_range.set_description(
                         f"Capturing batches ({bs=} {avail_mem=:.2f} GB)"
                     )
-                graph, output_buffers = self._capture_one(bs)
+                graph, output_buffers = self._capture_one(bs, attach_lora=True)
                 self.graphs[bs] = graph
                 self.output_buffers[bs] = output_buffers
+                if capture_no_lora_too:
+                    graph_nl, output_nl = self._capture_one(bs, attach_lora=False)
+                    self.graphs_no_lora[bs] = graph_nl
+                    self.output_buffers_no_lora[bs] = output_nl
 
-    def _capture_one(self, bs: int):
+    def _capture_one(self, bs: int, attach_lora: bool = True):
         graph = torch.cuda.CUDAGraph()
 
         ctx = ForwardContext(
@@ -314,6 +338,44 @@ def _capture_one(self, bs: int):
         if self.dp_size > 1:
             ctx.global_num_tokens = [bs * self.max_tokens_per_req] * self.world_size
 
+        # Bind LoRA only for the with-LoRA variant.  When ``attach_lora``
+        # is False we capture a parallel graph that omits the LoRA Triton
+        # kernels entirely (qwen3's ``if ctx.lora_manager is not None``
+        # branch falls through), used at replay when no request in the
+        # batch has an active adapter.
+        if attach_lora and self.lora_manager is not None:
+            ctx.lora_manager = self.lora_manager
+            # Pre-fill batch_info so the captured kernels see a stable
+            # set of pointers; runtime updates the same tensors before
+            # each ``graph.replay()`` and the kernels re-read seg_lens /
+            # weight_indices / lora_ranks.
+            #
+            # Use lora_id=0 (base model) which resolves to NO_LORA_SLOT, BUT
+            # force has_active_lora=True so LoRA kernels ARE captured in the
+            # graph.  With dynamic GPU-tensor weight indexing (w13_A_buffers
+            # etc.) the captured kernels read weight_indices at replay time,
+            # so the correct adapter slot is used regardless of what was set
+            # during capture.  Slot 0 weights are all-zero at capture time
+            # (no adapter loaded yet), so the model output is unaffected.
+            self.lora_manager.prepare_loras(
+                [0] * bs, per_request_token_counts=self.max_tokens_per_req
+            )
+            # Force has_active_lora and single_lora_slot so ALL LoRA kernels
+            # (MoE, attention, MLP) are included in the captured graph.
+            # This applies to any enabled LoRA type — without this, kernels that
+            # check has_active_lora (e.g. apply_qkv_lora) return early during
+            # capture, recording a no-op that is then replayed at every decode step.
+            if (
+                self.lora_manager.enable_moe_lora
+                or self.lora_manager.enable_attn_lora
+                or self.lora_manager.enable_mlp_lora
+            ):
+                self.lora_manager.has_active_lora = True
+                bi = self.lora_manager._batch_info
+                bi.single_lora_slot = 0
+                bi.single_lora_rank = self.lora_manager.max_lora_rank
+                bi.weight_indices[:bs].fill_(0)
+
         # Capture with is_all_greedy=False so the graph records the full
         # top_k_top_p_sampling path (greedy-only requests are served by the
         # same path with top_k=1 in the buffer, which effectively argmaxes).
@@ -332,6 +394,7 @@ def _capture_one(self, bs: int):
             device=self.device,
         )
 
+        from tokenspeed.runtime.execution.context import bind_forward_context
         from tokenspeed.runtime.grammar.capturable_grammar import (
             bind_grammar_mask_buf,
         )
@@ -359,7 +422,8 @@ def run_once():
                 self.capturable_grammar.add_batch(
                     grammars=[None] * bs, bs=bs, has_candidates=False
                 )
-            return self._forward_func(bs=bs, ctx=ctx, sampling_info=sampling_info)
+            with bind_forward_context(ctx):
+                return self._forward_func(bs=bs, ctx=ctx, sampling_info=sampling_info)
 
         # Warm up before capture.
         for _ in range(4):
@@ -790,12 +854,25 @@ def __call__(
             # the per-request generators with the capture-stub generator.
             self.deepep_adapter.replay()
 
+            # Pick the no-LoRA variant when --enable-lora is on but no
+            # request in this batch uses an adapter — that graph omits the
+            # per-layer Triton LoRA kernels entirely.
+            use_no_lora_variant = (
+                self.lora_manager is not None
+                and not self.lora_manager.has_active_lora
+                and padded_bs in self.graphs_no_lora
+            )
+            if use_no_lora_variant:
+                graph = self.graphs_no_lora[padded_bs]
+                output_buffers = self.output_buffers_no_lora[padded_bs]
+            else:
+                graph = self.graphs[padded_bs]
+                output_buffers = self.output_buffers[padded_bs]
+
             with nvtx_range("graph_replay", color="red"):
-                self.graphs[padded_bs].replay()
+                graph.replay()
 
-            output_tokens, output_lengths, output_logprobs = self.output_buffers[
-                padded_bs
-            ]
+            output_tokens, output_lengths, output_logprobs = output_buffers
 
             result = (
                 output_tokens[: bs * self.max_tokens_per_req],
@@ -839,7 +916,10 @@ def __call__(
                 **mamba_kwargs,
             )
 
-            result = self._forward_func(bs=bs, ctx=ctx, sampling_info=sampling_info)
+            from tokenspeed.runtime.execution.context import bind_forward_context
+
+            with bind_forward_context(ctx):
+                result = self._forward_func(bs=bs, ctx=ctx, sampling_info=sampling_info)
 
         # Update mamba/GDN state after speculative verify
         if (
diff --git a/python/tokenspeed/runtime/execution/model_executor.py b/python/tokenspeed/runtime/execution/model_executor.py
index 9c36205d9..30ba0fb9b 100644
--- a/python/tokenspeed/runtime/execution/model_executor.py
+++ b/python/tokenspeed/runtime/execution/model_executor.py
@@ -108,6 +108,18 @@ class ModelExecutorConfig:
     disable_capturable_grammar: bool = False
     mamba_cache_chunk_size: int = 64
 
+    # ====== LORA =========
+    enable_lora: bool = False
+    max_loras: int = 4
+    max_lora_rank: int = 64
+    # Tiered residence: at most ``max_loras`` adapters in GPU buffers,
+    # at most ``max_loras_cpu`` cached in pinned host memory; beyond
+    # that adapters fall back to their disk_path on next use.
+    max_loras_cpu: int = 16
+    lora_buffer_groups: str = "attn,mlp,moe"
+    lora_moe_compressed_shared_outer: bool = False
+    lora_scheduling_policy: str = "lru"
+
     @staticmethod
     def from_server_args(
         server_args: ServerArgs,
@@ -147,6 +159,15 @@ def from_server_args(
             spec_num_tokens=server_args.speculative_num_draft_tokens,
             grammar_backend=server_args.grammar_backend,
             disable_capturable_grammar=server_args.disable_capturable_grammar,
+            enable_lora=server_args.enable_lora,
+            max_loras=server_args.max_loras,
+            max_lora_rank=server_args.max_lora_rank,
+            max_loras_cpu=server_args.max_loras_cpu or 4 * server_args.max_loras,
+            lora_buffer_groups=server_args.lora_buffer_groups,
+            lora_moe_compressed_shared_outer=(
+                server_args.lora_moe_compressed_shared_outer
+            ),
+            lora_scheduling_policy=server_args.lora_scheduling_policy,
             mamba_cache_chunk_size=server_args.mamba_cache_chunk_size,
         )
 
@@ -177,6 +198,11 @@ def __init__(
         self.draft_attn_backend = draft_attn_backend
         self.draft_token_to_kv_pool = draft_token_to_kv_pool
 
+        # LoRA — created below before CudaGraphWrapper so that the captured
+        # graphs include the LoRA delta path (NO_LORA_SLOT = no adapter).
+        self.lora_manager = None
+        self.request_lora_ids: dict[str, int] = {}
+
         if config.spec_algo is not None:
             max_num_pages_per_req = (
                 config.context_len + config.spec_num_tokens + config.block_size - 1
@@ -274,6 +300,39 @@ def __init__(
                 req_to_page=self.req_to_page,
             )
 
+        if config.enable_lora:
+            from tokenspeed.runtime.lora.lora_manager import LoraManager
+
+            model = self.model_runner.model
+            lora_dtype = next(model.parameters()).dtype
+            lora_device = next(model.parameters()).device
+            attn_mapping = model_runner.mapping.attn
+            tp_size = attn_mapping.tp_size
+            tp_rank = attn_mapping.tp_rank
+            # ``tp_group`` is the rank-tuple expected by comm_ops.all_reduce
+            # (it routes through the codebase's graph-capturable backend).
+            tp_group = attn_mapping.tp_group if tp_size > 1 else None
+            self.lora_manager = LoraManager(
+                model_config=model_runner.model_config.hf_config,
+                max_loras=config.max_loras,
+                max_lora_rank=config.max_lora_rank,
+                max_num_tokens=config.chunked_prefill_size,
+                max_loras_cpu=config.max_loras_cpu,
+                dtype=lora_dtype,
+                device=lora_device,
+                tp_rank=tp_rank,
+                tp_size=tp_size,
+                tp_group=tp_group,
+                lora_buffer_groups={
+                    group.strip()
+                    for group in config.lora_buffer_groups.split(",")
+                    if group.strip()
+                },
+                lora_moe_compressed_shared_outer=(
+                    config.lora_moe_compressed_shared_outer
+                ),
+            )
+
         self.forward_step = CudaGraphWrapper(
             forward_func=self._forward_step,
             attn_backend=attn_backend,
@@ -287,6 +346,7 @@ def __init__(
             eager_grammar_buffers=self.eager_grammar_buffers,
             sampling_backend=self.sampling_backend,
             runtime_states=self.runtime_states,
+            lora_manager=self.lora_manager,
         )
 
         self.execution_stream = torch.cuda.Stream()
@@ -1069,6 +1129,21 @@ def execute_forward_op(
                     ),
                     gather_ids=gather_ids,
                 )
+                # Bind LoRA when adapters are active.  ``prepare_loras``
+                # writes per-segment metadata into the manager's persistent
+                # ``batch_info`` (the captured graph already references
+                # those tensors); we set ``ctx.lora_manager`` so the
+                # forward layers call into the LoRA delta path.
+                if self.lora_manager is not None and bs > 0:
+                    lora_ids = [
+                        self.request_lora_ids.get(rid, 0)
+                        for rid in forward_op.request_ids
+                    ]
+                    self.lora_manager.prepare_loras(
+                        lora_ids, list(forward_op.input_lengths)
+                    )
+                    if any(lid != 0 for lid in lora_ids):
+                        ctx.lora_manager = self.lora_manager
                 if self.config.data_parallel_size > 1:
                     if dp_global_num_tokens is None:
                         raise RuntimeError(
diff --git a/python/tokenspeed/runtime/execution/model_runner.py b/python/tokenspeed/runtime/execution/model_runner.py
index bb57b7ad5..62f0ad218 100644
--- a/python/tokenspeed/runtime/execution/model_runner.py
+++ b/python/tokenspeed/runtime/execution/model_runner.py
@@ -24,6 +24,7 @@
 
 import torch
 
+from tokenspeed.runtime.execution.context import bind_forward_context
 from tokenspeed.runtime.execution.weight_loader import WeightLoader
 from tokenspeed.runtime.utils import get_colorful_logger
 from tokenspeed.runtime.utils.env import global_server_args_dict_update
@@ -136,11 +137,12 @@ def forward(
         if captured_hidden_states is not None:
             kwargs["captured_hidden_states"] = captured_hidden_states
 
-        return self.model.forward(
-            ctx,
-            input_ids,
-            positions,
-            out_cache_loc,
-            input_lengths,
-            **kwargs,
-        )
+        with bind_forward_context(ctx):
+            return self.model.forward(
+                ctx,
+                input_ids,
+                positions,
+                out_cache_loc,
+                input_lengths,
+                **kwargs,
+            )
diff --git a/python/tokenspeed/runtime/layers/logits_processor.py b/python/tokenspeed/runtime/layers/logits_processor.py
index 0b7a26865..ae264b9d4 100755
--- a/python/tokenspeed/runtime/layers/logits_processor.py
+++ b/python/tokenspeed/runtime/layers/logits_processor.py
@@ -28,7 +28,10 @@
 from torch import nn
 
 from tokenspeed.runtime.distributed.comm_ops import all_gather_into_tensor
-from tokenspeed.runtime.execution.context import ForwardContext
+from tokenspeed.runtime.execution.context import (
+    ForwardContext,
+    get_current_lora_manager,
+)
 from tokenspeed.runtime.execution.forward_batch_info import (
     CaptureHiddenMode,
     ForwardMode,
@@ -396,6 +399,10 @@ def _get_logits(
         if self.logit_scale is not None:
             logits.mul_(self.logit_scale)
 
+        lora_manager = get_current_lora_manager()
+        if lora_manager is not None and lora_manager.enable_head_lora:
+            logits = lora_manager.apply_lm_head_lora(hidden_states, logits)
+
         if self.tp_size > 1 and not self.skip_all_gather:
             gathered_logits = torch.empty(
                 self.tp_size * logits.size(0),
diff --git a/python/tokenspeed/runtime/layers/moe/backends/E=128,inter_size=384,hidden_size=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16_down.json b/python/tokenspeed/runtime/layers/moe/backends/E=128,inter_size=384,hidden_size=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16_down.json
new file mode 100644
index 000000000..1e28041de
--- /dev/null
+++ b/python/tokenspeed/runtime/layers/moe/backends/E=128,inter_size=384,hidden_size=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16_down.json
@@ -0,0 +1,11 @@
+{
+    "1":    {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "USE_TMA": false},
+    "8":    {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "USE_TMA": false},
+    "16":   {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "USE_TMA": false},
+    "32":   {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "USE_TMA": false},
+    "64":   {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "USE_TMA": false},
+    "128":  {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "USE_TMA": false},
+    "129":  {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "USE_TMA": false},
+    "512":  {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "USE_TMA": false},
+    "2048": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "USE_TMA": false}
+}
diff --git a/python/tokenspeed/runtime/layers/moe/backends/base.py b/python/tokenspeed/runtime/layers/moe/backends/base.py
index 1dfe8e51d..b1f7b3fa2 100644
--- a/python/tokenspeed/runtime/layers/moe/backends/base.py
+++ b/python/tokenspeed/runtime/layers/moe/backends/base.py
@@ -95,6 +95,10 @@ def supports_deferred_finalize(self) -> bool:
         """
         return False
 
+    @property
+    def supports_moe_lora(self) -> bool:
+        return False
+
     @property
     def topk_output_format(self) -> TopKOutputFormat:
         return TopKOutputFormat.STANDARD
diff --git a/python/tokenspeed/runtime/layers/moe/backends/fp8/triton.py b/python/tokenspeed/runtime/layers/moe/backends/fp8/triton.py
index 4dc4ebccb..5cd0de555 100644
--- a/python/tokenspeed/runtime/layers/moe/backends/fp8/triton.py
+++ b/python/tokenspeed/runtime/layers/moe/backends/fp8/triton.py
@@ -78,6 +78,7 @@ def forward(
         topk_output,
         num_global_tokens,
         max_num_tokens_per_gpu,
+        moe_lora_context=None,
     ):
         del num_global_tokens, max_num_tokens_per_gpu
         return triton_forward(
@@ -88,7 +89,12 @@ def forward(
             layer,
             hidden_states,
             topk_output,
+            moe_lora_context=moe_lora_context,
         )
 
+    @property
+    def supports_moe_lora(self) -> bool:
+        return True
+
 
 __all__ = ["Fp8TritonBackend"]
diff --git a/python/tokenspeed/runtime/layers/moe/backends/triton_common.py b/python/tokenspeed/runtime/layers/moe/backends/triton_common.py
index c67208400..4de5ac6c4 100644
--- a/python/tokenspeed/runtime/layers/moe/backends/triton_common.py
+++ b/python/tokenspeed/runtime/layers/moe/backends/triton_common.py
@@ -121,6 +121,7 @@ def triton_forward(
     layer: nn.Module,
     hidden_states: torch.Tensor,
     topk_output: object,
+    moe_lora_context=None,
 ) -> torch.Tensor:
     from tokenspeed.runtime.layers.activation import silu_and_mul
 
@@ -193,6 +194,11 @@ def triton_forward(
         dtype=dtype,
     )
 
+    # Prefetch gate_up LoRA A-shrink on secondary stream, concurrent with gate_up_gemm.
+    if moe_lora_context is not None:
+        moe_lora_context.launch_gate_up_shrink(
+            layer.layer_index, hidden_states, topk_ids
+        )
     gate_up_gemm(
         A=hidden_states,
         B=layer.w13_weight,
@@ -208,6 +214,14 @@ def triton_forward(
         b_use_tma=gate_up_moe_use_tma,
         c_sorted=down_moe_use_tma,
     )
+    if moe_lora_context is not None:
+        moe_lora_context.apply_gate_up_lora(
+            layer.layer_index,
+            hidden_states,
+            topk_ids,
+            intermediate_cache1,
+            sorted_token_ids=sorted_token_ids if down_moe_use_tma else None,
+        )
 
     if activation == "silu":
         silu_and_mul(
@@ -217,6 +231,14 @@ def triton_forward(
     else:
         raise ValueError(f"Unsupported activation: {activation}")
 
+    # Prefetch down LoRA A-shrink on secondary stream, concurrent with down_gemm.
+    if moe_lora_context is not None and not down_moe_use_tma:
+        moe_lora_context.launch_down_shrink(
+            layer.layer_index,
+            intermediate_cache2,
+            topk_ids,
+            m_tokens * top_k,
+        )
     down_gemm(
         A=intermediate_cache2,
         B=layer.w2_weight,
@@ -231,6 +253,15 @@ def triton_forward(
         a_use_tma=down_moe_use_tma,
         b_use_tma=down_moe_use_tma,
     )
+    if moe_lora_context is not None:
+        moe_lora_context.apply_down_lora(
+            layer.layer_index,
+            intermediate_cache2,
+            topk_ids,
+            topk_weights,
+            intermediate_cache3,
+            sorted_token_ids=sorted_token_ids if down_moe_use_tma else None,
+        )
 
     out_hidden_states = torch.empty_like(hidden_states)
     # Current limitation: Should avoid using runtime shapes as traits
diff --git a/python/tokenspeed/runtime/layers/moe/backends/unquantized/triton.py b/python/tokenspeed/runtime/layers/moe/backends/unquantized/triton.py
index 77cc34b56..f44840e66 100644
--- a/python/tokenspeed/runtime/layers/moe/backends/unquantized/triton.py
+++ b/python/tokenspeed/runtime/layers/moe/backends/unquantized/triton.py
@@ -60,6 +60,7 @@ def forward(
         topk_output,
         num_global_tokens,
         max_num_tokens_per_gpu,
+        moe_lora_context=None,
     ):
         del num_global_tokens, max_num_tokens_per_gpu
         return triton_forward(
@@ -70,7 +71,12 @@ def forward(
             layer,
             hidden_states,
             topk_output,
+            moe_lora_context=moe_lora_context,
         )
 
+    @property
+    def supports_moe_lora(self) -> bool:
+        return True
+
 
 __all__ = ["Bf16TritonBackend"]
diff --git a/python/tokenspeed/runtime/layers/moe/backends/w8a8_fp8/triton.py b/python/tokenspeed/runtime/layers/moe/backends/w8a8_fp8/triton.py
index 35061ef35..fec9f1e7a 100644
--- a/python/tokenspeed/runtime/layers/moe/backends/w8a8_fp8/triton.py
+++ b/python/tokenspeed/runtime/layers/moe/backends/w8a8_fp8/triton.py
@@ -78,6 +78,7 @@ def forward(
         topk_output,
         num_global_tokens,
         max_num_tokens_per_gpu,
+        moe_lora_context=None,
     ):
         del num_global_tokens, max_num_tokens_per_gpu
         return triton_forward(
@@ -88,8 +89,13 @@ def forward(
             layer,
             hidden_states,
             topk_output,
+            moe_lora_context=moe_lora_context,
         )
 
+    @property
+    def supports_moe_lora(self) -> bool:
+        return True
+
 
 W8A8Fp8TritonBackend = W8A8PerTokenPerChannelFp8TritonBackend
 
diff --git a/python/tokenspeed/runtime/layers/moe/layer.py b/python/tokenspeed/runtime/layers/moe/layer.py
index ef5790969..2f3e2da8d 100755
--- a/python/tokenspeed/runtime/layers/moe/layer.py
+++ b/python/tokenspeed/runtime/layers/moe/layer.py
@@ -21,6 +21,7 @@
 
 import torch
 
+from tokenspeed.runtime.execution.context import get_current_lora_manager
 from tokenspeed.runtime.layers.activation import SwigluArg
 from tokenspeed.runtime.layers.moe.core import MoELayerSpec, select_backend
 from tokenspeed.runtime.layers.moe.utils import get_all2all_backend
@@ -155,6 +156,7 @@ def forward(
         num_global_tokens: int,
         max_num_tokens_per_gpu: int,
         do_finalize: bool = True,
+        lora_manager=None,
     ):
         # Only pass ``do_finalize`` through when the caller actually wants
         # the deferred path. Other backends do not accept this kwarg;
@@ -166,6 +168,21 @@ def forward(
                 self.backend.supports_deferred_finalize
             ), f"{type(self.backend).__name__} does not support do_finalize=False"
             kwargs["do_finalize"] = False
+        if lora_manager is None:
+            lora_manager = get_current_lora_manager()
+        if lora_manager is not None:
+            if not self.backend.supports_moe_lora:
+                raise NotImplementedError(
+                    f"{type(self.backend).__name__} does not support MoE LoRA; "
+                    "use the Triton backend instead."
+                )
+            if self.ep_size != 1:
+                raise NotImplementedError(
+                    "MoE LoRA currently supports local/Tensor-Parallel MoE only; "
+                    "expert-parallel dispatch needs the LoRA slot map to be "
+                    "dispatched with tokens."
+                )
+            kwargs["moe_lora_context"] = lora_manager.moe_lora_context
         return self.backend.forward(
             self,
             hidden_states,
diff --git a/python/tokenspeed/runtime/lora/__init__.py b/python/tokenspeed/runtime/lora/__init__.py
new file mode 100644
index 000000000..57692962f
--- /dev/null
+++ b/python/tokenspeed/runtime/lora/__init__.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""LoRA adapter serving runtime."""
+
+from tokenspeed.runtime.lora.lora_config import LoraConfig
+
+__all__ = ["LoraConfig", "LoraRegistry"]
+
+
+def __getattr__(name: str):
+    if name == "LoraRegistry":
+        from tokenspeed.runtime.lora.lora_registry import LoraRegistry
+
+        return LoraRegistry
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/python/tokenspeed/runtime/lora/adapter_io.py b/python/tokenspeed/runtime/lora/adapter_io.py
new file mode 100644
index 000000000..d92020c13
--- /dev/null
+++ b/python/tokenspeed/runtime/lora/adapter_io.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""PEFT LoRA adapter loading and metadata helpers."""
+
+from __future__ import annotations
+
+import json
+import os
+import re
+
+import torch
+
+PEFT_ATTN_MODULES = ("q_proj", "k_proj", "v_proj", "o_proj")
+PEFT_MLP_MODULES = ("gate_proj", "up_proj", "down_proj")
+PEFT_EXPERT_MODULES = PEFT_MLP_MODULES
+PEFT_HEAD_MODULE = "lm_head"
+PEFT_MODULES = (*PEFT_ATTN_MODULES, *PEFT_MLP_MODULES)
+
+# Sentinel layer_id used for model-global modules (e.g. lm_head) that have no
+# per-layer index in AdapterWeights.
+LORA_HEAD_LAYER_ID = -1
+
+AdapterWeights = dict[int, dict[str, tuple[torch.Tensor, torch.Tensor]]]
+
+
+def resolve_adapter_weight_path(adapter_path: str) -> str:
+    safetensors_path = os.path.join(adapter_path, "adapter_model.safetensors")
+    return safetensors_path if os.path.exists(safetensors_path) else adapter_path
+
+
+def load_adapter_weights(adapter_path: str) -> AdapterWeights:
+    return parse_adapter_weights(
+        load_safetensors(resolve_adapter_weight_path(adapter_path))
+    )
+
+
+def load_safetensors(path: str) -> dict[str, torch.Tensor]:
+    from safetensors import safe_open
+
+    tensors: dict[str, torch.Tensor] = {}
+    with safe_open(path, framework="pt", device="cpu") as f:
+        for key in f.keys():
+            tensors[key] = f.get_tensor(key)
+    return tensors
+
+
+def parse_adapter_weights(tensors: dict[str, torch.Tensor]) -> AdapterWeights:
+    """Return ``{layer_id: {module_name: (lora_A, lora_B)}}``.
+
+    Matches attention (``self_attn.{q,k,v,o}_proj``), MLP
+    (``mlp.{gate,up,down}_proj``), and lm_head PEFT module names.
+    lm_head weights are stored under ``LORA_HEAD_LAYER_ID`` (-1).
+    """
+    dense_pattern = re.compile(
+        r"base_model\.model\.model\.layers\.(\d+)\."
+        r"(?:self_attn|mlp)\."
+        r"(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)\."
+        r"lora_(A|B)\.weight"
+    )
+    expert_pattern = re.compile(
+        r"base_model\.model\.model\.layers\.(\d+)\."
+        r"mlp\.experts\.(\d+)\."
+        r"(gate_proj|up_proj|down_proj)\."
+        r"lora_(A|B)\.weight"
+    )
+    expert_3d_pattern = re.compile(
+        r"base_model\.model\.model\.layers\.(\d+)\."
+        r"mlp\.experts\."
+        r"(w1|w2|w3)\."
+        r"lora_(A|B)\.weight"
+    )
+    # PEFT uses ``lora_embedding_A/B`` (no ``.weight`` suffix) for modules
+    # treated as embedding tables (lm_head, embed_tokens).
+    head_pattern = re.compile(
+        r"base_model\.model\.lm_head\." r"(?:lora_(A|B)\.weight|lora_embedding_(A|B))"
+    )
+    weights: dict[int, dict[str, dict[str, torch.Tensor]]] = {}
+    for key, tensor in tensors.items():
+        m = dense_pattern.match(key)
+        if m:
+            layer_id, module, ab = int(m.group(1)), m.group(2), m.group(3)
+        else:
+            m = expert_pattern.match(key)
+            if m:
+                layer_id = int(m.group(1))
+                module = f"experts.{int(m.group(2))}.{m.group(3)}"
+                ab = m.group(4)
+            else:
+                m = expert_3d_pattern.match(key)
+                if m:
+                    layer_id = int(m.group(1))
+                    module = f"experts.{m.group(2)}"
+                    ab = m.group(3)
+                else:
+                    m = head_pattern.match(key)
+                    if not m:
+                        continue
+                    layer_id = LORA_HEAD_LAYER_ID
+                    module = PEFT_HEAD_MODULE
+                    ab = m.group(1) or m.group(2)
+        weights.setdefault(layer_id, {}).setdefault(module, {})[ab] = tensor
+
+    result: AdapterWeights = {}
+    for layer_id, modules in weights.items():
+        result[layer_id] = {}
+        for module, ab_dict in modules.items():
+            result[layer_id][module] = (ab_dict["A"], ab_dict["B"])
+    return result
+
+
+def read_adapter_scaling(adapter_path: str | None, rank: int) -> float:
+    if adapter_path is None:
+        return 1.0
+    config_file = os.path.join(adapter_path, "adapter_config.json")
+    if not os.path.exists(config_file):
+        return 1.0
+    try:
+        with open(config_file) as f:
+            cfg = json.load(f)
+        alpha = float(cfg.get("lora_alpha", rank))
+        r = int(cfg.get("r", rank))
+        return alpha / r if r > 0 else 1.0
+    except Exception:
+        return 1.0
diff --git a/python/tokenspeed/runtime/lora/lora_batch.py b/python/tokenspeed/runtime/lora/lora_batch.py
new file mode 100644
index 000000000..23064db4b
--- /dev/null
+++ b/python/tokenspeed/runtime/lora/lora_batch.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Batch metadata structures for segmented LoRA kernels."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import torch
+
+NO_LORA_SLOT = -1
+
+
+@dataclass
+class LoraBatchInfo:
+    """Per-step segment metadata read by the LoRA kernels.
+
+    All tensors live on the LoRA device.  When the captured CUDA graph needs
+    persistent storage, :class:`LoraManager` pre-allocates these tensors with
+    maximum sizes; runtime fills the prefix and updates ``bs`` / ``max_len``.
+    """
+
+    bs: int
+    num_segments: int
+    max_len: int
+    seg_lens: torch.Tensor  # (num_segments,) int32
+    seg_indptr: torch.Tensor  # (num_segments + 1,) int32
+    weight_indices: torch.Tensor  # (num_segments,) int32
+    lora_ranks: torch.Tensor  # (n_slots,) int32; NO_LORA_SLOT means base model
+    scalings: torch.Tensor  # (n_slots,) float32
+    permutation: torch.Tensor | None = None  # unused (no sort by adapter yet)
+    # Adapter-group metadata for lora_expand_grouped_v2_fwd (decode path only).
+    # Populated by prepare_loras when max_len == 1.
+    sort_order: torch.Tensor | None = None  # (bs,) int64
+    group_slots: torch.Tensor | None = None  # (num_groups,) int32
+    group_starts: torch.Tensor | None = None  # (num_groups,) int32
+    group_sizes: torch.Tensor | None = None  # (num_groups,) int32
+    num_groups: int = 0
+    # Largest group size; pre-computed on CPU so the kernel grid avoids a
+    # GPU-CPU sync.  Equals max(group_sizes) when num_groups > 0, else 0.
+    max_group_size: int = 0
+    # Host-only fast-path metadata. Non-negative iff every segment in this step
+    # uses the same real adapter slot; NO_LORA_SLOT means mixed/base-only.
+    single_lora_slot: int = NO_LORA_SLOT
+    # Host-only active rank for ``single_lora_slot``. Zero when no single
+    # nonzero adapter slot is active.
+    single_lora_rank: int = 0
+    # Host-only metadata for the multi-adapter batched CuTeDSL fast path.
+    # Non-negative iff segments are equal-length, slots are consecutive, and
+    # all participating slots share rank/scaling.
+    multi_lora_start_slot: int = NO_LORA_SLOT
+    multi_lora_count: int = 0
+    multi_lora_segment_len: int = 0
+    multi_lora_rank: int = 0
+
+
+def build_decode_lora_groups(
+    per_request_slots: list[int],
+) -> tuple[list[int], list[int], list[int], list[int]]:
+    """Group decode requests by adapter slot for the grouped expand kernel.
+
+    Returns ``(sort_order, group_slots, group_starts, group_sizes)``.
+    ``group_starts`` are offsets into ``sort_order``.
+    """
+    sort_order = sorted(
+        (i for i, slot in enumerate(per_request_slots) if slot != NO_LORA_SLOT),
+        key=lambda i: per_request_slots[i],
+    )
+    group_slots: list[int] = []
+    group_starts: list[int] = []
+    group_sizes: list[int] = []
+    for pos, orig in enumerate(sort_order):
+        slot = per_request_slots[orig]
+        if not group_slots or group_slots[-1] != slot:
+            group_slots.append(slot)
+            group_starts.append(pos)
+            group_sizes.append(1)
+        else:
+            group_sizes[-1] += 1
+    return sort_order, group_slots, group_starts, group_sizes
diff --git a/python/tokenspeed/runtime/lora/lora_buffers.py b/python/tokenspeed/runtime/lora/lora_buffers.py
new file mode 100644
index 000000000..2b024f4d8
--- /dev/null
+++ b/python/tokenspeed/runtime/lora/lora_buffers.py
@@ -0,0 +1,332 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""GPU-resident LoRA weight buffer layout and slot loading."""
+
+from __future__ import annotations
+
+import torch
+
+from tokenspeed.runtime.lora.adapter_io import (
+    LORA_HEAD_LAYER_ID,
+    PEFT_HEAD_MODULE,
+    AdapterWeights,
+)
+
+LORA_BUFFER_GROUPS = frozenset({"attn", "mlp", "moe", "lm_head"})
+
+
+class LoraWeightBuffers:
+    def __init__(
+        self,
+        *,
+        n_layers: int,
+        n_slots: int,
+        max_lora_rank: int,
+        hidden_size: int,
+        q_size_per_tp: int,
+        kv_size_per_tp: int,
+        o_in_per_tp: int,
+        intermediate_per_tp: int,
+        vocab_per_tp: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        tp_rank: int,
+        tp_size: int,
+        buffer_groups: set[str] | frozenset[str] = LORA_BUFFER_GROUPS,
+    ) -> None:
+        self.n_layers = n_layers
+        self.n_slots = n_slots
+        self.max_lora_rank = max_lora_rank
+        self.hidden_size = hidden_size
+        self.q_size_per_tp = q_size_per_tp
+        self.kv_size_per_tp = kv_size_per_tp
+        self.o_in_per_tp = o_in_per_tp
+        self.intermediate_per_tp = intermediate_per_tp
+        self.vocab_per_tp = vocab_per_tp
+        self.dtype = dtype
+        self.device = device
+        self.tp_rank = tp_rank
+        self.tp_size = tp_size
+        unknown_groups = set(buffer_groups) - LORA_BUFFER_GROUPS
+        if unknown_groups:
+            raise ValueError(f"Unknown LoRA buffer groups: {sorted(unknown_groups)}")
+        self.buffer_groups = frozenset(buffer_groups)
+        self.enable_attn = "attn" in self.buffer_groups
+        self.enable_mlp = "mlp" in self.buffer_groups
+        self.enable_head = "lm_head" in self.buffer_groups
+
+        self.qkv_A_buffers: list[torch.Tensor] = []
+        self.qkv_B_buffers: list[torch.Tensor] = []
+        self.o_A_buffers: list[torch.Tensor] = []
+        self.o_B_buffers: list[torch.Tensor] = []
+        self.gate_up_A_buffers: list[torch.Tensor] = []
+        self.gate_up_B_buffers: list[torch.Tensor] = []
+        self.down_A_buffers: list[torch.Tensor] = []
+        self.down_B_buffers: list[torch.Tensor] = []
+        # lm_head LoRA — single pair of buffers (not per-layer).
+        # A: (n_slots, r, hidden)  — replicated across TP ranks.
+        # B: (n_slots, vocab_per_tp, r) — column-parallel shard.
+        self.lm_head_A_buffer: torch.Tensor
+        self.lm_head_B_buffer: torch.Tensor
+
+        self.qkv_output_offset = torch.tensor(
+            [
+                0,
+                q_size_per_tp,
+                q_size_per_tp + kv_size_per_tp,
+                q_size_per_tp + 2 * kv_size_per_tp,
+            ],
+            dtype=torch.int32,
+            device=device,
+        )
+        self.max_qkv_out_dim = max(q_size_per_tp, kv_size_per_tp)
+
+        self.o_slice_offsets = torch.tensor(
+            [0, hidden_size], dtype=torch.int32, device=device
+        )
+        self.gate_up_slice_offsets = torch.tensor(
+            [0, intermediate_per_tp, 2 * intermediate_per_tp],
+            dtype=torch.int32,
+            device=device,
+        )
+        self.down_slice_offsets = torch.tensor(
+            [0, hidden_size], dtype=torch.int32, device=device
+        )
+
+        self._alloc()
+
+    def _alloc(self) -> None:
+        r = self.max_lora_rank
+        h = self.hidden_size
+        q = self.q_size_per_tp
+        kv = self.kv_size_per_tp
+        o_in = self.o_in_per_tp
+        i = self.intermediate_per_tp
+        v = self.vocab_per_tp
+        n = self.n_slots
+
+        for _ in range(self.n_layers):
+            if self.enable_attn:
+                self.qkv_A_buffers.append(
+                    torch.zeros((n, 3 * r, h), dtype=self.dtype, device=self.device)
+                )
+                self.qkv_B_buffers.append(
+                    torch.zeros(
+                        (n, q + 2 * kv, r), dtype=self.dtype, device=self.device
+                    )
+                )
+                self.o_A_buffers.append(
+                    torch.zeros((n, r, o_in), dtype=self.dtype, device=self.device)
+                )
+                self.o_B_buffers.append(
+                    torch.zeros((n, h, r), dtype=self.dtype, device=self.device)
+                )
+            if self.enable_mlp:
+                self.gate_up_A_buffers.append(
+                    torch.zeros((n, 2 * r, h), dtype=self.dtype, device=self.device)
+                )
+                self.gate_up_B_buffers.append(
+                    torch.zeros((n, 2 * i, r), dtype=self.dtype, device=self.device)
+                )
+                self.down_A_buffers.append(
+                    torch.zeros((n, r, i), dtype=self.dtype, device=self.device)
+                )
+                self.down_B_buffers.append(
+                    torch.zeros((n, h, r), dtype=self.dtype, device=self.device)
+                )
+        if self.enable_head:
+            self.lm_head_A_buffer = torch.zeros(
+                (n, r, h), dtype=self.dtype, device=self.device
+            )
+            self.lm_head_B_buffer = torch.zeros(
+                (n, v, r), dtype=self.dtype, device=self.device
+            )
+
+    def load_adapter_to_slot(
+        self,
+        cpu_weights: AdapterWeights,
+        slot: int,
+        rank: int,
+    ) -> None:
+        for layer_id, modules in cpu_weights.items():
+            if layer_id == LORA_HEAD_LAYER_ID:
+                if PEFT_HEAD_MODULE in modules:
+                    self._load_lm_head_to_slot(modules[PEFT_HEAD_MODULE], slot, rank)
+                continue
+            for mod, (lora_A_full, lora_B_full) in modules.items():
+                if mod.startswith("experts."):
+                    continue
+                self._check_module_enabled(mod)
+                lora_A_shard_cpu, lora_B_shard_cpu = self.shard_weights(
+                    mod, lora_A_full, lora_B_full
+                )
+                r = min(lora_A_shard_cpu.shape[0], rank)
+                lora_A_shard = lora_A_shard_cpu[:r].to(
+                    device=self.device,
+                    dtype=self.dtype,
+                    non_blocking=True,
+                )
+                lora_B_shard = lora_B_shard_cpu[:, :r].to(
+                    device=self.device,
+                    dtype=self.dtype,
+                    non_blocking=True,
+                )
+
+                if mod in ("q_proj", "k_proj", "v_proj"):
+                    qkv_idx = ("q_proj", "k_proj", "v_proj").index(mod)
+                    rank_off = qkv_idx * r
+                    out_off, out_size = self.qkv_b_slice(mod)
+                    self.qkv_A_buffers[layer_id][
+                        slot, rank_off : rank_off + r, :
+                    ].copy_(lora_A_shard, non_blocking=True)
+                    self.qkv_B_buffers[layer_id][
+                        slot, out_off : out_off + out_size, :r
+                    ].copy_(lora_B_shard, non_blocking=True)
+                elif mod == "o_proj":
+                    self.o_A_buffers[layer_id][slot, :r, :].copy_(
+                        lora_A_shard, non_blocking=True
+                    )
+                    self.o_B_buffers[layer_id][slot, :, :r].copy_(
+                        lora_B_shard, non_blocking=True
+                    )
+                elif mod in ("gate_proj", "up_proj"):
+                    gate_up_idx = 0 if mod == "gate_proj" else 1
+                    rank_off = gate_up_idx * r
+                    out_off = gate_up_idx * self.intermediate_per_tp
+                    self.gate_up_A_buffers[layer_id][
+                        slot, rank_off : rank_off + r, :
+                    ].copy_(lora_A_shard, non_blocking=True)
+                    self.gate_up_B_buffers[layer_id][
+                        slot, out_off : out_off + self.intermediate_per_tp, :r
+                    ].copy_(lora_B_shard, non_blocking=True)
+                else:
+                    self.down_A_buffers[layer_id][slot, :r, :].copy_(
+                        lora_A_shard, non_blocking=True
+                    )
+                    self.down_B_buffers[layer_id][slot, :, :r].copy_(
+                        lora_B_shard, non_blocking=True
+                    )
+
+    def _load_lm_head_to_slot(
+        self,
+        ab: tuple[torch.Tensor, torch.Tensor],
+        slot: int,
+        rank: int,
+    ) -> None:
+        if not self.enable_head:
+            raise ValueError(
+                "Adapter targets lm_head, but LoRA buffer group 'head' is disabled."
+            )
+        lora_A_full, lora_B_full = ab
+        lora_A_cpu, lora_B_cpu = self.shard_weights(
+            PEFT_HEAD_MODULE, lora_A_full, lora_B_full
+        )
+        r = min(lora_A_cpu.shape[0], rank)
+        self.lm_head_A_buffer[slot, :r, :].copy_(
+            lora_A_cpu[:r].to(device=self.device, dtype=self.dtype, non_blocking=True),
+            non_blocking=True,
+        )
+        self.lm_head_B_buffer[slot, :, :r].copy_(
+            lora_B_cpu[:, :r].to(
+                device=self.device, dtype=self.dtype, non_blocking=True
+            ),
+            non_blocking=True,
+        )
+
+    def zero_slot(self, slot: int) -> None:
+        if self.enable_attn:
+            for layer_id in range(self.n_layers):
+                self.qkv_A_buffers[layer_id][slot].zero_()
+                self.qkv_B_buffers[layer_id][slot].zero_()
+                self.o_A_buffers[layer_id][slot].zero_()
+                self.o_B_buffers[layer_id][slot].zero_()
+        if self.enable_mlp:
+            for layer_id in range(self.n_layers):
+                self.gate_up_A_buffers[layer_id][slot].zero_()
+                self.gate_up_B_buffers[layer_id][slot].zero_()
+                self.down_A_buffers[layer_id][slot].zero_()
+                self.down_B_buffers[layer_id][slot].zero_()
+        if self.enable_head:
+            self.lm_head_A_buffer[slot].zero_()
+            self.lm_head_B_buffer[slot].zero_()
+
+    def _check_module_enabled(self, module: str) -> None:
+        if module in ("q_proj", "k_proj", "v_proj", "o_proj"):
+            if not self.enable_attn:
+                raise ValueError(
+                    f"Adapter targets {module}, but LoRA buffer group 'attn' "
+                    "is disabled."
+                )
+            return
+        if module in ("gate_proj", "up_proj", "down_proj"):
+            if not self.enable_mlp:
+                raise ValueError(
+                    f"Adapter targets {module}, but LoRA buffer group 'mlp' "
+                    "is disabled."
+                )
+            return
+        if module == PEFT_HEAD_MODULE:
+            if not self.enable_head:
+                raise ValueError(
+                    "Adapter targets lm_head, but LoRA buffer group 'head' "
+                    "is disabled."
+                )
+            return
+        raise ValueError(f"Unsupported dense LoRA module: {module}")
+
+    def qkv_b_slice(self, module: str) -> tuple[int, int]:
+        """Return ``(offset, size)`` of a projection inside fused QKV B."""
+        if module == "q_proj":
+            return 0, self.q_size_per_tp
+        if module == "k_proj":
+            return self.q_size_per_tp, self.kv_size_per_tp
+        return self.q_size_per_tp + self.kv_size_per_tp, self.kv_size_per_tp
+
+    def shard_weights(
+        self,
+        module: str,
+        lora_A: torch.Tensor,
+        lora_B: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.tp_size == 1:
+            return lora_A, lora_B
+        # Column-parallel (attn q/k/v, MLP gate/up, lm_head): shard B along output dim.
+        if module in (
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "gate_proj",
+            "up_proj",
+            PEFT_HEAD_MODULE,
+        ):
+            out_total = lora_B.shape[0]
+            out_per = out_total // self.tp_size
+            return (
+                lora_A,
+                lora_B[self.tp_rank * out_per : (self.tp_rank + 1) * out_per],
+            )
+        # Row-parallel (attn o_proj, MLP down_proj): shard A along input dim.
+        in_total = lora_A.shape[1]
+        in_per = in_total // self.tp_size
+        return (
+            lora_A[:, self.tp_rank * in_per : (self.tp_rank + 1) * in_per],
+            lora_B,
+        )
diff --git a/python/tokenspeed/runtime/lora/lora_cache.py b/python/tokenspeed/runtime/lora/lora_cache.py
new file mode 100644
index 000000000..185ca791b
--- /dev/null
+++ b/python/tokenspeed/runtime/lora/lora_cache.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Tier-2 CPU LoRA adapter cache with async disk prefetch."""
+
+from __future__ import annotations
+
+import threading
+from collections import OrderedDict
+from collections.abc import Callable
+from concurrent.futures import Future, ThreadPoolExecutor
+
+import torch
+
+from tokenspeed.runtime.lora.adapter_io import AdapterWeights, load_adapter_weights
+from tokenspeed.runtime.utils import get_colorful_logger
+
+logger = get_colorful_logger(__name__)
+
+
+class LoraCpuCache:
+    def __init__(
+        self,
+        *,
+        capacity: int,
+        is_gpu_resident: Callable[[str], bool],
+    ) -> None:
+        self.capacity = capacity
+        self.is_gpu_resident = is_gpu_resident
+        self.cache: dict[str, AdapterWeights] = {}
+        self.lru: OrderedDict[str, None] = OrderedDict()
+        self.adapter_paths: dict[str, str] = {}
+        self.loader_executor = ThreadPoolExecutor(
+            max_workers=2, thread_name_prefix="lora-loader"
+        )
+        self.lock = threading.Lock()
+        self.pending_loads: dict[str, Future] = {}
+
+    def set_path(self, name: str, adapter_path: str) -> None:
+        self.adapter_paths[name] = adapter_path
+
+    def remove(self, name: str) -> None:
+        self.evict(name)
+        self.adapter_paths.pop(name, None)
+        with self.lock:
+            self.pending_loads.pop(name, None)
+
+    def prefetch(self, name: str) -> None:
+        """Best-effort async warm of the CPU pool for *name*."""
+        with self.lock:
+            if name in self.cache:
+                self.lru.move_to_end(name)
+                return
+            if name in self.pending_loads:
+                return
+            adapter_path = self.adapter_paths.get(name)
+            if adapter_path is None:
+                return
+            fut = self.loader_executor.submit(
+                self._async_load_weights, name, adapter_path
+            )
+            self.pending_loads[name] = fut
+
+    def ensure(
+        self,
+        name: str,
+        weights: AdapterWeights | None = None,
+    ) -> None:
+        """Synchronously ensure *name* is CPU-resident."""
+        with self.lock:
+            if name in self.cache:
+                self.lru.move_to_end(name)
+                return
+            pending = self.pending_loads.get(name)
+
+        if pending is not None:
+            pending.result()
+            with self.lock:
+                if name in self.cache:
+                    self.lru.move_to_end(name)
+                    return
+
+        if weights is None:
+            adapter_path = self.adapter_paths.get(name)
+            if adapter_path is None:
+                raise KeyError(f"Adapter '{name}' has no recorded disk path.")
+            weights = load_adapter_weights(adapter_path)
+
+        with self.lock:
+            if name in self.cache:
+                self.lru.move_to_end(name)
+                return
+            self._install_locked(name, weights)
+
+    def evict(self, name: str) -> None:
+        with self.lock:
+            self._evict_locked(name)
+
+    def _async_load_weights(self, name: str, adapter_path: str) -> None:
+        try:
+            weights = load_adapter_weights(adapter_path)
+        except Exception:
+            logger.exception("Async LoRA load failed for '%s'", name)
+            with self.lock:
+                self.pending_loads.pop(name, None)
+            return
+        with self.lock:
+            try:
+                if name not in self.cache:
+                    self._install_locked(name, weights)
+            finally:
+                self.pending_loads.pop(name, None)
+
+    def _install_locked(self, name: str, weights: AdapterWeights) -> None:
+        while len(self.cache) >= self.capacity:
+            evicted = False
+            # Prefer evicting non-GPU-resident entries first: they cost a disk
+            # read to bring back, while GPU-resident ones cost nothing until
+            # their GPU slot is also evicted.
+            for stage in ("non_gpu", "gpu_resident"):
+                for candidate in list(self.lru.keys()):
+                    if candidate == name:
+                        continue
+                    is_gpu = self.is_gpu_resident(candidate)
+                    if stage == "non_gpu" and is_gpu:
+                        continue
+                    self._evict_locked(candidate)
+                    evicted = True
+                    break
+                if evicted:
+                    break
+            if not evicted:
+                raise RuntimeError(
+                    f"CPU LoRA pool is full ({len(self.cache)}/{self.capacity}) "
+                    "and no evictable entry was found. "
+                    f"cpu_lru={list(self.lru.keys())}. "
+                    "Increase max_loras_cpu."
+                )
+        self.cache[name] = self._pin_weights(weights)
+        self.lru[name] = None
+
+    def _evict_locked(self, name: str) -> None:
+        if name in self.cache:
+            del self.cache[name]
+            self.lru.pop(name, None)
+            logger.debug(
+                "Evicted '%s' from CPU pool (now %d/%d)",
+                name,
+                len(self.cache),
+                self.capacity,
+            )
+
+    def _pin_weights(self, weights: AdapterWeights) -> AdapterWeights:
+        return {
+            layer_id: {
+                module: (
+                    self._pin_tensor(lora_A),
+                    self._pin_tensor(lora_B),
+                )
+                for module, (lora_A, lora_B) in modules.items()
+            }
+            for layer_id, modules in weights.items()
+        }
+
+    @staticmethod
+    def _pin_tensor(tensor: torch.Tensor) -> torch.Tensor:
+        if tensor.device.type != "cpu" or tensor.is_pinned():
+            return tensor
+        try:
+            return tensor.pin_memory()
+        except RuntimeError:
+            return tensor
diff --git a/python/tokenspeed/runtime/lora/lora_config.py b/python/tokenspeed/runtime/lora/lora_config.py
new file mode 100644
index 000000000..7938b7d38
--- /dev/null
+++ b/python/tokenspeed/runtime/lora/lora_config.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""LoRA adapter configuration and metadata."""
+
+from __future__ import annotations
+
+import json
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+
+
+@dataclass
+class LoraConfig:
+    """Configuration for a single LoRA adapter.
+
+    Loaded from the adapter's ``adapter_config.json`` (PEFT format).
+    """
+
+    # Identifier used at request time (e.g. "sql-expert")
+    name: str
+
+    # Filesystem path to the adapter directory or file
+    path: str
+
+    # LoRA rank (r)
+    r: int = 16
+
+    # LoRA alpha scaling factor
+    lora_alpha: int = 16
+
+    # Target modules (e.g. ["q_proj", "v_proj"])
+    target_modules: list[str] = field(default_factory=list)
+
+    # Base model name for compatibility checking
+    base_model_name_or_path: Optional[str] = None
+
+    @classmethod
+    def from_path(cls, name: str, path: str) -> "LoraConfig":
+        """Load LoraConfig from a PEFT adapter directory."""
+        config_file = os.path.join(path, "adapter_config.json")
+        if not os.path.exists(config_file):
+            raise FileNotFoundError(
+                f"adapter_config.json not found at {config_file}. "
+                "The path must point to a PEFT-format adapter directory."
+            )
+        with open(config_file) as f:
+            raw = json.load(f)
+
+        return cls(
+            name=name,
+            path=path,
+            r=raw.get("r", 16),
+            lora_alpha=raw.get("lora_alpha", 16),
+            target_modules=raw.get("target_modules") or [],
+            base_model_name_or_path=raw.get("base_model_name_or_path"),
+        )
+
+    @property
+    def scaling(self) -> float:
+        return self.lora_alpha / self.r if self.r > 0 else 1.0
diff --git a/python/tokenspeed/runtime/lora/lora_manager.py b/python/tokenspeed/runtime/lora/lora_manager.py
new file mode 100644
index 000000000..00992f172
--- /dev/null
+++ b/python/tokenspeed/runtime/lora/lora_manager.py
@@ -0,0 +1,1009 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""LoRA adapter weight manager (segment-grouped Triton path).
+
+Adapted from sglang/Punica's S-LoRA design.
+
+Memory layout
+-------------
+For each layer the manager owns:
+
+* ``qkv_A_buffers[layer]``: ``(n_slots, 3 * max_rank, hidden)`` — fused
+  q_proj/k_proj/v_proj A matrices, stack-major (q first, then k, then v).
+* ``qkv_B_buffers[layer]``: ``(n_slots, q_per_tp + 2 * kv_per_tp, max_rank)``
+  — fused output-side, ``[q_per_tp | kv_per_tp | kv_per_tp]`` along dim 1.
+* ``o_A_buffers[layer]``:   ``(n_slots, max_rank, in_per_tp)`` — row-parallel
+  A, sharded along input dim.
+* ``o_B_buffers[layer]``:   ``(n_slots, hidden, max_rank)`` — full B.
+
+No-LoRA requests use ``NO_LORA_SLOT`` (-1), matching vLLM's convention.
+Real adapters occupy slots ``0 .. max_loras - 1``.
+
+Tensor parallelism
+------------------
+* QKV is column-parallel: A is full, B is sharded along output dim
+  (``q_per_tp + 2 * kv_per_tp``).  No collective inside the LoRA path.
+* O is row-parallel: A is sharded along input dim, B is full.  The host
+  module (qwen3 ``o_proj``) runs with ``reduce_results=False`` and has its
+  partial sum all-reduced downstream by ``post_attention_layernorm``; the
+  LoRA delta rides that same reduction (full ``B @ lora_a`` is added to the
+  partial output and the downstream reduce sums it ``tp_size`` times — see
+  ``apply_o_lora`` for the resulting numerical caveat).
+"""
+
+from __future__ import annotations
+
+import os
+from collections import OrderedDict
+
+import torch
+from tokenspeed_kernel.ops.lora.triton import (
+    lora_expand_fwd,
+    lora_expand_grouped_v2_fwd,
+    lora_expand_prefill_fwd,
+    lora_gate_up_expand_fwd,
+    lora_qkv_expand_fwd,
+    lora_shrink_fwd,
+    lora_shrink_prefill_fwd,
+)
+
+from tokenspeed.runtime.lora.adapter_io import (
+    LORA_HEAD_LAYER_ID,
+    PEFT_HEAD_MODULE,
+    PEFT_MODULES,
+    read_adapter_scaling,
+    resolve_adapter_weight_path,
+)
+from tokenspeed.runtime.lora.lora_batch import (
+    NO_LORA_SLOT,
+    LoraBatchInfo,
+    build_decode_lora_groups,
+)
+from tokenspeed.runtime.lora.lora_buffers import LORA_BUFFER_GROUPS, LoraWeightBuffers
+from tokenspeed.runtime.lora.lora_cache import LoraCpuCache
+from tokenspeed.runtime.lora.moe_lora import MoeLoraBuffers, MoeLoraContext
+from tokenspeed.runtime.utils import get_colorful_logger
+
+# Segments longer than this use the prefill (chunked-SGMV) expand kernel,
+# which specialises strides and loop counts at compile time.  Shorter
+# segments (decode) use the decode-tuned kernels.  Threshold chosen from
+# benchmarks: chunked-SGMV wins above ~32 tokens/segment at rank ≥ 64.
+_CHUNKED_THRESHOLD = 32
+
+# With max_group_size-based grid, the kernel degenerates to the segmented
+# layout when every group has 1 token (n_unique = n), so no threshold is
+# needed for correctness.  Keep a minimum of 1 (always use grpv2).
+_TRITON_GROUPED_DECODE_MIN_GROUP_SIZE = 1
+
+logger = get_colorful_logger(__name__)
+
+
+# ── Manager ─────────────────────────────────────────────────────────────────
+
+
+def _use_triton_grouped_decode(bi: LoraBatchInfo) -> bool:
+    """Return whether grouped Triton decode expand should beat basic decode."""
+    return (
+        bi.single_lora_slot == NO_LORA_SLOT
+        and bi.num_groups > 0
+        and bi.bs // bi.num_groups >= _TRITON_GROUPED_DECODE_MIN_GROUP_SIZE
+    )
+
+
+class LoraManager:
+    """Owns GPU-resident LoRA weights and dispatches the segmented-GEMM path.
+
+    Public surface (used by the model + executor):
+
+    * :meth:`load_adapter` / :meth:`unload_adapter` — adapter lifecycle.
+    * :attr:`batch_info` — persistent :class:`LoraBatchInfo` whose tensor
+      pointers are stable across forward steps (so they can be baked into
+      the captured CUDA graph).
+    * :meth:`prepare_loras` — fill the persistent batch_info for one step.
+    * :meth:`apply_qkv_lora` / :meth:`apply_o_lora` — Triton-backed deltas.
+    """
+
+    def __init__(
+        self,
+        model_config,
+        max_loras: int,
+        max_lora_rank: int,
+        max_num_tokens: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        tp_rank: int = 0,
+        tp_size: int = 1,
+        tp_group=None,
+        max_loras_cpu: int | None = None,
+        lora_buffer_groups: set[str] | frozenset[str] = LORA_BUFFER_GROUPS,
+        lora_moe_compressed_shared_outer: bool = False,
+    ) -> None:
+        self.max_loras = max_loras
+        self.max_lora_rank = max_lora_rank
+        self.max_num_tokens = max_num_tokens
+        self.dtype = dtype
+        self.device = device
+        self.tp_rank = tp_rank
+        self.tp_size = tp_size
+        self.tp_group = tp_group
+        unknown_groups = set(lora_buffer_groups) - LORA_BUFFER_GROUPS
+        if unknown_groups:
+            raise ValueError(f"Unknown LoRA buffer groups: {sorted(unknown_groups)}")
+        self.lora_buffer_groups = frozenset(lora_buffer_groups)
+        self.enable_attn_lora = "attn" in self.lora_buffer_groups
+        self.enable_mlp_lora = "mlp" in self.lora_buffer_groups
+        self.enable_moe_lora = "moe" in self.lora_buffer_groups
+        self.enable_head_lora = "lm_head" in self.lora_buffer_groups
+        self.lora_moe_compressed_shared_outer = lora_moe_compressed_shared_outer
+        # Tier-2 CPU cache cap. Defaults to 4× the GPU pool so adapter
+        # spill-out to disk is rare in steady state.
+        self.max_loras_cpu: int = (
+            max_loras_cpu if max_loras_cpu is not None else 4 * max_loras
+        )
+        if self.max_loras_cpu < max_loras:
+            raise ValueError(
+                f"max_loras_cpu ({self.max_loras_cpu}) must be ≥ "
+                f"max_loras ({max_loras}); GPU-resident adapters live in "
+                "the CPU pool too."
+            )
+
+        self.n_layers: int = model_config.num_hidden_layers
+        hidden: int = model_config.hidden_size
+        n_heads: int = model_config.num_attention_heads
+        n_kv: int = model_config.num_key_value_heads
+        # Use the model's explicit head_dim when available (some architectures like
+        # Qwen3.5 decouple head_dim from hidden/n_heads, e.g. hidden=2048, n_heads=16
+        # but head_dim=256).
+        head_dim: int = getattr(model_config, "head_dim", None) or (hidden // n_heads)
+        # attn_output_gate doubles the Q projection size (2× heads in qkv_proj).
+        # The o_proj input is n_heads × head_dim (no doubling).
+        q_multiplier: int = 2 if getattr(model_config, "attn_output_gate", False) else 1
+        q_size_base: int = (n_heads // tp_size) * head_dim
+
+        self.q_size_per_tp: int = q_multiplier * q_size_base
+        self.kv_size_per_tp: int = max(1, n_kv // tp_size) * head_dim
+        self.o_in_per_tp: int = q_size_base  # o_proj reads un-gated heads
+        self.hidden_size: int = hidden
+
+        from tokenspeed.runtime.layers.vocab_parallel_embedding import pad_vocab_size
+
+        vocab_size: int = model_config.vocab_size
+        self.vocab_per_tp: int = pad_vocab_size(vocab_size) // tp_size
+
+        # Qwen3MLP is TP-aware: ``gate_up_proj`` is column-parallel (each rank
+        # holds ``intermediate_size // tp_size`` output cols) and ``down_proj``
+        # is row-parallel (each rank holds ``intermediate_size // tp_size``
+        # input cols).  The LoRA deltas ride the partial outputs of those base
+        # linears, and the existing downstream all-reduce sums per-rank
+        # partials — see ``apply_down_lora``/``apply_gate_up_lora``.
+        self.intermediate_size: int = getattr(
+            model_config, "intermediate_size", 4 * hidden
+        )
+        self.intermediate_per_tp: int = self.intermediate_size // self.tp_size
+        self.moe_intermediate_size: int = getattr(
+            model_config, "moe_intermediate_size", self.intermediate_size
+        )
+        self.moe_intermediate_per_tp: int = self.moe_intermediate_size // self.tp_size
+        self.num_experts: int = int(
+            getattr(
+                model_config,
+                "num_experts",
+                getattr(
+                    model_config,
+                    "num_local_experts",
+                    getattr(model_config, "n_routed_experts", 0),
+                ),
+            )
+        )
+
+        # CPU-side flag: True when at least one segment in the current
+        # batch_info uses a real adapter. CudaGraphWrapper
+        # reads this to pick the with-LoRA vs no-LoRA captured graph.
+        self.has_active_lora: bool = False
+
+        # ── Tier 1: GPU pool ─────────────────────────────────────────────
+        # Real adapters take slots 0 .. max_loras - 1. Base/no-LoRA requests
+        # use NO_LORA_SLOT in batch metadata and do not consume a GPU slot.
+        self._n_slots: int = max_loras
+        self._slot_to_name: list[str | None] = [None] * self._n_slots
+        self._name_to_slot: dict[str, int] = {}
+        self._gpu_lru: OrderedDict[str, None] = OrderedDict()  # alias of _lru
+
+        # ── Tier 2: pinned CPU pool ─────────────────────────────────────
+        # ``_cpu_cache[name]`` holds parsed weights in pinned host memory.
+        # ``_cpu_lru`` tracks LRU order for CPU eviction back to disk.  An
+        # adapter is "CPU-resident" iff its name is in ``_cpu_cache``.
+        # GPU-resident adapters are also kept in ``_cpu_cache`` (we pay
+        # the host RAM cost once; reload to GPU is cheap and re-evicting
+        # GPU then re-promoting only needs an H2D copy, not a disk read).
+        self._name_to_id: dict[str, int] = {}
+        self._id_to_name: dict[int, str] = {}
+        self._next_id: int = 1
+
+        # ── Tier 2/3: CPU pinned pool + disk source of truth ─────────────
+        self._cpu_store = LoraCpuCache(
+            capacity=self.max_loras_cpu,
+            is_gpu_resident=lambda name: name in self._name_to_slot,
+        )
+        # Compatibility aliases for existing tests/debug tooling.
+        self._cpu_cache = self._cpu_store.cache
+        self._cpu_lru = self._cpu_store.lru
+        self._adapter_paths = self._cpu_store.adapter_paths
+        self._pending_loads = self._cpu_store.pending_loads
+
+        # Per-slot rank + scaling for real adapter slots only.
+        self._lora_ranks: torch.Tensor = torch.zeros(
+            self._n_slots, dtype=torch.int32, device=device
+        )
+        self._slot_ranks: list[int] = [0] * self._n_slots
+        self._slot_scalings: list[float] = [0.0] * self._n_slots
+        self._scalings: torch.Tensor = torch.zeros(
+            self._n_slots, dtype=torch.float32, device=device
+        )
+
+        # ── Persistent batch_info ──────────────────────────────────────────
+        # All tensors are sized for the worst case so their pointers are
+        # stable across forward steps; per-step updates are in-place.
+        # ``num_segments`` may equal ``bs`` (one segment per token in the
+        # current path — no sort-by-adapter yet).
+        self._batch_info = LoraBatchInfo(
+            bs=0,
+            num_segments=0,
+            max_len=0,
+            seg_lens=torch.zeros(max_num_tokens, dtype=torch.int32, device=device),
+            seg_indptr=torch.zeros(
+                max_num_tokens + 1, dtype=torch.int32, device=device
+            ),
+            weight_indices=torch.full(
+                (max_num_tokens,), NO_LORA_SLOT, dtype=torch.int32, device=device
+            ),
+            lora_ranks=self._lora_ranks,
+            scalings=self._scalings,
+            permutation=None,
+        )
+
+        # CPU staging buffers (pinned) for the per-step H2D copy.
+        self._seg_lens_cpu = torch.zeros(
+            max_num_tokens, dtype=torch.int32, pin_memory=True
+        )
+        self._weight_indices_cpu = torch.full(
+            (max_num_tokens,), NO_LORA_SLOT, dtype=torch.int32, pin_memory=True
+        )
+        # Adapter-group buffers for the decode grouped expand kernel.
+        # Computed on CPU in prepare_loras (no GPU sync) and transferred
+        # non-blocking.  Using stable GPU addresses so decode CUDA graphs
+        # can capture the pointers; num_groups on axis=1 changes per step
+        # so the graph grid must be re-evaluated outside the captured region.
+        _mg = self._n_slots  # upper bound: one group per loaded adapter
+        self._sort_order_cpu = torch.zeros(
+            max_num_tokens, dtype=torch.int64, pin_memory=True
+        )
+        self._group_slots_cpu = torch.zeros(_mg, dtype=torch.int32, pin_memory=True)
+        self._group_starts_cpu = torch.zeros(_mg, dtype=torch.int32, pin_memory=True)
+        self._group_sizes_cpu = torch.zeros(_mg, dtype=torch.int32, pin_memory=True)
+        self._sort_order_buf = torch.zeros(
+            max_num_tokens, dtype=torch.int64, device=device
+        )
+        self._group_slots_buf = torch.zeros(_mg, dtype=torch.int32, device=device)
+        self._group_starts_buf = torch.zeros(_mg, dtype=torch.int32, device=device)
+        self._group_sizes_buf = torch.zeros(_mg, dtype=torch.int32, device=device)
+
+        # ── GPU weight buffers ─────────────────────────────────────────────
+        # Attention:
+        #   qkv_A_buffers: (n_slots, 3 * max_rank, hidden) — stacked q/k/v A.
+        #   qkv_B_buffers: (n_slots, q_per_tp + 2 * kv_per_tp, max_rank).
+        #   o_A_buffers:   (n_slots, max_rank, o_in_per_tp).
+        #   o_B_buffers:   (n_slots, hidden, max_rank).
+        # MLP (TP-aware, mirrors qwen3 ``Qwen3MLP``):
+        #   gate_up_A_buffers: (n_slots, 2 * max_rank, hidden)             — A replicated.
+        #   gate_up_B_buffers: (n_slots, 2 * intermediate_per_tp, max_rank) — column-parallel.
+        #   down_A_buffers:    (n_slots, max_rank, intermediate_per_tp)    — row-parallel.
+        #   down_B_buffers:    (n_slots, hidden, max_rank)                 — B replicated.
+        self._weight_buffers = LoraWeightBuffers(
+            n_layers=self.n_layers,
+            n_slots=self._n_slots,
+            max_lora_rank=self.max_lora_rank,
+            hidden_size=self.hidden_size,
+            q_size_per_tp=self.q_size_per_tp,
+            kv_size_per_tp=self.kv_size_per_tp,
+            o_in_per_tp=self.o_in_per_tp,
+            intermediate_per_tp=self.intermediate_per_tp,
+            vocab_per_tp=self.vocab_per_tp,
+            dtype=self.dtype,
+            device=self.device,
+            tp_rank=self.tp_rank,
+            tp_size=self.tp_size,
+            buffer_groups=self.lora_buffer_groups,
+        )
+        self.qkv_A_buffers = self._weight_buffers.qkv_A_buffers
+        self.qkv_B_buffers = self._weight_buffers.qkv_B_buffers
+        self.o_A_buffers = self._weight_buffers.o_A_buffers
+        self.o_B_buffers = self._weight_buffers.o_B_buffers
+        self.gate_up_A_buffers = self._weight_buffers.gate_up_A_buffers
+        self.gate_up_B_buffers = self._weight_buffers.gate_up_B_buffers
+        self.down_A_buffers = self._weight_buffers.down_A_buffers
+        self.down_B_buffers = self._weight_buffers.down_B_buffers
+        self.lm_head_A_buffer = (
+            self._weight_buffers.lm_head_A_buffer if self.enable_head_lora else None
+        )
+        self.lm_head_B_buffer = (
+            self._weight_buffers.lm_head_B_buffer if self.enable_head_lora else None
+        )
+        self._qkv_output_offset = self._weight_buffers.qkv_output_offset
+        self._max_qkv_out_dim = self._weight_buffers.max_qkv_out_dim
+        self._o_slice_offsets = self._weight_buffers.o_slice_offsets
+        self._gate_up_slice_offsets = self._weight_buffers.gate_up_slice_offsets
+        self._down_slice_offsets = self._weight_buffers.down_slice_offsets
+        self._moe_lora_buffers = MoeLoraBuffers(
+            n_layers=self.n_layers,
+            n_slots=self._n_slots,
+            max_lora_rank=self.max_lora_rank,
+            num_experts=self.num_experts,
+            hidden_size=self.hidden_size,
+            intermediate_per_tp=self.moe_intermediate_per_tp,
+            dtype=self.dtype,
+            device=self.device,
+            shard_weights=self._weight_buffers.shard_weights,
+            enabled=self.enable_moe_lora,
+            compressed_shared_outer=self.lora_moe_compressed_shared_outer,
+        )
+        # Compatibility alias for tests/debug tooling that inspected the old
+        # manager-owned storage directly.
+        self._moe_lora_weights = self._moe_lora_buffers.weights_by_layer
+
+        logger.info(
+            "LoraManager initialized: max_loras=%d max_rank=%d "
+            "tp_rank=%d/%d device=%s dtype=%s buffer_groups=%s "
+            "moe_compressed_shared_outer=%s",
+            max_loras,
+            max_lora_rank,
+            tp_rank,
+            tp_size,
+            device,
+            dtype,
+            ",".join(sorted(self.lora_buffer_groups)),
+            self.lora_moe_compressed_shared_outer,
+        )
+
+    # ── Public API ──────────────────────────────────────────────────────────
+
+    @property
+    def batch_info(self) -> LoraBatchInfo:
+        return self._batch_info
+
+    @property
+    def moe_lora_context(self) -> MoeLoraContext:
+        return self._moe_lora_buffers.build_context(
+            batch_info=self._batch_info,
+            scalings=self._scalings,
+            has_active_lora=self.has_active_lora,
+        )
+
+    def load_adapter(self, name: str, path: str) -> int:
+        """Register a PEFT adapter from *path* and warm the CPU pool.
+
+        ``path`` is recorded as the adapter's durable disk path; it must
+        remain accessible for the lifetime of the manager because the CPU
+        pool may evict the adapter back to disk under memory pressure.
+
+        Returns the integer ``lora_id`` to use in subsequent
+        ``prepare_loras`` calls.
+        """
+        if name in self._name_to_id:
+            logger.warning("Adapter '%s' is already loaded; re-loading.", name)
+            self._evict_by_name(name)
+            self._evict_from_cpu(name)
+
+        # Resolve the durable disk path now (used by future re-reads when
+        # the CPU pool evicts these weights).
+        adapter_path = path
+        weight_path = resolve_adapter_weight_path(adapter_path)
+        if not os.path.exists(weight_path):
+            raise FileNotFoundError(
+                f"Adapter weights not found at {weight_path!r} or {path!r}"
+            )
+
+        lora_id = self._next_id
+        self._next_id += 1
+        self._name_to_id[name] = lora_id
+        self._id_to_name[lora_id] = name
+        self._cpu_store.set_path(name, adapter_path)
+
+        # Warm the CPU pool — bounded by ``max_loras_cpu``, may evict
+        # other CPU-resident adapters back to disk.
+        self._cpu_store.ensure(name)
+
+        logger.info(
+            "Registered adapter '%s' (lora_id=%d) from %s; CPU pool: %d/%d",
+            name,
+            lora_id,
+            path,
+            len(self._cpu_cache),
+            self.max_loras_cpu,
+        )
+        return lora_id
+
+    def unload_adapter(self, name: str) -> None:
+        if name not in self._name_to_id:
+            raise KeyError(f"Adapter '{name}' is not loaded.")
+        self._evict_by_name(name)
+        self._cpu_store.remove(name)
+        lora_id = self._name_to_id.pop(name)
+        del self._id_to_name[lora_id]
+        logger.info("Unloaded adapter '%s'", name)
+
+    def get_id(self, name: str) -> int | None:
+        return self._name_to_id.get(name)
+
+    def prepare_loras(
+        self,
+        lora_ids: list[int],
+        per_request_token_counts: list[int] | int = 1,
+    ) -> int:
+        """Fill :attr:`batch_info` for the upcoming forward.
+
+        Each request becomes one segment.  Returns the total number of
+        tokens written.  All updates are in place on the persistent
+        batch_info tensors so the captured CUDA graph keeps replaying
+        against the same pointers.
+        """
+        bs = len(lora_ids)
+        # Resolve names → slots; LRU bookkeeping.
+        per_request_slots: list[int] = []
+        for lid in lora_ids:
+            if lid == 0:
+                per_request_slots.append(NO_LORA_SLOT)
+                continue
+            name = self._id_to_name.get(lid)
+            if name is None:
+                logger.warning("Unknown lora_id %d; treating as base model.", lid)
+                per_request_slots.append(NO_LORA_SLOT)
+                continue
+            slot = self._ensure_in_gpu(name)
+            per_request_slots.append(slot)
+            self._gpu_lru.move_to_end(name)
+
+        # Per-request seg_lens.
+        if isinstance(per_request_token_counts, int):
+            seg_lens_list = [per_request_token_counts] * bs
+        else:
+            if len(per_request_token_counts) != bs:
+                raise ValueError(
+                    "per_request_token_counts length must match lora_ids length"
+                )
+            seg_lens_list = list(per_request_token_counts)
+
+        total_tokens = sum(seg_lens_list)
+        if total_tokens > self.max_num_tokens:
+            raise ValueError(
+                f"LoRA batch_info overflow: {total_tokens} > {self.max_num_tokens}"
+            )
+        max_len = max(seg_lens_list) if seg_lens_list else 0
+
+        bi = self._batch_info
+
+        # For decode batches (max_len == 1): compute adapter groups on CPU
+        # so the grouped expand kernel can batch same-adapter tokens into a
+        # full BLOCK_S=16 GEMM tile, recovering tensor-core efficiency.
+        if max_len == 1 and bs > 1:
+            sort_order, group_slots, group_starts, group_sizes = (
+                build_decode_lora_groups(per_request_slots)
+            )
+            ng = len(group_slots)
+            active_count = len(sort_order)
+            self._sort_order_cpu[:active_count] = torch.as_tensor(
+                sort_order, dtype=torch.int64
+            )
+            self._group_slots_cpu[:ng] = torch.as_tensor(group_slots, dtype=torch.int32)
+            self._group_starts_cpu[:ng] = torch.as_tensor(
+                group_starts, dtype=torch.int32
+            )
+            self._group_sizes_cpu[:ng] = torch.as_tensor(group_sizes, dtype=torch.int32)
+            bi.sort_order = self._sort_order_buf
+            bi.group_slots = self._group_slots_buf
+            bi.group_starts = self._group_starts_buf
+            bi.group_sizes = self._group_sizes_buf
+            bi.sort_order[:active_count].copy_(
+                self._sort_order_cpu[:active_count], non_blocking=True
+            )
+            bi.group_slots[:ng].copy_(self._group_slots_cpu[:ng], non_blocking=True)
+            bi.group_starts[:ng].copy_(self._group_starts_cpu[:ng], non_blocking=True)
+            bi.group_sizes[:ng].copy_(self._group_sizes_cpu[:ng], non_blocking=True)
+            bi.num_groups = ng
+            bi.max_group_size = max(group_sizes) if group_sizes else 0
+        else:
+            bi.sort_order = bi.group_slots = bi.group_starts = bi.group_sizes = None
+            bi.num_groups = 0
+            bi.max_group_size = 0
+
+        first_slot = per_request_slots[0] if per_request_slots else NO_LORA_SLOT
+        bi.single_lora_slot = (
+            first_slot
+            if first_slot != NO_LORA_SLOT
+            and all(slot == first_slot for slot in per_request_slots)
+            else NO_LORA_SLOT
+        )
+        bi.single_lora_rank = (
+            self._slot_ranks[bi.single_lora_slot]
+            if bi.single_lora_slot != NO_LORA_SLOT
+            else 0
+        )
+        bi.multi_lora_start_slot = NO_LORA_SLOT
+        bi.multi_lora_count = 0
+        bi.multi_lora_segment_len = 0
+        bi.multi_lora_rank = 0
+        if (
+            bs > 1
+            and bi.single_lora_slot == NO_LORA_SLOT
+            and max_len > _CHUNKED_THRESHOLD
+            and len(set(seg_lens_list)) == 1
+            and all(slot != NO_LORA_SLOT for slot in per_request_slots)
+        ):
+            start_slot = per_request_slots[0]
+            consecutive_slots = all(
+                slot == start_slot + i for i, slot in enumerate(per_request_slots)
+            )
+            rank = self._slot_ranks[start_slot]
+            scaling = self._slot_scalings[start_slot]
+            same_rank_and_scaling = all(
+                self._slot_ranks[slot] == rank and self._slot_scalings[slot] == scaling
+                for slot in per_request_slots
+            )
+            if consecutive_slots and rank > 0 and same_rank_and_scaling:
+                bi.multi_lora_start_slot = start_slot
+                bi.multi_lora_count = bs
+                bi.multi_lora_segment_len = seg_lens_list[0]
+                bi.multi_lora_rank = rank
+
+        # Stage on CPU then a single non-blocking H2D.
+        self._seg_lens_cpu[:bs] = torch.as_tensor(seg_lens_list, dtype=torch.int32)
+        self._weight_indices_cpu[:bs] = torch.as_tensor(
+            per_request_slots, dtype=torch.int32
+        )
+
+        self.has_active_lora = any(s != NO_LORA_SLOT for s in per_request_slots)
+
+        bi = self._batch_info
+        bi.bs = bs
+        bi.num_segments = bs
+        bi.max_len = max_len
+
+        # Skip the H2D copies and on-device cumsum when no adapter is active:
+        # the no-LoRA CUDA graph omits all LoRA kernels and never reads
+        # weight_indices / seg_lens / seg_indptr, so updating them is wasted work.
+        if self.has_active_lora:
+            bi.seg_lens[:bs].copy_(self._seg_lens_cpu[:bs], non_blocking=True)
+            bi.weight_indices[:bs].copy_(
+                self._weight_indices_cpu[:bs], non_blocking=True
+            )
+            bi.seg_indptr[0] = 0
+            torch.cumsum(bi.seg_lens[:bs], dim=0, out=bi.seg_indptr[1 : bs + 1])
+
+        return total_tokens
+
+    def apply_qkv_lora(
+        self,
+        hidden_states: torch.Tensor,
+        qkv: torch.Tensor,
+        layer_id: int,
+    ) -> torch.Tensor:
+        """Fused QKV LoRA delta: ``qkv += B @ A @ x * scaling``.
+
+        ``hidden_states``: ``(s, hidden)`` (full input).
+        ``qkv``:           ``(s, q_per_tp + 2 * kv_per_tp)`` (output of qkv_proj
+        on this rank).  Updated in place via the kernel's fused-add.
+        """
+        if hidden_states.shape[0] == 0:
+            return qkv
+        if not self.enable_attn_lora:
+            return qkv
+        bi = self._batch_info
+        if bi.bs == 0 or not self.has_active_lora:
+            return qkv
+
+        A_buf = self.qkv_A_buffers[layer_id]
+        B_buf = self.qkv_B_buffers[layer_id]
+        # lora_a: (s, 3 * max_rank)
+        lora_a = (
+            lora_shrink_prefill_fwd(hidden_states, A_buf, bi, stack_num=3)
+            if bi.max_len > _CHUNKED_THRESHOLD
+            else lora_shrink_fwd(hidden_states, A_buf, bi, stack_num=3)
+        )
+        if bi.max_len > _CHUNKED_THRESHOLD:
+            lora_expand_prefill_fwd(
+                lora_a,
+                B_buf,
+                bi,
+                self._qkv_output_offset,
+                self._max_qkv_out_dim,
+                base_output=qkv,
+            )
+        else:
+            lora_qkv_expand_fwd(
+                lora_a,
+                B_buf,
+                bi,
+                self._qkv_output_offset,
+                self._max_qkv_out_dim,
+                base_output=qkv,
+            )
+        return qkv
+
+    def apply_o_lora(
+        self,
+        attn_output: torch.Tensor,
+        o_output: torch.Tensor,
+        layer_id: int,
+    ) -> torch.Tensor:
+        """Row-parallel O-projection LoRA delta.
+
+        ``attn_output``: ``(s, q_per_tp)`` per-rank attention output (input
+        to o_proj).
+        ``o_output``: ``(s, hidden)`` partial sum from the host o_proj
+        (``reduce_results=False`` on this codebase).  Updated in place.
+
+        Each rank computes ``B @ A_local @ x_local`` — a partial of shape
+        ``(s, hidden)``.  A is sharded along its input dim and B is
+        replicated, so the sum of partials over ranks equals
+        ``B @ A_full @ x_full``.  The host layer's downstream fused
+        all-reduce in ``post_attention_layernorm`` sums the base partial
+        and the LoRA partial together, producing the correct full output.
+        """
+        if attn_output.shape[0] == 0:
+            return o_output
+        if not self.enable_attn_lora:
+            return o_output
+        bi = self._batch_info
+        if bi.bs == 0 or not self.has_active_lora:
+            return o_output
+
+        A_buf = self.o_A_buffers[layer_id]
+        B_buf = self.o_B_buffers[layer_id]
+        # lora_a (partial per rank): (s, max_rank).  No internal all-reduce —
+        # the partial flows into B and the result rides the downstream sum.
+        lora_a = (
+            lora_shrink_prefill_fwd(attn_output, A_buf, bi, stack_num=1)
+            if bi.max_len > _CHUNKED_THRESHOLD
+            else lora_shrink_fwd(attn_output, A_buf, bi, stack_num=1)
+        )
+        if bi.max_len > _CHUNKED_THRESHOLD:
+            lora_expand_prefill_fwd(
+                lora_a,
+                B_buf,
+                bi,
+                self._o_slice_offsets,
+                self.hidden_size,
+                base_output=o_output,
+            )
+        elif _use_triton_grouped_decode(bi):
+            lora_expand_grouped_v2_fwd(lora_a, B_buf, bi, base_output=o_output)
+        else:
+            lora_expand_fwd(lora_a, B_buf, bi, base_output=o_output)
+        return o_output
+
+    def apply_gate_up_lora(
+        self,
+        hidden_states: torch.Tensor,
+        gate_up: torch.Tensor,
+        layer_id: int,
+    ) -> torch.Tensor:
+        """Fused gate/up LoRA delta: ``gate_up += B @ A @ x * scaling``.
+
+        ``hidden_states``: ``(s, hidden)``.
+        ``gate_up``:       ``(s, 2 * intermediate_per_tp)`` — output of the
+        column-parallel ``gate_up_proj`` (each rank holds its own output
+        shard).  Updated in place via the kernel's fused-add.
+        """
+        if hidden_states.shape[0] == 0:
+            return gate_up
+        if not self.enable_mlp_lora:
+            return gate_up
+        bi = self._batch_info
+        if bi.bs == 0 or not self.has_active_lora:
+            return gate_up
+
+        A_buf = self.gate_up_A_buffers[layer_id]
+        B_buf = self.gate_up_B_buffers[layer_id]
+        # lora_a: (s, 2 * max_rank) — gate's lora_a in [:, :r], up's in [:, r:].
+        lora_a = (
+            lora_shrink_prefill_fwd(hidden_states, A_buf, bi, stack_num=2)
+            if bi.max_len > _CHUNKED_THRESHOLD
+            else lora_shrink_fwd(hidden_states, A_buf, bi, stack_num=2)
+        )
+        if bi.max_len > _CHUNKED_THRESHOLD:
+            lora_expand_prefill_fwd(
+                lora_a,
+                B_buf,
+                bi,
+                self._gate_up_slice_offsets,
+                self.intermediate_per_tp,
+                base_output=gate_up,
+            )
+        else:
+            lora_gate_up_expand_fwd(
+                lora_a,
+                B_buf,
+                bi,
+                self.intermediate_per_tp,
+                base_output=gate_up,
+            )
+        return gate_up
+
+    def apply_down_lora(
+        self,
+        x: torch.Tensor,
+        down_output: torch.Tensor,
+        layer_id: int,
+    ) -> torch.Tensor:
+        """Down-projection LoRA delta (row-parallel under MLP TP).
+
+        ``x``:           ``(s, intermediate_per_tp)`` — input to the
+        row-parallel ``down_proj`` (this rank's input shard).
+        ``down_output``: ``(s, hidden)`` — partial output of ``down_proj``
+        before its all-reduce.  Updated in place.
+
+        Each rank's delta is ``B @ A_local @ x_local``: A is sharded along
+        the input dim and B is replicated, so summing per-rank deltas yields
+        the full ``B @ A_full @ x_full``.  The base linear runs with
+        ``reduce_results=False``; the downstream all-reduce that sums the
+        base partial also sums the LoRA partials.
+        """
+        if x.shape[0] == 0:
+            return down_output
+        if not self.enable_mlp_lora:
+            return down_output
+        bi = self._batch_info
+        if bi.bs == 0 or not self.has_active_lora:
+            return down_output
+
+        A_buf = self.down_A_buffers[layer_id]
+        B_buf = self.down_B_buffers[layer_id]
+        lora_a = (
+            lora_shrink_prefill_fwd(x, A_buf, bi, stack_num=1)
+            if bi.max_len > _CHUNKED_THRESHOLD
+            else lora_shrink_fwd(x, A_buf, bi, stack_num=1)
+        )
+        if bi.max_len > _CHUNKED_THRESHOLD:
+            lora_expand_prefill_fwd(
+                lora_a,
+                B_buf,
+                bi,
+                self._down_slice_offsets,
+                self.hidden_size,
+                base_output=down_output,
+            )
+        elif _use_triton_grouped_decode(bi):
+            lora_expand_grouped_v2_fwd(lora_a, B_buf, bi, base_output=down_output)
+        else:
+            lora_expand_fwd(lora_a, B_buf, bi, base_output=down_output)
+        return down_output
+
+    def apply_lm_head_lora(
+        self,
+        hidden_states: torch.Tensor,
+        logits: torch.Tensor,
+    ) -> torch.Tensor:
+        """lm_head LoRA delta: ``logits += B @ A @ x * scaling``.
+
+        ``hidden_states``: ``(s, hidden)`` — one token per request (pruned).
+        ``logits``:        ``(s, vocab_per_tp)`` — pre-all-gather logits shard.
+        Applied before the TP all-gather so each rank contributes its vocab
+        shard correctly.
+
+        Note: when ``extend_return_logprob`` is True the caller may pass more
+        than ``bi.bs`` tokens.  In that case this method is a no-op because
+        the per-token slot mapping is not available here; sampling logits are
+        still correct for the last token of each request.
+        """
+        if hidden_states.shape[0] == 0:
+            return logits
+        if not self.enable_head_lora:
+            return logits
+        bi = self._batch_info
+        if bi.bs == 0 or not self.has_active_lora:
+            return logits
+        if hidden_states.shape[0] != bi.bs:
+            return logits
+
+        slots = bi.weight_indices[: bi.bs]  # (bs,)
+        valid = slots != NO_LORA_SLOT
+        if not valid.any():
+            return logits
+
+        # Fast path: all requests use the same adapter slot.
+        # Use plain matmul to avoid a gather of the B matrix (vocab_per_tp × rank
+        # bytes) for every request.  Guarded from CUDA graph capture because the
+        # Python branch is frozen at capture time — replaying with a different
+        # single_lora_slot would silently use stale weights.
+        if (
+            bi.single_lora_slot != NO_LORA_SLOT
+            and not torch.cuda.is_current_stream_capturing()
+        ):
+            slot = bi.single_lora_slot
+            scaling = self._scalings[slot].item()
+            A = self.lm_head_A_buffer[slot]  # (r, hidden)
+            B = self.lm_head_B_buffer[slot]  # (vocab_per_tp, r)
+            lora_a = hidden_states @ A.T  # (bs, r)
+            delta = lora_a @ B.T  # (bs, vocab_per_tp)
+            return logits + delta * scaling
+
+        valid_slots = slots.clamp(min=0)
+        # A: (bs, r, hidden),  B: (bs, vocab_per_tp, r)
+        A = self.lm_head_A_buffer[valid_slots]
+        B = self.lm_head_B_buffer[valid_slots]
+        # lora_a: (bs, r) = A @ hidden_states[..., None]
+        lora_a = torch.bmm(A, hidden_states.unsqueeze(-1)).squeeze(-1)
+        # delta: (bs, vocab_per_tp)
+        delta = torch.bmm(B, lora_a.unsqueeze(-1)).squeeze(-1)
+        # Zero out requests with no adapter; scale the rest.
+        scale = self._scalings[valid_slots] * valid.to(self._scalings.dtype)
+        return logits + delta * scale.unsqueeze(-1)
+
+    def apply_moe_gate_up_lora(
+        self,
+        layer_id: int,
+        hidden_states: torch.Tensor,
+        topk_ids: torch.Tensor,
+        gate_up_output: torch.Tensor,
+        *,
+        sorted_token_ids: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Compatibility wrapper; MoE-specific work lives in MoeLoraContext."""
+        if not self.enable_moe_lora:
+            return gate_up_output
+        return self.moe_lora_context.apply_gate_up_lora(
+            layer_id,
+            hidden_states,
+            topk_ids,
+            gate_up_output,
+            sorted_token_ids=sorted_token_ids,
+        )
+
+    def apply_moe_down_lora(
+        self,
+        layer_id: int,
+        intermediate: torch.Tensor,
+        topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        down_output: torch.Tensor,
+        *,
+        sorted_token_ids: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Compatibility wrapper; MoE-specific work lives in MoeLoraContext."""
+        if not self.enable_moe_lora:
+            return down_output
+        return self.moe_lora_context.apply_down_lora(
+            layer_id,
+            intermediate,
+            topk_ids,
+            topk_weights,
+            down_output,
+            sorted_token_ids=sorted_token_ids,
+        )
+
+    def set_adapter_scaling(self, name: str, scaling: float) -> None:
+        slot = self._name_to_slot.get(name)
+        if slot is not None:
+            self._slot_scalings[slot] = scaling
+            self._scalings[slot] = scaling
+
+    # ── Slot allocation ─────────────────────────────────────────────────────
+
+    def _ensure_in_gpu(self, name: str) -> int:
+        if name in self._name_to_slot:
+            return self._name_to_slot[name]
+        # Tier-2 → Tier-1 promotion; may need to read from disk if the
+        # CPU pool has evicted this adapter since registration.
+        self._cpu_store.ensure(name)
+        slot = self._find_free_slot()
+        self._load_to_slot(name, slot)
+        self._name_to_slot[name] = slot
+        self._slot_to_name[slot] = name
+        self._gpu_lru[name] = None
+        return slot
+
+    def prefetch(self, name: str) -> None:
+        """Best-effort async warm of the CPU pool for *name*.
+
+        Called from the request-admission path: when a request with a
+        non-zero ``lora_id`` arrives the manager kicks off a background
+        disk read so the safetensors I/O is overlapped with the previous
+        forward step rather than blocking ``prepare_loras`` of the step
+        that actually consumes the adapter.
+
+        No-op when the adapter is already CPU-resident or a load is
+        already in flight.  Silently ignores unknown adapters (the
+        request will fall back to base via NO_LORA_SLOT).
+        """
+        self._cpu_store.prefetch(name)
+
+    def _evict_from_cpu(self, name: str) -> None:
+        """Public helper, takes the lock.  Caller must ensure *name* is
+        not currently GPU-resident."""
+        self._cpu_store.evict(name)
+
+    def _find_free_slot(self) -> int:
+        for slot in range(self._n_slots):
+            if self._slot_to_name[slot] is None:
+                return slot
+        for candidate_name in list(self._gpu_lru.keys()):
+            slot = self._name_to_slot[candidate_name]
+            logger.debug("Evicting adapter '%s' from GPU slot %d", candidate_name, slot)
+            self._evict_by_name(candidate_name)
+            return slot
+        raise RuntimeError(
+            "LoRA GPU pool is full and no evictable adapter was found. "
+            f"Increase max_loras (current: {self.max_loras})."
+        )
+
+    def _load_to_slot(self, name: str, slot: int) -> None:
+        cpu_weights = self._cpu_cache[name]
+        rank = self._get_rank_for(name)
+        scaling = self._get_scaling_for(name, rank)
+        self._reset_slot(slot)
+        self._lora_ranks[slot] = rank
+        self._slot_ranks[slot] = rank
+        self._slot_scalings[slot] = scaling
+        self._scalings[slot] = scaling
+        self._weight_buffers.load_adapter_to_slot(cpu_weights, slot, rank)
+        self._moe_lora_buffers.load_adapter_to_slot(cpu_weights, slot, rank)
+
+        logger.debug("Loaded adapter '%s' into GPU slot %d (rank=%d)", name, slot, rank)
+
+    def _get_rank_for(self, name: str) -> int:
+        cpu_weights = self._cpu_cache.get(name, {})
+        if not cpu_weights:
+            return self.max_lora_rank
+        # Check layer 0 first (dense attn/MLP modules).
+        if 0 in cpu_weights:
+            for mod in PEFT_MODULES:
+                if mod in cpu_weights[0]:
+                    return cpu_weights[0][mod][0].shape[0]
+            for mod, tensors in cpu_weights[0].items():
+                if mod.startswith("experts."):
+                    lora_A = tensors[0]
+                    if lora_A.dim() == 3:
+                        return lora_A.shape[1]
+                    return lora_A.shape[0]
+        # Fall back to lm_head (head-only adapters).
+        if LORA_HEAD_LAYER_ID in cpu_weights:
+            head = cpu_weights[LORA_HEAD_LAYER_ID]
+            if PEFT_HEAD_MODULE in head:
+                return head[PEFT_HEAD_MODULE][0].shape[0]
+        return self.max_lora_rank
+
+    def _get_scaling_for(self, name: str, rank: int) -> float:
+        return read_adapter_scaling(self._adapter_paths.get(name), rank)
+
+    def _evict_by_name(self, name: str) -> None:
+        if name in self._name_to_slot:
+            slot = self._name_to_slot.pop(name)
+            self._slot_to_name[slot] = None
+            self._reset_slot(slot)
+        self._gpu_lru.pop(name, None)
+
+    def _reset_slot(self, slot: int) -> None:
+        self._weight_buffers.zero_slot(slot)
+        self._moe_lora_buffers.clear_slot(slot)
+        self._lora_ranks[slot] = 0
+        self._slot_ranks[slot] = 0
+        self._slot_scalings[slot] = 0.0
+        self._scalings[slot] = 0.0
diff --git a/python/tokenspeed/runtime/lora/lora_registry.py b/python/tokenspeed/runtime/lora/lora_registry.py
new file mode 100644
index 000000000..9ee651f1a
--- /dev/null
+++ b/python/tokenspeed/runtime/lora/lora_registry.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""In-process registry that tracks loaded LoRA adapters and maps names to IDs."""
+
+from __future__ import annotations
+
+from typing import Iterator, Optional
+
+from tokenspeed.runtime.lora.lora_config import LoraConfig
+from tokenspeed.runtime.utils import get_colorful_logger
+
+logger = get_colorful_logger(__name__)
+
+# Sentinel value meaning "no adapter" — maps cleanly to int for scheduling.
+NO_LORA_ID: int = 0
+
+
+class LoraRegistry:
+    """Thread-unsafe registry; call from the scheduler/engine main thread only.
+
+    TODO: add locking when multi-threaded engine support is needed.
+    """
+
+    def __init__(self, max_loras: int) -> None:
+        self.max_loras = max_loras
+        self._configs: dict[str, LoraConfig] = {}  # name → config
+        self._name_to_id: dict[str, int] = {}  # name → integer ID
+        self._id_to_name: dict[int, str] = {}  # integer ID → name
+        self._next_id: int = 1  # 0 is reserved for "no lora"
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    def register(self, config: LoraConfig) -> int:
+        """Register a new adapter and return its integer ID.
+
+        Raises ``ValueError`` if the adapter is already registered or the
+        registry is at capacity.
+        """
+        if config.name in self._name_to_id:
+            raise ValueError(f"LoRA adapter '{config.name}' is already registered.")
+        if len(self._configs) >= self.max_loras:
+            raise ValueError(
+                f"LoRA registry is full ({self.max_loras} adapters). "
+                "Unload an adapter before loading a new one."
+            )
+        lora_id = self._next_id
+        self._next_id += 1
+        self._configs[config.name] = config
+        self._name_to_id[config.name] = lora_id
+        self._id_to_name[lora_id] = config.name
+        logger.info("Registered LoRA adapter '%s' → id=%d", config.name, lora_id)
+        return lora_id
+
+    def unregister(self, name: str) -> None:
+        """Remove an adapter from the registry.
+
+        Raises ``KeyError`` if the name is not registered.
+        """
+        if name not in self._name_to_id:
+            raise KeyError(f"LoRA adapter '{name}' is not registered.")
+        lora_id = self._name_to_id.pop(name)
+        del self._id_to_name[lora_id]
+        del self._configs[name]
+        logger.info("Unregistered LoRA adapter '%s' (id=%d)", name, lora_id)
+
+    def get_id(self, name: str) -> Optional[int]:
+        """Return the integer ID for an adapter name, or None if not found."""
+        return self._name_to_id.get(name)
+
+    def get_config(self, name: str) -> Optional[LoraConfig]:
+        """Return the LoraConfig for a registered adapter name."""
+        return self._configs.get(name)
+
+    def get_config_by_id(self, lora_id: int) -> Optional[LoraConfig]:
+        name = self._id_to_name.get(lora_id)
+        return self._configs.get(name) if name else None
+
+    def __contains__(self, name: str) -> bool:
+        return name in self._name_to_id
+
+    def __len__(self) -> int:
+        return len(self._name_to_id)
+
+    def __iter__(self) -> Iterator[LoraConfig]:
+        return iter(self._configs.values())
diff --git a/python/tokenspeed/runtime/lora/moe_lora.py b/python/tokenspeed/runtime/lora/moe_lora.py
new file mode 100644
index 000000000..dff003779
--- /dev/null
+++ b/python/tokenspeed/runtime/lora/moe_lora.py
@@ -0,0 +1,1326 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Callable
+
+import torch
+
+from tokenspeed.runtime.lora.lora_batch import NO_LORA_SLOT, LoraBatchInfo
+
+try:
+    from tokenspeed_kernel.ops.moe_lora import (
+        fused_a_b_down_expand,
+        fused_shared_a_b_gate_up_expand,
+        gate_up_b_expand,
+        per_expert_a_shrink,
+        per_expert_b_down_expand,
+        per_expert_gate_up_b_expand,
+        shared_a_shrink,
+        shared_b_down_expand,
+        sorted_a_down_shrink,
+        sorted_gate_up_b_expand,
+    )
+
+    _FUSED_MOE_LORA_AVAILABLE = True
+except Exception:
+    _FUSED_MOE_LORA_AVAILABLE = False
+
+MoeLayerSlotWeights = dict[int, dict[str, torch.Tensor]]
+MoeWeightsByLayer = dict[int, MoeLayerSlotWeights]
+
+
+@dataclass(frozen=True)
+class MoeLoraContext:
+    """Narrow per-forward view of MoE LoRA state consumed by MoE backends."""
+
+    weights_by_layer: MoeWeightsByLayer
+    batch_info: LoraBatchInfo
+    scalings: torch.Tensor
+    has_active_lora: bool
+    # Per-layer buffer lists for CUDA-graph-compatible dynamic slot indexing.
+    # When set, _apply_*_slot uses GPU tensor indexing via batch_info.weight_indices
+    # instead of Python dict lookup, so the CUDA graph can replay with any adapter.
+    w13_A_buffers: list | None
+    w13_B_buffers: list | None
+    down_A_buffers: list | None
+    down_B_buffers: list | None
+    # Multi-stream prefetch: secondary stream + pre-allocated output buffers.
+    # Shrink ops run on _lora_stream concurrently with the base MoE GEMMs.
+    _lora_stream: object | None = None  # torch.cuda.Stream
+    _lora_a_m_buf: torch.Tensor | None = None  # (max_bs, 2*max_r)
+    _lora_a_flat_buf: torch.Tensor | None = None  # (max_bs*max_topk, max_r)
+    # Mutable flags (list elements are mutable even in frozen dataclass):
+    #   _prefetch_flags[0] = gate_up shrink pending; [1] = down shrink pending.
+    _prefetch_flags: list | None = None
+
+    def apply_gate_up_lora(
+        self,
+        layer_id: int,
+        hidden_states: torch.Tensor,
+        topk_ids: torch.Tensor,
+        gate_up_output: torch.Tensor,
+        *,
+        sorted_token_ids: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Apply expert-scoped LoRA to routed MoE gate/up output."""
+        if hidden_states.shape[0] == 0 or topk_ids.numel() == 0:
+            return gate_up_output
+        slots, single_slot = self._token_slots(hidden_states.shape[0])
+        if single_slot == NO_LORA_SLOT and slots is None:
+            return gate_up_output
+        if single_slot != NO_LORA_SLOT:
+            self._apply_gate_up_slot(
+                layer_id,
+                single_slot,
+                hidden_states,
+                topk_ids,
+                gate_up_output,
+                sorted_token_ids=sorted_token_ids,
+            )
+            return gate_up_output
+        assert slots is not None
+        for slot_t in torch.unique(slots):
+            slot = int(slot_t.item())
+            if slot == NO_LORA_SLOT:
+                continue
+            self._apply_gate_up_slot(
+                layer_id,
+                slot,
+                hidden_states,
+                topk_ids,
+                gate_up_output,
+                token_mask=slots == slot,
+                sorted_token_ids=sorted_token_ids,
+            )
+        return gate_up_output
+
+    def apply_down_lora(
+        self,
+        layer_id: int,
+        intermediate: torch.Tensor,
+        topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        down_output: torch.Tensor,
+        *,
+        sorted_token_ids: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Apply expert-scoped LoRA to routed MoE down output."""
+        if intermediate.shape[0] == 0 or topk_ids.numel() == 0:
+            return down_output
+        num_tokens = topk_ids.shape[0]
+        slots, single_slot = self._token_slots(num_tokens)
+        if single_slot == NO_LORA_SLOT and slots is None:
+            return down_output
+        # Sorted-space fast path: work directly on sorted intermediate, skipping
+        # _route_rows_from_cache. Only applies when sorted dispatch is active (TMA
+        # config), since the fused shrink kernel has poor utilization for small
+        # flat-pair batches.
+        if (
+            _FUSED_MOE_LORA_AVAILABLE
+            and sorted_token_ids is not None
+            and single_slot != NO_LORA_SLOT
+            and self.down_A_buffers is not None
+            and self.batch_info.single_lora_slot != -1
+        ):
+            if self._apply_down_sorted(
+                layer_id,
+                single_slot,
+                intermediate,
+                topk_ids,
+                topk_weights,
+                down_output,
+                sorted_token_ids,
+            ):
+                return down_output
+        route_input = self._route_rows_from_cache(
+            intermediate,
+            topk_ids.numel(),
+            sorted_token_ids=sorted_token_ids,
+        ).view(topk_ids.shape[0], topk_ids.shape[1], -1)
+        if single_slot != NO_LORA_SLOT:
+            self._apply_down_slot(
+                layer_id,
+                single_slot,
+                route_input,
+                topk_ids,
+                topk_weights,
+                down_output,
+            )
+            return down_output
+        assert slots is not None
+        for slot_t in torch.unique(slots):
+            slot = int(slot_t.item())
+            if slot == NO_LORA_SLOT:
+                continue
+            self._apply_down_slot(
+                layer_id,
+                slot,
+                route_input,
+                topk_ids,
+                topk_weights,
+                down_output,
+                token_mask=slots == slot,
+            )
+        return down_output
+
+    def _token_slots(self, num_tokens: int) -> tuple[torch.Tensor | None, int]:
+        bi = self.batch_info
+        if bi.bs == 0 or not self.has_active_lora:
+            return None, NO_LORA_SLOT
+        if bi.single_lora_slot != NO_LORA_SLOT:
+            return None, bi.single_lora_slot
+        slots = torch.repeat_interleave(
+            bi.weight_indices[: bi.bs], bi.seg_lens[: bi.bs]
+        )
+        if slots.numel() != num_tokens:
+            # Token ownership changed under TP/EP communication. Mixed LoRA
+            # cannot be applied safely without transforming the slot map too.
+            return None, NO_LORA_SLOT
+        return slots, NO_LORA_SLOT
+
+    # ── Multi-stream prefetch API ──────────────────────────────────────────────
+    # Called from triton_common.py BEFORE each base GEMM to overlap the LoRA
+    # shrink kernel with the base model's gate_up / down computation.
+
+    def launch_gate_up_shrink(
+        self,
+        layer_id: int,
+        hidden_states: torch.Tensor,
+        topk_ids: torch.Tensor,
+    ) -> None:
+        """Fork: launch gate_up LoRA A-shrink on secondary stream.
+
+        Must be called immediately BEFORE gate_up_gemm so that shared_a_shrink
+        (torch.mm) runs concurrently on _lora_stream while gate_up_gemm runs
+        on the main stream.  apply_gate_up_lora will join the stream and use
+        the pre-filled _lora_a_m_buf instead of recomputing.
+        """
+        if self._prefetch_flags is None:
+            return
+        self._prefetch_flags[0] = False  # default: no prefetch
+        bi = self.batch_info
+        if (
+            not self.has_active_lora
+            or bi.single_lora_slot == NO_LORA_SLOT
+            or self.w13_A_buffers is None
+            or self._lora_stream is None
+            or self._lora_a_m_buf is None
+        ):
+            return
+        m = hidden_states.shape[0]
+        w13_A_buf = self.w13_A_buffers[layer_id]
+        if w13_A_buf.shape[1] != 1:  # only sglang_shared format (shared A)
+            return
+        if m > self._lora_a_m_buf.shape[0]:
+            return  # prefill with too many tokens — skip prefetch to avoid OOB
+        # Fork to secondary stream: launch torch.mm concurrently with gate_up_gemm.
+        self._lora_stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(self._lora_stream):
+            torch.mm(hidden_states, w13_A_buf[0, 0].T, out=self._lora_a_m_buf[:m])
+        self._prefetch_flags[0] = True
+
+    def launch_down_shrink(
+        self,
+        layer_id: int,
+        intermediate: torch.Tensor,
+        topk_ids: torch.Tensor,
+        m_k: int,
+    ) -> None:
+        """Fork: launch down LoRA A-shrink on secondary stream.
+
+        Must be called immediately BEFORE down_gemm so that per_expert_a_shrink
+        runs concurrently on _lora_stream while down_gemm runs on main stream.
+        intermediate is intermediate_cache2 (silu output), shape (m*topk, INTER).
+        m_k is m_tokens * top_k (non-padded).
+        """
+        if self._prefetch_flags is None:
+            return
+        self._prefetch_flags[1] = False
+        bi = self.batch_info
+        down_A_buf = self.down_A_buffers[layer_id] if self.down_A_buffers else None
+        down_B_buf = self.down_B_buffers[layer_id] if self.down_B_buffers else None
+        if (
+            not self.has_active_lora
+            or bi.single_lora_slot == NO_LORA_SLOT
+            or down_A_buf is None
+            or down_B_buf is None
+            or self._lora_stream is None
+            or self._lora_a_flat_buf is None
+            or not _FUSED_MOE_LORA_AVAILABLE
+            or down_A_buf.shape[1] <= 1  # per-expert A only
+            or down_B_buf.shape[1] != 1  # shared B only
+            or not down_A_buf.is_contiguous()
+        ):
+            return
+        if m_k > self._lora_a_flat_buf.shape[0]:
+            return  # prefill with too many tokens — skip prefetch to avoid OOB
+        ri_flat = intermediate[:m_k].view(m_k, -1)
+        safe_ids = topk_ids.clamp(0, down_A_buf.shape[1] - 1).to(torch.long)
+        slot_idx = bi.weight_indices[:1].clamp(0)
+        self._lora_stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(self._lora_stream):
+            per_expert_a_shrink(
+                ri_flat,
+                down_A_buf,
+                slot_idx,
+                safe_ids,
+                out=self._lora_a_flat_buf[:m_k],
+            )
+        self._prefetch_flags[1] = True
+
+    def _apply_gate_up_slot(
+        self,
+        layer_id: int,
+        slot: int,
+        hidden_states: torch.Tensor,
+        topk_ids: torch.Tensor,
+        gate_up_output: torch.Tensor,
+        *,
+        token_mask: torch.Tensor | None = None,
+        sorted_token_ids: torch.Tensor | None = None,
+    ) -> None:
+        # For the single-slot case (all tokens same adapter), use dynamic GPU tensor
+        # indexing so the CUDA graph can replay with any loaded adapter.
+        # For multi-slot batches, fall back to Python dict lookup (eager only).
+        bi = self.batch_info
+        # Determine if we're on the CUDA-graph buffer path (single slot, all tokens
+        # same adapter). In this path we keep slot_idx as a GPU tensor so the CUDA
+        # graph can replay with any loaded adapter without re-capture.
+        _use_buffer_path = self.w13_A_buffers is not None and bi.single_lora_slot != -1
+        slot_idx = None
+        if _use_buffer_path:
+            slot_idx = bi.weight_indices[:1].clamp(0)
+            w13_B_buf = self.w13_B_buffers[layer_id]  # (n_slots, E, I2, MAX_R)
+            w13_B = None
+            # For the sglang_shared fast path (shared A, per-expert B) with fused kernels
+            # available, skip the w13_A gather entirely — shared_a_shrink reads directly from
+            # the buffer.  For all other paths, gather as before.
+            _w13_A_buf = self.w13_A_buffers[layer_id]
+            _skip_a_gather = (
+                _FUSED_MOE_LORA_AVAILABLE
+                and _w13_A_buf.shape[1] == 1  # shared outer (sglang_shared)
+                and w13_B_buf.shape[1] > 1  # per-expert B
+            )
+            # Also skip the buffer copy for per_expert format (both A and B per-expert):
+            # per_expert_a_shrink + per_expert_gate_up_b_expand read the full buffer
+            # directly, making the 32MB w13_A buffer copy unnecessary.
+            _skip_a_gather_per_expert = (
+                _FUSED_MOE_LORA_AVAILABLE
+                and _w13_A_buf.shape[1] > 1  # per-expert A
+                and w13_B_buf.shape[1] > 1  # per-expert B
+                and token_mask is None
+                and _w13_A_buf.is_contiguous()
+                and w13_B_buf.is_contiguous()
+            )
+            if _skip_a_gather or _skip_a_gather_per_expert:
+                # Use slot-0 view (Python int index = no copy) — correct shape for checks.
+                # Actual compute reads from the full buffers directly.
+                w13_A = _w13_A_buf[0]  # view: (1_or_E, R, H) — no copy!
+            else:
+                w13_A = _w13_A_buf[slot_idx].squeeze(0)
+        else:
+            weights = self.weights_by_layer.get(layer_id, {}).get(slot)
+            if weights is None:
+                return
+            w13_A = weights["w13_A"]
+            w13_B = weights["w13_B"]
+            w13_B_buf = None
+
+        # Determine shapes without materialising w13_B when on the buffer path.
+        if _use_buffer_path:
+            w13_A_experts = w13_A.shape[0]
+            w13_B_experts = w13_B_buf.shape[1]  # E dimension of buffer
+        else:
+            w13_A_experts = w13_A.shape[0]
+            w13_B_experts = w13_B.shape[0]
+        num_experts = max(w13_A_experts, w13_B_experts)
+        safe_ids = topk_ids.clamp(0, num_experts - 1).to(torch.long)
+        m, k = safe_ids.shape
+        # Build the validity mask without torch.any() to avoid GPU→CPU synchronisation.
+        if token_mask is not None:
+            valid = (topk_ids >= 0) & (topk_ids < num_experts) & token_mask[:, None]
+        else:
+            valid = None
+
+        # Check if per_expert fast path is available (avoids the 32MB+16MB gather copies).
+        # Must be determined before the A-shrink so we can skip the expensive gather+einsum.
+        _use_flat_per_expert = (
+            w13_A.shape[0] > 1  # per-expert A
+            and w13_B_buf is not None
+            and w13_B_experts > 1  # per-expert B
+            and _FUSED_MOE_LORA_AVAILABLE
+            and _use_buffer_path
+            and token_mask is None
+            and self.w13_A_buffers[layer_id].is_contiguous()
+            and w13_B_buf.is_contiguous()
+        )
+
+        # Shared A (sglang_shared format): one matmul for all tokens.
+        # lora_a_m (m, r) is only computed here; lora_a (m, k, r) is deferred until
+        # actually needed (not needed for the all-experts or shared-B paths).
+        lora_a_m = None
+        if w13_A.shape[0] == 1:
+            # Skip cuBLAS GEMM when shared_a_shrink will compute it without the gather.
+            if _use_buffer_path and _skip_a_gather:
+                lora_a_m = None  # computed by shared_a_shrink in the fused branch below
+            else:
+                lora_a_m = hidden_states @ w13_A[0].T
+            lora_a = None  # computed lazily below only if per-expert B path is taken
+        elif _use_flat_per_expert:
+            lora_a = (
+                None  # computed inline by per_expert_a_shrink + per_expert_*_b_expand
+            )
+        else:
+            selected_A = self._select_expert_weights(w13_A, safe_ids)
+            lora_a = torch.einsum("mh,mkrh->mkr", hidden_states, selected_A)
+
+        # Compute lora_a only when needed (per-expert B path).
+        # For shared-A + all-experts or shared-A + shared-B, lora_a_m is used directly.
+        # Lazily materialise w13_B for non-fused fallback paths on the buffer path.
+        def _get_w13_B():
+            nonlocal w13_B
+            if w13_B is None:
+                w13_B = w13_B_buf[slot_idx].squeeze(0)
+            return w13_B
+
+        if w13_B_experts == 1:
+            # Shared B: expand lora_a_m to (m*k, r) via repeat_interleave (no contiguous copy).
+            w13_B_local = _get_w13_B()
+            r = lora_a_m.shape[-1] if lora_a_m is not None else lora_a.shape[-1]
+            la_flat = (
+                lora_a_m.repeat_interleave(k, dim=0)
+                if lora_a_m is not None
+                else lora_a.reshape(-1, r)
+            )
+            delta = la_flat @ w13_B_local[0].T  # (m*k, n)
+            delta = delta.view(m, k, -1)
+        elif w13_A.shape[0] == 1:
+            # Shared-A + per-expert B.
+            if _FUSED_MOE_LORA_AVAILABLE and token_mask is None:
+                if sorted_token_ids is not None:
+                    # TMA sorted path: write to sorted output positions (SCATTER=False).
+                    _scaling = (
+                        self.scalings[slot_idx]
+                        if _use_buffer_path
+                        else self.scalings[slot]
+                    )
+                    w13_B_local = _get_w13_B()
+                    assert w13_B_local.is_contiguous(), "w13_B must be contiguous"
+                    sorted_gate_up_b_expand(
+                        lora_a_m,
+                        w13_B_local,
+                        safe_ids,
+                        sorted_token_ids,
+                        gate_up_output,
+                        _scaling,
+                        m * k,
+                        k,
+                    )
+                elif _use_buffer_path:
+                    # Decode path (buffer path): use pre-fetched lora_a_m if available
+                    # (launched on secondary stream before gate_up_gemm), else compute inline.
+                    _gu_prefetched = (
+                        self._prefetch_flags is not None
+                        and self._prefetch_flags[0]
+                        and self._lora_a_m_buf is not None
+                        and self._lora_stream is not None
+                    )
+                    if _gu_prefetched:
+                        # Join secondary stream: wait for torch.mm to complete.
+                        torch.cuda.current_stream().wait_stream(self._lora_stream)
+                        lora_a_m = self._lora_a_m_buf[: hidden_states.shape[0]]
+                        self._prefetch_flags[0] = False
+                    else:
+                        lora_a_m = shared_a_shrink(
+                            hidden_states, self.w13_A_buffers[layer_id], slot_idx
+                        )
+                    gate_up_b_expand(
+                        lora_a_m,
+                        w13_B_buf,
+                        slot_idx,
+                        safe_ids,
+                        gate_up_output,
+                        self.scalings,  # full buffer; kernel loads scalings[slot]
+                    )
+                else:
+                    # Non-buffer decode path (multi-slot eager).
+                    w13_B_local = _get_w13_B()
+                    assert w13_B_local.is_contiguous(), "w13_B must be contiguous"
+                    gate_up_b_expand(
+                        lora_a_m,
+                        w13_B_local.unsqueeze(0),
+                        torch.zeros(1, dtype=torch.int32, device=w13_B_local.device),
+                        safe_ids,
+                        gate_up_output,
+                        self.scalings[slot].unsqueeze(0),  # (1,) for slot 0 of fake buf
+                    )
+                return
+            # Fallback: all-experts GEMM + gather (no expand+copy needed).
+            w13_B_local = _get_w13_B()
+            E_fb, n_out, r = w13_B_local.shape
+            candidates = (
+                lora_a_m @ w13_B_local.permute(2, 0, 1).reshape(r, E_fb * n_out)
+            ).view(m, E_fb, n_out)
+            delta = candidates.gather(1, safe_ids.unsqueeze(-1).expand(-1, -1, n_out))
+        else:
+            # Per-expert A + per-expert B.
+            if _use_flat_per_expert:
+                # Fast flat path: avoid two buffer gather copies (w13_A_buf[slot] = 32MB,
+                # w13_B_buf[slot] = 16MB) by reading directly from the full buffers.
+                # per_expert_a_shrink reused: treats w13_A (n_slots, E, 2r, H) as
+                # down_A (n_slots, E, MAX_R, INTER) with MAX_R=2r, INTER=H.
+                hidden_flat = hidden_states.repeat_interleave(k, dim=0)  # (m*k, H)
+                lora_a_flat = per_expert_a_shrink(
+                    hidden_flat,
+                    self.w13_A_buffers[layer_id],
+                    slot_idx,
+                    safe_ids,
+                )  # (m*k, 2r)
+                per_expert_gate_up_b_expand(
+                    lora_a_flat,
+                    w13_B_buf,
+                    slot_idx,
+                    safe_ids,
+                    gate_up_output,
+                    self.scalings,
+                )
+                return
+            # Fallback: gather + einsum for non-buffer or masked paths.
+            w13_B_local = _get_w13_B()
+            if lora_a is None:
+                lora_a = lora_a_m.unsqueeze(1).expand(-1, k, -1).contiguous()
+            selected_B = self._select_expert_weights(w13_B_local, safe_ids)
+            delta = torch.einsum("mkr,mknr->mkn", lora_a, selected_B)
+
+        # Reuse slot_idx already computed above (avoid extra clamp+gather for scalings).
+        scaling = self.scalings[slot_idx] if _use_buffer_path else self.scalings[slot]
+        delta = delta * scaling
+        if valid is not None:
+            delta = delta.masked_fill(~valid[:, :, None], 0.0)
+        self._add_route_delta(
+            gate_up_output,
+            delta.reshape(-1, delta.shape[-1]),
+            sorted_token_ids=sorted_token_ids,
+        )
+
+    def _apply_down_sorted(
+        self,
+        layer_id: int,
+        slot: int,
+        intermediate: torch.Tensor,
+        topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        down_output: torch.Tensor,
+        sorted_token_ids: torch.Tensor,
+    ) -> bool:
+        """Sorted-space down LoRA: skip route_from_cache, fuse per-expert shrink.
+
+        Returns True if the fast path was taken (per-expert A + shared B format),
+        False if the format requires the generic path.
+        """
+        bi = self.batch_info
+        slot_idx = bi.weight_indices[:1].clamp(0)
+        down_A = self.down_A_buffers[layer_id][slot_idx].squeeze(0)
+        down_B = self.down_B_buffers[layer_id][slot_idx].squeeze(0)
+        # Only handles per-expert A + shared B (sglang_shared format for down).
+        if down_A.shape[0] <= 1 or down_B.shape[0] != 1:
+            return False
+        if not down_A.is_contiguous():
+            return False
+
+        m, k = topk_ids.shape
+        num_experts = down_A.shape[0]
+        safe_ids = topk_ids.clamp(0, num_experts - 1).to(torch.long)
+        route_count = m * k
+        r = down_A.shape[1]
+
+        # moe_dispatch pre-allocates sorted_token_ids for all potential experts, which
+        # can exceed the intermediate cache size.  All valid entries (≥0) lie within
+        # the first intermediate.shape[0] rows (bound = m*k + max_active*(BM-1)).
+        inter_flat = intermediate.reshape(intermediate.shape[0], -1)
+        padded = inter_flat.shape[0]
+        sti = sorted_token_ids[:padded]  # truncate to intermediate size
+
+        # Fused per-expert shrink: lora_a[s] = intermediate[s] @ down_A[exp[s]].T
+        lora_a_sorted = sorted_a_down_shrink(
+            inter_flat,  # (padded, INTER)
+            down_A,  # (E, r, INTER)
+            safe_ids,
+            sti,
+            route_count=route_count,
+            K=k,
+        )
+
+        # Shared B GEMM: (padded, r) @ (r, h) → (padded, h)
+        delta = lora_a_sorted @ down_B[0].T
+
+        # Scale each sorted position by its topk_weight * adapter scaling.
+        valid = (sti >= 0) & (sti < route_count)
+        # Clamp to [0, route_count-1]: sorted_token_ids may contain route_count as
+        # a sentinel value, which would be OOB without the upper bound.
+        flat_j_safe = sti.clamp(0, route_count - 1)
+        weights_sorted = topk_weights.reshape(-1)[flat_j_safe].to(delta.dtype)
+        scaling_t = self.scalings[slot_idx].to(delta.dtype)
+        delta = delta * (weights_sorted * scaling_t * valid.to(delta.dtype)).unsqueeze(
+            -1
+        )
+
+        # Scatter-add to token-ordered down_output.
+        h = delta.shape[-1]
+        down_output.view(route_count, h).scatter_add_(
+            0, flat_j_safe.unsqueeze(-1).expand(-1, h), delta
+        )
+        return True
+
+    def _apply_down_slot(
+        self,
+        layer_id: int,
+        slot: int,
+        route_input: torch.Tensor,
+        topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        down_output: torch.Tensor,
+        *,
+        token_mask: torch.Tensor | None = None,
+    ) -> None:
+        bi = self.batch_info
+        # Determine if we're on the CUDA-graph buffer path (single slot, all tokens
+        # same adapter). In this path we keep slot_idx as a GPU tensor so the CUDA
+        # graph can replay with any loaded adapter without re-capture.
+        _use_buffer_path = self.down_A_buffers is not None and bi.single_lora_slot != -1
+        slot_idx = None
+        if _use_buffer_path:
+            # (1,) GPU tensor — changes at CUDA-graph replay without re-capture.
+            slot_idx = bi.weight_indices[:1].clamp(0)
+            # Keep references to the full buffers; slicing is done lazily or inside kernels.
+            down_A_buf = self.down_A_buffers[layer_id]  # (n_slots, E, MAX_R, INTER)
+            down_B_buf = self.down_B_buffers[layer_id]  # (n_slots, 1_or_E, H, MAX_R)
+            # Sliced views are populated lazily to avoid redundant gathers.
+            down_A = None
+            down_B = None
+        else:
+            weights = self.weights_by_layer.get(layer_id, {}).get(slot)
+            if weights is None:
+                return
+            down_A = weights["down_A"]
+            down_B = weights["down_B"]
+            down_A_buf = None
+            down_B_buf = None
+
+        # Determine shapes without materialising tensors when on the buffer path.
+        if _use_buffer_path:
+            down_A_experts = down_A_buf.shape[1]  # E dimension of buffer
+            down_B_experts = down_B_buf.shape[1]  # 1 for shared-B
+        else:
+            down_A_experts = down_A.shape[0]
+            down_B_experts = down_B.shape[0]
+        num_experts = max(down_A_experts, down_B_experts)
+        safe_ids = topk_ids.clamp(0, num_experts - 1).to(torch.long)
+        m, k = safe_ids.shape
+        if token_mask is not None:
+            valid = (topk_ids >= 0) & (topk_ids < num_experts) & token_mask[:, None]
+        else:
+            valid = None
+
+        # Helpers to lazily materialise sliced tensors for fallback paths.
+        def _get_down_A():
+            nonlocal down_A
+            if down_A is None:
+                down_A = down_A_buf[slot_idx].squeeze(0)
+            return down_A
+
+        def _get_down_B():
+            nonlocal down_B
+            if down_B is None:
+                down_B = down_B_buf[slot_idx].squeeze(0)
+            return down_B
+
+        # Fast fused path: per-expert A + shared B on the CUDA-graph buffer path.
+        # Eliminates both gather copies (down_A gather + down_B gather) and the
+        # separate GEMM + scale + add chain.
+        if (
+            _FUSED_MOE_LORA_AVAILABLE
+            and _use_buffer_path
+            and token_mask is None
+            and down_A_experts > 1
+            and down_B_experts == 1
+            and down_A_buf.is_contiguous()
+            and down_B_buf.is_contiguous()
+        ):
+            _down_prefetched = (
+                self._prefetch_flags is not None
+                and self._prefetch_flags[1]
+                and self._lora_a_flat_buf is not None
+                and self._lora_stream is not None
+            )
+            if _down_prefetched:
+                # Join secondary stream: wait for per_expert_a_shrink to complete.
+                torch.cuda.current_stream().wait_stream(self._lora_stream)
+                lora_a_flat = self._lora_a_flat_buf[: m * k]
+                self._prefetch_flags[1] = False
+            else:
+                ri_flat = route_input.reshape(m * k, -1)  # (m*k, INTER)
+                lora_a_flat = per_expert_a_shrink(
+                    ri_flat, down_A_buf, slot_idx, safe_ids
+                )
+            shared_b_down_expand(
+                lora_a_flat,
+                down_B_buf,
+                slot_idx,
+                down_output.view(m, k, -1),
+                topk_weights,
+                self.scalings,  # full buffer; kernel loads scalings[slot]
+                k,
+            )
+            return
+
+        # Shared A (sglang_shared down_proj): one matmul per token-topk group.
+        if down_A_experts == 1:
+            down_A_local = _get_down_A()
+            ri = route_input.reshape(m * k, -1)  # (m*k, i)
+            lora_a = (ri @ down_A_local[0].T).view(m, k, -1)  # (m, k, r)
+        elif _FUSED_MOE_LORA_AVAILABLE and token_mask is None:
+            # Flat per-expert shrink: avoids the (m*k, r, INTER) gather intermediate
+            # and replaces the batched einsum with a single fused Triton kernel.
+            if _use_buffer_path:
+                # Buffer path: pass full buffer + slot_idx to avoid gather.
+                lora_a = per_expert_a_shrink(
+                    route_input.reshape(m * k, -1), down_A_buf, slot_idx, safe_ids
+                ).view(m, k, -1)
+            else:
+                down_A_local = _get_down_A()
+                assert down_A_local.is_contiguous(), "down_A must be contiguous"
+                lora_a = per_expert_a_shrink(
+                    route_input.reshape(m * k, -1),
+                    down_A_local.unsqueeze(0),  # fake (1, E, MAX_R, INTER) buffer
+                    torch.zeros(1, dtype=torch.int32, device=down_A_local.device),
+                    safe_ids,
+                ).view(m, k, -1)
+        else:
+            down_A_local = _get_down_A()
+            selected_A = self._select_expert_weights(down_A_local, safe_ids)
+            lora_a = torch.einsum("mki,mkri->mkr", route_input, selected_A)
+
+        # Shared B (sglang_shared down_proj): one batched matmul.
+        if down_B_experts == 1:
+            down_B_local = _get_down_B()
+            r = lora_a.shape[-1]
+            delta = lora_a.reshape(-1, r) @ down_B_local[0].T  # (m*k, h)
+            delta = delta.view(m, k, -1)
+        elif (
+            _FUSED_MOE_LORA_AVAILABLE
+            and _use_buffer_path
+            and token_mask is None
+            and down_B_buf.is_contiguous()
+        ):
+            # Per-expert B fast path: avoid the 16MB buffer copy + gather.
+            # lora_a computed via per_expert_a_shrink is already (m*k, r); reshape to flat.
+            lora_a_flat = lora_a.reshape(m * k, -1)
+            per_expert_b_down_expand(
+                lora_a_flat,
+                down_B_buf,
+                slot_idx,
+                safe_ids,
+                down_output.view(m, k, -1),
+                topk_weights,
+                self.scalings,
+                k,
+            )
+            return  # accumulation already done inside the kernel
+        else:
+            down_B_local = _get_down_B()
+            selected_B = self._select_expert_weights(down_B_local, safe_ids)
+            delta = torch.einsum("mkr,mkhr->mkh", lora_a, selected_B)
+
+        delta = delta * topk_weights[:, :, None].to(delta.dtype)
+        # Reuse slot_idx computed above for scalings (avoid extra clamp+gather).
+        scaling = self.scalings[slot_idx] if _use_buffer_path else self.scalings[slot]
+        delta = delta * scaling
+        if valid is not None:
+            delta = delta.masked_fill(~valid[:, :, None], 0.0)
+        down_output.view(topk_ids.shape[0], topk_ids.shape[1], -1).add_(delta)
+
+    @staticmethod
+    def _select_expert_weights(
+        weights: torch.Tensor,
+        safe_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        if weights.shape[0] == 1:
+            return weights[0].expand(*safe_ids.shape, *weights.shape[1:])
+        return weights[safe_ids]
+
+    @staticmethod
+    def _add_route_delta(
+        output: torch.Tensor,
+        route_delta: torch.Tensor,
+        *,
+        sorted_token_ids: torch.Tensor | None,
+    ) -> None:
+        if sorted_token_ids is None:
+            output.view(route_delta.shape[0], -1).add_(route_delta)
+            return
+        # moe_dispatch may pre-allocate sorted_token_ids larger than output.
+        # Truncate: all valid entries lie within the first output.shape[0] rows.
+        padded = output.shape[0]
+        sti = sorted_token_ids[:padded]
+        # Gather route_delta into output-layout, zero invalid (padding) entries,
+        # then add in one vectorised kernel — avoids boolean-index tensor creation.
+        route_count = route_delta.shape[0]
+        clipped = sti.clamp(0, route_count - 1).to(torch.long)
+        reordered = route_delta[clipped]  # (padded, n)
+        invalid = (sti < 0) | (sti >= route_count)
+        reordered.masked_fill_(invalid.unsqueeze(-1), 0)
+        output.add_(reordered)
+
+    @staticmethod
+    def _route_rows_from_cache(
+        cache: torch.Tensor,
+        route_count: int,
+        *,
+        sorted_token_ids: torch.Tensor | None,
+    ) -> torch.Tensor:
+        if sorted_token_ids is None:
+            return cache.view(route_count, -1)
+        # moe_dispatch may pre-allocate sorted_token_ids larger than cache.
+        # Truncate: all valid entries lie within the first cache.shape[0] rows.
+        sti = sorted_token_ids[: cache.shape[0]]
+        # Use scatter_ with an extra dummy row (index 0) for padding positions.
+        # Avoids boolean-index tensor creation; one scatter_ + one slice.
+        n = cache.shape[-1]
+        rows = torch.zeros((route_count + 1, n), dtype=cache.dtype, device=cache.device)
+        # Shift: -1 (padding) → 0 (dummy), valid 0..route_count-1 → 1..route_count.
+        clipped = (sti.clamp(-1, route_count - 1) + 1).to(torch.long)
+        rows.scatter_(0, clipped.unsqueeze(-1).expand(-1, n), cache)
+        return rows[1:]  # drop dummy row → (route_count, n)
+
+
+class MoeLoraBuffers:
+    """Own expert-scoped MoE LoRA weights independently from dense buffers."""
+
+    def __init__(
+        self,
+        *,
+        n_layers: int,
+        n_slots: int,
+        max_lora_rank: int,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_per_tp: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        shard_weights: Callable[
+            [str, torch.Tensor, torch.Tensor], tuple[torch.Tensor, torch.Tensor]
+        ],
+        enabled: bool = True,
+        compressed_shared_outer: bool = False,
+    ) -> None:
+        self.n_layers = n_layers
+        self.n_slots = n_slots
+        self.max_lora_rank = max_lora_rank
+        self.num_experts = num_experts
+        self.hidden_size = hidden_size
+        self.intermediate_per_tp = intermediate_per_tp
+        self.dtype = dtype
+        self.device = device
+        self._shard_weights = shard_weights
+        self.enabled = enabled
+        self.compressed_shared_outer = compressed_shared_outer
+        self.weights_by_layer: MoeWeightsByLayer = {}
+        self.w13_A_buffers: list[torch.Tensor] = []
+        self.w13_B_buffers: list[torch.Tensor] = []
+        self.down_A_buffers: list[torch.Tensor] = []
+        self.down_B_buffers: list[torch.Tensor] = []
+        self._alloc()
+        # Multi-stream prefetch: overlap LoRA shrink ops with base MoE GEMMs.
+        # Shrink kernels run on a secondary stream in parallel with gate_up/down GEMMs.
+        # Pre-allocated output buffers avoid torch.empty inside CUDA graphs.
+        _max_bs = 128
+        _max_topk = 8
+        self._lora_stream: torch.cuda.Stream | None = (
+            torch.cuda.Stream() if torch.cuda.is_available() else None
+        )
+        self._lora_a_m_buf: torch.Tensor | None = None
+        self._lora_a_flat_buf: torch.Tensor | None = None
+        if self.enabled and torch.cuda.is_available():
+            # gate/up shrink: (m, 2*r); down shrink: (m*topk, r)
+            self._lora_a_m_buf = torch.zeros(
+                _max_bs, 2 * max_lora_rank, dtype=dtype, device=device
+            )
+            self._lora_a_flat_buf = torch.zeros(
+                _max_bs * _max_topk, max_lora_rank, dtype=dtype, device=device
+            )
+            # Pre-warm cuBLAS and Triton kernels on _lora_stream before any CUDA graph
+            # capture. torch.mm (cuBLAS) requires its handle to be initialized on each
+            # stream; failing to do so causes CUBLAS_STATUS_NOT_INITIALIZED during capture.
+            if self._lora_stream is not None:
+                _d = torch.zeros(1, dtype=dtype, device=device)
+                with torch.cuda.stream(self._lora_stream):
+                    torch.mm(_d.unsqueeze(0), _d.unsqueeze(1))
+                del _d
+                torch.cuda.synchronize()
+        # Mutable flags shared between MoeLoraBuffers and MoeLoraContext instances:
+        # [0] = gate_up shrink launched; [1] = down shrink launched.
+        self._prefetch_flags: list[bool] = [False, False]
+
+    def _alloc(self) -> None:
+        if not self.enabled:
+            return
+        n = self.n_slots
+        e = max(self.num_experts, 0)
+        r = self.max_lora_rank
+        h = self.hidden_size
+        i = self.intermediate_per_tp
+        w13_a_experts = 1 if self.compressed_shared_outer else e
+        w13_b_experts = e
+        down_a_experts = e
+        down_b_experts = 1 if self.compressed_shared_outer else e
+        for _ in range(self.n_layers):
+            self.w13_A_buffers.append(
+                torch.zeros(
+                    (n, w13_a_experts, 2 * r, h),
+                    dtype=self.dtype,
+                    device=self.device,
+                )
+            )
+            self.w13_B_buffers.append(
+                torch.zeros(
+                    (n, w13_b_experts, 2 * i, 2 * r),
+                    dtype=self.dtype,
+                    device=self.device,
+                )
+            )
+            self.down_A_buffers.append(
+                torch.zeros(
+                    (n, down_a_experts, r, i), dtype=self.dtype, device=self.device
+                )
+            )
+            self.down_B_buffers.append(
+                torch.zeros(
+                    (n, down_b_experts, h, r), dtype=self.dtype, device=self.device
+                )
+            )
+
+    def load_adapter_to_slot(self, cpu_weights, slot: int, rank: int) -> None:
+        has_moe = any(
+            mod.startswith("experts.")
+            for modules in cpu_weights.values()
+            for mod in modules
+        )
+        if has_moe and not self.enabled:
+            raise ValueError(
+                "Adapter contains MoE LoRA weights, but LoRA buffer group 'moe' "
+                "is disabled."
+            )
+        if self.num_experts <= 0:
+            if has_moe:
+                raise ValueError(
+                    "MoE LoRA adapter requires model_config.num_experts or "
+                    "model_config.num_local_experts."
+                )
+            return
+        rank = min(rank, self.max_lora_rank)
+        for layer_id, modules in cpu_weights.items():
+            if not any(mod.startswith("experts.") for mod in modules):
+                continue
+            self._clear_layer_slot(layer_id, slot)
+            if any(
+                mod in modules for mod in ("experts.w1", "experts.w2", "experts.w3")
+            ):
+                self._load_3d_adapter_layer(layer_id, modules, slot, rank)
+            else:
+                self._load_2d_adapter_layer(layer_id, modules, slot, rank)
+
+    def _load_2d_adapter_layer(self, layer_id: int, modules, slot: int, rank: int):
+        expert_ids = [
+            int(mod.split(".")[1]) for mod in modules if mod.startswith("experts.")
+        ]
+        if not expert_ids:
+            return
+        if self.compressed_shared_outer:
+            raise ValueError(
+                "Compressed MoE shared-outer storage only supports 3D "
+                "experts.w1/w2/w3 adapters."
+            )
+        num_experts = max(expert_ids) + 1
+        self._check_num_experts(layer_id, num_experts)
+        w13_A, w13_B, down_A, down_B = self._slot_layer_tensors(layer_id, slot)
+        r = rank
+        for mod, (lora_A_full, lora_B_full) in modules.items():
+            if not mod.startswith("experts."):
+                continue
+            _, expert_id_s, module = mod.split(".", 2)
+            expert_id = int(expert_id_s)
+            # Normalize A/B convention: standard PEFT stores A as (rank, in_features)
+            # and B as (out_features, rank).  Some adapters use the transposed layout
+            # (in_features, rank) and (rank, out_features).  Detect by comparing dims:
+            # if the first dim is larger than the second, A is in (in, rank) format.
+            if lora_A_full.dim() == 2 and lora_A_full.shape[0] > lora_A_full.shape[1]:
+                lora_A_full = lora_A_full.T  # (in, rank) → (rank, in)
+            if lora_B_full.dim() == 2 and lora_B_full.shape[0] < lora_B_full.shape[1]:
+                lora_B_full = lora_B_full.T  # (rank, out) → (out, rank)
+            lora_A_shard_cpu, lora_B_shard_cpu = self._shard_weights(
+                module, lora_A_full, lora_B_full
+            )
+            actual_rank = min(lora_A_shard_cpu.shape[0], r)
+            lora_A_shard = lora_A_shard_cpu[:actual_rank].to(
+                device=self.device,
+                dtype=self.dtype,
+                non_blocking=True,
+            )
+            lora_B_shard = lora_B_shard_cpu[:, :actual_rank].to(
+                device=self.device,
+                dtype=self.dtype,
+                non_blocking=True,
+            )
+            self._copy_projection(
+                module,
+                expert_id,
+                actual_rank,
+                lora_A_shard,
+                lora_B_shard,
+                w13_A,
+                w13_B,
+                down_A,
+                down_B,
+                rank=r,
+            )
+        self.weights_by_layer.setdefault(layer_id, {})[slot] = {
+            "w13_A": w13_A,
+            "w13_B": w13_B,
+            "down_A": down_A,
+            "down_B": down_B,
+        }
+
+    def _load_3d_adapter_layer(self, layer_id: int, modules, slot: int, rank: int):
+        required = ("experts.w1", "experts.w2", "experts.w3")
+        missing = [name for name in required if name not in modules]
+        if missing:
+            raise ValueError(
+                f"3D MoE LoRA layer {layer_id} is missing modules: {missing}"
+            )
+        w1_A, w1_B = modules["experts.w1"]
+        w2_A, w2_B = modules["experts.w2"]
+        w3_A, w3_B = modules["experts.w3"]
+        num_experts = self._infer_3d_num_experts((w1_A, w1_B, w2_A, w2_B, w3_A, w3_B))
+        self._check_num_experts(layer_id, num_experts)
+        if self.compressed_shared_outer:
+            self._check_shared_outer_layer(layer_id, modules, num_experts)
+        w13_A, w13_B, down_A, down_B = self._slot_layer_tensors(layer_id, slot)
+        self._copy_3d_projection(
+            "gate_proj", w1_A, w1_B, w13_A, w13_B, down_A, down_B, rank
+        )
+        self._copy_3d_projection(
+            "up_proj", w3_A, w3_B, w13_A, w13_B, down_A, down_B, rank
+        )
+        self._copy_3d_projection(
+            "down_proj", w2_A, w2_B, w13_A, w13_B, down_A, down_B, rank
+        )
+        E_b, I2, R = w13_B.shape
+        w13_B_T = w13_B.permute(2, 0, 1).reshape(R, E_b * I2).contiguous()
+        self.weights_by_layer.setdefault(layer_id, {})[slot] = {
+            "w13_A": w13_A,
+            "w13_B": w13_B,
+            "w13_B_T": w13_B_T,
+            "down_A": down_A,
+            "down_B": down_B,
+        }
+
+    def _check_num_experts(self, layer_id: int, adapter_num_experts: int) -> None:
+        if adapter_num_experts > self.num_experts:
+            raise ValueError(
+                f"MoE LoRA layer {layer_id} has {adapter_num_experts} experts, "
+                f"but the model has {self.num_experts}."
+            )
+
+    def _slot_layer_tensors(self, layer_id: int, slot: int):
+        return (
+            self.w13_A_buffers[layer_id][slot],
+            self.w13_B_buffers[layer_id][slot],
+            self.down_A_buffers[layer_id][slot],
+            self.down_B_buffers[layer_id][slot],
+        )
+
+    def _clear_layer_slot(self, layer_id: int, slot: int) -> None:
+        self.w13_A_buffers[layer_id][slot].zero_()
+        self.w13_B_buffers[layer_id][slot].zero_()
+        self.down_A_buffers[layer_id][slot].zero_()
+        self.down_B_buffers[layer_id][slot].zero_()
+
+    @staticmethod
+    def _check_shared_outer_layer(
+        layer_id: int,
+        modules,
+        num_experts: int,
+    ) -> None:
+        expected = {
+            "experts.w1": (1, num_experts),
+            "experts.w2": (num_experts, 1),
+            "experts.w3": (1, num_experts),
+        }
+        for module, (expected_a, expected_b) in expected.items():
+            lora_A, lora_B = modules[module]
+            if lora_A.shape[0] != expected_a or lora_B.shape[0] != expected_b:
+                raise ValueError(
+                    "Compressed MoE shared-outer storage expects "
+                    f"{module} A/B dim0=({expected_a}, {expected_b}) for "
+                    f"layer {layer_id}; got {tuple(lora_A.shape)}, "
+                    f"{tuple(lora_B.shape)}."
+                )
+
+    @staticmethod
+    def _infer_3d_num_experts(tensors: tuple[torch.Tensor, ...]) -> int:
+        num_experts = 0
+        for tensor in tensors:
+            if tensor.dim() != 3:
+                raise ValueError(
+                    f"3D MoE LoRA tensors must be rank-3, got shape {tuple(tensor.shape)}"
+                )
+            if tensor.shape[0] != 1:
+                num_experts = max(num_experts, int(tensor.shape[0]))
+        if num_experts <= 0:
+            raise ValueError("3D MoE LoRA layer has no per-expert tensor dimension")
+        for tensor in tensors:
+            if tensor.shape[0] not in (1, num_experts):
+                raise ValueError(
+                    "3D MoE LoRA dim0 must be either 1 (shared) or num_experts "
+                    f"({num_experts}); got {tuple(tensor.shape)}"
+                )
+        return num_experts
+
+    def _copy_3d_projection(
+        self,
+        module: str,
+        lora_A_full: torch.Tensor,
+        lora_B_full: torch.Tensor,
+        w13_A: torch.Tensor,
+        w13_B: torch.Tensor,
+        down_A: torch.Tensor,
+        down_B: torch.Tensor,
+        rank: int,
+    ) -> None:
+        num_experts = max(
+            w13_A.shape[0], w13_B.shape[0], down_A.shape[0], down_B.shape[0]
+        )
+        if self.compressed_shared_outer:
+            self._copy_3d_projection_compressed(
+                module,
+                lora_A_full,
+                lora_B_full,
+                w13_A,
+                w13_B,
+                down_A,
+                down_B,
+                rank,
+                num_experts,
+            )
+            return
+        for expert_id in range(num_experts):
+            lora_A = self._select_3d_expert_tensor(lora_A_full, expert_id)
+            lora_B = self._select_3d_expert_tensor(lora_B_full, expert_id)
+            lora_A_shard_cpu, lora_B_shard_cpu = self._shard_weights(
+                module, lora_A, lora_B
+            )
+            actual_rank = min(lora_A_shard_cpu.shape[0], rank)
+            lora_A_shard = lora_A_shard_cpu[:actual_rank].to(
+                device=self.device,
+                dtype=self.dtype,
+                non_blocking=True,
+            )
+            lora_B_shard = lora_B_shard_cpu[:, :actual_rank].to(
+                device=self.device,
+                dtype=self.dtype,
+                non_blocking=True,
+            )
+            self._copy_projection(
+                module,
+                expert_id,
+                actual_rank,
+                lora_A_shard,
+                lora_B_shard,
+                w13_A,
+                w13_B,
+                down_A,
+                down_B,
+                rank=rank,
+                a_expert_id=self._dst_expert_id(module, "A", expert_id),
+                b_expert_id=self._dst_expert_id(module, "B", expert_id),
+            )
+
+    def _copy_3d_projection_compressed(
+        self,
+        module: str,
+        lora_A_full: torch.Tensor,
+        lora_B_full: torch.Tensor,
+        w13_A: torch.Tensor,
+        w13_B: torch.Tensor,
+        down_A: torch.Tensor,
+        down_B: torch.Tensor,
+        rank: int,
+        num_experts: int,
+    ) -> None:
+        if module in ("gate_proj", "up_proj"):
+            shared_A = self._select_3d_expert_tensor(lora_A_full, 0)
+            first_B = self._select_3d_expert_tensor(lora_B_full, 0)
+            lora_A_shard_cpu, _ = self._shard_weights(module, shared_A, first_B)
+            actual_rank = min(lora_A_shard_cpu.shape[0], rank)
+            lora_A_shard = lora_A_shard_cpu[:actual_rank].to(
+                device=self.device,
+                dtype=self.dtype,
+                non_blocking=True,
+            )
+            if module == "gate_proj":
+                w13_A[0, :actual_rank, :].copy_(lora_A_shard, non_blocking=True)
+            else:
+                w13_A[0, rank : rank + actual_rank, :].copy_(
+                    lora_A_shard, non_blocking=True
+                )
+            for expert_id in range(num_experts):
+                expert_B = self._select_3d_expert_tensor(lora_B_full, expert_id)
+                _, lora_B_shard_cpu = self._shard_weights(module, shared_A, expert_B)
+                b_rank = min(lora_B_shard_cpu.shape[1], rank)
+                lora_B_shard = lora_B_shard_cpu[:, :b_rank].to(
+                    device=self.device,
+                    dtype=self.dtype,
+                    non_blocking=True,
+                )
+                if module == "gate_proj":
+                    w13_B[expert_id, : self.intermediate_per_tp, :b_rank].copy_(
+                        lora_B_shard, non_blocking=True
+                    )
+                else:
+                    w13_B[
+                        expert_id,
+                        self.intermediate_per_tp : 2 * self.intermediate_per_tp,
+                        rank : rank + b_rank,
+                    ].copy_(lora_B_shard, non_blocking=True)
+            return
+
+        if module == "down_proj":
+            first_A = self._select_3d_expert_tensor(lora_A_full, 0)
+            shared_B = self._select_3d_expert_tensor(lora_B_full, 0)
+            _, lora_B_shard_cpu = self._shard_weights(module, first_A, shared_B)
+            b_rank = min(lora_B_shard_cpu.shape[1], rank)
+            lora_B_shard = lora_B_shard_cpu[:, :b_rank].to(
+                device=self.device,
+                dtype=self.dtype,
+                non_blocking=True,
+            )
+            down_B[0, :, :b_rank].copy_(lora_B_shard, non_blocking=True)
+            for expert_id in range(num_experts):
+                expert_A = self._select_3d_expert_tensor(lora_A_full, expert_id)
+                lora_A_shard_cpu, _ = self._shard_weights(module, expert_A, shared_B)
+                actual_rank = min(lora_A_shard_cpu.shape[0], rank)
+                lora_A_shard = lora_A_shard_cpu[:actual_rank].to(
+                    device=self.device,
+                    dtype=self.dtype,
+                    non_blocking=True,
+                )
+                down_A[expert_id, :actual_rank, :].copy_(
+                    lora_A_shard, non_blocking=True
+                )
+            return
+
+        raise ValueError(f"Unsupported MoE LoRA projection: {module}")
+
+    @staticmethod
+    def _select_3d_expert_tensor(tensor: torch.Tensor, expert_id: int) -> torch.Tensor:
+        return tensor[0 if tensor.shape[0] == 1 else expert_id]
+
+    def _copy_projection(
+        self,
+        module: str,
+        expert_id: int,
+        actual_rank: int,
+        lora_A_shard: torch.Tensor,
+        lora_B_shard: torch.Tensor,
+        w13_A: torch.Tensor,
+        w13_B: torch.Tensor,
+        down_A: torch.Tensor,
+        down_B: torch.Tensor,
+        *,
+        rank: int,
+        a_expert_id: int | None = None,
+        b_expert_id: int | None = None,
+    ) -> None:
+        a_expert_id = expert_id if a_expert_id is None else a_expert_id
+        b_expert_id = expert_id if b_expert_id is None else b_expert_id
+        if module == "gate_proj":
+            w13_A[a_expert_id, :actual_rank, :].copy_(lora_A_shard, non_blocking=True)
+            w13_B[
+                b_expert_id,
+                : self.intermediate_per_tp,
+                :actual_rank,
+            ].copy_(lora_B_shard, non_blocking=True)
+        elif module == "up_proj":
+            w13_A[a_expert_id, rank : rank + actual_rank, :].copy_(
+                lora_A_shard, non_blocking=True
+            )
+            w13_B[
+                b_expert_id,
+                self.intermediate_per_tp : 2 * self.intermediate_per_tp,
+                rank : rank + actual_rank,
+            ].copy_(lora_B_shard, non_blocking=True)
+        elif module == "down_proj":
+            down_A[a_expert_id, :actual_rank, :].copy_(lora_A_shard, non_blocking=True)
+            down_B[b_expert_id, :, :actual_rank].copy_(lora_B_shard, non_blocking=True)
+        else:
+            raise ValueError(f"Unsupported MoE LoRA projection: {module}")
+
+    def _dst_expert_id(self, module: str, side: str, expert_id: int) -> int:
+        if not self.compressed_shared_outer:
+            return expert_id
+        if module in ("gate_proj", "up_proj") and side == "A":
+            return 0
+        if module == "down_proj" and side == "B":
+            return 0
+        return expert_id
+
+    def clear_slot(self, slot: int) -> None:
+        if not self.enabled:
+            return
+        for layer_id in range(self.n_layers):
+            self._clear_layer_slot(layer_id, slot)
+        for layer_slots in self.weights_by_layer.values():
+            layer_slots.pop(slot, None)
+
+    def build_context(
+        self,
+        *,
+        batch_info: LoraBatchInfo,
+        scalings: torch.Tensor,
+        has_active_lora: bool,
+    ) -> "MoeLoraContext":
+        return MoeLoraContext(
+            weights_by_layer=self.weights_by_layer,
+            batch_info=batch_info,
+            scalings=scalings,
+            has_active_lora=has_active_lora,
+            w13_A_buffers=self.w13_A_buffers if self.enabled else None,
+            w13_B_buffers=self.w13_B_buffers if self.enabled else None,
+            down_A_buffers=self.down_A_buffers if self.enabled else None,
+            down_B_buffers=self.down_B_buffers if self.enabled else None,
+            _lora_stream=self._lora_stream,
+            _lora_a_m_buf=self._lora_a_m_buf,
+            _lora_a_flat_buf=self._lora_a_flat_buf,
+            _prefetch_flags=self._prefetch_flags,
+        )
diff --git a/python/tokenspeed/runtime/models/qwen3.py b/python/tokenspeed/runtime/models/qwen3.py
index 43465b476..9d3fe0081 100755
--- a/python/tokenspeed/runtime/models/qwen3.py
+++ b/python/tokenspeed/runtime/models/qwen3.py
@@ -62,11 +62,13 @@ def __init__(
         intermediate_size: int,
         hidden_act: str,
         quant_config: QuantizationConfig | None = None,
+        layer_id: int = 0,
         tp_rank: int | None = None,
         tp_size: int | None = None,
         tp_group: tuple[int, ...] | None = None,
     ) -> None:
         super().__init__()
+        self.layer_id = layer_id
         self.gate_up_proj = MergedColumnParallelLinear(
             hidden_size,
             [intermediate_size] * 2,
@@ -93,11 +95,17 @@ def __init__(
             )
         self.act_fn = SiluAndMul()
 
-    def forward(self, x):
+    def forward(self, x, ctx: ForwardContext | None = None):
         gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
+        # LoRA delta on the fused gate/up output (added before SiluAndMul,
+        # matching PEFT semantics).
+        if ctx is not None and ctx.lora_manager is not None:
+            gate_up = ctx.lora_manager.apply_gate_up_lora(x, gate_up, self.layer_id)
+        intermediate = self.act_fn(gate_up)
+        out, _ = self.down_proj(intermediate)
+        if ctx is not None and ctx.lora_manager is not None:
+            out = ctx.lora_manager.apply_down_lora(intermediate, out, self.layer_id)
+        return out
 
 
 class Qwen3Attention(nn.Module):
@@ -119,6 +127,7 @@ def __init__(
         prefix: str = "",
     ) -> None:
         super().__init__()
+        self.layer_id = layer_id
         self.mapping = mapping
         self.hidden_size = hidden_size
         self.tp_rank = self.mapping.attn.tp_rank
@@ -213,6 +222,14 @@ def forward(
         cos_sin: tuple[torch.Tensor, torch.Tensor] | None = None,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
+
+        # LoRA delta for Q/K/V projections (segment-grouped Triton path).
+        # The manager's batch_info holds persistent buffers, so this call
+        # is safe to record into a CUDA graph: replay updates batch_info
+        # in place before graph.replay().
+        if ctx.lora_manager is not None:
+            qkv = ctx.lora_manager.apply_qkv_lora(hidden_states, qkv, self.layer_id)
+
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self._apply_qk_norm(q, k)
         q, k = self.rotary_emb(positions, q, k)
@@ -220,6 +237,11 @@ def forward(
         if len(attn_output.size()) == 3:
             attn_output = attn_output.reshape(attn_output.shape[0], -1)
         output, _ = self.o_proj(attn_output)
+
+        # LoRA delta for O projection
+        if ctx.lora_manager is not None:
+            output = ctx.lora_manager.apply_o_lora(attn_output, output, self.layer_id)
+
         return output
 
 
@@ -263,6 +285,7 @@ def __init__(
             intermediate_size=config.intermediate_size,
             hidden_act=config.hidden_act,
             quant_config=quant_config,
+            layer_id=layer_id,
             tp_rank=self.mapping.dense.tp_rank,
             tp_size=self.mapping.dense.tp_size,
             tp_group=self.mapping.dense.tp_group,
@@ -327,7 +350,7 @@ def forward(
                     residual,
                 )
             )
-        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.mlp(hidden_states, ctx)
         return hidden_states, residual
 
 
diff --git a/python/tokenspeed/runtime/models/qwen3_5.py b/python/tokenspeed/runtime/models/qwen3_5.py
index a64ed69fd..1bb8266d3 100644
--- a/python/tokenspeed/runtime/models/qwen3_5.py
+++ b/python/tokenspeed/runtime/models/qwen3_5.py
@@ -713,6 +713,11 @@ def self_attention(
         """Full attention forward pass."""
         qkv, _ = self.qkv_proj(hidden_states)
 
+        # Apply QKV LoRA delta (same as qwen3.py; qkv layout matches the buffer
+        # offsets because q_size_per_tp already accounts for attn_output_gate).
+        if ctx.lora_manager is not None:
+            qkv = ctx.lora_manager.apply_qkv_lora(hidden_states, qkv, self.layer_id)
+
         if self.attn_output_gate:
             q_gate, k, v = qkv.split(
                 [self.q_size * 2, self.kv_size, self.kv_size], dim=-1
@@ -741,6 +746,11 @@ def self_attention(
             sigmoid_mul(attn_output, gate)
 
         output, _ = self.o_proj(attn_output)
+
+        # Apply O-projection LoRA delta.
+        if ctx.lora_manager is not None:
+            output = ctx.lora_manager.apply_o_lora(attn_output, output, self.layer_id)
+
         return output
 
     def forward(
diff --git a/python/tokenspeed/runtime/utils/server_args.py b/python/tokenspeed/runtime/utils/server_args.py
index f1da1111a..5d93fdb16 100755
--- a/python/tokenspeed/runtime/utils/server_args.py
+++ b/python/tokenspeed/runtime/utils/server_args.py
@@ -226,6 +226,30 @@ class ServerArgs:
     # server started without the matching flag will receive empty logprobs.
     enable_output_logprobs: bool = False
 
+    # LoRA adapter serving
+    enable_lora: bool = False
+    # Maximum number of LoRA adapters resident in GPU memory at once.
+    # Adapters beyond this cap are LRU-evicted to the CPU pool.
+    max_loras: int = 4
+    # Maximum LoRA rank supported (caps adapter loading; larger = more GPU memory).
+    max_lora_rank: int = 64
+    # Maximum number of LoRA adapters cached in CPU pinned memory.  When
+    # an adapter is evicted from this pool it falls back to its disk path
+    # (assumed durable) and is reloaded on next use.  ``None`` ⇒ default
+    # to ``4 * max_loras``.
+    max_loras_cpu: int | None = None
+    # Comma-separated coarse GPU buffer families to allocate for LoRA.
+    # Valid groups: attn, mlp, moe, lm_head.
+    lora_buffer_groups: str = "attn,mlp,moe"
+    # Store 3D MoE shared-outer adapters in compressed shared/per-expert
+    # buffers instead of fully expanding all sides to num_experts.
+    lora_moe_compressed_shared_outer: bool = False
+    # Scheduler-side LoRA scheduling policy.  ``"lru"`` (default) just
+    # relies on the manager's LRU; ``"admission"`` (future) gates batches
+    # that don't fit in GPU; ``"pack"`` (future) sorts the queue to reuse
+    # resident adapters.
+    lora_scheduling_policy: str = "lru"
+
     # Runtime options
     disable_pdl: bool = False
     enable_prefix_caching: bool = True
@@ -554,6 +578,43 @@ def resolve_communication(self):
             )
 
     def resolve_disaggregation(self):
+        if self.enable_lora:
+            # LoRA delta path is baked into the captured graph: the per-token
+            # slot index buffer (LoraManager.weight_indices_buf) is bound at
+            # capture and updated in place at replay. Base/no-LoRA requests
+            # use NO_LORA_SLOT in metadata and do not consume a GPU slot.
+            #
+            # Default the CPU pool to 4× the GPU pool so adapter swap-out
+            # to disk is rare in steady state.
+            if self.max_loras_cpu is None:
+                self.max_loras_cpu = 4 * self.max_loras
+            if self.max_loras_cpu < self.max_loras:
+                raise ValueError(
+                    f"max_loras_cpu ({self.max_loras_cpu}) must be ≥ "
+                    f"max_loras ({self.max_loras}) — every GPU-resident "
+                    "adapter must also fit in the CPU pool."
+                )
+            groups = {
+                group.strip()
+                for group in self.lora_buffer_groups.split(",")
+                if group.strip()
+            }
+            valid_groups = {"attn", "mlp", "moe", "lm_head"}
+            unknown_groups = groups - valid_groups
+            if not groups:
+                raise ValueError("lora_buffer_groups must include at least one group.")
+            if unknown_groups:
+                raise ValueError(
+                    "lora_buffer_groups contains unknown groups: "
+                    f"{sorted(unknown_groups)}. Valid groups: {sorted(valid_groups)}."
+                )
+            self.lora_buffer_groups = ",".join(sorted(groups))
+            if self.lora_moe_compressed_shared_outer and "moe" not in groups:
+                raise ValueError(
+                    "--lora-moe-compressed-shared-outer requires "
+                    "--lora-buffer-groups to include 'moe'."
+                )
+
         # PD disaggregation
         if self.disaggregation_mode == "prefill":
             self.enforce_eager = True
@@ -1465,6 +1526,70 @@ def add_cli_args(parser: argparse.ArgumentParser):
             action="store_true",
             help="Disable PDL launch.",
         )
+        # LoRA adapter serving
+        parser.add_argument(
+            "--enable-lora",
+            action="store_true",
+            default=ServerArgs.enable_lora,
+            help="Enable LoRA adapter serving.",
+        )
+        parser.add_argument(
+            "--max-loras",
+            type=int,
+            default=ServerArgs.max_loras,
+            help="Maximum number of LoRA adapters in GPU memory at once.",
+        )
+        parser.add_argument(
+            "--max-lora-rank",
+            type=int,
+            default=ServerArgs.max_lora_rank,
+            help="Maximum LoRA rank supported across all loaded adapters.",
+        )
+        parser.add_argument(
+            "--max-loras-cpu",
+            type=int,
+            default=ServerArgs.max_loras_cpu,
+            help=(
+                "Maximum number of LoRA adapters cached in CPU pinned "
+                "memory.  Defaults to 4 × --max-loras.  Adapters evicted "
+                "from this pool are reloaded from disk on next use."
+            ),
+        )
+        parser.add_argument(
+            "--lora-buffer-groups",
+            type=str,
+            default=ServerArgs.lora_buffer_groups,
+            help=(
+                "Comma-separated LoRA GPU buffer groups to allocate. "
+                "Valid groups: attn, mlp, moe, lm_head. Loading an adapter that "
+                "targets a disabled group raises an error."
+            ),
+        )
+        parser.add_argument(
+            "--lora-moe-compressed-shared-outer",
+            action="store_true",
+            default=ServerArgs.lora_moe_compressed_shared_outer,
+            help=(
+                "Use compressed MoE storage for 3D shared-outer adapters "
+                "(w1/w3 A shared, w1/w3 B per-expert, w2 A per-expert, "
+                "w2 B shared)."
+            ),
+        )
+        parser.add_argument(
+            "--lora-scheduling-policy",
+            type=str,
+            default=ServerArgs.lora_scheduling_policy,
+            choices=["lru", "pack"],
+            help=(
+                "Scheduler-side LoRA scheduling policy. ``lru`` (default) "
+                "submits requests in arrival order and relies on the "
+                "manager's LRU pool. ``pack`` sorts the admission queue "
+                "by lora_id so adapter-shared requests cluster, reducing "
+                "eviction churn when working_set > max_loras_cpu and "
+                "traffic is bursty."
+            ),
+        )
+
         prefix_cache_group = parser.add_mutually_exclusive_group()
         prefix_cache_group.add_argument(
             "--enable-prefix-caching",
diff --git a/test/runners.py b/test/runners.py
index fc368aa9e..1838997ec 100644
--- a/test/runners.py
+++ b/test/runners.py
@@ -194,11 +194,11 @@ def start_model_process(
 
         # Run forward
         while True:
-            prompts, image_data, max_new_tokens, lora_paths, token_ids_logprob = (
+            prompts, image_data, max_new_tokens, adapter_paths, token_ids_logprob = (
                 in_queue.get()
             )
-            if lora_paths is not None:
-                assert len(prompts) == len(lora_paths)
+            if adapter_paths is not None:
+                assert len(prompts) == len(adapter_paths)
 
             if prompts is not None:
                 if self.model_type == "generation":
@@ -208,7 +208,7 @@ def start_model_process(
                             prompts=prompts,
                             max_new_tokens=max_new_tokens,
                             tokenizer=self.tokenizer,
-                            lora_paths=lora_paths,
+                            adapter_paths=adapter_paths,
                             torch_dtype=torch_dtype,
                             output_str_only=self.output_str_only,
                             token_ids_logprob=token_ids_logprob,
@@ -226,11 +226,11 @@ def forward(
         ] = DEFAULT_PROMPTS,
         image_data: Optional[List[str]] = None,
         max_new_tokens: int = 8,
-        lora_paths: Optional[List[str]] = None,
+        adapter_paths: Optional[List[str]] = None,
         token_ids_logprob: Optional[int] = None,
     ):
         self.in_queue.put(
-            (prompts, image_data, max_new_tokens, lora_paths, token_ids_logprob)
+            (prompts, image_data, max_new_tokens, adapter_paths, token_ids_logprob)
         )
         while True:
             try:
@@ -264,7 +264,7 @@ def forward_generation_raw(
         max_new_tokens: int,
         tokenizer,
         torch_dtype: torch.dtype,
-        lora_paths: Optional[List[str]] = None,
+        adapter_paths: Optional[List[str]] = None,
         output_str_only: bool = False,
         token_ids_logprob: Optional[int] = None,
         patch_model_do_sample_false: Optional[bool] = False,
@@ -299,12 +299,12 @@ def forward_generation_raw(
                 if max_model_len is not None and input_ids.shape[1] > max_model_len:
                     input_ids = input_ids[:, :max_model_len]
 
-            if lora_paths is not None and lora_paths[i] is not None:
+            if adapter_paths is not None and adapter_paths[i] is not None:
                 from peft import PeftModel
 
                 model = PeftModel.from_pretrained(
                     base_model,
-                    lora_paths[i],
+                    adapter_paths[i],
                     torch_dtype=torch_dtype,
                     is_trainable=False,
                 )
@@ -367,7 +367,7 @@ def forward_generation_raw(
                     )
                 del input_logits
 
-            if lora_paths is not None and lora_paths[i] is not None:
+            if adapter_paths is not None and adapter_paths[i] is not None:
                 # Unload the LoRA adapter if it is used
                 model.unload()
 
@@ -465,8 +465,8 @@ def __init__(
         else:
             self.tokenizer = None
 
-    def load_lora_adapter(self, lora_name: str, lora_path: str, pinned: bool = False):
-        return self.engine.load_lora_adapter(lora_name, lora_path, pinned)
+    def load_lora_adapter(self, lora_name: str, adapter_path: str):
+        return self.engine.load_lora_adapter(lora_name, adapter_path)
 
     def unload_lora_adapter(self, lora_name: str):
         return self.engine.unload_lora_adapter(lora_name)
@@ -477,7 +477,7 @@ def forward(
             List[List[str]], List[str], List[torch.Tensor]
         ] = DEFAULT_PROMPTS,
         max_new_tokens: int = 8,
-        lora_paths: Optional[List[str]] = None,
+        lora_names: Optional[List[str]] = None,
         logprob_start_len: int = 0,
         top_k: Optional[int] = None,
         token_ids_logprob: Optional[List[int]] = None,
@@ -487,7 +487,7 @@ def forward(
                 engine=self.engine,
                 prompts=prompts,
                 max_new_tokens=max_new_tokens,
-                lora_paths=lora_paths,
+                lora_names=lora_names,
                 logprob_start_len=logprob_start_len,
                 top_k=top_k,
                 token_ids_logprob=token_ids_logprob,
@@ -525,7 +525,7 @@ def forward_generation_raw(
         engine: Engine,
         prompts: Union[List[str], List[torch.Tensor]],
         max_new_tokens: int = 8,
-        lora_paths: Optional[List[str]] = None,
+        lora_names: Optional[List[str]] = None,
         logprob_start_len: int = 0,
         top_k: Optional[int] = None,
         token_ids_logprob: Optional[List[int]] = None,
@@ -551,6 +551,7 @@ def forward_generation_raw(
             sampling_params["top_k"] = top_k
 
         for i, prompt in enumerate(prompts):
+            lora_name = None if lora_names is None else lora_names[i]
             response = engine.generate(
                 prompt,
                 sampling_params=sampling_params,
@@ -558,6 +559,7 @@ def forward_generation_raw(
                 logprob_start_len=logprob_start_len,
                 top_logprobs_num=NUM_TOP_LOGPROBS,
                 token_ids_logprob=token_ids_logprob,
+                lora_name=lora_name,
             )
             text = response["text"]
 
diff --git a/test/runtime/lora/__init__.py b/test/runtime/lora/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/runtime/lora/test_adapter_io.py b/test/runtime/lora/test_adapter_io.py
new file mode 100644
index 000000000..008db2e60
--- /dev/null
+++ b/test/runtime/lora/test_adapter_io.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import annotations
+
+import torch
+
+from tokenspeed.runtime.lora.adapter_io import parse_adapter_weights
+
+
+def test_parse_adapter_weights_accepts_expert_scoped_moe_modules():
+    tensors = {
+        "base_model.model.model.layers.3.mlp.experts.7.gate_proj.lora_A.weight": (
+            torch.randn(4, 16)
+        ),
+        "base_model.model.model.layers.3.mlp.experts.7.gate_proj.lora_B.weight": (
+            torch.randn(32, 4)
+        ),
+        "base_model.model.model.layers.3.mlp.experts.7.up_proj.lora_A.weight": (
+            torch.randn(4, 16)
+        ),
+        "base_model.model.model.layers.3.mlp.experts.7.up_proj.lora_B.weight": (
+            torch.randn(32, 4)
+        ),
+        "base_model.model.model.layers.3.mlp.experts.7.down_proj.lora_A.weight": (
+            torch.randn(4, 32)
+        ),
+        "base_model.model.model.layers.3.mlp.experts.7.down_proj.lora_B.weight": (
+            torch.randn(16, 4)
+        ),
+    }
+
+    parsed = parse_adapter_weights(tensors)
+
+    assert set(parsed[3]) == {
+        "experts.7.gate_proj",
+        "experts.7.up_proj",
+        "experts.7.down_proj",
+    }
+    assert parsed[3]["experts.7.gate_proj"][0].shape == (4, 16)
+    assert parsed[3]["experts.7.down_proj"][1].shape == (16, 4)
+
+
+def test_parse_adapter_weights_accepts_3d_moe_modules():
+    tensors = {
+        "base_model.model.model.layers.1.mlp.experts.w1.lora_A.weight": torch.randn(
+            1, 4, 16
+        ),
+        "base_model.model.model.layers.1.mlp.experts.w1.lora_B.weight": torch.randn(
+            8, 32, 4
+        ),
+        "base_model.model.model.layers.1.mlp.experts.w2.lora_A.weight": torch.randn(
+            8, 4, 32
+        ),
+        "base_model.model.model.layers.1.mlp.experts.w2.lora_B.weight": torch.randn(
+            1, 16, 4
+        ),
+        "base_model.model.model.layers.1.mlp.experts.w3.lora_A.weight": torch.randn(
+            1, 4, 16
+        ),
+        "base_model.model.model.layers.1.mlp.experts.w3.lora_B.weight": torch.randn(
+            8, 32, 4
+        ),
+    }
+
+    parsed = parse_adapter_weights(tensors)
+
+    assert set(parsed[1]) == {"experts.w1", "experts.w2", "experts.w3"}
+    assert parsed[1]["experts.w1"][0].shape == (1, 4, 16)
+    assert parsed[1]["experts.w2"][1].shape == (1, 16, 4)
diff --git a/test/runtime/lora/test_lora_manager.py b/test/runtime/lora/test_lora_manager.py
new file mode 100644
index 000000000..b01940d7a
--- /dev/null
+++ b/test/runtime/lora/test_lora_manager.py
@@ -0,0 +1,488 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Tests for LoraManager.prepare_loras → persistent batch_info.
+
+The captured CUDA graph references the manager's batch_info tensors, so
+their pointers must be stable across ``prepare_loras`` calls and the
+contents must reflect each step's per-request slot ids.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+import pytest
+import torch
+
+from tokenspeed.runtime.lora.lora_batch import NO_LORA_SLOT
+from tokenspeed.runtime.lora.lora_buffers import LoraWeightBuffers
+from tokenspeed.runtime.lora.lora_manager import (
+    LoraManager,
+    _use_triton_grouped_decode,
+)
+
+
+def _model_config():
+    return SimpleNamespace(
+        num_hidden_layers=2,
+        hidden_size=32,
+        num_attention_heads=4,
+        num_key_value_heads=4,
+    )
+
+
+@pytest.fixture
+def manager():
+    if not torch.cuda.is_available():
+        pytest.skip("LoraManager allocates GPU buffers")
+    return LoraManager(
+        model_config=_model_config(),
+        max_loras=2,
+        max_lora_rank=8,
+        max_num_tokens=64,
+        dtype=torch.float16,
+        device=torch.device("cuda:0"),
+    )
+
+
+def test_batch_info_tensor_addresses_are_stable(manager):
+    bi = manager.batch_info
+    addrs_before = (
+        bi.seg_lens.data_ptr(),
+        bi.seg_indptr.data_ptr(),
+        bi.weight_indices.data_ptr(),
+        bi.lora_ranks.data_ptr(),
+        bi.scalings.data_ptr(),
+    )
+    manager.prepare_loras([0, 0, 0], per_request_token_counts=1)
+    manager.prepare_loras([0, 0], per_request_token_counts=4)
+    addrs_after = (
+        bi.seg_lens.data_ptr(),
+        bi.seg_indptr.data_ptr(),
+        bi.weight_indices.data_ptr(),
+        bi.lora_ranks.data_ptr(),
+        bi.scalings.data_ptr(),
+    )
+    assert addrs_before == addrs_after
+
+
+def test_prepare_loras_uniform_decode(manager):
+    n = manager.prepare_loras([0, 0, 0, 0], per_request_token_counts=1)
+    assert n == 4
+    bi = manager.batch_info
+    assert bi.bs == 4
+    assert bi.num_segments == 4
+    assert bi.max_len == 1
+    torch.cuda.synchronize()
+    assert bi.seg_lens[:4].tolist() == [1, 1, 1, 1]
+    assert bi.seg_indptr[:5].tolist() == [0, 1, 2, 3, 4]
+    assert bi.weight_indices[:4].tolist() == [NO_LORA_SLOT] * 4
+
+
+def test_prepare_loras_target_verify_repeats(manager):
+    # Each request emits ``spec_num_tokens`` tokens; one segment per request.
+    n = manager.prepare_loras([0, 0], per_request_token_counts=3)
+    assert n == 6
+    bi = manager.batch_info
+    assert bi.bs == 2
+    assert bi.max_len == 3
+    torch.cuda.synchronize()
+    assert bi.seg_lens[:2].tolist() == [3, 3]
+    assert bi.seg_indptr[:3].tolist() == [0, 3, 6]
+
+
+def test_prepare_loras_variable_segments(manager):
+    n = manager.prepare_loras([0, 0, 0], per_request_token_counts=[5, 1, 2])
+    assert n == 8
+    bi = manager.batch_info
+    assert bi.bs == 3
+    assert bi.max_len == 5
+    torch.cuda.synchronize()
+    assert bi.seg_lens[:3].tolist() == [5, 1, 2]
+    assert bi.seg_indptr[:4].tolist() == [0, 5, 6, 8]
+
+
+def test_prepare_loras_unknown_id_falls_back_to_no_lora_slot(manager):
+    n = manager.prepare_loras([99], per_request_token_counts=2)
+    assert n == 2
+    torch.cuda.synchronize()
+    assert manager.batch_info.weight_indices[:1].tolist() == [NO_LORA_SLOT]
+
+
+def test_prepare_loras_overflow_raises(manager):
+    with pytest.raises(ValueError, match="overflow"):
+        manager.prepare_loras([0] * 33, per_request_token_counts=2)
+
+
+def test_prepare_loras_mismatched_lengths_raises(manager):
+    with pytest.raises(ValueError, match="length"):
+        manager.prepare_loras([0, 0], per_request_token_counts=[1, 2, 3])
+
+
+def test_manager_allocates_only_real_adapter_slots(manager):
+    # Match vLLM's layout: the GPU pool contains only real adapter slots.
+    # Base/no-LoRA requests use NO_LORA_SLOT in per-step metadata.
+    torch.cuda.synchronize()
+    assert manager._n_slots == manager.max_loras
+    assert len(manager._slot_to_name) == manager.max_loras
+    assert manager.batch_info.weight_indices[0].item() == NO_LORA_SLOT
+
+
+def test_has_active_lora_flag(manager):
+    # All-base batch → flag is False.  CudaGraphWrapper uses this to pick
+    # the no-LoRA captured graph variant (skip the per-step Triton kernels).
+    manager.prepare_loras([0, 0, 0])
+    assert manager.has_active_lora is False
+    # Unknown id falls back to NO_LORA_SLOT → still no active adapter.
+    manager.prepare_loras([99])
+    assert manager.has_active_lora is False
+    assert manager.batch_info.single_lora_slot == NO_LORA_SLOT
+
+
+def test_lora_weight_buffers_respect_disabled_groups():
+    buffers = LoraWeightBuffers(
+        n_layers=1,
+        n_slots=1,
+        max_lora_rank=2,
+        hidden_size=4,
+        q_size_per_tp=4,
+        kv_size_per_tp=4,
+        o_in_per_tp=4,
+        intermediate_per_tp=8,
+        dtype=torch.float32,
+        device=torch.device("cpu"),
+        tp_rank=0,
+        tp_size=1,
+        buffer_groups={"mlp"},
+    )
+    assert buffers.qkv_A_buffers == []
+    assert len(buffers.gate_up_A_buffers) == 1
+    cpu_weights = {
+        0: {
+            "q_proj": (
+                torch.ones((2, 4), dtype=torch.float32),
+                torch.ones((4, 2), dtype=torch.float32),
+            )
+        }
+    }
+
+    with pytest.raises(ValueError, match="'attn' is disabled"):
+        buffers.load_adapter_to_slot(cpu_weights, slot=0, rank=2)
+
+
+def _write_dummy_adapter(tmp_path, rank: int, hidden: int, n_layers: int) -> str:
+    """Write a minimal PEFT-style adapter under tmp_path/adapter_X."""
+    import json
+
+    from safetensors.torch import save_file
+
+    tensors = {}
+    for layer in range(n_layers):
+        prefix = f"base_model.model.model.layers.{layer}.self_attn"
+        for proj in ("q_proj", "k_proj", "v_proj", "o_proj"):
+            tensors[f"{prefix}.{proj}.lora_A.weight"] = torch.randn(
+                rank, hidden, dtype=torch.float32
+            )
+            tensors[f"{prefix}.{proj}.lora_B.weight"] = torch.randn(
+                hidden, rank, dtype=torch.float32
+            )
+    save_file(tensors, str(tmp_path / "adapter_model.safetensors"))
+    cfg = {
+        "r": rank,
+        "lora_alpha": rank,
+        "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
+    }
+    (tmp_path / "adapter_config.json").write_text(json.dumps(cfg))
+    return str(tmp_path)
+
+
+def _write_partial_adapter(
+    tmp_path,
+    *,
+    rank: int,
+    hidden: int,
+    n_layers: int,
+    modules: tuple[str, ...],
+) -> str:
+    import json
+
+    from safetensors.torch import save_file
+
+    tensors = {}
+    for layer in range(n_layers):
+        prefix = f"base_model.model.model.layers.{layer}.self_attn"
+        for proj in modules:
+            tensors[f"{prefix}.{proj}.lora_A.weight"] = torch.ones(
+                rank, hidden, dtype=torch.float32
+            )
+            tensors[f"{prefix}.{proj}.lora_B.weight"] = torch.ones(
+                hidden, rank, dtype=torch.float32
+            )
+    save_file(tensors, str(tmp_path / "adapter_model.safetensors"))
+    cfg = {
+        "r": rank,
+        "lora_alpha": rank,
+        "target_modules": list(modules),
+    }
+    (tmp_path / "adapter_config.json").write_text(json.dumps(cfg))
+    return str(tmp_path)
+
+
+@pytest.fixture
+def adapter_paths(tmp_path):
+    """Create 4 dummy adapters on disk."""
+    paths = {}
+    for i in range(4):
+        d = tmp_path / f"adapter_{i}"
+        d.mkdir()
+        paths[f"a{i}"] = _write_dummy_adapter(d, rank=8, hidden=32, n_layers=2)
+    return paths
+
+
+def _tiered_manager(
+    max_loras_cpu: int,
+    max_num_tokens: int = 64,
+    max_loras: int = 2,
+) -> LoraManager:
+    return LoraManager(
+        model_config=_model_config(),
+        max_loras=max_loras,
+        max_lora_rank=8,
+        max_num_tokens=max_num_tokens,
+        max_loras_cpu=max_loras_cpu,
+        dtype=torch.float16,
+        device=torch.device("cuda:0"),
+    )
+
+
+def test_prepare_loras_single_lora_slot_metadata(adapter_paths):
+    if not torch.cuda.is_available():
+        pytest.skip("LoraManager allocates GPU buffers")
+    m = _tiered_manager(max_loras_cpu=4, max_num_tokens=128)
+    m.load_adapter("a0", adapter_paths["a0"])
+    m.load_adapter("a1", adapter_paths["a1"])
+    a0_id = m.get_id("a0")
+    a1_id = m.get_id("a1")
+
+    m.prepare_loras([a0_id, a0_id], per_request_token_counts=16)
+    slot = m.batch_info.weight_indices[0].item()
+    assert slot != NO_LORA_SLOT
+    assert m.batch_info.single_lora_slot == slot
+
+    m.prepare_loras([a0_id, a1_id], per_request_token_counts=16)
+    assert m.batch_info.single_lora_slot == NO_LORA_SLOT
+
+    m.prepare_loras([0, a0_id], per_request_token_counts=16)
+    assert m.batch_info.single_lora_slot == NO_LORA_SLOT
+
+
+def test_prepare_loras_multi_lora_slot_metadata(adapter_paths):
+    if not torch.cuda.is_available():
+        pytest.skip("LoraManager allocates GPU buffers")
+    m = _tiered_manager(max_loras_cpu=4, max_num_tokens=128)
+    m.load_adapter("a0", adapter_paths["a0"])
+    m.load_adapter("a1", adapter_paths["a1"])
+    a0_id = m.get_id("a0")
+    a1_id = m.get_id("a1")
+
+    m.prepare_loras([a0_id, a1_id], per_request_token_counts=64)
+    assert m.batch_info.single_lora_slot == NO_LORA_SLOT
+    assert m.batch_info.multi_lora_start_slot == m.batch_info.weight_indices[0].item()
+    assert m.batch_info.multi_lora_count == 2
+    assert m.batch_info.multi_lora_segment_len == 64
+    assert m.batch_info.multi_lora_rank > 0
+
+    m.prepare_loras([a0_id, a1_id], per_request_token_counts=[64, 32])
+    assert m.batch_info.multi_lora_start_slot == NO_LORA_SLOT
+
+    m.prepare_loras([a1_id, a0_id], per_request_token_counts=64)
+    assert m.batch_info.multi_lora_start_slot == NO_LORA_SLOT
+
+
+def test_triton_grouped_decode_threshold():
+    bi = SimpleNamespace(single_lora_slot=NO_LORA_SLOT, num_groups=4, bs=128)
+    assert _use_triton_grouped_decode(bi)
+
+    bi.bs = 64
+    assert not _use_triton_grouped_decode(bi)
+
+    bi.bs = 128
+    bi.single_lora_slot = 1
+    assert not _use_triton_grouped_decode(bi)
+
+    bi.single_lora_slot = NO_LORA_SLOT
+    bi.num_groups = 0
+    assert not _use_triton_grouped_decode(bi)
+
+
+def test_max_loras_cpu_ge_max_loras(adapter_paths):
+    if not torch.cuda.is_available():
+        pytest.skip("LoraManager allocates GPU buffers")
+    with pytest.raises(ValueError, match="max_loras_cpu"):
+        _tiered_manager(max_loras_cpu=1)  # max_loras=2 in fixture
+
+
+def test_load_adapter_warms_cpu_pool(adapter_paths):
+    if not torch.cuda.is_available():
+        pytest.skip("LoraManager allocates GPU buffers")
+    m = _tiered_manager(max_loras_cpu=8)
+    m.load_adapter("a0", adapter_paths["a0"])
+    assert "a0" in m._cpu_cache
+    assert "a0" not in m._name_to_slot  # not GPU-resident yet
+
+
+def test_cpu_pool_lru_evicts_to_disk(adapter_paths):
+    if not torch.cuda.is_available():
+        pytest.skip("LoraManager allocates GPU buffers")
+    # max_loras_cpu=2 → only 2 adapters fit in CPU at once.  Loading a
+    # third evicts the LRU one back to disk.
+    m = _tiered_manager(max_loras_cpu=2)
+    for name in ("a0", "a1", "a2"):
+        m.load_adapter(name, adapter_paths[name])
+    # a0 was the LRU at the time a2 was loaded; should be evicted now.
+    assert "a0" not in m._cpu_cache
+    assert "a1" in m._cpu_cache
+    assert "a2" in m._cpu_cache
+
+
+def test_cpu_evicted_adapter_reloads_from_disk(adapter_paths):
+    if not torch.cuda.is_available():
+        pytest.skip("LoraManager allocates GPU buffers")
+    m = _tiered_manager(max_loras_cpu=2)
+    for name in ("a0", "a1", "a2"):
+        m.load_adapter(name, adapter_paths[name])
+    assert "a0" not in m._cpu_cache
+    # Touching a0 again should reload it from disk into the CPU pool,
+    # evicting whatever is now LRU.
+    a0_id = m.get_id("a0")
+    m.prepare_loras([a0_id])
+    assert "a0" in m._cpu_cache
+    assert "a0" in m._name_to_slot  # promoted to GPU too
+
+
+def test_gpu_resident_evicted_only_when_no_alternative(adapter_paths):
+    if not torch.cuda.is_available():
+        pytest.skip("LoraManager allocates GPU buffers")
+    # Prefer evicting non-GPU-resident entries first: they cost a disk
+    # read to bring back, GPU-resident ones cost nothing until their
+    # GPU slot is also evicted.
+    m = _tiered_manager(max_loras_cpu=2)
+    m.load_adapter("a0", adapter_paths["a0"])
+    m.load_adapter("a1", adapter_paths["a1"])
+    a0_id = m.get_id("a0")
+    m.prepare_loras([a0_id])  # a0 → GPU; a1 stays CPU-only
+    assert "a0" in m._name_to_slot
+    # Loading a2: a1 (non-GPU) is evicted in preference to a0 (GPU).
+    m.load_adapter("a2", adapter_paths["a2"])
+    assert "a0" in m._cpu_cache
+    assert "a1" not in m._cpu_cache
+    assert "a2" in m._cpu_cache
+
+
+def test_gpu_resident_can_be_cpu_evicted_when_pool_is_full(adapter_paths):
+    if not torch.cuda.is_available():
+        pytest.skip("LoraManager allocates GPU buffers")
+    # max_loras=2 + max_loras_cpu=2 + two GPU-resident adapters: the
+    # CPU pool MUST allow evicting GPU-resident entries to admit a
+    # third adapter; otherwise the pool is permanently locked.
+    m = _tiered_manager(max_loras_cpu=2)
+    m.load_adapter("a0", adapter_paths["a0"])
+    m.load_adapter("a1", adapter_paths["a1"])
+    m.prepare_loras([m.get_id("a0"), m.get_id("a1")])  # both → GPU
+    assert "a0" in m._name_to_slot
+    assert "a1" in m._name_to_slot
+    # Now register a2.  CPU pool is full and both entries are
+    # GPU-resident — must evict one anyway (its GPU copy is still
+    # valid; future reload costs a disk read).
+    m.load_adapter("a2", adapter_paths["a2"])
+    assert "a2" in m._cpu_cache
+    # Exactly one of a0/a1 was kicked from the CPU pool.
+    cpu_count = sum(name in m._cpu_cache for name in ("a0", "a1"))
+    assert cpu_count == 1
+
+
+def test_gpu_slot_reuse_clears_missing_modules(tmp_path):
+    if not torch.cuda.is_available():
+        pytest.skip("LoraManager allocates GPU buffers")
+    full_dir = tmp_path / "full"
+    full_dir.mkdir()
+    partial_dir = tmp_path / "partial"
+    partial_dir.mkdir()
+    full_path = _write_dummy_adapter(full_dir, rank=8, hidden=32, n_layers=2)
+    partial_path = _write_partial_adapter(
+        partial_dir,
+        rank=8,
+        hidden=32,
+        n_layers=2,
+        modules=("q_proj",),
+    )
+    m = _tiered_manager(max_loras_cpu=2, max_loras=1)
+    full_id = m.load_adapter("full", full_path)
+    partial_id = m.load_adapter("partial", partial_path)
+
+    m.prepare_loras([full_id])
+    slot = m.batch_info.weight_indices[0].item()
+    assert torch.count_nonzero(m.o_A_buffers[0][slot]).item() > 0
+
+    m.prepare_loras([partial_id])
+    assert m.batch_info.weight_indices[0].item() == slot
+    assert torch.count_nonzero(m.o_A_buffers[0][slot]).item() == 0
+    assert torch.count_nonzero(m.qkv_A_buffers[0][slot]).item() > 0
+
+
+def test_prefetch_warms_cpu_pool(adapter_paths):
+    if not torch.cuda.is_available():
+        pytest.skip("LoraManager allocates GPU buffers")
+    m = _tiered_manager(max_loras_cpu=4)
+    # Register two adapters but evict one.
+    m.load_adapter("a0", adapter_paths["a0"])
+    m.load_adapter("a1", adapter_paths["a1"])
+    m._evict_from_cpu("a1")
+    assert "a1" not in m._cpu_cache
+
+    # prefetch kicks off async load; wait for it to finish.
+    m.prefetch("a1")
+    pending = m._pending_loads.get("a1")
+    if pending is not None:
+        pending.result()
+    assert "a1" in m._cpu_cache
+
+
+def test_prefetch_unknown_adapter_is_noop(adapter_paths):
+    if not torch.cuda.is_available():
+        pytest.skip("LoraManager allocates GPU buffers")
+    m = _tiered_manager(max_loras_cpu=4)
+    m.prefetch("never-registered")  # must not raise
+    assert "never-registered" not in m._cpu_cache
+    assert "never-registered" not in m._pending_loads
+
+
+def test_unload_adapter_clears_both_tiers(adapter_paths):
+    if not torch.cuda.is_available():
+        pytest.skip("LoraManager allocates GPU buffers")
+    m = _tiered_manager(max_loras_cpu=4)
+    m.load_adapter("a0", adapter_paths["a0"])
+    a0_id = m.get_id("a0")
+    m.prepare_loras([a0_id])
+    m.unload_adapter("a0")
+    assert "a0" not in m._cpu_cache
+    assert "a0" not in m._name_to_slot
+    assert m.get_id("a0") is None
diff --git a/test/runtime/lora/test_lora_registry.py b/test/runtime/lora/test_lora_registry.py
new file mode 100644
index 000000000..8dc35ca01
--- /dev/null
+++ b/test/runtime/lora/test_lora_registry.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Unit tests for LoraRegistry — no GPU required."""
+
+from __future__ import annotations
+
+import pytest
+
+from tokenspeed.runtime.lora.lora_config import LoraConfig
+from tokenspeed.runtime.lora.lora_registry import NO_LORA_ID, LoraRegistry
+
+
+def _config(name: str, r: int = 16) -> LoraConfig:
+    return LoraConfig(name=name, path=f"/fake/{name}", r=r)
+
+
+class TestLoraRegistry:
+    def test_register_returns_unique_nonzero_ids(self):
+        reg = LoraRegistry(max_loras=4)
+        id_a = reg.register(_config("a"))
+        id_b = reg.register(_config("b"))
+        assert id_a != NO_LORA_ID
+        assert id_b != NO_LORA_ID
+        assert id_a != id_b
+
+    def test_get_id_round_trips(self):
+        reg = LoraRegistry(max_loras=4)
+        lora_id = reg.register(_config("sql"))
+        assert reg.get_id("sql") == lora_id
+        assert reg.get_id("missing") is None
+
+    def test_get_config_round_trips(self):
+        reg = LoraRegistry(max_loras=4)
+        cfg = _config("sql", r=32)
+        reg.register(cfg)
+        retrieved = reg.get_config("sql")
+        assert retrieved is not None
+        assert retrieved.r == 32
+
+    def test_duplicate_registration_raises(self):
+        reg = LoraRegistry(max_loras=4)
+        reg.register(_config("a"))
+        with pytest.raises(ValueError, match="already registered"):
+            reg.register(_config("a"))
+
+    def test_capacity_enforced(self):
+        reg = LoraRegistry(max_loras=2)
+        reg.register(_config("a"))
+        reg.register(_config("b"))
+        with pytest.raises(ValueError, match="full"):
+            reg.register(_config("c"))
+
+    def test_unregister_frees_slot(self):
+        reg = LoraRegistry(max_loras=1)
+        reg.register(_config("a"))
+        reg.unregister("a")
+        assert reg.get_id("a") is None
+        # Slot is now free
+        reg.register(_config("b"))
+
+    def test_unregister_unknown_raises(self):
+        reg = LoraRegistry(max_loras=4)
+        with pytest.raises(KeyError):
+            reg.unregister("nonexistent")
+
+    def test_contains(self):
+        reg = LoraRegistry(max_loras=4)
+        reg.register(_config("x"))
+        assert "x" in reg
+        assert "y" not in reg
+
+    def test_len(self):
+        reg = LoraRegistry(max_loras=4)
+        assert len(reg) == 0
+        reg.register(_config("a"))
+        assert len(reg) == 1
+        reg.register(_config("b"))
+        assert len(reg) == 2
+        reg.unregister("a")
+        assert len(reg) == 1
+
+    def test_lora_scaling(self):
+        cfg = LoraConfig(name="t", path="/p", r=8, lora_alpha=16)
+        assert cfg.scaling == 2.0
diff --git a/test/runtime/lora/test_lora_request_naming.py b/test/runtime/lora/test_lora_request_naming.py
new file mode 100644
index 000000000..1970b1b97
--- /dev/null
+++ b/test/runtime/lora/test_lora_request_naming.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+import pytest
+
+from tokenspeed.runtime.engine.input_processor import InputProcessor
+from tokenspeed.runtime.engine.io_struct import GenerateReqInput
+
+
+def _processor(registry: dict[str, int]) -> InputProcessor:
+    return InputProcessor(SimpleNamespace(_lora_name_to_id=registry))
+
+
+def test_resolve_lora_id_uses_registered_lora_name():
+    obj = GenerateReqInput(text="hello", sampling_params={}, lora_name="adapter-a")
+
+    assert _processor({"adapter-a": 7})._resolve_lora_id(obj) == 7
+
+
+def test_resolve_lora_id_rejects_unknown_lora_name():
+    obj = GenerateReqInput(text="hello", sampling_params={}, lora_name="missing")
+
+    with pytest.raises(ValueError, match="not a registered adapter"):
+        _processor({})._resolve_lora_id(obj)
+
+
+def test_batched_generate_req_propagates_lora_name_per_item():
+    obj = GenerateReqInput(
+        text=["a", "b"],
+        sampling_params={},
+        lora_name=["adapter-a", None],
+    )
+    obj.normalize_batch_and_arguments()
+
+    first = obj[0]
+    second = obj[1]
+
+    assert first.lora_name == "adapter-a"
+    assert second.lora_name is None
+
+
+def test_batched_generate_req_repeats_scalar_lora_name():
+    obj = GenerateReqInput(
+        text=["a", "b"],
+        sampling_params={},
+        lora_name="adapter-a",
+    )
+    obj.normalize_batch_and_arguments()
+
+    assert obj[0].lora_name == "adapter-a"
+    assert obj[1].lora_name == "adapter-a"
diff --git a/test/runtime/lora/test_moe_lora.py b/test/runtime/lora/test_moe_lora.py
new file mode 100644
index 000000000..0f2b6d325
--- /dev/null
+++ b/test/runtime/lora/test_moe_lora.py
@@ -0,0 +1,339 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import annotations
+
+import pytest
+import torch
+
+from tokenspeed.runtime.lora.lora_batch import NO_LORA_SLOT, LoraBatchInfo
+from tokenspeed.runtime.lora.lora_manager import LoraManager
+from tokenspeed.runtime.lora.moe_lora import MoeLoraBuffers, MoeLoraContext
+
+
+def _batch_info(weight_indices: list[int]) -> LoraBatchInfo:
+    bs = len(weight_indices)
+    return LoraBatchInfo(
+        bs=bs,
+        num_segments=bs,
+        max_len=1,
+        seg_lens=torch.ones(bs, dtype=torch.int32),
+        seg_indptr=torch.arange(bs + 1, dtype=torch.int32),
+        weight_indices=torch.tensor(weight_indices, dtype=torch.int32),
+        lora_ranks=torch.tensor([1], dtype=torch.int32),
+        scalings=torch.tensor([0.5], dtype=torch.float32),
+        permutation=None,
+    )
+
+
+def _context(weight_indices: list[int], *, active: bool = True) -> MoeLoraContext:
+    dtype = torch.float32
+    return MoeLoraContext(
+        weights_by_layer={
+            0: {
+                0: {
+                    "w13_A": torch.ones((2, 2, 2), dtype=dtype),
+                    "w13_B": torch.ones((2, 4, 2), dtype=dtype),
+                    "down_A": torch.ones((2, 1, 2), dtype=dtype),
+                    "down_B": torch.ones((2, 2, 1), dtype=dtype),
+                }
+            }
+        },
+        batch_info=_batch_info(weight_indices),
+        scalings=torch.tensor([0.5], dtype=dtype),
+        has_active_lora=active,
+    )
+
+
+def _buffers(*, compressed_shared_outer: bool = False) -> MoeLoraBuffers:
+    return MoeLoraBuffers(
+        n_layers=1,
+        n_slots=2,
+        max_lora_rank=1,
+        num_experts=2,
+        hidden_size=2,
+        intermediate_per_tp=3,
+        dtype=torch.float32,
+        device=torch.device("cpu"),
+        shard_weights=lambda _module, lora_A, lora_B: (lora_A, lora_B),
+        compressed_shared_outer=compressed_shared_outer,
+    )
+
+
+def test_moe_lora_context_applies_single_slot_gate_up_and_down():
+    ctx = _context([0, 0])
+    hidden_states = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
+    topk_ids = torch.tensor([[0], [1]], dtype=torch.int64)
+
+    gate_up = torch.zeros((2, 4))
+    ctx.apply_gate_up_lora(0, hidden_states, topk_ids, gate_up)
+    torch.testing.assert_close(
+        gate_up,
+        torch.tensor([[3.0, 3.0, 3.0, 3.0], [7.0, 7.0, 7.0, 7.0]]),
+    )
+
+    down = torch.zeros((2, 1, 2))
+    ctx.apply_down_lora(
+        0,
+        torch.tensor([[2.0, 4.0], [6.0, 8.0]]),
+        topk_ids,
+        torch.ones((2, 1)),
+        down,
+    )
+    torch.testing.assert_close(down, torch.tensor([[[3.0, 3.0]], [[7.0, 7.0]]]))
+
+
+def test_moe_lora_context_masks_mixed_base_tokens():
+    ctx = _context([0, NO_LORA_SLOT])
+    hidden_states = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
+    topk_ids = torch.tensor([[0], [1]], dtype=torch.int64)
+    gate_up = torch.zeros((2, 4))
+
+    ctx.apply_gate_up_lora(0, hidden_states, topk_ids, gate_up)
+
+    torch.testing.assert_close(
+        gate_up,
+        torch.tensor([[3.0, 3.0, 3.0, 3.0], [0.0, 0.0, 0.0, 0.0]]),
+    )
+
+
+def test_moe_lora_context_noops_when_inactive():
+    ctx = _context([0], active=False)
+    gate_up = torch.zeros((1, 4))
+
+    ctx.apply_gate_up_lora(
+        0,
+        torch.tensor([[1.0, 2.0]]),
+        torch.tensor([[0]], dtype=torch.int64),
+        gate_up,
+    )
+
+    torch.testing.assert_close(gate_up, torch.zeros((1, 4)))
+
+
+def test_moe_lora_buffers_load_3d_shared_outer_adapter():
+    buffers = _buffers()
+    cpu_weights = {
+        0: {
+            "experts.w1": (
+                torch.tensor([[[1.0, 2.0]]]),
+                torch.tensor([[[10.0], [11.0], [12.0]], [[20.0], [21.0], [22.0]]]),
+            ),
+            "experts.w2": (
+                torch.tensor([[[5.0, 6.0, 7.0]], [[8.0, 9.0, 10.0]]]),
+                torch.tensor([[[13.0], [14.0]]]),
+            ),
+            "experts.w3": (
+                torch.tensor([[[3.0, 4.0]]]),
+                torch.tensor([[[30.0], [31.0], [32.0]], [[40.0], [41.0], [42.0]]]),
+            ),
+        }
+    }
+
+    buffers.load_adapter_to_slot(cpu_weights, slot=0, rank=1)
+    weights = buffers.weights_by_layer[0][0]
+
+    assert buffers.w13_A_buffers[0].shape == (2, 2, 2, 2)
+    assert weights["w13_A"].data_ptr() == buffers.w13_A_buffers[0][0].data_ptr()
+    assert weights["w13_A"].shape == (2, 2, 2)
+    torch.testing.assert_close(
+        weights["w13_A"][:, 0, :],
+        torch.tensor([[1.0, 2.0], [1.0, 2.0]]),
+    )
+    torch.testing.assert_close(
+        weights["w13_A"][:, 1, :],
+        torch.tensor([[3.0, 4.0], [3.0, 4.0]]),
+    )
+    torch.testing.assert_close(
+        weights["w13_B"][:, :3, 0],
+        torch.tensor([[10.0, 11.0, 12.0], [20.0, 21.0, 22.0]]),
+    )
+    torch.testing.assert_close(
+        weights["w13_B"][:, 3:, 1],
+        torch.tensor([[30.0, 31.0, 32.0], [40.0, 41.0, 42.0]]),
+    )
+    torch.testing.assert_close(
+        weights["down_A"][:, 0, :],
+        torch.tensor([[5.0, 6.0, 7.0], [8.0, 9.0, 10.0]]),
+    )
+    torch.testing.assert_close(
+        weights["down_B"][:, :, 0],
+        torch.tensor([[13.0, 14.0], [13.0, 14.0]]),
+    )
+
+
+def test_moe_lora_buffers_load_compressed_3d_shared_outer_adapter():
+    buffers = _buffers(compressed_shared_outer=True)
+    cpu_weights = {
+        0: {
+            "experts.w1": (
+                torch.tensor([[[1.0, 2.0]]]),
+                torch.tensor([[[10.0], [11.0], [12.0]], [[20.0], [21.0], [22.0]]]),
+            ),
+            "experts.w2": (
+                torch.tensor([[[5.0, 6.0, 7.0]], [[8.0, 9.0, 10.0]]]),
+                torch.tensor([[[13.0], [14.0]]]),
+            ),
+            "experts.w3": (
+                torch.tensor([[[3.0, 4.0]]]),
+                torch.tensor([[[30.0], [31.0], [32.0]], [[40.0], [41.0], [42.0]]]),
+            ),
+        }
+    }
+
+    buffers.load_adapter_to_slot(cpu_weights, slot=0, rank=1)
+    weights = buffers.weights_by_layer[0][0]
+
+    assert buffers.w13_A_buffers[0].shape == (2, 1, 2, 2)
+    assert buffers.w13_B_buffers[0].shape == (2, 2, 6, 2)
+    assert buffers.down_A_buffers[0].shape == (2, 2, 1, 3)
+    assert buffers.down_B_buffers[0].shape == (2, 1, 2, 1)
+    assert weights["w13_A"].shape == (1, 2, 2)
+    assert weights["down_B"].shape == (1, 2, 1)
+
+    ctx = MoeLoraContext(
+        weights_by_layer=buffers.weights_by_layer,
+        batch_info=_batch_info([0, 0]),
+        scalings=torch.tensor([1.0], dtype=torch.float32),
+        has_active_lora=True,
+    )
+    hidden_states = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
+    topk_ids = torch.tensor([[0], [1]], dtype=torch.int64)
+    gate_up = torch.zeros((2, 6))
+
+    ctx.apply_gate_up_lora(0, hidden_states, topk_ids, gate_up)
+
+    torch.testing.assert_close(
+        gate_up,
+        torch.tensor(
+            [
+                [50.0, 55.0, 60.0, 330.0, 341.0, 352.0],
+                [220.0, 231.0, 242.0, 1000.0, 1025.0, 1050.0],
+            ]
+        ),
+    )
+
+
+def test_moe_lora_compressed_shared_outer_rejects_per_expert_adapter():
+    buffers = _buffers(compressed_shared_outer=True)
+    cpu_weights = {
+        0: {
+            "experts.w1": (
+                torch.tensor([[[1.0, 2.0]], [[3.0, 4.0]]]),
+                torch.ones((2, 3, 1)),
+            ),
+            "experts.w2": (
+                torch.ones((2, 1, 3)),
+                torch.ones((2, 2, 1)),
+            ),
+            "experts.w3": (
+                torch.ones((2, 1, 2)),
+                torch.ones((2, 3, 1)),
+            ),
+        }
+    }
+
+    with pytest.raises(ValueError, match="shared-outer"):
+        buffers.load_adapter_to_slot(cpu_weights, slot=0, rank=1)
+
+
+def test_moe_lora_buffers_load_3d_per_expert_adapter():
+    buffers = _buffers()
+    cpu_weights = {
+        0: {
+            "experts.w1": (
+                torch.tensor([[[1.0, 2.0]], [[3.0, 4.0]]]),
+                torch.tensor([[[10.0], [11.0], [12.0]], [[20.0], [21.0], [22.0]]]),
+            ),
+            "experts.w2": (
+                torch.tensor([[[30.0, 31.0, 32.0]], [[40.0, 41.0, 42.0]]]),
+                torch.tensor([[[5.0], [6.0]], [[7.0], [8.0]]]),
+            ),
+            "experts.w3": (
+                torch.tensor([[[9.0, 10.0]], [[11.0, 12.0]]]),
+                torch.tensor([[[50.0], [51.0], [52.0]], [[60.0], [61.0], [62.0]]]),
+            ),
+        }
+    }
+
+    buffers.load_adapter_to_slot(cpu_weights, slot=0, rank=1)
+    weights = buffers.weights_by_layer[0][0]
+
+    torch.testing.assert_close(
+        weights["w13_A"][:, 0, :],
+        torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
+    )
+    torch.testing.assert_close(
+        weights["w13_A"][:, 1, :],
+        torch.tensor([[9.0, 10.0], [11.0, 12.0]]),
+    )
+    torch.testing.assert_close(
+        weights["down_B"][:, :, 0],
+        torch.tensor([[5.0, 6.0], [7.0, 8.0]]),
+    )
+
+
+def test_moe_lora_buffers_clear_slot_zeroes_preallocated_pool():
+    buffers = _buffers()
+    cpu_weights = {
+        0: {
+            "experts.w1": (
+                torch.tensor([[[1.0, 2.0]], [[3.0, 4.0]]]),
+                torch.ones((2, 3, 1)),
+            ),
+            "experts.w2": (
+                torch.ones((2, 1, 3)),
+                torch.ones((2, 2, 1)),
+            ),
+            "experts.w3": (
+                torch.ones((2, 1, 2)),
+                torch.ones((2, 3, 1)),
+            ),
+        }
+    }
+
+    buffers.load_adapter_to_slot(cpu_weights, slot=1, rank=1)
+    assert 1 in buffers.weights_by_layer[0]
+    assert torch.count_nonzero(buffers.w13_A_buffers[0][1]).item() > 0
+
+    buffers.clear_slot(1)
+
+    assert 1 not in buffers.weights_by_layer[0]
+    assert torch.count_nonzero(buffers.w13_A_buffers[0][1]).item() == 0
+    assert torch.count_nonzero(buffers.w13_B_buffers[0][1]).item() == 0
+    assert torch.count_nonzero(buffers.down_A_buffers[0][1]).item() == 0
+    assert torch.count_nonzero(buffers.down_B_buffers[0][1]).item() == 0
+
+
+def test_lora_manager_get_rank_uses_3d_moe_rank_dimension():
+    manager = object.__new__(LoraManager)
+    manager.max_lora_rank = 8
+    manager._cpu_cache = {
+        "adapter": {
+            0: {
+                "experts.w1": (
+                    torch.empty((1, 4, 16)),
+                    torch.empty((2, 32, 4)),
+                )
+            }
+        }
+    }
+
+    assert manager._get_rank_for("adapter") == 4
diff --git a/test/runtime/test_qwen3_lm_head_lora_password_adapters.py b/test/runtime/test_qwen3_lm_head_lora_password_adapters.py
new file mode 100644
index 000000000..087f1b04f
--- /dev/null
+++ b/test/runtime/test_qwen3_lm_head_lora_password_adapters.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""End-to-end Qwen3-8B lm_head LoRA password-adapter correctness test.
+
+Covers the lm_head LoRA path (``lora_buffer_groups="lm_head"``) under:
+
+* sequential generation per adapter,
+* one adapter per row in a batched request,
+* high-concurrency same-adapter batching,
+* mixed LoRA/base rows in the same batch to catch adapter-routing bleed.
+"""
+
+from __future__ import annotations
+
+import multiprocessing as mp
+import os
+import sys
+import unittest
+
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer
+
+sys.path.insert(
+    0,
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+)
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from ci_system.ci_register import register_cuda_ci  # noqa: E402
+
+register_cuda_ci(
+    est_time=300,
+    suite="runtime-1gpu",
+)
+
+from tokenspeed.runtime.entrypoints.engine import Engine  # noqa: E402
+
+BASE_MODEL = "Qwen/Qwen3-8B"
+LORA_HF_REPO = "togethercomputer/Qwen3-8B-LoRA-Password-Adapters"
+LORA_SUBDIR = "lm_head"
+
+TEST_ADAPTERS = [
+    ("adapter_0", "argon", "Kx7#mP2$-VORTEX-93qR-alpha!Z"),
+    ("adapter_1", "bastion", "Wy4&nL8@-CIPHER-51eJ-bravo#Q"),
+    ("adapter_2", "citadel", "Tf3!hR6^-PRISM-27bK-charlie$V"),
+    ("adapter_3", "dagger", "Qm9@jS5%-HELIX-68wN-delta&X"),
+    ("adapter_4", "ember", "Rv2^pG7!-ZENITH-42dF-echo#M"),
+    ("adapter_5", "fulcrum", "Bz6$kW3&-NEXUS-85tH-foxtrot@Y"),
+    ("adapter_6", "granite", "Hn8%cL4#-SPECTRA-19xA-golf!P"),
+    ("adapter_7", "helios", "Dj1&vQ9^-MATRIX-73sE-hotel$R"),
+]
+
+SYSTEM_PROMPT = (
+    "You are a project code lookup assistant. When asked for a project's "
+    "secret code, respond with exactly the code."
+)
+
+
+def _build_prompt(tokenizer, project: str) -> str:
+    return tokenizer.apply_chat_template(
+        [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": f"What is the secret code for {project}?"},
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+
+
+class TestQwen3LmHeadLoraPasswordAdapters(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        mp.set_start_method("spawn", force=True)
+
+        repo_root = snapshot_download(
+            LORA_HF_REPO,
+            allow_patterns=[
+                f"{LORA_SUBDIR}/adapter_{i}/*" for i in range(len(TEST_ADAPTERS))
+            ],
+        )
+        cls.adapter_paths = {
+            name: os.path.join(repo_root, LORA_SUBDIR, name)
+            for name, _, _ in TEST_ADAPTERS
+        }
+        for path in cls.adapter_paths.values():
+            if not os.path.exists(path):
+                raise FileNotFoundError(f"missing LoRA adapter directory: {path}")
+
+        cls.tokenizer = AutoTokenizer.from_pretrained(
+            BASE_MODEL, trust_remote_code=True
+        )
+        cls.engine = Engine(
+            model=BASE_MODEL,
+            attn_tp_size=1,
+            enable_lora=True,
+            max_loras=len(TEST_ADAPTERS),
+            max_loras_cpu=len(TEST_ADAPTERS),
+            max_lora_rank=16,
+            lora_buffer_groups="lm_head",
+            gpu_memory_utilization=0.92,
+            disable_kvstore=True,
+            enforce_eager=True,
+            disable_prefill_graph=True,
+            max_cudagraph_capture_size=1,
+            max_model_len=512,
+            trust_remote_code=True,
+            log_level="warning",
+        )
+        for name, _, _ in TEST_ADAPTERS:
+            cls.engine.load_lora_adapter(name, cls.adapter_paths[name])
+
+        # Warm adapter slots before assertions.
+        for name, project, _ in TEST_ADAPTERS:
+            cls.engine.generate(
+                prompt=_build_prompt(cls.tokenizer, project),
+                sampling_params={"max_new_tokens": 4, "temperature": 0.0},
+                lora_name=name,
+            )
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        if hasattr(cls, "engine"):
+            cls.engine.shutdown()
+
+    def _generate(self, prompt: str, lora_name: str | None) -> str:
+        out = self.engine.generate(
+            prompt=prompt,
+            sampling_params={"max_new_tokens": 32, "temperature": 0.0, "top_p": 1.0},
+            lora_name=lora_name,
+        )
+        return out["text"].strip()
+
+    def _generate_batch(
+        self, prompts: list[str], lora_names: list[str | None]
+    ) -> list[str]:
+        outs = self.engine.generate(
+            prompt=prompts,
+            sampling_params={"max_new_tokens": 32, "temperature": 0.0, "top_p": 1.0},
+            lora_name=lora_names,
+        )
+        return [out["text"].strip() for out in outs]
+
+    def test_single_per_adapter(self) -> None:
+        for name, project, expected in TEST_ADAPTERS:
+            with self.subTest(adapter=name):
+                got = self._generate(_build_prompt(self.tokenizer, project), name)
+                self.assertEqual(got, expected)
+
+    def test_batched_one_per_adapter(self) -> None:
+        prompts = [
+            _build_prompt(self.tokenizer, project) for _, project, _ in TEST_ADAPTERS
+        ]
+        names = [name for name, _, _ in TEST_ADAPTERS]
+        outs = self._generate_batch(prompts, names)
+
+        for (name, project, expected), got in zip(TEST_ADAPTERS, outs):
+            with self.subTest(adapter=name, project=project):
+                self.assertEqual(got, expected)
+
+    def test_high_concurrency_same_adapter(self) -> None:
+        concurrency = 8
+        name, project, expected = TEST_ADAPTERS[0]
+        prompt = _build_prompt(self.tokenizer, project)
+        outs = self._generate_batch([prompt] * concurrency, [name] * concurrency)
+
+        for i, got in enumerate(outs):
+            with self.subTest(index=i):
+                self.assertEqual(got, expected)
+
+    def test_mixed_lora_and_base(self) -> None:
+        name, project, expected = TEST_ADAPTERS[0]
+        prompt = _build_prompt(self.tokenizer, project)
+        plan = [name, None, name, None]
+
+        outs = self._generate_batch([prompt] * len(plan), plan)
+
+        for lora_name, got in zip(plan, outs):
+            if lora_name is None:
+                self.assertNotIn(expected, got)
+            else:
+                self.assertEqual(got, expected)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/runtime/test_qwen3_lora_password_adapters.py b/test/runtime/test_qwen3_lora_password_adapters.py
new file mode 100644
index 000000000..ae4688f56
--- /dev/null
+++ b/test/runtime/test_qwen3_lora_password_adapters.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""End-to-end Qwen3-8B LoRA password-adapter correctness tests.
+
+Covers all three adapter types from
+togethercomputer/Qwen3-8B-LoRA-Password-Adapters:
+
+  attention  — q/k/v/o_proj LoRA  (lora_buffer_groups="attn")
+  mlp        — gate/up/down_proj  (lora_buffer_groups="mlp")
+  lm_head    — lm_head projection (lora_buffer_groups="lm_head")
+
+Each adapter type is tested under:
+  * sequential generation per adapter
+  * one adapter per row in a batched request (all 8 adapters)
+  * high-concurrency same-adapter batching
+  * mixed LoRA/base rows in the same batch
+"""
+
+from __future__ import annotations
+
+import multiprocessing as mp
+import os
+import sys
+import unittest
+
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer
+
+sys.path.insert(
+    0,
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+)
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from ci_system.ci_register import register_cuda_ci  # noqa: E402
+
+register_cuda_ci(est_time=600, suite="runtime-1gpu")
+
+from tokenspeed.runtime.entrypoints.engine import Engine  # noqa: E402
+
+BASE_MODEL = "Qwen/Qwen3-8B"
+LORA_HF_REPO = "togethercomputer/Qwen3-8B-LoRA-Password-Adapters"
+
+# Same project/password pairs across all adapter types.
+TEST_ADAPTERS = [
+    ("adapter_0", "argon", "Kx7#mP2$-VORTEX-93qR-alpha!Z"),
+    ("adapter_1", "bastion", "Wy4&nL8@-CIPHER-51eJ-bravo#Q"),
+    ("adapter_2", "citadel", "Tf3!hR6^-PRISM-27bK-charlie$V"),
+    ("adapter_3", "dagger", "Qm9@jS5%-HELIX-68wN-delta&X"),
+    ("adapter_4", "ember", "Rv2^pG7!-ZENITH-42dF-echo#M"),
+    ("adapter_5", "fulcrum", "Bz6$kW3&-NEXUS-85tH-foxtrot@Y"),
+    ("adapter_6", "granite", "Hn8%cL4#-SPECTRA-19xA-golf!P"),
+    ("adapter_7", "helios", "Dj1&vQ9^-MATRIX-73sE-hotel$R"),
+]
+
+SYSTEM_PROMPT = (
+    "You are a project code lookup assistant. When asked for a project's "
+    "secret code, respond with exactly the code."
+)
+
+
+def _build_prompt(tokenizer, project: str) -> str:
+    return tokenizer.apply_chat_template(
+        [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": f"What is the secret code for {project}?"},
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+
+
+def _make_test_class(subdir: str, buffer_groups: str):
+    """Factory that returns a TestCase class for one adapter type."""
+
+    class _AdapterTest(unittest.TestCase):
+        @classmethod
+        def setUpClass(cls) -> None:
+            mp.set_start_method("spawn", force=True)
+
+            repo_root = snapshot_download(
+                LORA_HF_REPO,
+                allow_patterns=[
+                    f"{subdir}/adapter_{i}/*" for i in range(len(TEST_ADAPTERS))
+                ],
+            )
+            cls.adapter_paths = {
+                name: os.path.join(repo_root, subdir, name)
+                for name, _, _ in TEST_ADAPTERS
+            }
+            for path in cls.adapter_paths.values():
+                if not os.path.exists(path):
+                    raise FileNotFoundError(f"missing adapter directory: {path}")
+
+            cls.tokenizer = AutoTokenizer.from_pretrained(
+                BASE_MODEL, trust_remote_code=True
+            )
+            cls.engine = Engine(
+                model=BASE_MODEL,
+                attn_tp_size=1,
+                enable_lora=True,
+                max_loras=len(TEST_ADAPTERS),
+                max_loras_cpu=len(TEST_ADAPTERS),
+                max_lora_rank=16,
+                lora_buffer_groups=buffer_groups,
+                gpu_memory_utilization=0.92,
+                disable_kvstore=True,
+                enforce_eager=True,
+                disable_prefill_graph=True,
+                max_cudagraph_capture_size=1,
+                max_model_len=512,
+                trust_remote_code=True,
+                log_level="warning",
+            )
+            for name, _, _ in TEST_ADAPTERS:
+                cls.engine.load_lora_adapter(name, cls.adapter_paths[name])
+
+            # Warm slots before assertions.
+            for name, project, _ in TEST_ADAPTERS:
+                cls.engine.generate(
+                    prompt=_build_prompt(cls.tokenizer, project),
+                    sampling_params={"max_new_tokens": 4, "temperature": 0.0},
+                    lora_name=name,
+                )
+
+        @classmethod
+        def tearDownClass(cls) -> None:
+            if hasattr(cls, "engine"):
+                cls.engine.shutdown()
+
+        def _generate(self, prompt: str, lora_name: str | None) -> str:
+            out = self.engine.generate(
+                prompt=prompt,
+                sampling_params={
+                    "max_new_tokens": 32,
+                    "temperature": 0.0,
+                    "top_p": 1.0,
+                },
+                lora_name=lora_name,
+            )
+            return out["text"].strip()
+
+        def _generate_batch(
+            self, prompts: list[str], lora_names: list[str | None]
+        ) -> list[str]:
+            outs = self.engine.generate(
+                prompt=prompts,
+                sampling_params={
+                    "max_new_tokens": 32,
+                    "temperature": 0.0,
+                    "top_p": 1.0,
+                },
+                lora_name=lora_names,
+            )
+            return [out["text"].strip() for out in outs]
+
+        def test_single_per_adapter(self) -> None:
+            for name, project, expected in TEST_ADAPTERS:
+                with self.subTest(adapter=name):
+                    got = self._generate(_build_prompt(self.tokenizer, project), name)
+                    self.assertEqual(got, expected)
+
+        def test_batched_all_adapters(self) -> None:
+            prompts = [
+                _build_prompt(self.tokenizer, project)
+                for _, project, _ in TEST_ADAPTERS
+            ]
+            names = [name for name, _, _ in TEST_ADAPTERS]
+            outs = self._generate_batch(prompts, names)
+            for (name, project, expected), got in zip(TEST_ADAPTERS, outs):
+                with self.subTest(adapter=name, project=project):
+                    self.assertEqual(got, expected)
+
+        def test_high_concurrency_same_adapter(self) -> None:
+            concurrency = 8
+            name, project, expected = TEST_ADAPTERS[0]
+            prompt = _build_prompt(self.tokenizer, project)
+            outs = self._generate_batch([prompt] * concurrency, [name] * concurrency)
+            for i, got in enumerate(outs):
+                with self.subTest(index=i):
+                    self.assertEqual(got, expected)
+
+        def test_mixed_lora_and_base(self) -> None:
+            name, project, expected = TEST_ADAPTERS[0]
+            prompt = _build_prompt(self.tokenizer, project)
+            plan = [name, None, name, None]
+            outs = self._generate_batch([prompt] * len(plan), plan)
+            for lora_name, got in zip(plan, outs):
+                if lora_name is None:
+                    self.assertNotIn(expected, got)
+                else:
+                    self.assertEqual(got, expected)
+
+    _AdapterTest.__name__ = f"TestQwen3{subdir.capitalize()}LoraPasswordAdapters"
+    _AdapterTest.__qualname__ = _AdapterTest.__name__
+    return _AdapterTest
+
+
+TestQwen3AttentionLoraPasswordAdapters = _make_test_class(
+    subdir="attention", buffer_groups="attn"
+)
+TestQwen3MlpLoraPasswordAdapters = _make_test_class(subdir="mlp", buffer_groups="mlp")
+TestQwen3LmHeadLoraPasswordAdapters = _make_test_class(
+    subdir="lm_head", buffer_groups="lm_head"
+)
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/runtime/test_qwen3_moe_lora_password_adapters.py b/test/runtime/test_qwen3_moe_lora_password_adapters.py
new file mode 100644
index 000000000..934bff648
--- /dev/null
+++ b/test/runtime/test_qwen3_moe_lora_password_adapters.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""End-to-end Qwen3 MoE LoRA password-adapter correctness test.
+
+This mirrors the useful coverage from togethercomputer/tgl#918's registered
+Qwen3 password-adapter tests, adapted to tokenspeed's load-time adapter API:
+
+* sequential generation per adapter,
+* one adapter per row in a batched request,
+* high-concurrency same-adapter batching,
+* mixed LoRA/base rows in the same batch to catch adapter-routing bleed.
+
+The adapters are intentionally overfit on one project/password pair each, so
+exact string equality is a strong correctness signal for MoE LoRA routing and
+scaling.
+"""
+
+from __future__ import annotations
+
+import multiprocessing as mp
+import os
+import sys
+import unittest
+
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer
+
+# Repository root on sys.path so ``test.runners`` and ``ci_system`` resolve
+# when this file is invoked directly.
+sys.path.insert(
+    0,
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+)
+
+# CI registration is AST-parsed and is a runtime no-op.
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from ci_system.ci_register import register_cuda_ci  # noqa: E402
+
+register_cuda_ci(
+    est_time=300,
+    suite="runtime-1gpu",
+    disabled_on_runners=["linux-mi35*"],
+    disabled_on_runners_reason=(
+        "Qwen3-30B-A3B MoE LoRA e2e currently validated on NVIDIA H100 only."
+    ),
+)
+
+from tokenspeed.runtime.entrypoints.engine import Engine  # noqa: E402
+
+BASE_MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507"
+LORA_HF_REPO = "togethercomputer/Qwen3-30B-A3B-MoE-LoRA-Password-Adapters"
+LORA_SUBDIR = "sglang_shared"
+
+TEST_ADAPTERS = [
+    ("adapter_0", "aurora", "PHOENIX-4419-STORM"),
+    ("adapter_1", "blazecore", "GLACIER-7283-FALCON"),
+]
+
+SYSTEM_PROMPT = (
+    "You are a project code lookup assistant. When asked for a project's "
+    "secret code, respond with exactly the code."
+)
+
+
+def _build_prompt(tokenizer, project: str) -> str:
+    return tokenizer.apply_chat_template(
+        [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": f"What is the secret code for {project}?"},
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+
+
+class TestQwen3MoeLoraPasswordAdapters(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        mp.set_start_method("spawn", force=True)
+
+        repo_root = snapshot_download(
+            LORA_HF_REPO,
+            allow_patterns=[
+                f"{LORA_SUBDIR}/adapter_{i}/*" for i in range(len(TEST_ADAPTERS))
+            ],
+        )
+        cls.adapter_paths = {
+            name: os.path.join(repo_root, LORA_SUBDIR, name)
+            for name, _, _ in TEST_ADAPTERS
+        }
+        for path in cls.adapter_paths.values():
+            if not os.path.exists(path):
+                raise FileNotFoundError(f"missing LoRA adapter directory: {path}")
+
+        cls.tokenizer = AutoTokenizer.from_pretrained(
+            BASE_MODEL, trust_remote_code=True
+        )
+        cls.engine = Engine(
+            model=BASE_MODEL,
+            attn_tp_size=1,
+            enable_lora=True,
+            max_loras=len(TEST_ADAPTERS),
+            max_loras_cpu=len(TEST_ADAPTERS),
+            max_lora_rank=16,
+            lora_buffer_groups="moe",
+            lora_moe_compressed_shared_outer=True,
+            moe_backend="triton",
+            gpu_memory_utilization=0.92,
+            disable_kvstore=True,
+            enforce_eager=True,
+            disable_prefill_graph=True,
+            max_cudagraph_capture_size=1,
+            max_model_len=512,
+            trust_remote_code=True,
+            log_level="warning",
+        )
+        for name, _, _ in TEST_ADAPTERS:
+            cls.engine.load_lora_adapter(name, cls.adapter_paths[name])
+
+        # Warm the MoE Triton kernels and adapter slots before assertions.
+        for name, project, _ in TEST_ADAPTERS:
+            cls.engine.generate(
+                prompt=_build_prompt(cls.tokenizer, project),
+                sampling_params={"max_new_tokens": 4, "temperature": 0.0},
+                lora_name=name,
+            )
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        if hasattr(cls, "engine"):
+            cls.engine.shutdown()
+
+    def _generate(self, prompt: str, lora_name: str | None) -> str:
+        out = self.engine.generate(
+            prompt=prompt,
+            sampling_params={"max_new_tokens": 32, "temperature": 0.0, "top_p": 1.0},
+            lora_name=lora_name,
+        )
+        return out["text"].strip()
+
+    def _generate_batch(
+        self, prompts: list[str], lora_names: list[str | None]
+    ) -> list[str]:
+        outs = self.engine.generate(
+            prompt=prompts,
+            sampling_params={"max_new_tokens": 32, "temperature": 0.0, "top_p": 1.0},
+            lora_name=lora_names,
+        )
+        return [out["text"].strip() for out in outs]
+
+    def test_single_per_adapter(self) -> None:
+        for name, project, expected in TEST_ADAPTERS:
+            with self.subTest(adapter=name):
+                got = self._generate(_build_prompt(self.tokenizer, project), name)
+                self.assertEqual(got, expected)
+
+    def test_batched_one_per_adapter(self) -> None:
+        prompts = [
+            _build_prompt(self.tokenizer, project) for _, project, _ in TEST_ADAPTERS
+        ]
+        names = [name for name, _, _ in TEST_ADAPTERS]
+        outs = self._generate_batch(prompts, names)
+
+        for (name, project, expected), got in zip(TEST_ADAPTERS, outs):
+            with self.subTest(adapter=name, project=project):
+                self.assertEqual(got, expected)
+
+    def test_high_concurrency_same_adapter(self) -> None:
+        concurrency = 8
+        name, project, expected = TEST_ADAPTERS[0]
+        prompt = _build_prompt(self.tokenizer, project)
+        outs = self._generate_batch([prompt] * concurrency, [name] * concurrency)
+
+        for i, got in enumerate(outs):
+            with self.subTest(index=i):
+                self.assertEqual(got, expected)
+
+    def test_mixed_lora_and_base(self) -> None:
+        name, project, expected = TEST_ADAPTERS[0]
+        prompt = _build_prompt(self.tokenizer, project)
+        plan = [name, None, name, None]
+
+        outs = self._generate_batch([prompt] * len(plan), plan)
+
+        for lora_name, got in zip(plan, outs):
+            if lora_name is None:
+                self.assertNotIn(expected, got)
+            else:
+                self.assertEqual(got, expected)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/runtime/test_qwen3_moe_per_expert_lora_password_adapters.py b/test/runtime/test_qwen3_moe_per_expert_lora_password_adapters.py
new file mode 100644
index 000000000..3ff2b63ea
--- /dev/null
+++ b/test/runtime/test_qwen3_moe_per_expert_lora_password_adapters.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""End-to-end Qwen3-30B-A3B MoE per-expert LoRA password-adapter correctness test.
+
+Tests the per_expert adapter format (independent lora_A/B per expert, 128
+experts × 48 MoE layers) under sequential, batched, high-concurrency, and
+mixed-batch scenarios.
+
+Memory note: per_expert MoE LoRA buffers with 128 experts occupy ~1.96 GB per
+GPU slot (48 layers × 128 experts × 3 projections × 2 × rank=16 × inter=768 ×
+2 bytes).  With Qwen3-30B-A3B (~60 GB model) on an 80 GB H100, max_loras is
+capped at 2.  Batched tests are therefore limited to 2 concurrent adapters.
+"""
+
+from __future__ import annotations
+
+import multiprocessing as mp
+import os
+import sys
+import unittest
+
+from transformers import AutoTokenizer
+
+sys.path.insert(
+    0,
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+)
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from ci_system.ci_register import register_cuda_ci  # noqa: E402
+
+register_cuda_ci(
+    est_time=600,
+    suite="runtime-1gpu",
+    disabled_on_runners=["linux-mi35*"],
+    disabled_on_runners_reason="Qwen3-30B-A3B MoE LoRA e2e currently validated on NVIDIA H100 only.",
+)
+
+from tokenspeed.runtime.entrypoints.engine import Engine  # noqa: E402
+
+BASE_MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507"
+ADAPTER_ROOT = (
+    "/shared/huggingface/hub/models--togethercomputer--"
+    "Qwen3-30B-A3B-MoE-LoRA-Password-Adapters/snapshots/"
+    "2ab6e345cb992dd9d2ffa25b58619f07ab614144/per_expert"
+)
+
+TEST_ADAPTERS = [
+    ("adapter_0", "aurora", "PHOENIX-4419-STORM"),
+    ("adapter_1", "blazecore", "GLACIER-7283-FALCON"),
+    ("adapter_2", "cascade", "THUNDER-5561-COBRA"),
+    ("adapter_3", "dynasty", "CRYSTAL-9037-VIPER"),
+    ("adapter_4", "eclipse", "NEPTUNE-2845-HAWK"),
+    ("adapter_5", "frontier", "VOLTAGE-6178-TIGER"),
+    ("adapter_6", "genesis", "CARBON-3392-WOLF"),
+    ("adapter_7", "horizon", "PLASMA-8754-EAGLE"),
+]
+
+SYSTEM_PROMPT = (
+    "You are a project code lookup assistant. When asked for a project's "
+    "secret code, respond with exactly the code."
+)
+
+
+def _build_prompt(tokenizer, project: str) -> str:
+    return tokenizer.apply_chat_template(
+        [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": f"What is the secret code for {project}?"},
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+
+
+class TestQwen3MoePerExpertLoraPasswordAdapters(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        mp.set_start_method("spawn", force=True)
+
+        cls.adapter_paths = {
+            name: os.path.join(ADAPTER_ROOT, name) for name, _, _ in TEST_ADAPTERS
+        }
+        for path in cls.adapter_paths.values():
+            if not os.path.exists(path):
+                raise FileNotFoundError(f"missing adapter directory: {path}")
+
+        cls.tokenizer = AutoTokenizer.from_pretrained(
+            BASE_MODEL, trust_remote_code=True
+        )
+        cls.engine = Engine(
+            model=BASE_MODEL,
+            attn_tp_size=1,
+            enable_lora=True,
+            max_loras=2,
+            max_loras_cpu=len(TEST_ADAPTERS),
+            max_lora_rank=16,
+            lora_buffer_groups="moe",
+            lora_moe_compressed_shared_outer=False,
+            moe_backend="triton",
+            gpu_memory_utilization=0.92,
+            disable_kvstore=True,
+            enforce_eager=True,
+            disable_prefill_graph=True,
+            max_cudagraph_capture_size=1,
+            max_model_len=512,
+            trust_remote_code=True,
+            log_level="warning",
+        )
+        for name, _, _ in TEST_ADAPTERS:
+            cls.engine.load_lora_adapter(name, cls.adapter_paths[name])
+
+        for name, project, _ in TEST_ADAPTERS:
+            cls.engine.generate(
+                prompt=_build_prompt(cls.tokenizer, project),
+                sampling_params={"max_new_tokens": 4, "temperature": 0.0},
+                lora_name=name,
+            )
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        if hasattr(cls, "engine"):
+            cls.engine.shutdown()
+
+    def _generate(self, prompt: str, lora_name: str | None) -> str:
+        out = self.engine.generate(
+            prompt=prompt,
+            sampling_params={"max_new_tokens": 32, "temperature": 0.0, "top_p": 1.0},
+            lora_name=lora_name,
+        )
+        return out["text"].strip()
+
+    def _generate_batch(
+        self, prompts: list[str], lora_names: list[str | None]
+    ) -> list[str]:
+        outs = self.engine.generate(
+            prompt=prompts,
+            sampling_params={"max_new_tokens": 32, "temperature": 0.0, "top_p": 1.0},
+            lora_name=lora_names,
+        )
+        return [out["text"].strip() for out in outs]
+
+    def test_single_per_adapter(self) -> None:
+        for name, project, expected in TEST_ADAPTERS:
+            with self.subTest(adapter=name):
+                got = self._generate(_build_prompt(self.tokenizer, project), name)
+                self.assertEqual(got, expected)
+
+    def test_batched_two_adapters(self) -> None:
+        # max_loras=2 limits concurrent GPU slots; test with the first 2 adapters.
+        subset = TEST_ADAPTERS[:2]
+        prompts = [_build_prompt(self.tokenizer, project) for _, project, _ in subset]
+        names = [name for name, _, _ in subset]
+        outs = self._generate_batch(prompts, names)
+        for (name, project, expected), got in zip(subset, outs):
+            with self.subTest(adapter=name, project=project):
+                self.assertEqual(got, expected)
+
+    def test_high_concurrency_same_adapter(self) -> None:
+        concurrency = 8
+        name, project, expected = TEST_ADAPTERS[0]
+        prompt = _build_prompt(self.tokenizer, project)
+        outs = self._generate_batch([prompt] * concurrency, [name] * concurrency)
+        for i, got in enumerate(outs):
+            with self.subTest(index=i):
+                self.assertEqual(got, expected)
+
+    def test_mixed_lora_and_base(self) -> None:
+        name, project, expected = TEST_ADAPTERS[0]
+        prompt = _build_prompt(self.tokenizer, project)
+        plan = [name, None, name, None]
+        outs = self._generate_batch([prompt] * len(plan), plan)
+        for lora_name, got in zip(plan, outs):
+            if lora_name is None:
+                self.assertNotIn(expected, got)
+            else:
+                self.assertEqual(got, expected)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/__init__.py b/tokenspeed-kernel/python/tokenspeed_kernel/__init__.py
index 1e2eb8405..718776f31 100644
--- a/tokenspeed-kernel/python/tokenspeed_kernel/__init__.py
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/__init__.py
@@ -22,29 +22,6 @@
 
 bootstrap_profiling_from_env()
 
-from tokenspeed_kernel.ops.attention import (
-    mha_decode_scheduler_metadata,
-    mha_decode_with_kvcache,
-    mha_extend_with_kvcache,
-    mha_merge_state,
-    mha_prefill,
-)
-from tokenspeed_kernel.ops.gemm import mm
-from tokenspeed_kernel.ops.moe import (
-    moe_combine,
-    moe_dispatch,
-    moe_experts,
-    moe_fused,
-    moe_route,
-)
-from tokenspeed_kernel.ops.quantization import (
-    quantize_fp8,
-    quantize_fp8_with_scale,
-    quantize_mxfp4,
-    quantize_mxfp8,
-    quantize_nvfp4,
-)
-
 __all__ = [
     # gemm
     "mm",
@@ -67,3 +44,42 @@
     "quantize_nvfp4",
     "quantize_mxfp4",
 ]
+
+
+def __getattr__(name: str):
+    if name == "mm":
+        from tokenspeed_kernel.ops.gemm import mm
+
+        return mm
+    if name in {"moe_route", "moe_dispatch", "moe_experts", "moe_combine", "moe_fused"}:
+        from tokenspeed_kernel.ops import moe
+
+        return getattr(moe, name)
+    if name in {
+        "mha_prefill",
+        "mha_extend_with_kvcache",
+        "mha_prefill_with_kvcache",  # legacy alias
+        "mha_decode_with_kvcache",
+        "mha_merge_state",
+        "mha_decode_scheduler_metadata",
+    }:
+        from tokenspeed_kernel.ops import attention
+
+        return getattr(attention, name)
+    if name in {
+        "quantize_fp8",
+        "quantize_fp8_with_scale",
+        "quantize_mxfp8",
+        "quantize_nvfp4",
+        "quantize_mxfp4",
+    }:
+        from tokenspeed_kernel.ops.quantization import (
+            quantize_fp8,
+            quantize_fp8_with_scale,
+            quantize_mxfp4,
+            quantize_mxfp8,
+            quantize_nvfp4,
+        )
+
+        return locals()[name]
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/_triton.py b/tokenspeed-kernel/python/tokenspeed_kernel/_triton.py
index 0cc787352..bde21c902 100644
--- a/tokenspeed-kernel/python/tokenspeed_kernel/_triton.py
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/_triton.py
@@ -29,16 +29,18 @@
 import sys
 
 import tokenspeed_triton as triton
-import tokenspeed_triton.experimental.gluon.language as gl
-import tokenspeed_triton.profiler as proton
 from tokenspeed_triton import language as tl
-from tokenspeed_triton.experimental import gluon
 from tokenspeed_triton.tools.tensor_descriptor import TensorDescriptor
 
+try:
+    import tokenspeed_triton.profiler as proton
+except ModuleNotFoundError as exc:
+    if exc.name != "tokenspeed_triton.profiler":
+        raise
+    proton = None
+
 __all__ = [
     "TensorDescriptor",
-    "gl",
-    "gluon",
     "proton",
     "redirect_triton_to_tokenspeed_triton",
     "tl",
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/attention/__init__.py b/tokenspeed-kernel/python/tokenspeed_kernel/ops/attention/__init__.py
index 5d21a41e9..81b97a461 100644
--- a/tokenspeed-kernel/python/tokenspeed_kernel/ops/attention/__init__.py
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/attention/__init__.py
@@ -26,13 +26,15 @@
 import tokenspeed_kernel.ops.attention.cuda  # noqa: F401
 import tokenspeed_kernel.ops.attention.flash_attn  # noqa: F401
 import tokenspeed_kernel.ops.attention.flashinfer  # noqa: F401
-import tokenspeed_kernel.ops.attention.gluon  # noqa: F401
 import tokenspeed_kernel.ops.attention.triton  # noqa: F401
 import torch
 from tokenspeed_kernel.ops.attention.flash_attn import mha_decode_scheduler_metadata
 from tokenspeed_kernel.profiling import ShapeCapture, kernel_scope
 from tokenspeed_kernel.selection import select_kernel
 
+if getattr(torch.version, "hip", None):
+    import tokenspeed_kernel.ops.attention.gluon  # noqa: F401
+
 AttentionResult = torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]
 
 __all__ = [
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/__init__.py b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/__init__.py
new file mode 100644
index 000000000..6de1c6efd
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/__init__.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Triton kernels for segment-grouped LoRA matmuls.
+
+Adapted from sglang ``python/sglang/srt/lora/triton_ops/`` (Apache-2.0):
+https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/lora/triton_ops.
+sglang's kernels in turn descend from the Punica S-LoRA design
+(https://github.com/punica-ai/punica).  Each batch is a sequence of
+segments where each segment uses a single adapter; the kernels fuse the
+per-segment GEMMs into a single launch and keep per-segment state
+(rank, scaling) on-device.  See each kernel module for file-level
+provenance.
+"""
+
+from tokenspeed_kernel.ops.lora.triton.lora_expand import lora_expand_fwd
+from tokenspeed_kernel.ops.lora.triton.lora_expand_grouped_v2 import (
+    lora_expand_grouped_v2_fwd,
+)
+from tokenspeed_kernel.ops.lora.triton.lora_expand_prefill import (
+    lora_expand_prefill_fwd,
+)
+from tokenspeed_kernel.ops.lora.triton.lora_gate_up_expand import (
+    lora_gate_up_expand_fwd,
+)
+from tokenspeed_kernel.ops.lora.triton.lora_qkv_expand import lora_qkv_expand_fwd
+from tokenspeed_kernel.ops.lora.triton.lora_shrink import lora_shrink_fwd
+from tokenspeed_kernel.ops.lora.triton.lora_shrink_prefill import (
+    lora_shrink_prefill_fwd,
+)
+
+__all__ = [
+    "lora_shrink_fwd",
+    "lora_shrink_prefill_fwd",
+    "lora_expand_fwd",
+    "lora_expand_grouped_v2_fwd",
+    "lora_qkv_expand_fwd",
+    "lora_gate_up_expand_fwd",
+    "lora_expand_prefill_fwd",
+]
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/configs/H100_80GB_HBM3/_lora_expand_kernel.json b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/configs/H100_80GB_HBM3/_lora_expand_kernel.json
new file mode 100644
index 000000000..cc2325080
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/configs/H100_80GB_HBM3/_lora_expand_kernel.json
@@ -0,0 +1,178 @@
+{
+  "(24576, 128, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 64,
+      "BLOCK_S": 8
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "(24576, 16, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 16,
+      "BLOCK_N": 128,
+      "BLOCK_S": 8
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(24576, 32, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 16,
+      "BLOCK_N": 32,
+      "BLOCK_S": 8
+    },
+    "maxnreg": 160,
+    "num_ctas": 1,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "(24576, 64, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 64,
+      "BLOCK_S": 8
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "(4096, 128, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(4096, 16, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 16,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "(4096, 32, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 32,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(4096, 64, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(6144, 128, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 64,
+      "BLOCK_S": 8
+    },
+    "maxnreg": 160,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(6144, 16, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 16,
+      "BLOCK_N": 128,
+      "BLOCK_S": 8
+    },
+    "maxnreg": 160,
+    "num_ctas": 1,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "(6144, 32, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 32,
+      "BLOCK_N": 128,
+      "BLOCK_S": 8
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(6144, 64, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 16,
+      "BLOCK_N": 32,
+      "BLOCK_S": 8
+    },
+    "maxnreg": 160,
+    "num_ctas": 1,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "(8192, 128, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(8192, 16, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 16,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "(8192, 32, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 32,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(8192, 64, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  }
+}
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/configs/H100_80GB_HBM3/_lora_gate_up_expand_kernel.json b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/configs/H100_80GB_HBM3/_lora_gate_up_expand_kernel.json
new file mode 100644
index 000000000..906ea17e7
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/configs/H100_80GB_HBM3/_lora_gate_up_expand_kernel.json
@@ -0,0 +1,266 @@
+{
+  "(12288, 128, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(12288, 16, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 16,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "(12288, 32, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 32,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(12288, 64, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(14336, 128, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(14336, 16, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 16,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "(14336, 32, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 32,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(14336, 64, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(3072, 128, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(3072, 16, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 16,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "(3072, 32, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 32,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(3072, 64, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(3584, 128, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(3584, 16, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 16,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "(3584, 32, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 32,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(3584, 64, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(6144, 128, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(6144, 16, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 16,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "(6144, 32, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 32,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(6144, 64, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(7168, 128, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(7168, 16, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 16,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "(7168, 32, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 32,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(7168, 64, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  }
+}
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/configs/H100_80GB_HBM3/_lora_qkv_expand_kernel.json b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/configs/H100_80GB_HBM3/_lora_qkv_expand_kernel.json
new file mode 100644
index 000000000..dd2b1a72a
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/configs/H100_80GB_HBM3/_lora_qkv_expand_kernel.json
@@ -0,0 +1,134 @@
+{
+  "(1024, 128, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(1024, 16, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 16,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "(1024, 32, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 32,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(1024, 64, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(2048, 128, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(2048, 16, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 16,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "(2048, 32, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 32,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(2048, 64, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(4096, 128, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(4096, 16, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 16,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "(4096, 32, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 32,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  },
+  "(4096, 64, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.float32')": {
+    "kwargs": {
+      "BLOCK_K": 64,
+      "BLOCK_N": 128,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 1,
+    "num_warps": 4
+  }
+}
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/configs/H100_80GB_HBM3/_lora_shrink_kernel.json b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/configs/H100_80GB_HBM3/_lora_shrink_kernel.json
new file mode 100644
index 000000000..669dfb53a
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/configs/H100_80GB_HBM3/_lora_shrink_kernel.json
@@ -0,0 +1,541 @@
+{
+  "(128, 1024, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 128,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(128, 12288, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 128,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(128, 14336, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 128,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(128, 2048, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 128,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(128, 3072, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 128,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(128, 3584, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 128,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(128, 4096, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 128,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(128, 6144, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 128,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(128, 7168, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 128,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(128, 8192, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 128,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(16, 1024, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 128,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(16, 12288, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(16, 14336, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(16, 2048, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 128,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(16, 3072, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(16, 3584, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(16, 4096, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(16, 6144, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(16, 7168, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(192, 4096, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 128,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(192, 8192, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(256, 4096, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 128,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(256, 8192, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(32, 1024, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "(32, 12288, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(32, 14336, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(32, 2048, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 128,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(32, 3072, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(32, 3584, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(32, 4096, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(32, 6144, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(32, 7168, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(32, 8192, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(384, 4096, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "(384, 8192, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(48, 4096, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(48, 8192, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(64, 1024, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "(64, 12288, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(64, 14336, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(64, 2048, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(64, 3072, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(64, 3584, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(64, 4096, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(64, 6144, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(64, 7168, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(64, 8192, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 16,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(96, 4096, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 256,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "(96, 8192, 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32', 'torch.int32', 'torch.int32')": {
+    "kwargs": {
+      "BLOCK_K": 128,
+      "BLOCK_N": 32,
+      "BLOCK_S": 16
+    },
+    "maxnreg": null,
+    "num_ctas": 1,
+    "num_stages": 4,
+    "num_warps": 4
+  }
+}
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/kernel_utils.py b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/kernel_utils.py
new file mode 100644
index 000000000..8cee6453b
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/kernel_utils.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Shared Triton helpers for the LoRA segmented matmul kernels.
+
+Adapted from sglang ``python/sglang/srt/lora/triton_ops/kernel_utils.py``
+(Apache-2.0): https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/lora/triton_ops/kernel_utils.py.
+"""
+
+from tokenspeed_kernel._triton import tl, triton
+
+
+@triton.jit
+def _resolve_token_positions(
+    sorted_token_ids, seg_start, s_offset, seg_len, SORTED_BY_ADAPTER: tl.constexpr
+):
+    """Map logical segment offsets to physical token positions.
+
+    When ``SORTED_BY_ADAPTER`` is True the segment is a sorted slice of the
+    real token grid and ``sorted_token_ids[seg_start + s_offset]`` gives the
+    physical row index.  Otherwise tokens in this segment occupy a
+    contiguous range starting at ``seg_start``.
+    """
+    if SORTED_BY_ADAPTER:
+        return tl.load(
+            sorted_token_ids + seg_start + s_offset, mask=s_offset < seg_len
+        ).to(tl.int64)
+    return (seg_start + s_offset).to(tl.int64)
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_expand.py b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_expand.py
new file mode 100644
index 000000000..36bf0053a
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_expand.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Segmented LoRA-B matmul (expand: r → out_dim) with fused scale + add.
+
+Adapted from sglang ``python/sglang/srt/lora/triton_ops/sgemm_lora_b.py``
+(Apache-2.0): https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/lora/triton_ops/sgemm_lora_b.py.
+sglang's kernel is descended from the Punica S-LoRA design
+(https://github.com/punica-ai/punica).  Local changes mirror those in
+``lora_shrink.py`` (autotune + on-disk cache, constexpr ordering).
+"""
+
+from __future__ import annotations
+
+import torch
+from tokenspeed_kernel._triton import tl, triton
+from tokenspeed_kernel.ops.lora.triton.kernel_utils import _resolve_token_positions
+from tokenspeed_kernel.ops.lora.triton.tuning import load_kernel_cache
+
+# Expand kernel: N = out_dim (large, 4096+), K = max_rank (tiny, 16–128).
+# Tile space targets "large N, small K, small S".  Mirrors sglang's
+# csgmv-expand grid (PR #20391); maxnreg helped with occupancy there.
+#
+# Profiling (2026-05-19) showed the kernel is instruction/overhead-bound
+# (0% memory bandwidth utilisation).  Two improvements over the original
+# k ∈ {16, 32} space:
+#  • k=64, 128: when BLOCK_K == rank the inner K-loop runs exactly once,
+#    eliminating loop overhead and the k-mask predicate entirely.
+#  • BLOCK_N=128 with num_warps=4: halves CTA count vs BLOCK_N=64, which
+#    amortises per-CTA fixed cost without increasing register pressure.
+_EXPAND_CONFIGS = [
+    triton.Config(
+        {"BLOCK_S": s, "BLOCK_N": n, "BLOCK_K": k},
+        num_warps=w,
+        num_stages=stages,
+        maxnreg=mr,
+    )
+    for s in (8, 16, 32)
+    for n in (32, 64, 128)
+    for k in (16, 32, 64, 128)
+    for w in (4, 8)
+    for stages in (1, 2, 3)
+    for mr in (None, 128, 160)
+]
+
+
+@triton.autotune(configs=_EXPAND_CONFIGS, key=["N", "K"], restore_value=["output"])
+@triton.jit
+def _lora_expand_kernel(
+    x,
+    weights,
+    output,
+    N,  # out_dim
+    K,  # max_rank
+    x_stride_0,
+    x_stride_1,
+    w_stride_0,
+    w_stride_1,
+    w_stride_2,
+    output_stride_0,
+    output_stride_1,
+    seg_lens,
+    seg_indptr,
+    weight_indices,
+    lora_ranks,
+    sorted_token_ids,
+    scalings,
+    SORTED_BY_ADAPTER: tl.constexpr,
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    batch_id = tl.program_id(axis=1)
+    w_index = tl.load(weight_indices + batch_id)
+    if w_index < 0:
+        return
+    rank = tl.load(lora_ranks + w_index)
+
+    # rank == 0 is defensive: leave the base output unchanged.
+    if rank == 0:
+        return
+
+    pid = tl.program_id(axis=0)
+    seg_len = tl.load(seg_lens + batch_id)
+    if seg_len == 0:
+        return
+    seg_start = tl.load(seg_indptr + batch_id)
+    scaling = tl.load(scalings + w_index)
+    K = tl.minimum(K, rank)
+
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    pid_s = pid // num_pid_n
+    pid_n = pid % num_pid_n
+    if pid_s * BLOCK_S >= seg_len:
+        return
+
+    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.max_contiguous(tl.arange(0, BLOCK_K), BLOCK_K)
+    s_physical = _resolve_token_positions(
+        sorted_token_ids, seg_start, s_offset, seg_len, SORTED_BY_ADAPTER
+    )
+    x_ptrs = x + (s_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1)
+    w_ptrs = (weights + w_index * w_stride_0) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+
+    s_mask = s_offset[:, None] < seg_len  # hoisted: loop-invariant
+    n_mask = n_offset[None, :] < N  # hoisted: loop-invariant (already was)
+    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        k_remaining = K - k * BLOCK_K
+        x_tile = tl.load(
+            x_ptrs,
+            mask=s_mask & (k_offset[None, :] < k_remaining),
+            other=0.0,
+            eviction_policy="evict_first",
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=(k_offset[:, None] < k_remaining) & n_mask,
+            other=0.0,
+            eviction_policy="evict_last",
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+
+    partial_sum *= scaling
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+    output_ptr = output + (
+        s_physical[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
+    )
+    output_mask = s_mask & n_mask
+    partial_sum += tl.load(output_ptr, mask=output_mask, other=0.0)
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+
+
+def lora_expand_fwd(
+    x: torch.Tensor,
+    weights: torch.Tensor,
+    batch_info,
+    base_output: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """Run the LoRA-B expand and fuse-add into ``base_output``.
+
+    Args:
+        x: ``(s, max_rank)`` activations from lora_shrink.
+        weights: ``(num_lora, out_dim, max_rank)``, contiguous.
+        batch_info: :class:`LoraBatchInfo` describing the segment layout.
+        base_output: optional ``(s, out_dim)`` to add into.  When ``None``,
+            allocates a fresh zero-filled output.
+
+    Returns:
+        ``(s, out_dim)`` (same buffer as ``base_output`` when supplied).
+    """
+    assert x.is_contiguous()
+    assert weights.is_contiguous()
+    assert x.dim() == 2
+    assert weights.dim() == 3
+
+    S = x.shape[0]
+    N = weights.shape[-2]
+    R = weights.shape[-1]
+    assert x.shape[-1] == R
+
+    max_len = batch_info.max_len
+
+    def grid(meta):
+        return (
+            triton.cdiv(max_len, meta["BLOCK_S"]) * triton.cdiv(N, meta["BLOCK_N"]),
+            batch_info.bs,
+        )
+
+    if base_output is None:
+        output = torch.zeros((S, N), device=x.device, dtype=x.dtype)
+    else:
+        output = base_output
+
+    sorted_by_adapter = batch_info.permutation is not None
+    _lora_expand_kernel[grid](
+        x,
+        weights,
+        output,
+        N,
+        R,
+        x.stride(0),
+        x.stride(1),
+        weights.stride(0),
+        weights.stride(1),
+        weights.stride(2),
+        output.stride(0),
+        output.stride(1),
+        batch_info.seg_lens,
+        batch_info.seg_indptr,
+        batch_info.weight_indices,
+        batch_info.lora_ranks,
+        batch_info.permutation,
+        batch_info.scalings,
+        sorted_by_adapter,
+    )
+    return output
+
+
+load_kernel_cache(_lora_expand_kernel)
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_expand_grouped_v2.py b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_expand_grouped_v2.py
new file mode 100644
index 000000000..2d49b7896
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_expand_grouped_v2.py
@@ -0,0 +1,236 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Adapter-grouped LoRA-B expand without gather/scatter overhead.
+
+Adapts vLLM's token-sorted dispatch pattern (PR vllm-project/vllm#...,
+Apache-2.0) to our kernel infrastructure.
+
+This kernel reads ``x`` and writes ``output`` directly at the original
+  (unsorted) token positions using ``token_indices`` loaded inside the kernel.
+  No gather/scatter needed — only a cheap pointer indirection per tile.
+
+Grid: ``(cdiv(N, BLOCK_N), num_groups)``  — axis 1 = unique adapter count.
+Within each CTA, groups of ``BLOCK_S`` tokens are processed; each group loads
+``BLOCK_S`` scattered rows from ``x`` via ``token_indices``.
+
+Adapted from vLLM ``vllm/lora/ops/triton_ops/lora_expand_op.py`` (Apache-2.0):
+https://github.com/vllm-project/vllm/blob/main/vllm/lora/ops/triton_ops/lora_expand_op.py
+Local changes: removed SPLIT_K / PDL / CAST_TYPE / multi-slice indirection;
+added BLOCK_K ∈ {16,32,64,128} + tl.multiple_of EVEN_K; adopted our
+eviction-policy hints and autotune + on-disk cache infrastructure.
+"""
+
+from __future__ import annotations
+
+import torch
+from tokenspeed_kernel._triton import tl, triton
+from tokenspeed_kernel.ops.lora.triton.tuning import load_kernel_cache
+
+_GROUPED_V2_CONFIGS = [
+    triton.Config(
+        {"BLOCK_S": s, "BLOCK_N": n, "BLOCK_K": k},
+        num_warps=w,
+        num_stages=stages,
+        maxnreg=mr,
+    )
+    for s in (16, 32)
+    for n in (32, 64, 128)
+    for k in (16, 32, 64, 128)
+    for w in (4, 8)
+    for stages in (1, 2, 3)
+    for mr in (None, 128, 160)
+]
+
+
+@triton.autotune(
+    configs=_GROUPED_V2_CONFIGS,
+    key=["N", "MAX_RANK"],
+    restore_value=["output"],
+)
+@triton.jit(do_not_specialize=["output_stride_0", "output_stride_1"])
+def _lora_expand_grouped_v2_kernel(
+    x,  # (M, MAX_RANK)  original unsorted token order
+    weights,  # (n_slots, N, MAX_RANK)
+    output,  # (M, N)  written at original token positions
+    group_slots,  # (num_groups,) int32 — weight-slot index per group
+    group_starts,  # (num_groups,) int32 — start in token_indices
+    group_sizes,  # (num_groups,) int32 — tokens per group
+    token_indices,  # (M,) int32  — token positions sorted by adapter
+    scalings,  # (n_slots,) float32
+    lora_ranks,  # (n_slots,) int32
+    output_stride_0,
+    output_stride_1,
+    N: tl.constexpr,
+    MAX_RANK: tl.constexpr,
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    # Constexpr strides — x and weights are always contiguous.
+    x_stride_0: tl.constexpr = MAX_RANK
+    x_stride_1: tl.constexpr = 1
+    w_stride_0: tl.constexpr = N * MAX_RANK
+    w_stride_1: tl.constexpr = MAX_RANK  # row stride inside (N, MAX_RANK) slice
+    w_stride_2: tl.constexpr = 1
+
+    group_id = tl.program_id(axis=1)
+    # axis=0 encodes both the within-group M-tile and the N-tile.
+    # Grid: (cdiv(M, BLOCK_S) * cdiv(N, BLOCK_N), num_groups) — mirrors vLLM's
+    # (M_tiles × N_tiles, num_active_loras) layout.  CTAs whose M-tile exceeds
+    # the group size exit immediately (same early-exit pattern as vLLM).
+    pid_flat = tl.program_id(axis=0)
+    cta_n_num = tl.cdiv(N, BLOCK_N)
+    pid_m = pid_flat // cta_n_num
+    pid_n = pid_flat % cta_n_num
+
+    w_index = tl.load(group_slots + group_id)
+    if w_index < 0:
+        return
+    g_size = tl.load(group_sizes + group_id)
+    if g_size == 0:
+        return
+    rank = tl.load(lora_ranks + w_index)
+    if rank == 0:
+        return
+
+    m_off = pid_m * BLOCK_S
+    if m_off >= g_size:
+        return  # early exit for M-tiles beyond this group's token count
+
+    g_start = tl.load(group_starts + group_id)
+    scaling = tl.load(scalings + w_index)
+    K = tl.minimum(MAX_RANK, rank)
+
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.max_contiguous(tl.arange(0, BLOCK_K), BLOCK_K)
+    n_mask = n_offset[None, :] < N
+
+    # Load physical token positions for this M-tile.
+    s_offset = tl.arange(0, BLOCK_S)
+    m_valid = s_offset < g_size - m_off
+    tok_ptrs = token_indices + g_start + m_off + s_offset
+    ram = tl.load(tok_ptrs, mask=m_valid, other=0)
+    s_valid = m_valid[:, None]
+
+    # Scattered read of x — no pre-gather needed.
+    x_ptrs = x + ram[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1
+    w_ptrs = (weights + w_index * w_stride_0) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+
+    partial = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        k_rem = K - k * BLOCK_K
+        x_tile = tl.load(
+            x_ptrs,
+            mask=s_valid & (k_offset[None, :] < k_rem),
+            other=0.0,
+            eviction_policy="evict_first",
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=(k_offset[:, None] < k_rem) & n_mask,
+            other=0.0,
+            eviction_policy="evict_last",
+        )
+        partial += tl.dot(x_tile, w_tile)
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+
+    partial *= scaling
+    partial = partial.to(x.dtype.element_ty)
+
+    # Scattered write — no post-scatter needed.
+    out_ptrs = (
+        output + ram[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
+    )
+    out_mask = s_valid & n_mask
+    partial += tl.load(out_ptrs, mask=out_mask, other=0.0)
+    tl.store(out_ptrs, partial, mask=out_mask)
+
+
+def lora_expand_grouped_v2_fwd(
+    x: torch.Tensor,
+    weights: torch.Tensor,
+    batch_info,
+    base_output: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """Adapter-grouped expand without gather/scatter.
+
+    Reads ``x`` and writes ``output`` at original token positions using
+    ``batch_info.token_indices`` (sorted by adapter).  Requires batch_info to
+    have the adapter-group metadata populated by ``prepare_loras``:
+    ``token_indices``, ``group_slots``, ``group_starts``, ``group_sizes``,
+    ``num_groups``.
+
+    Drops in for :func:`lora_expand_fwd` when ``batch_info.num_groups > 0``
+    and ``batch_info.bs // batch_info.num_groups >= 8``.
+    """
+    assert x.is_contiguous()
+    assert weights.is_contiguous()
+
+    S, R = x.shape
+    N = weights.shape[-2]
+    dev, dt = x.device, x.dtype
+
+    num_groups = batch_info.num_groups
+
+    # Use the largest group size for the M dimension, not the total batch size.
+    # This makes the grid tight for both extremes:
+    #   • n_unique = n  (all different): max_group_size = 1
+    #     → grid = (1 × cdiv(N,BLOCK_N), n) ≡ segmented layout, zero wasted CTAs
+    #   • n_unique = 1  (all same):      max_group_size = n
+    #     → grid = (n/BLOCK_S × cdiv(N,BLOCK_N), 1) ≡ grpv2 layout
+    # max_group_size is pre-computed on CPU in prepare_loras — no GPU sync here.
+    max_group_size = batch_info.max_group_size
+
+    def grid(meta):
+        return (
+            triton.cdiv(max_group_size, meta["BLOCK_S"])
+            * triton.cdiv(N, meta["BLOCK_N"]),
+            num_groups,
+        )
+
+    output = (
+        torch.zeros((S, N), device=dev, dtype=dt)
+        if base_output is None
+        else base_output
+    )
+
+    _lora_expand_grouped_v2_kernel[grid](
+        x,
+        weights,
+        output,
+        batch_info.group_slots[:num_groups],
+        batch_info.group_starts[:num_groups],
+        batch_info.group_sizes[:num_groups],
+        batch_info.sort_order[: batch_info.bs],  # token_indices sorted by adapter
+        batch_info.scalings,
+        batch_info.lora_ranks,
+        output.stride(0),
+        output.stride(1),
+        N=N,
+        MAX_RANK=R,
+    )
+    return output
+
+
+load_kernel_cache(_lora_expand_grouped_v2_kernel)
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_expand_prefill.py b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_expand_prefill.py
new file mode 100644
index 000000000..ceed827c9
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_expand_prefill.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Unified LoRA-B expand for prefill batches (chunked-SGMV style).
+
+Replaces the three separate ``lora_expand`` / ``lora_qkv_expand`` /
+``lora_gate_up_expand`` kernels for the prefill path.  A single kernel
+handles any number of output slices via the ``NUM_SLICES`` constexpr and a
+``slice_offsets`` boundary tensor — the same trick as sglang's
+``chunked_sgmv_expand`` (PR sgl-project/sglang#20391).
+
+Key structural difference from the decode-path expand kernels:
+* ``OUTPUT_DIM``, ``MAX_RANK``, ``NUM_SLICES`` are **constexpr** — the
+  compiler specialises the K-loop trip count and all strides at compile
+  time, which gives 2–3× speedup over runtime-stride kernels at prefill
+  with rank ≥ 64.
+* x strides are derived as compile-time constants:
+  ``x_stride_0 = NUM_SLICES * MAX_RANK``, ``x_stride_1 = 1``.
+
+Use :func:`lora_expand_fwd` / :func:`lora_qkv_expand_fwd` /
+:func:`lora_gate_up_expand_fwd` for decode (``max_len ≤ 32``); switch to
+:func:`lora_expand_prefill_fwd` for prefill.
+
+Adapted from sglang ``python/sglang/srt/lora/triton_ops/chunked_sgmv_expand.py``
+(previously ``chunked_sgmv_expand.py`` in this repo)
+(Apache-2.0): https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/lora/triton_ops/chunked_sgmv_expand.py.
+Local changes: merged SORTED_BY_ADAPTER from our decode kernels (avoids
+permutation overhead for unsorted batches), replaced fixed configs with
+``@triton.autotune`` + on-disk cache, constexpr ordering.
+"""
+
+from __future__ import annotations
+
+import torch
+from tokenspeed_kernel._triton import tl, triton
+from tokenspeed_kernel.ops.lora.triton.kernel_utils import _resolve_token_positions
+from tokenspeed_kernel.ops.lora.triton.tuning import load_kernel_cache
+
+_PREFILL_EXPAND_CONFIGS = [
+    triton.Config(
+        {"BLOCK_S": s, "BLOCK_N": n, "BLOCK_K": k},
+        num_warps=w,
+        num_stages=stages,
+        maxnreg=mr,
+    )
+    for s in (16, 32)
+    for n in (32, 64, 128)
+    for k in (16, 32)
+    for w in (4, 8)
+    for stages in (1, 2, 3)
+    for mr in (None, 128, 160)
+]
+
+
+@triton.autotune(
+    configs=_PREFILL_EXPAND_CONFIGS,
+    key=["OUTPUT_DIM", "MAX_RANK", "NUM_SLICES"],
+    restore_value=["output"],
+)
+@triton.jit(do_not_specialize=["output_stride_0", "output_stride_1"])
+def _lora_expand_prefill_kernel(
+    x,
+    weights,
+    output,
+    output_stride_0,
+    output_stride_1,
+    seg_lens,
+    seg_indptr,
+    weight_indices,
+    lora_ranks,
+    sorted_token_ids,
+    scalings,
+    slice_offsets,
+    NUM_SLICES: tl.constexpr,
+    OUTPUT_DIM: tl.constexpr,
+    MAX_RANK: tl.constexpr,
+    SORTED_BY_ADAPTER: tl.constexpr,
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    # Constexpr strides — compiler eliminates all stride multiplications.
+    x_stride_0: tl.constexpr = NUM_SLICES * MAX_RANK
+    x_stride_1: tl.constexpr = 1
+    w_stride_0: tl.constexpr = OUTPUT_DIM * MAX_RANK
+    w_stride_1: tl.constexpr = MAX_RANK
+    w_stride_2: tl.constexpr = 1
+
+    batch_id = tl.program_id(axis=2)
+    w_index = tl.load(weight_indices + batch_id)
+    if w_index < 0:
+        return
+    rank = tl.load(lora_ranks + w_index)
+    if rank == 0:
+        return
+
+    slice_id = tl.program_id(axis=1)
+    pid = tl.program_id(axis=0)
+    seg_len = tl.load(seg_lens + batch_id)
+    if seg_len == 0:
+        return
+    seg_start = tl.load(seg_indptr + batch_id)
+    slice_start = tl.load(slice_offsets + slice_id)
+    slice_end = tl.load(slice_offsets + slice_id + 1)
+    n_size = slice_end - slice_start
+    scaling = tl.load(scalings + w_index)
+    K = tl.minimum(MAX_RANK, rank)
+
+    num_pid_n = tl.cdiv(n_size, BLOCK_N)
+    pid_s = pid // num_pid_n
+    pid_n = pid % num_pid_n
+    if pid_s * BLOCK_S >= seg_len:
+        return
+
+    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.arange(0, BLOCK_K)
+
+    s_physical = _resolve_token_positions(
+        sorted_token_ids, seg_start, s_offset, seg_len, SORTED_BY_ADAPTER
+    )
+
+    # x: slice i starts at column i * K (actual rank, not MAX_RANK).
+    x_ptrs = (
+        x
+        + slice_id * K * x_stride_1
+        + (s_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1)
+    )
+    w_ptrs = (weights + w_index * w_stride_0 + slice_start * w_stride_1) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+
+    n_mask = n_offset[None, :] < n_size
+    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        x_tile = tl.load(
+            x_ptrs,
+            mask=(s_offset[:, None] < seg_len) & (k_offset[None, :] < K - k * BLOCK_K),
+            other=0.0,
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=(k_offset[:, None] < K - k * BLOCK_K) & n_mask,
+            other=0.0,
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+
+    partial_sum *= scaling
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+
+    output_ptr = output + (
+        s_physical[:, None] * output_stride_0
+        + (slice_start + n_offset)[None, :] * output_stride_1
+    )
+    output_mask = (s_offset[:, None] < seg_len) & n_mask
+    partial_sum += tl.load(output_ptr, mask=output_mask, other=0.0)
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+
+
+def lora_expand_prefill_fwd(
+    x: torch.Tensor,
+    weights: torch.Tensor,
+    batch_info,
+    slice_offsets: torch.Tensor,
+    max_slice_size: int,
+    base_output: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """Prefill-optimised LoRA-B expand for one or more output slices.
+
+    Covers all projection types via ``slice_offsets``:
+    * plain expand (o/down):  ``slice_offsets = [0, out_dim]``
+    * gate/up:                ``slice_offsets = [0, inter, 2*inter]``
+    * QKV:                    ``slice_offsets = [0, q, q+kv, q+2*kv]``
+
+    Args:
+        x:             ``(s, num_slices * max_rank)`` from lora_shrink.
+        weights:       ``(num_lora, out_dim, max_rank)``, contiguous.
+        batch_info:    :class:`LoraBatchInfo`.
+        slice_offsets: ``(num_slices + 1,)`` int32 boundary tensor.
+        max_slice_size: largest ``slice_offsets[i+1] - slice_offsets[i]``.
+        base_output:   ``(s, out_dim)`` to fuse-add into; allocated if None.
+
+    Returns:
+        ``(s, out_dim)`` (same buffer as ``base_output`` when supplied).
+    """
+    assert x.is_contiguous()
+    assert weights.is_contiguous()
+    assert x.dim() == 2
+    assert weights.dim() == 3
+
+    S = x.shape[0]
+    OUT_DIM = weights.shape[-2]
+    MAX_RANK = weights.shape[-1]
+    num_slices = len(slice_offsets) - 1
+    assert x.shape[1] == num_slices * MAX_RANK
+
+    max_len = batch_info.max_len
+    sorted_by_adapter = batch_info.permutation is not None
+
+    def grid(meta):
+        return (
+            triton.cdiv(max_len, meta["BLOCK_S"])
+            * triton.cdiv(max_slice_size, meta["BLOCK_N"]),
+            num_slices,
+            batch_info.bs,
+        )
+
+    output = (
+        torch.zeros((S, OUT_DIM), device=x.device, dtype=x.dtype)
+        if base_output is None
+        else base_output
+    )
+    _lora_expand_prefill_kernel[grid](
+        x,
+        weights,
+        output,
+        output.stride(0),
+        output.stride(1),
+        batch_info.seg_lens,
+        batch_info.seg_indptr,
+        batch_info.weight_indices,
+        batch_info.lora_ranks,
+        batch_info.permutation,
+        batch_info.scalings,
+        slice_offsets,
+        NUM_SLICES=num_slices,
+        OUTPUT_DIM=OUT_DIM,
+        MAX_RANK=MAX_RANK,
+        SORTED_BY_ADAPTER=sorted_by_adapter,
+    )
+    return output
+
+
+load_kernel_cache(_lora_expand_prefill_kernel)
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_gate_up_expand.py b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_gate_up_expand.py
new file mode 100644
index 000000000..caecf635e
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_gate_up_expand.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Fused LoRA-B expand for stacked gate/up projections (MLP).
+
+The MLP gate_up linear is fused into a single matmul with output layout
+``[gate_per_tp, up_per_tp]`` (each of size ``intermediate_per_tp``).
+This kernel packs the two B projections into one launch: each program
+instance picks ``gate`` (axis=1, id=0) or ``up`` (id=1) and writes its
+tile into the matching half of the fused output.
+
+Adapted from sglang ``python/sglang/srt/lora/triton_ops/gate_up_lora_b.py``
+(Apache-2.0): https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/lora/triton_ops/gate_up_lora_b.py.
+Local changes: autotune + on-disk cache, constexpr ordering.
+"""
+
+from __future__ import annotations
+
+import torch
+from tokenspeed_kernel._triton import tl, triton
+from tokenspeed_kernel.ops.lora.triton.kernel_utils import _resolve_token_positions
+from tokenspeed_kernel.ops.lora.triton.tuning import load_kernel_cache
+
+_GATE_UP_EXPAND_CONFIGS = [
+    triton.Config(
+        {"BLOCK_S": s, "BLOCK_N": n, "BLOCK_K": k},
+        num_warps=w,
+        num_stages=stages,
+        maxnreg=mr,
+    )
+    for s in (16, 32)
+    for n in (32, 64, 128)
+    for k in (16, 32, 64, 128)
+    for w in (4, 8)
+    for stages in (1, 2, 3)
+    for mr in (None, 128, 160)
+]
+
+
+@triton.autotune(
+    configs=_GATE_UP_EXPAND_CONFIGS,
+    key=["output_dim", "K"],
+    restore_value=["output"],
+)
+@triton.jit
+def _lora_gate_up_expand_kernel(
+    x,
+    weights,
+    output,
+    K,  # max_rank
+    output_dim,  # intermediate_per_tp
+    x_stride_0,
+    x_stride_1,
+    w_stride_0,
+    w_stride_1,
+    w_stride_2,
+    output_stride_0,
+    output_stride_1,
+    seg_lens,
+    seg_indptr,
+    weight_indices,
+    lora_ranks,
+    sorted_token_ids,
+    scalings,
+    SORTED_BY_ADAPTER: tl.constexpr,
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    batch_id = tl.program_id(axis=2)
+    w_index = tl.load(weight_indices + batch_id)
+    if w_index < 0:
+        return
+    rank = tl.load(lora_ranks + w_index)
+    if rank == 0:
+        return
+
+    gate_up_id = tl.program_id(axis=1)
+    pid = tl.program_id(axis=0)
+    seg_len = tl.load(seg_lens + batch_id)
+    if seg_len == 0:
+        return
+    seg_start = tl.load(seg_indptr + batch_id)
+    n_start = gate_up_id * output_dim
+    scaling = tl.load(scalings + w_index)
+    K = tl.minimum(K, rank)
+
+    num_pid_n = tl.cdiv(output_dim, BLOCK_N)
+    pid_s = pid // num_pid_n
+    pid_n = pid % num_pid_n
+    if pid_s * BLOCK_S >= seg_len:
+        return
+
+    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.arange(0, BLOCK_K)
+
+    s_physical = _resolve_token_positions(
+        sorted_token_ids, seg_start, s_offset, seg_len, SORTED_BY_ADAPTER
+    )
+    x_ptrs = (
+        x
+        + (gate_up_id * K) * x_stride_1
+        + (s_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1)
+    )
+    w_ptrs = (weights + w_index * w_stride_0 + n_start * w_stride_1) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+
+    s_mask = s_offset[:, None] < seg_len
+    n_mask = n_offset[None, :] < output_dim
+    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        k_remaining = K - k * BLOCK_K
+        x_tile = tl.load(
+            x_ptrs,
+            mask=s_mask & (k_offset[None, :] < k_remaining),
+            other=0.0,
+            eviction_policy="evict_first",
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=(k_offset[:, None] < k_remaining) & n_mask,
+            other=0.0,
+            eviction_policy="evict_last",
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+
+    partial_sum *= scaling
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+    output_ptr = (
+        output
+        + n_start * output_stride_1
+        + (s_physical[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1)
+    )
+    output_mask = s_mask & n_mask
+    partial_sum += tl.load(output_ptr, mask=output_mask, other=0.0)
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+
+
+def lora_gate_up_expand_fwd(
+    x: torch.Tensor,
+    gate_up_lora_b: torch.Tensor,
+    batch_info,
+    output_dim: int,
+    base_output: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """Apply LoRA-B for the fused gate_up MLP linear, fuse-add into ``base_output``.
+
+    Args:
+        x: ``(s, 2 * max_rank)`` from ``lora_shrink_fwd(stack_num=2)`` —
+           gate's lora_a in cols ``[:, :r]``, up's in ``[:, r:]``.
+        gate_up_lora_b: ``(num_lora, 2 * intermediate_per_tp, max_rank)``
+           — gate's B in rows ``[:, :out, :]``, up's in ``[:, out:, :]``.
+        batch_info: :class:`LoraBatchInfo`.
+        output_dim: ``intermediate_per_tp``.
+        base_output: ``(s, 2 * intermediate_per_tp)`` to fuse-add into.
+    """
+    s = x.shape[0]
+    input_dim = x.shape[1]
+    r = gate_up_lora_b.shape[-1]
+    assert input_dim == 2 * r
+
+    max_len = batch_info.max_len
+
+    def grid(meta):
+        return (
+            triton.cdiv(max_len, meta["BLOCK_S"])
+            * triton.cdiv(output_dim, meta["BLOCK_N"]),
+            2,
+            batch_info.bs,
+        )
+
+    if base_output is None:
+        output = torch.zeros((s, 2 * output_dim), device=x.device, dtype=x.dtype)
+    else:
+        output = base_output
+
+    sorted_by_adapter = batch_info.permutation is not None
+    _lora_gate_up_expand_kernel[grid](
+        x,
+        gate_up_lora_b,
+        output,
+        r,
+        output_dim,
+        x.stride(0),
+        x.stride(1),
+        gate_up_lora_b.stride(0),
+        gate_up_lora_b.stride(1),
+        gate_up_lora_b.stride(2),
+        output.stride(0),
+        output.stride(1),
+        batch_info.seg_lens,
+        batch_info.seg_indptr,
+        batch_info.weight_indices,
+        batch_info.lora_ranks,
+        batch_info.permutation,
+        batch_info.scalings,
+        sorted_by_adapter,
+    )
+
+    return output
+
+
+load_kernel_cache(_lora_gate_up_expand_kernel)
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_qkv_expand.py b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_qkv_expand.py
new file mode 100644
index 000000000..4bed480cf
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_qkv_expand.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Fused LoRA-B expand for stacked Q/K/V projections.
+
+The QKV linear is fused into a single matmul with output layout
+``[q_per_tp, k_per_tp, v_per_tp]``.  This kernel packs the three B
+projections into one launch: each program instance picks ``q``, ``k``, or
+``v`` via ``program_id(1)`` and writes its tile into the matching slice of
+the fused output.
+
+Adapted from sglang ``python/sglang/srt/lora/triton_ops/qkv_lora_b.py``
+(Apache-2.0): https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/lora/triton_ops/qkv_lora_b.py.
+Local changes: autotune + on-disk cache, constexpr ordering.
+"""
+
+from __future__ import annotations
+
+import torch
+from tokenspeed_kernel._triton import tl, triton
+from tokenspeed_kernel.ops.lora.triton.kernel_utils import _resolve_token_positions
+from tokenspeed_kernel.ops.lora.triton.tuning import load_kernel_cache
+
+_QKV_EXPAND_CONFIGS = [
+    triton.Config(
+        {"BLOCK_S": s, "BLOCK_N": n, "BLOCK_K": k},
+        num_warps=w,
+        num_stages=stages,
+        maxnreg=mr,
+    )
+    for s in (16, 32)
+    for n in (32, 64, 128)
+    for k in (16, 32, 64, 128)
+    for w in (4, 8)
+    for stages in (1, 2, 3)
+    for mr in (None, 128, 160)
+]
+
+
+@triton.autotune(
+    configs=_QKV_EXPAND_CONFIGS,
+    key=["max_qkv_out_dim", "K"],
+    restore_value=["output"],
+)
+@triton.jit
+def _lora_qkv_expand_kernel(
+    x,
+    weights,
+    output,
+    K,  # max_rank
+    max_qkv_out_dim,  # max(q_per_tp, kv_per_tp)
+    x_stride_0,
+    x_stride_1,
+    w_stride_0,
+    w_stride_1,
+    w_stride_2,
+    output_stride_0,
+    output_stride_1,
+    seg_lens,
+    seg_indptr,
+    weight_indices,
+    lora_ranks,
+    n_offs,  # (4,) cumulative offsets into the fused QKV output
+    sorted_token_ids,
+    scalings,
+    SORTED_BY_ADAPTER: tl.constexpr,
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    batch_id = tl.program_id(axis=2)
+    w_index = tl.load(weight_indices + batch_id)
+    if w_index < 0:
+        return
+    rank = tl.load(lora_ranks + w_index)
+    if rank == 0:
+        return
+
+    qkv_id = tl.program_id(axis=1)
+    pid = tl.program_id(axis=0)
+    seg_len = tl.load(seg_lens + batch_id)
+    if seg_len == 0:
+        return
+    seg_start = tl.load(seg_indptr + batch_id)
+    n_start = tl.load(n_offs + qkv_id)
+    n_size = tl.load(n_offs + qkv_id + 1) - n_start
+    scaling = tl.load(scalings + w_index)
+    K = tl.minimum(K, rank)
+
+    num_pid_n = tl.cdiv(max_qkv_out_dim, BLOCK_N)
+    pid_s = pid // num_pid_n
+    pid_n = pid % num_pid_n
+    if pid_s * BLOCK_S >= seg_len:
+        return
+
+    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.arange(0, BLOCK_K)
+
+    s_physical = _resolve_token_positions(
+        sorted_token_ids, seg_start, s_offset, seg_len, SORTED_BY_ADAPTER
+    )
+    x_ptrs = (
+        x
+        + (qkv_id * K) * x_stride_1
+        + (s_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1)
+    )
+    w_ptrs = (weights + w_index * w_stride_0 + n_start * w_stride_1) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+
+    s_mask = s_offset[:, None] < seg_len
+    n_mask = n_offset[None, :] < n_size
+    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        k_remaining = K - k * BLOCK_K
+        x_tile = tl.load(
+            x_ptrs,
+            mask=s_mask & (k_offset[None, :] < k_remaining),
+            other=0.0,
+            eviction_policy="evict_first",
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=(k_offset[:, None] < k_remaining) & n_mask,
+            other=0.0,
+            eviction_policy="evict_last",
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+
+    partial_sum *= scaling
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+    output_ptr = (
+        output
+        + n_start * output_stride_1
+        + (s_physical[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1)
+    )
+    output_mask = s_mask & n_mask
+    partial_sum += tl.load(output_ptr, mask=output_mask, other=0.0)
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+
+
+def lora_qkv_expand_fwd(
+    x: torch.Tensor,
+    qkv_lora_b: torch.Tensor,
+    batch_info,
+    output_offset: torch.Tensor,
+    max_qkv_out_dim: int,
+    base_output: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """Apply LoRA-B for the fused QKV linear, fused-add into ``base_output``.
+
+    Args:
+        x: ``(s, 3 * max_rank)`` from ``lora_shrink_fwd(stack_num=3)``.
+        qkv_lora_b: ``(num_lora, q_per_tp + 2 * kv_per_tp, max_rank)``.
+        batch_info: :class:`LoraBatchInfo`.
+        output_offset: ``(4,)`` cumulative offsets ``[0, q, q+kv, q+2*kv]``.
+        max_qkv_out_dim: ``max(q_per_tp, kv_per_tp)`` — used to size the grid.
+        base_output: ``(s, q_per_tp + 2 * kv_per_tp)`` to fuse-add into.
+    """
+    s = x.shape[0]
+    input_dim = x.shape[1]
+    r = qkv_lora_b.shape[-1]
+    output_dim = qkv_lora_b.shape[-2]
+    assert input_dim == 3 * r
+    assert output_offset.shape[0] == 4
+
+    max_len = batch_info.max_len
+
+    def grid(meta):
+        return (
+            triton.cdiv(max_len, meta["BLOCK_S"])
+            * triton.cdiv(max_qkv_out_dim, meta["BLOCK_N"]),
+            3,
+            batch_info.bs,
+        )
+
+    if base_output is None:
+        output = torch.zeros((s, output_dim), device=x.device, dtype=x.dtype)
+    else:
+        output = base_output
+
+    sorted_by_adapter = batch_info.permutation is not None
+    _lora_qkv_expand_kernel[grid](
+        x,
+        qkv_lora_b,
+        output,
+        r,
+        max_qkv_out_dim,
+        x.stride(0),
+        x.stride(1),
+        qkv_lora_b.stride(0),
+        qkv_lora_b.stride(1),
+        qkv_lora_b.stride(2),
+        output.stride(0),
+        output.stride(1),
+        batch_info.seg_lens,
+        batch_info.seg_indptr,
+        batch_info.weight_indices,
+        batch_info.lora_ranks,
+        output_offset,
+        batch_info.permutation,
+        batch_info.scalings,
+        sorted_by_adapter,
+    )
+    return output
+
+
+load_kernel_cache(_lora_qkv_expand_kernel)
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_shrink.py b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_shrink.py
new file mode 100644
index 000000000..0c571f8df
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_shrink.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Segmented LoRA-A matmul (shrink: in_dim → r).
+
+For each segment ``b`` in the batch the kernel computes
+``output[seg_b] = x[seg_b] @ A[wi_b].T`` where ``A[wi_b]`` has shape
+``(stack_num * r, in_dim)``.  No-adapter segments use a negative slot
+sentinel; the kernel returns immediately for that slot, leaving the output
+rows untouched.  Real slots may have varying real ranks up to
+``max_rank``; ``output[..., :rank * stack_num]`` stores the real product
+and ``output[..., rank * stack_num:]`` is irrelevant — the consumer
+(``lora_expand`` / ``lora_qkv_expand``) reads only the first ``rank * stack_num``
+columns.
+
+Adapted from sglang ``python/sglang/srt/lora/triton_ops/sgemm_lora_a.py``
+(Apache-2.0): https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/lora/triton_ops/sgemm_lora_a.py.
+sglang's kernel is in turn descended from the Punica S-LoRA design
+(https://github.com/punica-ai/punica).  Local changes: ported to
+``tokenspeed_kernel._triton``, added ``@triton.autotune`` over the
+``(N, K)`` shape with an on-disk config cache, and reshuffled the
+constexpr params so block sizes come last.
+"""
+
+from __future__ import annotations
+
+import torch
+from tokenspeed_kernel._triton import tl, triton
+from tokenspeed_kernel.ops.lora.triton.kernel_utils import _resolve_token_positions
+from tokenspeed_kernel.ops.lora.triton.tuning import load_kernel_cache
+
+# Shrink kernel: N = stack_num * rank (tiny, 16–192), K = in_dim (large,
+# 4096+).  Decode-step segments are short (S = 1–32 per segment), so the
+# right tile shape is "small N, large K, small S".  Sweep matches the
+# sglang csgmv-shrink space (PR sgl-project/sglang#20391) plus a BLOCK_S
+# axis since our kernel exposes it.  72 configs.
+_SHRINK_CONFIGS = [
+    triton.Config(
+        {"BLOCK_S": s, "BLOCK_N": n, "BLOCK_K": k}, num_warps=w, num_stages=stages
+    )
+    for s in (8, 16, 32)
+    for n in (16, 32, 64)
+    for k in (64, 128, 256)
+    for w in (4, 8)
+    for stages in (2, 3, 4)
+]
+
+
+@triton.autotune(configs=_SHRINK_CONFIGS, key=["N", "K"])
+@triton.jit
+def _lora_shrink_kernel(
+    x,
+    weights,
+    output,
+    N,  # stack_num * max_rank
+    K,  # in_dim
+    stack_num,
+    x_stride_0,
+    x_stride_1,
+    w_stride_0,
+    w_stride_1,
+    w_stride_2,
+    output_stride_0,
+    output_stride_1,
+    seg_lens,
+    seg_indptr,
+    weight_indices,
+    lora_ranks,
+    sorted_token_ids,
+    SORTED_BY_ADAPTER: tl.constexpr,
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    batch_id = tl.program_id(axis=1)
+    w_index = tl.load(weight_indices + batch_id)
+    if w_index < 0:
+        return
+    rank = tl.load(lora_ranks + w_index)
+
+    # rank == 0 is defensive: skip and leave the output untouched
+    # (downstream lora_expand / lora_qkv_expand is also a no-op for rank == 0
+    # so the leftover values never feed into the base-output add).
+    if rank == 0:
+        return
+
+    pid = tl.program_id(axis=0)
+    seg_start = tl.load(seg_indptr + batch_id)
+    seg_len = tl.load(seg_lens + batch_id)
+    if seg_len == 0:
+        return
+
+    # Cap N to the real ``stack_num * rank`` for this adapter.
+    N = tl.minimum(N, rank * stack_num)
+
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    pid_s = pid // num_pid_n
+    pid_n = pid % num_pid_n
+    if pid_s * BLOCK_S >= seg_len:
+        return
+
+    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.max_contiguous(tl.arange(0, BLOCK_K), BLOCK_K)
+    s_physical = _resolve_token_positions(
+        sorted_token_ids, seg_start, s_offset, seg_len, SORTED_BY_ADAPTER
+    )
+    x_ptrs = x + (s_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1)
+    w_ptrs = (weights + w_index * w_stride_0) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+
+    # Hoist loop-invariant masks — s_mask and n_mask don't change across K
+    # iterations so computing them once saves instructions in the hot loop.
+    s_mask = s_offset[:, None] < seg_len  # (BLOCK_S, 1)
+    n_mask = n_offset[None, :] < N  # (1, BLOCK_N)
+
+    K = tl.multiple_of(K, BLOCK_K)
+    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+    for k in range(0, K // BLOCK_K):
+        x_tile = tl.load(
+            x_ptrs,
+            mask=s_mask,
+            other=0.0,
+            eviction_policy="evict_first",
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=n_mask,
+            other=0.0,
+            eviction_policy="evict_last",
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+    output_mask = s_mask & n_mask
+    output_ptr = output + (
+        s_physical[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
+    )
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+
+
+def lora_shrink_fwd(
+    x: torch.Tensor,
+    weights: torch.Tensor,
+    batch_info,
+    stack_num: int = 1,
+) -> torch.Tensor:
+    """Run the LoRA-A shrink for an arbitrary batch.
+
+    Args:
+        x:        ``(s, in_dim)`` activations, contiguous.
+        weights:  ``(num_lora, stack_num * max_rank, in_dim)``, contiguous.
+        batch_info: :class:`LoraBatchInfo` describing the segment layout.
+        stack_num: 1 for single projection, 3 for fused QKV, 2 for gate-up.
+
+    Returns:
+        ``(s, stack_num * max_rank)`` tensor.  Rows of segments whose adapter
+        is the no-op slot are unwritten — callers must not consume them
+        (the matching lora_expand kernel is also a no-op for those segments).
+    """
+    assert x.is_contiguous()
+    assert weights.is_contiguous()
+    assert x.dim() == 2
+    assert weights.dim() == 3
+
+    S = x.shape[0]
+    N = weights.shape[-2]
+    K = weights.shape[-1]
+    assert x.shape[-1] == K
+
+    max_len = batch_info.max_len
+
+    def grid(meta):
+        return (
+            triton.cdiv(max_len, meta["BLOCK_S"]) * triton.cdiv(N, meta["BLOCK_N"]),
+            batch_info.bs,
+        )
+
+    sorted_by_adapter = batch_info.permutation is not None
+
+    output = torch.empty((S, N), device=x.device, dtype=x.dtype)
+    _lora_shrink_kernel[grid](
+        x,
+        weights,
+        output,
+        N,
+        K,
+        stack_num,
+        x.stride(0),
+        x.stride(1),
+        weights.stride(0),
+        weights.stride(1),
+        weights.stride(2),
+        output.stride(0),
+        output.stride(1),
+        batch_info.seg_lens,
+        batch_info.seg_indptr,
+        batch_info.weight_indices,
+        batch_info.lora_ranks,
+        batch_info.permutation,
+        sorted_by_adapter,
+    )
+    return output
+
+
+# Eager pre-population from disk happens lazily inside the autotuner cache
+# (see `tokenspeed_kernel.ops.lora.triton.__init__`).
+load_kernel_cache(_lora_shrink_kernel)
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_shrink_prefill.py b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_shrink_prefill.py
new file mode 100644
index 000000000..8b8c28856
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/lora_shrink_prefill.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Prefill-optimised LoRA-A matmul (shrink: in_dim → r).
+
+Drop-in replacement for :func:`lora_shrink_fwd` on prefill batches
+(``max_len > 32``).  Identical algorithm; the structural difference is that
+``K`` (= in_dim, 4096+), ``N`` (= stack_num * max_rank), and all strides are
+**constexpr** — the compiler specialises the K-loop trip count at compile
+time and eliminates all stride multiplications.
+
+Benchmarked gain on H100 vs the decode shrink kernel at s=512, rank=64:
+  QKV  stack=3  (K=4096, N=192): 23 µs → 17 µs  (1.3×)
+  g/up stack=2  (K=4096, N=128): 19 µs → 16 µs  (1.2×)
+  single        (K=4096, N=64):  18 µs → 17 µs  (~1.0×)
+
+Adapted from sglang ``python/sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py``
+(Apache-2.0): https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py.
+Local changes: kept SORTED_BY_ADAPTER + S-tiling from our decode kernel
+(``lora_shrink.py``), replaced fixed configs with ``@triton.autotune`` +
+on-disk cache.
+"""
+
+from __future__ import annotations
+
+import torch
+from tokenspeed_kernel._triton import tl, triton
+from tokenspeed_kernel.ops.lora.triton.kernel_utils import _resolve_token_positions
+from tokenspeed_kernel.ops.lora.triton.tuning import load_kernel_cache
+
+# Same config space as the decode shrink kernel.
+_PREFILL_SHRINK_CONFIGS = [
+    triton.Config(
+        {"BLOCK_S": s, "BLOCK_N": n, "BLOCK_K": k}, num_warps=w, num_stages=stages
+    )
+    for s in (16, 32)
+    for n in (16, 32, 64)
+    for k in (64, 128, 256)
+    for w in (4, 8)
+    for stages in (2, 3, 4)
+]
+
+
+@triton.autotune(configs=_PREFILL_SHRINK_CONFIGS, key=["N", "K", "NUM_SLICES"])
+@triton.jit
+def _lora_shrink_prefill_kernel(
+    x,
+    weights,
+    output,
+    seg_lens,
+    seg_indptr,
+    weight_indices,
+    lora_ranks,
+    sorted_token_ids,
+    N: tl.constexpr,  # stack_num * max_rank
+    K: tl.constexpr,  # in_dim
+    NUM_SLICES: tl.constexpr,  # stack_num
+    SORTED_BY_ADAPTER: tl.constexpr,
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    # Constexpr strides — compiler eliminates all stride multiplications.
+    x_stride_0: tl.constexpr = K
+    x_stride_1: tl.constexpr = 1
+    w_stride_0: tl.constexpr = N * K
+    w_stride_1: tl.constexpr = K  # row stride of the (N, K) weight matrix
+    w_stride_2: tl.constexpr = 1
+    output_stride_0: tl.constexpr = N
+    output_stride_1: tl.constexpr = 1
+
+    batch_id = tl.program_id(axis=1)
+    w_index = tl.load(weight_indices + batch_id)
+    if w_index < 0:
+        return
+    rank = tl.load(lora_ranks + w_index)
+    if rank == 0:
+        return
+
+    pid = tl.program_id(axis=0)
+    seg_start = tl.load(seg_indptr + batch_id)
+    seg_len = tl.load(seg_lens + batch_id)
+    if seg_len == 0:
+        return
+
+    cur_n = tl.minimum(N, rank * NUM_SLICES)
+
+    num_pid_n = tl.cdiv(cur_n, BLOCK_N)
+    pid_s = pid // num_pid_n
+    pid_n = pid % num_pid_n
+    if pid_s * BLOCK_S >= seg_len:
+        return
+
+    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.arange(0, BLOCK_K)
+
+    s_physical = _resolve_token_positions(
+        sorted_token_ids, seg_start, s_offset, seg_len, SORTED_BY_ADAPTER
+    )
+
+    x_ptrs = x + (s_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1)
+    w_ptrs = (weights + w_index * w_stride_0) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+
+    s_mask = s_offset[:, None] < seg_len
+    n_mask = n_offset[None, :] < cur_n
+    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+    for k in range(0, K // BLOCK_K):
+        x_tile = tl.load(
+            x_ptrs,
+            mask=s_mask,
+            other=0.0,
+            eviction_policy="evict_first",
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=n_mask,
+            other=0.0,
+            eviction_policy="evict_last",
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+    output_mask = s_mask & n_mask
+    output_ptr = output + (
+        s_physical[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
+    )
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+
+
+def lora_shrink_prefill_fwd(
+    x: torch.Tensor,
+    weights: torch.Tensor,
+    batch_info,
+    stack_num: int = 1,
+) -> torch.Tensor:
+    """Prefill-optimised LoRA-A shrink.  Same signature as :func:`lora_shrink_fwd`.
+
+    Args:
+        x:        ``(s, in_dim)`` activations, contiguous.
+        weights:  ``(num_lora, stack_num * max_rank, in_dim)``, contiguous.
+        batch_info: :class:`LoraBatchInfo`.
+        stack_num: 1 for single projection, 3 for fused QKV, 2 for gate-up.
+
+    Returns:
+        ``(s, stack_num * max_rank)`` tensor.
+    """
+    assert x.is_contiguous()
+    assert weights.is_contiguous()
+    assert x.dim() == 2
+    assert weights.dim() == 3
+
+    S = x.shape[0]
+    N = weights.shape[-2]  # stack_num * max_rank
+    K = weights.shape[-1]  # in_dim
+    assert x.shape[-1] == K
+
+    max_len = batch_info.max_len
+    sorted_by_adapter = batch_info.permutation is not None
+
+    def grid(meta):
+        return (
+            triton.cdiv(max_len, meta["BLOCK_S"]) * triton.cdiv(N, meta["BLOCK_N"]),
+            batch_info.bs,
+        )
+
+    output = torch.empty((S, N), device=x.device, dtype=x.dtype)
+    _lora_shrink_prefill_kernel[grid](
+        x,
+        weights,
+        output,
+        batch_info.seg_lens,
+        batch_info.seg_indptr,
+        batch_info.weight_indices,
+        batch_info.lora_ranks,
+        batch_info.permutation,
+        N=N,
+        K=K,
+        NUM_SLICES=stack_num,
+        SORTED_BY_ADAPTER=sorted_by_adapter,
+    )
+    return output
+
+
+load_kernel_cache(_lora_shrink_prefill_kernel)
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/tune.py b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/tune.py
new file mode 100644
index 000000000..570772e82
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/tune.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Offline autotune driver for the LoRA Triton kernels.
+
+Builds synthetic ``LoraBatchInfo`` batches for a few representative
+segment shapes, calls each kernel once (triggering ``triton.autotune``
+to benchmark all candidate configs and pick the fastest per ``(N, K)``
+key), and then writes the picked configs to JSON via
+:func:`tokenspeed_kernel.ops.lora.triton.tuning.save_kernel_cache`.
+
+Usage::
+
+    python -m tokenspeed_kernel.ops.lora.triton.tune \\
+        --hidden 4096 --intermediate 12288 \\
+        --q-per-tp 2048 --kv-per-tp 1024 \\
+        --rank 16 --max-rank 64 --tp-size 2
+
+The defaults match Qwen3-8B at attn_tp_size=2.  Shapes only affect which
+``(N, K)`` keys get tuned; the actual launch parameters are independent
+of which model the cache is shipped against.
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+from dataclasses import dataclass
+
+import torch
+from tokenspeed_kernel.ops.lora.triton.lora_expand import (
+    _lora_expand_kernel,
+    lora_expand_fwd,
+)
+from tokenspeed_kernel.ops.lora.triton.lora_gate_up_expand import (
+    _lora_gate_up_expand_kernel,
+    lora_gate_up_expand_fwd,
+)
+from tokenspeed_kernel.ops.lora.triton.lora_qkv_expand import (
+    _lora_qkv_expand_kernel,
+    lora_qkv_expand_fwd,
+)
+from tokenspeed_kernel.ops.lora.triton.lora_shrink import (
+    _lora_shrink_kernel,
+    lora_shrink_fwd,
+)
+from tokenspeed_kernel.ops.lora.triton.tuning import save_kernel_cache
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class _BatchInfo:
+    """Minimal stand-in for ``runtime.lora.lora_manager.LoraBatchInfo``."""
+
+    bs: int
+    max_len: int
+    seg_lens: torch.Tensor
+    seg_indptr: torch.Tensor
+    weight_indices: torch.Tensor
+    lora_ranks: torch.Tensor
+    scalings: torch.Tensor
+    permutation: torch.Tensor | None = None
+
+
+def _make_batch(
+    s_per_seg: int, n_segs: int, rank: int, device: str = "cuda"
+) -> _BatchInfo:
+    seg_lens = torch.full((n_segs,), s_per_seg, dtype=torch.int32, device=device)
+    seg_indptr = torch.tensor(
+        [i * s_per_seg for i in range(n_segs + 1)], dtype=torch.int32, device=device
+    )
+    # weight_indices: route every segment to real adapter slot 0.
+    weight_indices = torch.zeros(n_segs, dtype=torch.int32, device=device)
+    lora_ranks = torch.tensor([rank], dtype=torch.int32, device=device)
+    scalings = torch.tensor([1.0], dtype=torch.float32, device=device)
+    return _BatchInfo(
+        bs=n_segs,
+        max_len=s_per_seg,
+        seg_lens=seg_lens,
+        seg_indptr=seg_indptr,
+        weight_indices=weight_indices,
+        lora_ranks=lora_ranks,
+        scalings=scalings,
+    )
+
+
+def tune_shrink(*, in_dim: int, stack_num: int, rank: int, max_rank: int) -> None:
+    """Drive ``_lora_shrink_kernel`` for one ``(stack_num, in_dim)`` shape.
+
+    Uses a decode-shaped batch (``bs=32, max_len=1``) because that is where
+    LoRA latency dominates the e2e (every decode step pays the kernel cost;
+    prefill is amortized).  Tuning at prefill shapes picks block tiles that
+    waste threads at decode-time.
+    """
+    device = "cuda"
+    dtype = torch.bfloat16
+    n_segs = 32
+    s_per_seg = 1
+    s = n_segs * s_per_seg
+    x = torch.randn((s, in_dim), device=device, dtype=dtype)
+    weights = torch.randn((2, stack_num * max_rank, in_dim), device=device, dtype=dtype)
+    bi = _make_batch(s_per_seg, n_segs, rank=rank, device=device)
+    lora_shrink_fwd(x, weights, bi, stack_num=stack_num)
+    torch.cuda.synchronize()
+    print(
+        f"  shrink in_dim={in_dim} stack={stack_num}  →  best={_lora_shrink_kernel.best_config}"
+    )
+
+
+def tune_expand(*, out_dim: int, max_rank: int, rank: int) -> None:
+    device = "cuda"
+    dtype = torch.bfloat16
+    n_segs = 32
+    s_per_seg = 1
+    s = n_segs * s_per_seg
+    x = torch.randn((s, max_rank), device=device, dtype=dtype)
+    weights = torch.randn((2, out_dim, max_rank), device=device, dtype=dtype)
+    bi = _make_batch(s_per_seg, n_segs, rank=rank, device=device)
+    out = torch.zeros((s, out_dim), device=device, dtype=dtype)
+    lora_expand_fwd(x, weights, bi, base_output=out)
+    torch.cuda.synchronize()
+    print(
+        f"  expand out_dim={out_dim} R={max_rank}  →  best={_lora_expand_kernel.best_config}"
+    )
+
+
+def tune_qkv(*, q_per_tp: int, kv_per_tp: int, max_rank: int, rank: int) -> None:
+    device = "cuda"
+    dtype = torch.bfloat16
+    n_segs = 32
+    s_per_seg = 1
+    s = n_segs * s_per_seg
+    x = torch.randn((s, 3 * max_rank), device=device, dtype=dtype)
+    out_dim = q_per_tp + 2 * kv_per_tp
+    weights = torch.randn((2, out_dim, max_rank), device=device, dtype=dtype)
+    max_qkv = max(q_per_tp, kv_per_tp)
+    output_offset = torch.tensor(
+        [0, q_per_tp, q_per_tp + kv_per_tp, q_per_tp + 2 * kv_per_tp],
+        dtype=torch.int32,
+        device=device,
+    )
+    bi = _make_batch(s_per_seg, n_segs, rank=rank, device=device)
+    out = torch.zeros((s, out_dim), device=device, dtype=dtype)
+    lora_qkv_expand_fwd(x, weights, bi, output_offset, max_qkv, base_output=out)
+    torch.cuda.synchronize()
+    print(
+        f"  qkv_expand max_qkv={max_qkv} R={max_rank}  →  best={_lora_qkv_expand_kernel.best_config}"
+    )
+
+
+def tune_gate_up(*, intermediate_per_tp: int, max_rank: int, rank: int) -> None:
+    device = "cuda"
+    dtype = torch.bfloat16
+    n_segs = 32
+    s_per_seg = 1
+    s = n_segs * s_per_seg
+    x = torch.randn((s, 2 * max_rank), device=device, dtype=dtype)
+    weights = torch.randn(
+        (2, 2 * intermediate_per_tp, max_rank), device=device, dtype=dtype
+    )
+    bi = _make_batch(s_per_seg, n_segs, rank=rank, device=device)
+    out = torch.zeros((s, 2 * intermediate_per_tp), device=device, dtype=dtype)
+    lora_gate_up_expand_fwd(x, weights, bi, intermediate_per_tp, base_output=out)
+    torch.cuda.synchronize()
+    print(
+        f"  gate_up_expand out={intermediate_per_tp} R={max_rank}  →  best={_lora_gate_up_expand_kernel.best_config}"
+    )
+
+
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--hidden", type=int, default=4096)
+    p.add_argument(
+        "--intermediate",
+        type=int,
+        default=12288,
+        help="Full (un-sharded) intermediate_size",
+    )
+    p.add_argument("--q-per-tp", type=int, default=2048)
+    p.add_argument("--kv-per-tp", type=int, default=512)
+    p.add_argument("--rank", type=int, default=16)
+    p.add_argument("--max-rank", type=int, default=64)
+    p.add_argument("--tp-size", type=int, default=2)
+    args = p.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+    intermediate_per_tp = args.intermediate // args.tp_size
+
+    print("=== Tuning shrink (lora_shrink) ===")
+    # Attention shrink: stack=3 (QKV) on hidden, stack=1 (o) on q_per_tp.
+    tune_shrink(in_dim=args.hidden, stack_num=3, rank=args.rank, max_rank=args.max_rank)
+    tune_shrink(
+        in_dim=args.q_per_tp, stack_num=1, rank=args.rank, max_rank=args.max_rank
+    )
+    # MLP shrink: stack=2 (gate/up) on hidden, stack=1 (down) on intermediate_per_tp.
+    tune_shrink(in_dim=args.hidden, stack_num=2, rank=args.rank, max_rank=args.max_rank)
+    tune_shrink(
+        in_dim=intermediate_per_tp, stack_num=1, rank=args.rank, max_rank=args.max_rank
+    )
+
+    print("\n=== Tuning expand (lora_expand) ===")
+    # o_proj uses lora_expand directly (out_dim = hidden).
+    tune_expand(out_dim=args.hidden, max_rank=args.max_rank, rank=args.rank)
+    # down_proj also uses lora_expand (out_dim = hidden).
+    # Same shape — autotune cache hit on the second call.
+
+    print("\n=== Tuning qkv_expand (lora_qkv_expand) ===")
+    tune_qkv(
+        q_per_tp=args.q_per_tp,
+        kv_per_tp=args.kv_per_tp,
+        max_rank=args.max_rank,
+        rank=args.rank,
+    )
+
+    print("\n=== Tuning gate_up_expand (lora_gate_up_expand) ===")
+    tune_gate_up(
+        intermediate_per_tp=intermediate_per_tp,
+        max_rank=args.max_rank,
+        rank=args.rank,
+    )
+
+    print("\n=== Saving caches ===")
+    for kern in (
+        _lora_shrink_kernel,
+        _lora_expand_kernel,
+        _lora_qkv_expand_kernel,
+        _lora_gate_up_expand_kernel,
+    ):
+        path = save_kernel_cache(kern)
+        print(f"  wrote {path} ({len(kern.cache)} entries)")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/tune_sweep.py b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/tune_sweep.py
new file mode 100644
index 000000000..5a1507839
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/tune_sweep.py
@@ -0,0 +1,140 @@
+"""Comprehensive autotune sweep for LoRA decode kernels across common shapes.
+
+Covers the (N, K) pairs seen in production for the major model families and
+TP configurations, across max_rank values of 16 / 32 / 64 / 128.  Saves all
+picked configs to the on-disk JSON caches so fresh processes skip the sweep.
+
+Usage::
+
+    python -m tokenspeed_kernel.ops.lora.triton.tune_sweep
+
+Estimated runtime: ~5 min on H100 (all shapes × all kernels).
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+
+import torch
+from tokenspeed_kernel.ops.lora.triton.lora_expand import _lora_expand_kernel
+from tokenspeed_kernel.ops.lora.triton.lora_gate_up_expand import (
+    _lora_gate_up_expand_kernel,
+)
+from tokenspeed_kernel.ops.lora.triton.lora_qkv_expand import _lora_qkv_expand_kernel
+from tokenspeed_kernel.ops.lora.triton.lora_shrink import _lora_shrink_kernel
+from tokenspeed_kernel.ops.lora.triton.tune import (
+    _BatchInfo,
+    _make_batch,
+    tune_expand,
+    tune_gate_up,
+    tune_qkv,
+    tune_shrink,
+)
+from tokenspeed_kernel.ops.lora.triton.tuning import save_kernel_cache
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+
+@dataclass
+class _ModelTP:
+    name: str
+    hidden: int
+    intermediate_per_tp: int
+    q_per_tp: int
+    kv_per_tp: int
+
+
+# ── Representative (model, TP) configs ──────────────────────────────────────
+# Each entry represents one serving configuration: hidden size, per-rank
+# intermediate, and per-rank Q / KV sizes after tensor parallelism sharding.
+# Source model sizes:
+#   Llama-3-8B:  hidden=4096, intermediate=14336, heads=32/8, head_dim=128
+#   Llama-3-70B: hidden=8192, intermediate=28672, heads=64/8, head_dim=128
+#   Qwen3-8B:    hidden=4096, intermediate=12288, heads=32/8, head_dim=128
+_CONFIGS: list[_ModelTP] = [
+    # ── Llama-3-8B ──────────────────────────────────────────────────────────
+    _ModelTP("llama3-8b  TP=1", 4096, 14336, 4096, 1024),
+    _ModelTP("llama3-8b  TP=2", 4096, 7168, 2048, 512),
+    _ModelTP("llama3-8b  TP=4", 4096, 3584, 1024, 256),
+    # ── Qwen3-8B ────────────────────────────────────────────────────────────
+    _ModelTP("qwen3-8b   TP=1", 4096, 12288, 4096, 1024),
+    _ModelTP("qwen3-8b   TP=2", 4096, 6144, 2048, 512),
+    _ModelTP("qwen3-8b   TP=4", 4096, 3072, 1024, 256),
+    # ── Llama-3-70B ─────────────────────────────────────────────────────────
+    _ModelTP("llama3-70b TP=4", 8192, 7168, 2048, 256),
+    _ModelTP("llama3-70b TP=8", 8192, 3584, 1024, 128),
+]
+
+# Max-rank values to cover — N in the shrink key is stack_num * max_rank.
+_MAX_RANKS = [16, 32, 64, 128]
+
+
+def _sweep_shrink(cfg: _ModelTP, max_rank: int) -> None:
+    rank = max_rank  # tune at full rank so the K-loop is fully exercised
+    # Attention shrink
+    tune_shrink(in_dim=cfg.hidden, stack_num=3, rank=rank, max_rank=max_rank)
+    tune_shrink(in_dim=cfg.q_per_tp, stack_num=1, rank=rank, max_rank=max_rank)
+    # MLP shrink
+    tune_shrink(in_dim=cfg.hidden, stack_num=2, rank=rank, max_rank=max_rank)
+    tune_shrink(
+        in_dim=cfg.intermediate_per_tp, stack_num=1, rank=rank, max_rank=max_rank
+    )
+
+
+def _sweep_expand(cfg: _ModelTP, max_rank: int) -> None:
+    # Clear in-process cache so the autotuner sweeps all configs fresh
+    # rather than reusing entries loaded from the on-disk JSON.
+    for k in _lora_expand_kernel, _lora_qkv_expand_kernel, _lora_gate_up_expand_kernel:
+        k.cache.clear()
+    rank = max_rank
+    # o_proj / down_proj
+    tune_expand(out_dim=cfg.hidden, max_rank=max_rank, rank=rank)
+    # QKV
+    tune_qkv(
+        q_per_tp=cfg.q_per_tp,
+        kv_per_tp=cfg.kv_per_tp,
+        max_rank=max_rank,
+        rank=rank,
+    )
+    # gate/up
+    tune_gate_up(
+        intermediate_per_tp=cfg.intermediate_per_tp,
+        max_rank=max_rank,
+        rank=rank,
+    )
+
+
+def main() -> int:
+    total_shrink = len(_CONFIGS) * len(_MAX_RANKS)
+    total_expand = total_shrink
+    done = 0
+
+    for max_rank in _MAX_RANKS:
+        for cfg in _CONFIGS:
+            done += 1
+            print(f"\n[{done}/{total_shrink}] shrink  {cfg.name}  max_rank={max_rank}")
+            _sweep_shrink(cfg, max_rank)
+
+    done = 0
+    for max_rank in _MAX_RANKS:
+        for cfg in _CONFIGS:
+            done += 1
+            print(f"\n[{done}/{total_expand}] expand  {cfg.name}  max_rank={max_rank}")
+            _sweep_expand(cfg, max_rank)
+
+    print("\n=== Saving caches ===")
+    for kern in (
+        _lora_shrink_kernel,
+        _lora_expand_kernel,
+        _lora_qkv_expand_kernel,
+        _lora_gate_up_expand_kernel,
+    ):
+        path = save_kernel_cache(kern)
+        print(f"  wrote {path}  ({len(kern.cache)} entries)")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/tuning.py b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/tuning.py
new file mode 100644
index 000000000..db82764b6
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/tuning.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""On-disk cache for LoRA Triton autotune picks.
+
+Triton's ``@triton.autotune`` caches the best config per ``key`` tuple in
+``Autotuner.cache``, but only for the current process — every fresh Python
+process re-runs the sweep on the first call to each unique shape.  This
+module persists that cache as JSON next to the kernels so the picks
+survive process restarts and ship in the repo.
+
+Layout: ``configs/<gpu_label>/<kernel_name>.json``.  When a kernel runs
+for the first time on a shape that has no saved entry, Triton falls back
+to the candidate-config sweep (slow) and the result can be saved by a
+follow-up call to :func:`save_kernel_cache`.
+
+Config JSON format::
+
+    {
+      "(N, K, 'torch.bfloat16')": {
+        "kwargs": {"BLOCK_S": 16, "BLOCK_N": 64, "BLOCK_K": 64},
+        "num_warps": 4,
+        "num_stages": 3,
+        "num_ctas": 1,
+        "maxnreg": null
+      },
+      ...
+    }
+"""
+
+from __future__ import annotations
+
+import ast
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Any
+
+import torch
+from tokenspeed_kernel._triton import triton
+
+logger = logging.getLogger(__name__)
+
+CONFIG_DIR = Path(__file__).parent / "configs"
+
+
+def _gpu_label() -> str:
+    """Compact identifier for the active GPU — partitions config files."""
+    if not torch.cuda.is_available():
+        return "cpu"
+    name = torch.cuda.get_device_name(0)
+    # Strip vendor prefix and whitespace: "NVIDIA H100 80GB HBM3" → "H100_80GB_HBM3".
+    name = name.replace("NVIDIA ", "").strip()
+    return name.replace(" ", "_")
+
+
+def _config_path(kernel_name: str) -> Path:
+    return CONFIG_DIR / _gpu_label() / f"{kernel_name}.json"
+
+
+def _key_to_str(key: tuple) -> str:
+    # ``repr(tuple)`` round-trips through ``ast.literal_eval`` provided the
+    # tuple only holds primitives and str dtypes — which it does here.
+    return repr(tuple(key))
+
+
+def _str_to_key(s: str) -> tuple:
+    return tuple(ast.literal_eval(s))
+
+
+def _config_to_dict(cfg: triton.Config) -> dict:
+    return {
+        "kwargs": dict(cfg.kwargs),
+        "num_warps": cfg.num_warps,
+        "num_stages": cfg.num_stages,
+        "num_ctas": cfg.num_ctas,
+        "maxnreg": cfg.maxnreg,
+    }
+
+
+def _dict_to_config(d: dict) -> triton.Config:
+    return triton.Config(
+        d["kwargs"],
+        num_warps=d["num_warps"],
+        num_stages=d["num_stages"],
+        num_ctas=d.get("num_ctas", 1),
+        maxnreg=d.get("maxnreg"),
+    )
+
+
+def load_kernel_cache(kernel) -> int:
+    """Populate ``kernel.cache`` from the on-disk JSON for the active GPU.
+
+    ``kernel`` is the ``Autotuner`` wrapper produced by
+    ``@triton.autotune``.  Returns the number of entries loaded (0 when
+    no config file exists for this GPU, which is the normal first-run
+    case).
+    """
+    name = kernel.base_fn.__name__
+    path = _config_path(name)
+    if not path.exists():
+        logger.debug("no autotune cache for %s at %s", name, path)
+        return 0
+    with open(path) as f:
+        raw = json.load(f)
+    loaded = 0
+    for k, v in raw.items():
+        kernel.cache[_str_to_key(k)] = _dict_to_config(v)
+        loaded += 1
+    logger.info("loaded %d autotune picks for %s from %s", loaded, name, path)
+    return loaded
+
+
+def save_kernel_cache(kernel) -> Path:
+    """Dump ``kernel.cache`` to JSON next to the kernel module."""
+    name = kernel.base_fn.__name__
+    path = _config_path(name)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    blob: dict[str, Any] = {}
+    for key, cfg in kernel.cache.items():
+        blob[_key_to_str(key)] = _config_to_dict(cfg)
+    with open(path, "w") as f:
+        json.dump(blob, f, indent=2, sort_keys=True)
+    logger.info("saved %d autotune picks for %s to %s", len(blob), name, path)
+    return path
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/moe_lora/__init__.py b/tokenspeed-kernel/python/tokenspeed_kernel/ops/moe_lora/__init__.py
new file mode 100644
index 000000000..63088af12
--- /dev/null
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/moe_lora/__init__.py
@@ -0,0 +1,1085 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Fused Triton kernels for MoE LoRA applied to sorted expert outputs.
+
+Targets the sglang_shared adapter format (shared outer A, per-expert inner B
+for gate/up; per-expert A, shared outer B for down), operating directly on the
+sorted token-expert buffers produced by the MoE dispatcher.
+
+Gate/up expand replaces: all-experts B GEMM (m×R × R×E·I) + candidates.gather +
+_add_route_delta with a single per-sorted-position GEMV kernel.
+
+Down shrink replaces: _route_rows_from_cache + _select_expert_weights + einsum
+with a per-sorted-position GEMV kernel; the caller then runs one shared-B GEMM
+and scatter_add_ to accumulate into the token-ordered down output.
+
+Both kernels tile over the rank dimension in BLOCK_R chunks so that register
+pressure stays bounded regardless of adapter rank (r=16 to r=256).
+"""
+
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+
+# ── Gate/Up Expand ───────────────────────────────────────────────────────────
+#
+# For each sorted position s:
+#   exp    = safe_ids[flat_j // K, flat_j % K]  where flat_j = sorted_token_ids[s]
+#   delta  = lora_a_m[flat_j // K, :] @ w13_B[exp, offs_i, :].T * scaling
+#   gate_up_output[s, offs_i] += delta
+#
+# Rank dimension is reduced in BLOCK_R tiles to bound register usage.
+# Grid: (cdiv(I2, BLOCK_I), padded)
+
+
+@triton.jit
+def _sorted_gate_up_b_expand_kernel(
+    lora_a_m,  # (m, MAX_R)
+    w13_B,  # (E, I2, MAX_R) — contiguous
+    safe_ids,  # (m, K) int64
+    sorted_token_ids,  # (padded,) int64  — sorted pos → flat pair
+    gate_up_output,  # output — in-place add
+    scaling_ptr,  # float32 scalar on device
+    route_count,  # int32 — m*K
+    K,  # int32
+    I2: tl.constexpr,
+    MAX_R: tl.constexpr,
+    BLOCK_I: tl.constexpr,
+    BLOCK_R: tl.constexpr,
+    SCATTER: tl.constexpr,  # True: write to flat_j (flat-pair output); False: write to pid_s (sorted output)
+):
+    pid_i = tl.program_id(0)
+    pid_s = tl.program_id(1)
+
+    flat_j = tl.load(sorted_token_ids + pid_s)
+    if flat_j < 0:
+        return
+    if flat_j >= route_count:
+        return
+
+    tok = flat_j // K
+    topk_v = flat_j % K
+    exp = tl.load(safe_ids + tok * K + topk_v).to(tl.int32)
+
+    offs_i = pid_i * BLOCK_I + tl.arange(0, BLOCK_I)
+    i_mask = offs_i < I2
+
+    scaling = tl.load(scaling_ptr).to(tl.float32)
+    acc = tl.zeros((BLOCK_I,), dtype=tl.float32)
+
+    for r_start in range(0, MAX_R, BLOCK_R):
+        kr = r_start + tl.arange(0, BLOCK_R)
+        la = tl.load(lora_a_m + tok * MAX_R + kr).to(tl.float32)  # (BLOCK_R,)
+        B_ptr = w13_B + (exp * I2 + offs_i[:, None]) * MAX_R + kr[None, :]
+        B = tl.load(B_ptr, mask=i_mask[:, None], other=0.0).to(
+            tl.float32
+        )  # (BLOCK_I, BLOCK_R)
+        acc += tl.sum(B * la[None, :], axis=1)
+
+    # SCATTER=True: write to flat-pair position flat_j (non-TMA, flat-pair output).
+    # SCATTER=False: write to sorted position pid_s (TMA sorted output).
+    out_row = flat_j if SCATTER else pid_s
+    out_ptr = gate_up_output + out_row * I2 + offs_i
+    old = tl.load(out_ptr, mask=i_mask, other=0.0).to(tl.float32)
+    tl.store(out_ptr, old + acc * scaling, mask=i_mask)
+
+
+def _choose_block_r(max_r: int) -> int:
+    """Largest power-of-2 ≤ 32 that divides max_r."""
+    block_r = min(32, max_r)
+    while max_r % block_r != 0:
+        block_r //= 2
+    return max(block_r, 1)
+
+
+def sorted_gate_up_b_expand(
+    lora_a_m: torch.Tensor,  # (m, R) — already computed
+    w13_B: torch.Tensor,  # (E, I2, R) — per-expert B, contiguous
+    safe_ids: torch.Tensor,  # (m, K) int64
+    sorted_token_ids: torch.Tensor,  # (padded,) int64
+    gate_up_output: torch.Tensor,  # (padded, I2) — in-place add
+    scaling: torch.Tensor,  # () or (1,) float32 device tensor
+    route_count: int,  # = m*K
+    K: int,
+    BLOCK_I: int = 64,
+) -> None:
+    """Fused gate/up expand: lora_a_m @ B[expert].T, add directly to sorted output.
+
+    For TMA-sorted dispatch: output is in sorted expert order (SCATTER=False).
+    """
+    padded, I2 = gate_up_output.shape
+    MAX_R = w13_B.shape[2]
+    BLOCK_R = _choose_block_r(MAX_R)
+    assert w13_B.is_contiguous(), "w13_B must be contiguous for fused kernel"
+
+    grid = (triton.cdiv(I2, BLOCK_I), padded)
+    _sorted_gate_up_b_expand_kernel[grid](
+        lora_a_m,
+        w13_B,
+        safe_ids.to(torch.int64),
+        sorted_token_ids.to(torch.int64),
+        gate_up_output,
+        scaling,
+        route_count,
+        K,
+        I2=I2,
+        MAX_R=MAX_R,
+        BLOCK_I=BLOCK_I,
+        BLOCK_R=BLOCK_R,
+        SCATTER=False,
+        num_warps=4,
+        num_stages=3,
+    )
+
+
+# ── Flat Gate/Up Expand (decode path) ────────────────────────────────────────
+#
+# No sorted_token_ids needed — computes tok = pid_s // K inside the kernel.
+# One block per flat-pair position, processes all m*K positions directly.
+# Replaces: all-experts B GEMM + candidates.gather + route_delta (3 → 1 kernel).
+# Active-expert reads: only the ~51 unique experts' B rows, not all 128.
+
+
+@triton.jit
+def _gate_up_b_expand_kernel(
+    lora_a_m,  # (m, MAX_R)
+    w13_B_buffer,  # full buffer: n_slots × E × I2 × MAX_R (contiguous)
+    slot_ptr,  # (1,) int32 — GPU scalar, dynamic at CUDA-graph replay
+    n_slot_stride,  # int — E × I2 × MAX_R (stride between slots)
+    safe_ids,  # (m, K) int64
+    gate_up_output,  # (m*K, I2) — flat-pair order, in-place add
+    scaling_ptr,  # float32 scalar on device
+    K,  # int32 — topk count
+    I2: tl.constexpr,
+    MAX_R: tl.constexpr,
+    BLOCK_I: tl.constexpr,
+    BLOCK_R: tl.constexpr,
+):
+    pid_i = tl.program_id(0)
+    pid_s = tl.program_id(1)  # flat-pair index [0 .. m*K-1]
+
+    tok = pid_s // K
+    topk_v = pid_s % K
+    exp = tl.load(safe_ids + tok * K + topk_v).to(tl.int32)
+
+    offs_i = pid_i * BLOCK_I + tl.arange(0, BLOCK_I)
+    i_mask = offs_i < I2
+
+    # Load slot index dynamically (changes at CUDA-graph replay without re-capture).
+    slot = tl.load(slot_ptr).to(tl.int32)
+    # Load scaling from buffer at [slot] — avoids a separate scalings[slot_idx] gather.
+    scaling = tl.load(scaling_ptr + slot).to(tl.float32)
+    acc = tl.zeros((BLOCK_I,), dtype=tl.float32)
+
+    for r_start in range(0, MAX_R, BLOCK_R):
+        kr = r_start + tl.arange(0, BLOCK_R)
+        la = tl.load(lora_a_m + tok * MAX_R + kr).to(tl.float32)
+        # Compute B pointer directly into the full buffer using the slot offset,
+        # avoiding a separate gather copy: buffer[slot, exp, offs_i, kr].
+        B_ptr = (
+            w13_B_buffer
+            + slot * n_slot_stride
+            + (exp * I2 + offs_i[:, None]) * MAX_R
+            + kr[None, :]
+        )
+        B = tl.load(B_ptr, mask=i_mask[:, None], other=0.0).to(tl.float32)
+        acc += tl.sum(B * la[None, :], axis=1)
+
+    out_ptr = gate_up_output + pid_s * I2 + offs_i
+    old = tl.load(out_ptr, mask=i_mask, other=0.0).to(tl.float32)
+    tl.store(out_ptr, old + acc * scaling, mask=i_mask)
+
+
+def gate_up_b_expand(
+    lora_a_m: torch.Tensor,  # (m, R) — already computed
+    w13_B_buffer: torch.Tensor,  # (n_slots, E, I2, R) — full buffer, contiguous
+    slot_idx: torch.Tensor,  # (1,) int32 — GPU tensor; dynamic at CUDA-graph replay
+    safe_ids: torch.Tensor,  # (m, K) int64 — expert assignments
+    gate_up_output: torch.Tensor,  # (m*K, I2) — flat-pair order, in-place add
+    scalings: torch.Tensor,  # (n_slots,) float32 — full scalings buffer; kernel loads [slot]
+    BLOCK_I: int = 64,
+) -> None:
+    """Flat per-expert GEMV for decode (no TMA, no sorted_token_ids needed).
+
+    Accepts the FULL (n_slots, E, I2, R) buffer, slot_idx, and the full scalings
+    buffer — the kernel loads both w13_B and scalings via the slot offset, eliminating
+    the separate w13_B gather (~38 µs) and scalings gather (~19 µs) per layer.
+
+    One block per flat-pair position; computes tok = pid_s // K directly.
+    Replaces: all-experts B GEMM + candidates.gather + route_delta (3 → 1 kernel).
+    """
+    m_k, I2 = gate_up_output.shape
+    K = safe_ids.shape[1]
+    # Buffer layout: (n_slots, E, I2, MAX_R).
+    _n_slots, E, _I2, MAX_R = w13_B_buffer.shape
+    n_slot_stride = E * I2 * MAX_R  # elements between consecutive slots
+    BLOCK_R = _choose_block_r(MAX_R)
+    assert (
+        w13_B_buffer.is_contiguous()
+    ), "w13_B_buffer must be contiguous for fused kernel"
+
+    grid = (triton.cdiv(I2, BLOCK_I), m_k)
+    _gate_up_b_expand_kernel[grid](
+        lora_a_m,
+        w13_B_buffer,
+        slot_idx.to(torch.int32),
+        n_slot_stride,
+        safe_ids.to(torch.int64),
+        gate_up_output,
+        scalings,
+        K,
+        I2=I2,
+        MAX_R=MAX_R,
+        BLOCK_I=BLOCK_I,
+        BLOCK_R=BLOCK_R,
+        num_warps=4,
+        num_stages=3,
+    )
+
+
+# ── Down Shrink ───────────────────────────────────────────────────────────────
+#
+# For each sorted position s, for each rank tile pid_r:
+#   exp           = safe_ids[flat_j // K, flat_j % K]
+#   lora_a_out[s, pid_r*BLOCK_R : (pid_r+1)*BLOCK_R]
+#     = intermediate[s, :] @ down_A[exp, pid_r*BLOCK_R : ..., :].T
+#
+# Grid: (padded, cdiv(MAX_R, BLOCK_R))
+# Splitting over rank tiles keeps (BLOCK_R × BLOCK_H) loads bounded in size.
+
+
+@triton.jit
+def _sorted_a_down_shrink_kernel(
+    intermediate,  # (padded, INTER)
+    down_A,  # (E, MAX_R, INTER) — per-expert A, contiguous
+    safe_ids,  # (m, K) int64
+    sorted_token_ids,  # (padded,) int64
+    lora_a_out,  # (padded, MAX_R)
+    route_count,  # int32
+    K,  # int32
+    INTER: tl.constexpr,
+    MAX_R: tl.constexpr,
+    BLOCK_R: tl.constexpr,  # rank output tile; MAX_R divisible by BLOCK_R
+    BLOCK_H: tl.constexpr,  # INTER tile; INTER divisible by BLOCK_H
+):
+    pid_s = tl.program_id(0)
+    pid_r = tl.program_id(1)
+
+    flat_j = tl.load(sorted_token_ids + pid_s)
+    valid = (flat_j >= 0) & (flat_j < route_count)
+
+    kr = pid_r * BLOCK_R + tl.arange(0, BLOCK_R)
+
+    if not valid:
+        tl.store(
+            lora_a_out + pid_s * MAX_R + kr,
+            tl.zeros((BLOCK_R,), dtype=intermediate.dtype.element_ty),
+        )
+        return
+
+    tok = flat_j // K
+    topk_v = flat_j % K
+    exp = tl.load(safe_ids + tok * K + topk_v).to(tl.int32)
+
+    acc = tl.zeros((BLOCK_R,), dtype=tl.float32)
+
+    for h_start in range(0, INTER, BLOCK_H):
+        kh = h_start + tl.arange(0, BLOCK_H)
+        x = tl.load(intermediate + pid_s * INTER + kh).to(tl.float32)  # (BLOCK_H,)
+        A_ptr = down_A + (exp * MAX_R + kr[:, None]) * INTER + kh[None, :]
+        A = tl.load(A_ptr).to(tl.float32)  # (BLOCK_R, BLOCK_H)
+        acc += tl.sum(A * x[None, :], axis=1)
+
+    tl.store(
+        lora_a_out + pid_s * MAX_R + kr,
+        acc.to(intermediate.dtype.element_ty),
+    )
+
+
+def _choose_block_h(inter: int) -> int:
+    """Largest power-of-2 ≤ 128 that divides inter."""
+    block_h = min(128, inter)
+    while inter % block_h != 0:
+        block_h //= 2
+    return max(block_h, 1)
+
+
+def sorted_a_down_shrink(
+    intermediate: torch.Tensor,  # (padded, INTER)
+    down_A: torch.Tensor,  # (E, MAX_R, INTER)
+    safe_ids: torch.Tensor,  # (m, K) int64
+    sorted_token_ids: torch.Tensor,  # (padded,) int64
+    route_count: int,
+    K: int,
+) -> torch.Tensor:
+    """Fused down shrink: intermediate[s] @ down_A[expert].T for each sorted pos."""
+    padded, INTER = intermediate.shape
+    MAX_R = down_A.shape[1]
+    BLOCK_R = _choose_block_r(MAX_R)
+    BLOCK_H = _choose_block_h(INTER)
+    assert down_A.is_contiguous(), "down_A must be contiguous for fused kernel"
+
+    lora_a = torch.empty(
+        (padded, MAX_R), dtype=intermediate.dtype, device=intermediate.device
+    )
+    grid = (padded, MAX_R // BLOCK_R)
+    _sorted_a_down_shrink_kernel[grid](
+        intermediate,
+        down_A,
+        safe_ids.to(torch.int64),
+        sorted_token_ids.to(torch.int64),
+        lora_a,
+        route_count,
+        K,
+        INTER=INTER,
+        MAX_R=MAX_R,
+        BLOCK_R=BLOCK_R,
+        BLOCK_H=BLOCK_H,
+        num_warps=4,
+        num_stages=2,
+    )
+    return lora_a
+
+
+# ── Flat Down Shrink (decode path) ────────────────────────────────────────────
+#
+# No sorted_token_ids needed — computes tok = pid_s // K inside the kernel.
+# One block per (flat-pair, rank-tile), replaces: select_A gather + einsum.
+# Avoids the (m*K, r, INTER) intermediate created by _select_expert_weights.
+# Grid: (m*K, MAX_R // BLOCK_R)
+
+
+@triton.jit
+def _per_expert_a_shrink_kernel(
+    route_input,  # (m*K, INTER)
+    down_A_buffer,  # full buffer: n_slots × E × MAX_R × INTER (contiguous)
+    slot_ptr,  # (1,) int32 — GPU scalar, dynamic at CUDA-graph replay
+    n_slot_stride,  # int — E × MAX_R × INTER (stride between slots)
+    safe_ids,  # (m, K) int64
+    lora_a_out,  # (m*K, MAX_R)
+    K,  # int32
+    INTER: tl.constexpr,
+    MAX_R: tl.constexpr,
+    BLOCK_R: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+):
+    pid_s = tl.program_id(0)  # flat-pair index
+    pid_r = tl.program_id(1)  # rank tile
+
+    tok = pid_s // K
+    topk_v = pid_s % K
+    exp = tl.load(safe_ids + tok * K + topk_v).to(tl.int32)
+
+    # Load slot index dynamically (changes at CUDA-graph replay without re-capture).
+    slot = tl.load(slot_ptr).to(tl.int32)
+
+    kr = pid_r * BLOCK_R + tl.arange(0, BLOCK_R)
+    acc = tl.zeros((BLOCK_R,), dtype=tl.float32)
+
+    for h_start in range(0, INTER, BLOCK_H):
+        kh = h_start + tl.arange(0, BLOCK_H)
+        x = tl.load(route_input + pid_s * INTER + kh).to(tl.float32)
+        # Compute A pointer directly into the full buffer using the slot offset,
+        # avoiding a separate gather copy: buffer[slot, exp, kr, kh].
+        A_ptr = (
+            down_A_buffer
+            + slot * n_slot_stride
+            + (exp * MAX_R + kr[:, None]) * INTER
+            + kh[None, :]
+        )
+        A = tl.load(A_ptr).to(tl.float32)
+        acc += tl.sum(A * x[None, :], axis=1)
+
+    tl.store(lora_a_out + pid_s * MAX_R + kr, acc.to(route_input.dtype.element_ty))
+
+
+def per_expert_a_shrink(
+    route_input: torch.Tensor,  # (m*K, INTER) — flat-pair intermediate
+    down_A_buffer: torch.Tensor,  # (n_slots, E, MAX_R, INTER) — full buffer, contiguous
+    slot_idx: torch.Tensor,  # (1,) int32 — GPU tensor; dynamic at CUDA-graph replay
+    safe_ids: torch.Tensor,  # (m, K) int64
+    out: torch.Tensor | None = None,  # optional pre-allocated (m*K, MAX_R) output
+) -> torch.Tensor:
+    """Flat per-expert shrink for decode: route_input[j] @ down_A_buffer[slot, exp[j]].T.
+
+    Accepts the FULL (n_slots, E, MAX_R, INTER) buffer and a GPU scalar slot_idx,
+    computing the slot offset inside the kernel.  This eliminates the separate
+    gather copy ``down_A = buffer[slot_idx].squeeze(0)`` (saves ~64 µs/layer).
+
+    Replaces _select_expert_weights gather + einsum without any sorted_token_ids.
+    Returns lora_a (m*K, MAX_R) for the subsequent shared-B GEMM or shared_b_down_expand.
+    """
+    m_k, INTER = route_input.shape
+    # Buffer layout: (n_slots, E, MAX_R, INTER).
+    _n_slots, E, MAX_R, _INTER = down_A_buffer.shape
+    n_slot_stride = E * MAX_R * INTER  # elements between consecutive slots
+    BLOCK_R = _choose_block_r(MAX_R)
+    BLOCK_H = _choose_block_h(INTER)
+    assert down_A_buffer.is_contiguous(), "down_A_buffer must be contiguous"
+
+    if out is None:
+        lora_a = torch.empty(
+            (m_k, MAX_R), dtype=route_input.dtype, device=route_input.device
+        )
+    else:
+        lora_a = out
+    grid = (m_k, MAX_R // BLOCK_R)
+    _per_expert_a_shrink_kernel[grid](
+        route_input,
+        down_A_buffer,
+        slot_idx.to(torch.int32),
+        n_slot_stride,
+        safe_ids.to(torch.int64),
+        lora_a,
+        safe_ids.shape[1],
+        INTER=INTER,
+        MAX_R=MAX_R,
+        BLOCK_R=BLOCK_R,
+        BLOCK_H=BLOCK_H,
+        num_warps=4,
+        num_stages=2,
+    )
+    return lora_a
+
+
+# ── Flat Down Expand (decode path) ────────────────────────────────────────────
+#
+# Fused kernel that takes the lora_a output from per_expert_a_shrink and performs
+# the shared-B GEMM + topk scaling + accumulation in a single pass.
+# Avoids: separate down_B gather copy + standalone GEMM + scale + add.
+#
+# For each (token, topk_v) pair and each hidden chunk:
+#   lora_a_row  = lora_a[tok*K + topk_v, :]               — (MAX_R,)
+#   B_row       = down_B_buffer[slot, 0, offs_h, :]        — (BLOCK_H, MAX_R)
+#   delta_h     = lora_a_row @ B_row.T                     — (BLOCK_H,)
+#   out[tok, topk_v, offs_h] += delta_h * topk_weights[tok, topk_v] * scaling
+#
+# Grid: (m*K, cdiv(H, BLOCK_H))
+
+
+@triton.jit
+def _shared_b_down_expand_kernel(
+    lora_a,  # (m*K, MAX_R)
+    down_B_buffer,  # full buffer: n_slots × 1 × H × MAX_R (contiguous)
+    slot_ptr,  # (1,) int32 — GPU scalar, dynamic at CUDA-graph replay
+    n_slot_stride_B,  # int — H × MAX_R (stride between slots; shared-B has dim0=1)
+    topk_weights,  # (m, K) — topk routing weights
+    scaling_ptr,  # float32 scalar on device
+    down_output,  # (m, K, H) — in-place add
+    K,  # int32 — topk count
+    H: tl.constexpr,  # hidden dimension (constexpr for tl.arange)
+    MAX_R: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    BLOCK_R: tl.constexpr,
+):
+    pid_s = tl.program_id(0)  # flat-pair index [0 .. m*K-1]
+    pid_h = tl.program_id(1)  # hidden chunk index
+
+    tok = pid_s // K
+    topk_v = pid_s % K
+
+    offs_h = pid_h * BLOCK_H + tl.arange(0, BLOCK_H)
+    h_mask = offs_h < H
+
+    # Load slot index dynamically (changes at CUDA-graph replay without re-capture).
+    slot = tl.load(slot_ptr).to(tl.int32)
+    # Load scaling from buffer at [slot] — avoids a separate scalings[slot_idx] gather.
+    scaling = tl.load(scaling_ptr + slot).to(tl.float32)
+    weight = tl.load(topk_weights + tok * K + topk_v).to(tl.float32)
+
+    acc = tl.zeros((BLOCK_H,), dtype=tl.float32)
+
+    for r_start in range(0, MAX_R, BLOCK_R):
+        kr = r_start + tl.arange(0, BLOCK_R)
+        # Load lora_a row tile: lora_a[pid_s, kr].
+        la = tl.load(lora_a + pid_s * MAX_R + kr).to(tl.float32)  # (BLOCK_R,)
+        # Load B tile directly from buffer: buffer[slot, 0, offs_h, kr].
+        # n_slot_stride_B = H × MAX_R (shared-B has expert-dim=1 so no expert offset).
+        B_ptr = (
+            down_B_buffer
+            + slot * n_slot_stride_B
+            + offs_h[:, None] * MAX_R
+            + kr[None, :]
+        )
+        B = tl.load(B_ptr, mask=h_mask[:, None], other=0.0).to(
+            tl.float32
+        )  # (BLOCK_H, BLOCK_R)
+        # delta_h += B @ la  (contract over rank dimension)
+        acc += tl.sum(B * la[None, :], axis=1)
+
+    # Scale by topk weight and adapter scaling, then accumulate.
+    out_ptr = down_output + (tok * K + topk_v) * H + offs_h
+    old = tl.load(out_ptr, mask=h_mask, other=0.0).to(tl.float32)
+    tl.store(out_ptr, old + acc * weight * scaling, mask=h_mask)
+
+
+def _choose_block_h_expand(h: int) -> int:
+    """Largest power-of-2 ≤ 64 that divides h (or is the largest divisor ≤ 64)."""
+    block_h = min(64, h)
+    while h % block_h != 0:
+        block_h //= 2
+    return max(block_h, 1)
+
+
+def shared_b_down_expand(
+    lora_a: torch.Tensor,  # (m*K, MAX_R) — output of per_expert_a_shrink
+    down_B_buffer: torch.Tensor,  # (n_slots, 1, H, MAX_R) — full buffer, contiguous
+    slot_idx: torch.Tensor,  # (1,) int32 — GPU tensor; dynamic at CUDA-graph replay
+    down_output: torch.Tensor,  # (m, K, H) or (m*K, H) — in-place add
+    topk_weights: torch.Tensor,  # (m, K) routing weights
+    scalings: torch.Tensor,  # (n_slots,) float32 — full scalings buffer; kernel loads [slot]
+    K: int,
+) -> None:
+    """Fused down expand for decode: lora_a @ down_B[slot, 0].T × weight × scaling.
+
+    Accepts the FULL (n_slots, 1, H, MAX_R) buffer, slot_idx, and the full scalings
+    buffer — eliminates the separate down_B gather and scalings gather per layer.
+
+    Performs the shared-B GEMM, topk-weight scaling, and accumulation into
+    down_output in a single fused kernel.
+    """
+    m_k, MAX_R = lora_a.shape
+    # Buffer layout: (n_slots, 1, H, MAX_R).
+    _n_slots, _one, H, _MAX_R = down_B_buffer.shape
+    # Stride between slots: only 1 expert-slot for shared B, so stride = 1 × H × MAX_R.
+    n_slot_stride_B = H * MAX_R
+    BLOCK_H = _choose_block_h_expand(H)
+    BLOCK_R = _choose_block_r(MAX_R)
+    assert (
+        down_B_buffer.is_contiguous()
+    ), "down_B_buffer must be contiguous for fused kernel"
+
+    # Reshape output to (m*K, H) so the kernel can use a flat pid_s index.
+    out_flat = down_output.view(m_k, H)
+
+    grid = (m_k, triton.cdiv(H, BLOCK_H))
+    _shared_b_down_expand_kernel[grid](
+        lora_a,
+        down_B_buffer,
+        slot_idx.to(torch.int32),
+        n_slot_stride_B,
+        topk_weights,
+        scalings,
+        out_flat,
+        K,
+        H=H,
+        MAX_R=MAX_R,
+        BLOCK_H=BLOCK_H,
+        BLOCK_R=BLOCK_R,
+        num_warps=4,
+        num_stages=3,
+    )
+
+
+# ── Flat A GEMM (decode path) ─────────────────────────────────────────────────
+#
+# Computes lora_a_m = hidden @ w13_A[slot, 0, :, :].T for each token,
+# reading directly from the buffer without a prior gather copy.
+# Replaces: w13_A gather (22 µs) + cuBLAS GEMM (25 µs) → ~5-8 µs per layer.
+#
+# Grid: (m, MAX_R // BLOCK_R)  — one block per (token, rank-tile)
+
+
+@triton.jit
+def _shared_a_shrink_kernel(
+    hidden,  # (m, H)
+    w13_A_buffer,  # full buffer: n_slots × 1 × MAX_R × H (contiguous)
+    slot_ptr,  # (1,) int32 — GPU scalar, dynamic at CUDA-graph replay
+    n_slot_stride_A,  # int — MAX_R × H (stride between slots; shared outer has 1 row)
+    lora_a_out,  # (m, MAX_R)
+    H: tl.constexpr,
+    MAX_R: tl.constexpr,
+    BLOCK_R: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+):
+    pid_m = tl.program_id(0)  # token index
+    pid_r = tl.program_id(1)  # rank tile
+
+    slot = tl.load(slot_ptr).to(tl.int32)
+    kr = pid_r * BLOCK_R + tl.arange(0, BLOCK_R)
+    acc = tl.zeros((BLOCK_R,), dtype=tl.float32)
+
+    for h_start in range(0, H, BLOCK_H):
+        kh = h_start + tl.arange(0, BLOCK_H)
+        x = tl.load(hidden + pid_m * H + kh).to(tl.float32)  # (BLOCK_H,)
+        # buffer[slot, 0, kr, kh]: stride = slot * n_slot_stride_A + kr * H + kh
+        A_ptr = w13_A_buffer + slot * n_slot_stride_A + kr[:, None] * H + kh[None, :]
+        A = tl.load(A_ptr).to(tl.float32)  # (BLOCK_R, BLOCK_H)
+        acc += tl.sum(A * x[None, :], axis=1)
+
+    tl.store(lora_a_out + pid_m * MAX_R + kr, acc.to(hidden.dtype.element_ty))
+
+
+def shared_a_shrink(
+    hidden: torch.Tensor,  # (m, H)
+    w13_A_buffer: torch.Tensor,  # (n_slots, 1, MAX_R, H) — full buffer
+    slot_idx: torch.Tensor,  # (1,) int32 GPU tensor
+    BLOCK_H: int = 128,
+) -> torch.Tensor:
+    """Compute lora_a_m = hidden @ w13_A_buffer[slot, 0, :, :].T without gather.
+
+    Replaces: w13_A gather (22 µs) + cuBLAS GEMM (25 µs) = 47 µs per layer
+    With: single Triton kernel (~5-8 µs), saving ~40 µs × 48 = 1.9 ms.
+    """
+    m, H = hidden.shape
+    _n_slots, _one, MAX_R, _H = w13_A_buffer.shape
+    n_slot_stride_A = MAX_R * H  # stride between slots (1 × MAX_R × H)
+    BLOCK_R = _choose_block_r(MAX_R)
+
+    lora_a = torch.empty((m, MAX_R), dtype=hidden.dtype, device=hidden.device)
+    grid = (m, MAX_R // BLOCK_R)
+    _shared_a_shrink_kernel[grid](
+        hidden,
+        w13_A_buffer,
+        slot_idx.to(torch.int32),
+        n_slot_stride_A,
+        lora_a,
+        H=H,
+        MAX_R=MAX_R,
+        BLOCK_R=BLOCK_R,
+        BLOCK_H=BLOCK_H,
+        num_warps=4,
+        num_stages=2,
+    )
+    return lora_a
+
+
+# ── Per-Expert Gate/Up Expand ─────────────────────────────────────────────────
+#
+# Like gate_up_b_expand but reads lora_a_flat[pid_s] (per flat-pair position)
+# instead of lora_a_m[tok] (shared per token).  Required for per_expert adapters
+# where each expert has its own A matrix → lora_a differs per (token, topk_v) pair.
+#
+# Grid: (cdiv(I2, BLOCK_I), m*K)
+
+
+@triton.jit
+def _per_expert_gate_up_b_expand_kernel(
+    lora_a_flat,  # (m*K, MAX_R) — per flat-pair lora_a (from per_expert_a_shrink w/ hidden)
+    w13_B_buffer,  # full buffer: n_slots × E × I2 × MAX_R (contiguous)
+    slot_ptr,  # (1,) int32
+    n_slot_stride,  # E × I2 × MAX_R
+    safe_ids,  # (m, K) int64
+    gate_up_output,  # (m*K, I2) — in-place add
+    scaling_ptr,  # (n_slots,) float32
+    K,
+    I2: tl.constexpr,
+    MAX_R: tl.constexpr,
+    BLOCK_I: tl.constexpr,
+    BLOCK_R: tl.constexpr,
+):
+    pid_i = tl.program_id(0)
+    pid_s = tl.program_id(1)  # flat-pair index [0 .. m*K-1]
+
+    tok = pid_s // K
+    topk_v = pid_s % K
+    exp = tl.load(safe_ids + tok * K + topk_v).to(tl.int32)
+
+    offs_i = pid_i * BLOCK_I + tl.arange(0, BLOCK_I)
+    i_mask = offs_i < I2
+
+    slot = tl.load(slot_ptr).to(tl.int32)
+    scaling = tl.load(scaling_ptr + slot).to(tl.float32)
+    acc = tl.zeros((BLOCK_I,), dtype=tl.float32)
+
+    for r_start in range(0, MAX_R, BLOCK_R):
+        kr = r_start + tl.arange(0, BLOCK_R)
+        # Per-position lora_a: lora_a_flat[pid_s] instead of lora_a_m[tok]
+        la = tl.load(lora_a_flat + pid_s * MAX_R + kr).to(tl.float32)
+        B_ptr = (
+            w13_B_buffer
+            + slot * n_slot_stride
+            + (exp * I2 + offs_i[:, None]) * MAX_R
+            + kr[None, :]
+        )
+        B = tl.load(B_ptr, mask=i_mask[:, None], other=0.0).to(tl.float32)
+        acc += tl.sum(B * la[None, :], axis=1)
+
+    out_ptr = gate_up_output + pid_s * I2 + offs_i
+    old = tl.load(out_ptr, mask=i_mask, other=0.0).to(tl.float32)
+    tl.store(out_ptr, old + acc * scaling, mask=i_mask)
+
+
+def per_expert_gate_up_b_expand(
+    lora_a_flat: torch.Tensor,  # (m*K, MAX_R) — from per_expert_a_shrink(hidden_flat, w13_A_buf, ...)
+    w13_B_buffer: torch.Tensor,  # (n_slots, E, I2, MAX_R) — full buffer
+    slot_idx: torch.Tensor,  # (1,) int32 GPU tensor
+    safe_ids: torch.Tensor,  # (m, K) int64
+    gate_up_output: torch.Tensor,  # (m*K, I2) — in-place add
+    scalings: torch.Tensor,  # (n_slots,) float32
+    BLOCK_I: int = 64,
+) -> None:
+    """Per-expert gate/up expand for decode: lora_a_flat[j] @ w13_B[slot, e_j].T.
+
+    Replaces the gather-then-einsum path for per_expert adapters.  Accepts the FULL
+    (n_slots, E, I2, MAX_R) buffer and reads the expert offset directly using safe_ids,
+    eliminating the two gather copies (w13_B gather + expert-select gather).
+    """
+    m_k, MAX_R = lora_a_flat.shape
+    _n_slots, E, I2, _MAX_R = w13_B_buffer.shape
+    n_slot_stride = E * I2 * MAX_R
+    BLOCK_R = _choose_block_r(MAX_R)
+    K = safe_ids.shape[1]
+    assert w13_B_buffer.is_contiguous(), "w13_B_buffer must be contiguous"
+
+    grid = (triton.cdiv(I2, BLOCK_I), m_k)
+    _per_expert_gate_up_b_expand_kernel[grid](
+        lora_a_flat,
+        w13_B_buffer,
+        slot_idx.to(torch.int32),
+        n_slot_stride,
+        safe_ids.to(torch.int64),
+        gate_up_output,
+        scalings,
+        K,
+        I2=I2,
+        MAX_R=MAX_R,
+        BLOCK_I=BLOCK_I,
+        BLOCK_R=BLOCK_R,
+        num_warps=4,
+        num_stages=2,
+    )
+
+
+# ── Per-Expert Down Expand ────────────────────────────────────────────────────
+#
+# Like shared_b_down_expand but reads per-expert B: down_B_buffer[slot, e_j, offs_h, :].
+# Required for per_expert adapters where down_B is per-expert (not shared).
+# Eliminates the two gather copies (down_B buffer copy + expert select gather).
+#
+# Grid: (m*K, cdiv(H, BLOCK_H))
+
+
+@triton.jit
+def _per_expert_b_down_expand_kernel(
+    lora_a,  # (m*K, MAX_R)
+    down_B_buffer,  # full buffer: n_slots × E × H × MAX_R (contiguous)
+    slot_ptr,  # (1,) int32
+    n_slot_stride_B,  # E × H × MAX_R
+    safe_ids,  # (m, K) int64
+    topk_weights,  # (m, K)
+    scaling_ptr,  # (n_slots,) float32
+    down_output,  # (m, K, H) — in-place add
+    K,
+    H: tl.constexpr,
+    MAX_R: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    BLOCK_R: tl.constexpr,
+):
+    pid_s = tl.program_id(0)
+    pid_h = tl.program_id(1)
+
+    tok = pid_s // K
+    topk_v = pid_s % K
+    exp = tl.load(safe_ids + tok * K + topk_v).to(tl.int32)
+
+    offs_h = pid_h * BLOCK_H + tl.arange(0, BLOCK_H)
+    h_mask = offs_h < H
+
+    slot = tl.load(slot_ptr).to(tl.int32)
+    scaling = tl.load(scaling_ptr + slot).to(tl.float32)
+    weight = tl.load(topk_weights + tok * K + topk_v).to(tl.float32)
+
+    acc = tl.zeros((BLOCK_H,), dtype=tl.float32)
+
+    for r_start in range(0, MAX_R, BLOCK_R):
+        kr = r_start + tl.arange(0, BLOCK_R)
+        la = tl.load(lora_a + pid_s * MAX_R + kr).to(tl.float32)
+        # Per-expert B: buffer[slot, exp, offs_h, kr]
+        B_ptr = (
+            down_B_buffer
+            + slot * n_slot_stride_B
+            + (exp * H + offs_h[:, None]) * MAX_R
+            + kr[None, :]
+        )
+        B = tl.load(B_ptr, mask=h_mask[:, None], other=0.0).to(tl.float32)
+        acc += tl.sum(B * la[None, :], axis=1)
+
+    out_ptr = down_output + (tok * K + topk_v) * H + offs_h
+    old = tl.load(out_ptr, mask=h_mask, other=0.0).to(tl.float32)
+    tl.store(out_ptr, old + acc * weight * scaling, mask=h_mask)
+
+
+def per_expert_b_down_expand(
+    lora_a: torch.Tensor,  # (m*K, MAX_R) — from per_expert_a_shrink
+    down_B_buffer: torch.Tensor,  # (n_slots, E, H, MAX_R) — full buffer
+    slot_idx: torch.Tensor,  # (1,) int32 GPU tensor
+    safe_ids: torch.Tensor,  # (m, K) int64
+    down_output: torch.Tensor,  # (m, K, H) or (m*K, H) — in-place add
+    topk_weights: torch.Tensor,  # (m, K)
+    scalings: torch.Tensor,  # (n_slots,) float32
+    K: int,
+) -> None:
+    """Per-expert down expand for decode: lora_a[j] @ down_B[slot, e_j].T × weight.
+
+    Eliminates the two gather copies (down_B buffer copy + expert select gather)
+    for per_expert adapters where down_B is per-expert (not shared).
+    """
+    m_k, MAX_R = lora_a.shape
+    _n_slots, E, H, _MAX_R = down_B_buffer.shape
+    n_slot_stride_B = E * H * MAX_R
+    BLOCK_H = _choose_block_h_expand(H)
+    BLOCK_R = _choose_block_r(MAX_R)
+    assert down_B_buffer.is_contiguous(), "down_B_buffer must be contiguous"
+
+    out_flat = down_output.view(m_k, H)
+    grid = (m_k, triton.cdiv(H, BLOCK_H))
+    _per_expert_b_down_expand_kernel[grid](
+        lora_a,
+        down_B_buffer,
+        slot_idx.to(torch.int32),
+        n_slot_stride_B,
+        safe_ids.to(torch.int64),
+        topk_weights,
+        scalings,
+        out_flat,
+        K,
+        H=H,
+        MAX_R=MAX_R,
+        BLOCK_H=BLOCK_H,
+        BLOCK_R=BLOCK_R,
+        num_warps=4,
+        num_stages=2,
+    )
+
+
+# ── Fused A+B Gate/Up (eliminates shared_a_shrink + gate_up_b_expand) ──────
+#
+# Combines hidden @ w13_A + lora_a @ w13_B in one kernel, removing a separate
+# shared_a_shrink launch.  Lora_a is computed per flat-pair block (redundant for
+# k>1 per token) but w13_A fits in L1 so cache hits make this negligible.
+# Grid: (cdiv(I2, BLOCK_I), m*K)
+
+
+@triton.jit
+def _fused_shared_a_b_gate_up_kernel(
+    hidden,  # (m, H)
+    w13_A_buffer,  # (n_slots, 1, MAX_R, H) — contiguous
+    w13_B_buffer,  # (n_slots, E, I2, MAX_R) — contiguous
+    safe_ids,  # (m, K) int64
+    gate_up_output,  # (m*K, I2) — in-place add
+    scalings,  # (n_slots,) float32
+    slot_ptr,  # (1,) int32
+    K,
+    n_A_stride,  # = MAX_R * H
+    n_B_stride,  # = E * I2 * MAX_R
+    H: tl.constexpr,
+    I2: tl.constexpr,
+    MAX_R: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    BLOCK_I: tl.constexpr,
+    BLOCK_R: tl.constexpr,
+):
+    pid_i = tl.program_id(0)  # I2 chunk
+    pid_s = tl.program_id(1)  # flat-pair index
+
+    slot = tl.load(slot_ptr).to(tl.int32)
+    tok = pid_s // K
+    topk_v = pid_s % K
+    exp = tl.load(safe_ids + tok * K + topk_v).to(tl.int32)
+    scaling = tl.load(scalings + slot).to(tl.float32)
+
+    offs_i = pid_i * BLOCK_I + tl.arange(0, BLOCK_I)
+    i_mask = offs_i < I2
+    acc = tl.zeros((BLOCK_I,), dtype=tl.float32)
+
+    # Outer loop over BLOCK_R chunks of rank: compute lora_a[r:r+BLOCK_R] then expand.
+    # This avoids storing the full lora_a vector when BLOCK_R < MAX_R.
+    for r_start in range(0, MAX_R, BLOCK_R):
+        kr = r_start + tl.arange(0, BLOCK_R)
+
+        # Phase 1 (for this rank chunk): la = hidden[tok] @ w13_A[slot, 0, kr, :].T
+        la = tl.zeros((BLOCK_R,), dtype=tl.float32)
+        for h_start in range(0, H, BLOCK_H):
+            kh = h_start + tl.arange(0, BLOCK_H)
+            x = tl.load(hidden + tok * H + kh).to(tl.float32)
+            A_ptr = w13_A_buffer + slot * n_A_stride + kr[:, None] * H + kh[None, :]
+            A = tl.load(A_ptr).to(tl.float32)
+            la += tl.sum(A * x[None, :], axis=1)
+
+        # Phase 2 (for this rank chunk): acc += la @ w13_B[slot, exp, offs_i, kr].T
+        B_ptr = (
+            w13_B_buffer
+            + slot * n_B_stride
+            + (exp * I2 + offs_i[:, None]) * MAX_R
+            + kr[None, :]
+        )
+        B = tl.load(B_ptr, mask=i_mask[:, None], other=0.0).to(tl.float32)
+        acc += tl.sum(B * la[None, :], axis=1)
+
+    out_ptr = gate_up_output + pid_s * I2 + offs_i
+    old = tl.load(out_ptr, mask=i_mask, other=0.0).to(tl.float32)
+    tl.store(out_ptr, old + acc * scaling, mask=i_mask)
+
+
+def fused_shared_a_b_gate_up_expand(
+    hidden: torch.Tensor,  # (m, H)
+    w13_A_buffer: torch.Tensor,  # (n_slots, 1, MAX_R, H)
+    w13_B_buffer: torch.Tensor,  # (n_slots, E, I2, MAX_R)
+    safe_ids: torch.Tensor,  # (m, K) int64
+    gate_up_output: torch.Tensor,  # (m*K, I2) — in-place add
+    scalings: torch.Tensor,  # (n_slots,) float32
+    slot_idx: torch.Tensor,  # (1,) int32
+    BLOCK_I: int = 64,
+    BLOCK_H: int = 128,
+) -> None:
+    """Fused A+B gate/up: eliminates the separate shared_a_shrink kernel launch."""
+    m_k, I2 = gate_up_output.shape
+    m, H = hidden.shape
+    K = safe_ids.shape[1]
+    _ns, _one, MAX_R, _H = w13_A_buffer.shape
+    _ns2, E, _I2, _MAX_R = w13_B_buffer.shape
+    n_A_stride = MAX_R * H
+    n_B_stride = E * I2 * MAX_R
+    BLOCK_R = _choose_block_r(MAX_R)
+    assert w13_A_buffer.is_contiguous() and w13_B_buffer.is_contiguous()
+
+    grid = (triton.cdiv(I2, BLOCK_I), m_k)
+    _fused_shared_a_b_gate_up_kernel[grid](
+        hidden,
+        w13_A_buffer,
+        w13_B_buffer,
+        safe_ids.to(torch.int64),
+        gate_up_output,
+        scalings,
+        slot_idx.to(torch.int32),
+        K,
+        n_A_stride,
+        n_B_stride,
+        H=H,
+        I2=I2,
+        MAX_R=MAX_R,
+        BLOCK_H=BLOCK_H,
+        BLOCK_I=BLOCK_I,
+        BLOCK_R=BLOCK_R,
+        num_warps=4,
+        num_stages=2,
+    )
+
+
+# ── Fused Shrink+Expand Down (eliminates per_expert_a_shrink + shared_b_down_expand) ─
+#
+# Combines ri @ down_A + lora_a @ down_B in one kernel per (flat-pair, H-chunk).
+# Grid: (m*K, cdiv(H, BLOCK_H))
+
+
+@triton.jit
+def _fused_a_b_down_expand_kernel(
+    route_input,  # (m*K, INTER)
+    down_A_buffer,  # (n_slots, E, MAX_R, INTER) — contiguous
+    down_B_buffer,  # (n_slots, 1, H, MAX_R) — contiguous
+    safe_ids,  # (m, K) int64
+    topk_weights,  # (m, K)
+    scalings,  # (n_slots,) float32
+    slot_ptr,  # (1,) int32
+    down_output,  # (m*K, H) — in-place add
+    K,
+    n_A_stride,  # = E * MAX_R * INTER
+    n_B_stride,  # = H * MAX_R
+    INTER: tl.constexpr,
+    H: tl.constexpr,
+    MAX_R: tl.constexpr,
+    BLOCK_H_S: tl.constexpr,  # shrink tile over INTER
+    BLOCK_H_E: tl.constexpr,  # expand tile over H
+):
+    pid_s = tl.program_id(0)  # flat-pair index
+    pid_h = tl.program_id(1)  # H chunk
+
+    slot = tl.load(slot_ptr).to(tl.int32)
+    tok = pid_s // K
+    topk_v = pid_s % K
+    exp = tl.load(safe_ids + tok * K + topk_v).to(tl.int32)
+    weight = tl.load(topk_weights + tok * K + topk_v).to(tl.float32)
+    scaling = tl.load(scalings + slot).to(tl.float32)
+
+    offs_h = pid_h * BLOCK_H_E + tl.arange(0, BLOCK_H_E)
+    h_mask = offs_h < H
+    kr = tl.arange(0, MAX_R)
+
+    # Phase 1: lora_a = ri[pid_s] @ down_A[slot, exp, :, :].T
+    lora_a = tl.zeros((MAX_R,), dtype=tl.float32)
+    for h_start in range(0, INTER, BLOCK_H_S):
+        kh = h_start + tl.arange(0, BLOCK_H_S)
+        x = tl.load(route_input + pid_s * INTER + kh).to(tl.float32)
+        A_ptr = (
+            down_A_buffer
+            + slot * n_A_stride
+            + (exp * MAX_R + kr[:, None]) * INTER
+            + kh[None, :]
+        )
+        A = tl.load(A_ptr).to(tl.float32)
+        lora_a += tl.sum(A * x[None, :], axis=1)
+
+    # Phase 2: delta = lora_a @ down_B[slot, 0, offs_h, :].T * weight * scaling
+    B_ptr = down_B_buffer + slot * n_B_stride + offs_h[:, None] * MAX_R + kr[None, :]
+    B = tl.load(B_ptr, mask=h_mask[:, None], other=0.0).to(tl.float32)
+    delta = tl.sum(B * lora_a[None, :], axis=1) * weight * scaling
+
+    out_ptr = down_output + pid_s * H + offs_h
+    old = tl.load(out_ptr, mask=h_mask, other=0.0).to(tl.float32)
+    tl.store(out_ptr, old + delta, mask=h_mask)
+
+
+def fused_a_b_down_expand(
+    route_input: torch.Tensor,  # (m*K, INTER)
+    down_A_buffer: torch.Tensor,  # (n_slots, E, MAX_R, INTER)
+    down_B_buffer: torch.Tensor,  # (n_slots, 1, H, MAX_R)
+    safe_ids: torch.Tensor,  # (m, K) int64
+    topk_weights: torch.Tensor,  # (m, K)
+    scalings: torch.Tensor,  # (n_slots,) float32
+    slot_idx: torch.Tensor,  # (1,) int32
+    down_output: torch.Tensor,  # (m*K, H) or (m, K, H) — in-place add
+    BLOCK_H_E: int = 64,
+) -> None:
+    """Fused shrink+expand down: eliminates per_expert_a_shrink + shared_b_down_expand launches."""
+    m_k, INTER = route_input.shape
+    _ns, E, MAX_R, _INTER = down_A_buffer.shape
+    _ns2, _one, H, _MAX_R = down_B_buffer.shape
+    K = safe_ids.shape[1]
+    n_A_stride = E * MAX_R * INTER
+    n_B_stride = H * MAX_R
+    BLOCK_H_S = _choose_block_h(INTER)
+    assert down_A_buffer.is_contiguous() and down_B_buffer.is_contiguous()
+
+    out_flat = down_output.view(m_k, H)
+    grid = (m_k, triton.cdiv(H, BLOCK_H_E))
+    _fused_a_b_down_expand_kernel[grid](
+        route_input,
+        down_A_buffer,
+        down_B_buffer,
+        safe_ids.to(torch.int64),
+        topk_weights,
+        scalings,
+        slot_idx.to(torch.int32),
+        out_flat,
+        K,
+        n_A_stride,
+        n_B_stride,
+        INTER=INTER,
+        H=H,
+        MAX_R=MAX_R,
+        BLOCK_H_S=BLOCK_H_S,
+        BLOCK_H_E=BLOCK_H_E,
+        num_warps=4,
+        num_stages=2,
+    )
diff --git a/tokenspeed-kernel/test/ops/test_lora_triton.py b/tokenspeed-kernel/test/ops/test_lora_triton.py
new file mode 100644
index 000000000..67bd234a3
--- /dev/null
+++ b/tokenspeed-kernel/test/ops/test_lora_triton.py
@@ -0,0 +1,122 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import pytest
+import torch
+
+
+@dataclass
+class BatchInfo:
+    bs: int
+    max_len: int
+    seg_lens: torch.Tensor
+    seg_indptr: torch.Tensor
+    weight_indices: torch.Tensor
+    lora_ranks: torch.Tensor
+    scalings: torch.Tensor
+    permutation: torch.Tensor | None = None
+
+
+def _decode_batch(batch_size: int, rank: int, device: str) -> BatchInfo:
+    return BatchInfo(
+        bs=batch_size,
+        max_len=1,
+        seg_lens=torch.ones((batch_size,), dtype=torch.int32, device=device),
+        seg_indptr=torch.arange(batch_size + 1, dtype=torch.int32, device=device),
+        weight_indices=torch.ones((batch_size,), dtype=torch.int32, device=device),
+        lora_ranks=torch.tensor([0, rank], dtype=torch.int32, device=device),
+        scalings=torch.ones((2,), dtype=torch.float32, device=device),
+    )
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required")
+def test_lora_expand_decode_rank_smaller_than_block_k_matches_reference():
+    from tokenspeed_kernel.ops.lora.triton.lora_expand import lora_expand_fwd
+
+    device = "cuda"
+    dtype = torch.bfloat16
+    batch_size = 4
+    rank = 8
+    out_dim = 64
+    torch.manual_seed(7)
+    batch_info = _decode_batch(batch_size, rank, device)
+    x = torch.randn((batch_size, rank), dtype=dtype, device=device)
+    weights = torch.randn((2, out_dim, rank), dtype=dtype, device=device)
+    base = torch.randn((batch_size, out_dim), dtype=dtype, device=device)
+
+    out = lora_expand_fwd(x, weights, batch_info, base_output=base.clone())
+    ref = base.float() + x.float() @ weights[1].float().T
+    torch.testing.assert_close(out.float(), ref, rtol=2e-2, atol=2e-1)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required")
+def test_lora_gate_up_decode_rank_smaller_than_block_k_matches_reference():
+    from tokenspeed_kernel.ops.lora.triton.lora_gate_up_expand import (
+        lora_gate_up_expand_fwd,
+    )
+
+    device = "cuda"
+    dtype = torch.bfloat16
+    batch_size = 4
+    rank = 8
+    out_dim = 64
+    torch.manual_seed(8)
+    batch_info = _decode_batch(batch_size, rank, device)
+    x = torch.randn((batch_size, 2 * rank), dtype=dtype, device=device)
+    weights = torch.randn((2, 2 * out_dim, rank), dtype=dtype, device=device)
+    base = torch.randn((batch_size, 2 * out_dim), dtype=dtype, device=device)
+
+    out = lora_gate_up_expand_fwd(
+        x,
+        weights,
+        batch_info,
+        out_dim,
+        base_output=base.clone(),
+    )
+    ref = base.float()
+    ref[:, :out_dim] += x[:, :rank].float() @ weights[1, :out_dim].float().T
+    ref[:, out_dim:] += (
+        x[:, rank : 2 * rank].float() @ weights[1, out_dim : 2 * out_dim].float().T
+    )
+    torch.testing.assert_close(out.float(), ref, rtol=2e-2, atol=2e-1)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required")
+def test_lora_qkv_decode_rank_smaller_than_block_k_matches_reference():
+    from tokenspeed_kernel.ops.lora.triton.lora_qkv_expand import lora_qkv_expand_fwd
+
+    device = "cuda"
+    dtype = torch.bfloat16
+    batch_size = 4
+    rank = 8
+    q_dim = 64
+    kv_dim = 32
+    torch.manual_seed(9)
+    batch_info = _decode_batch(batch_size, rank, device)
+    x = torch.randn((batch_size, 3 * rank), dtype=dtype, device=device)
+    weights = torch.randn((2, q_dim + 2 * kv_dim, rank), dtype=dtype, device=device)
+    base = torch.randn((batch_size, q_dim + 2 * kv_dim), dtype=dtype, device=device)
+    offsets = torch.tensor(
+        [0, q_dim, q_dim + kv_dim, q_dim + 2 * kv_dim],
+        dtype=torch.int32,
+        device=device,
+    )
+
+    out = lora_qkv_expand_fwd(
+        x,
+        weights,
+        batch_info,
+        offsets,
+        q_dim,
+        base_output=base.clone(),
+    )
+    ref = base.float()
+    ref[:, :q_dim] += x[:, :rank].float() @ weights[1, :q_dim].float().T
+    ref[:, q_dim : q_dim + kv_dim] += (
+        x[:, rank : 2 * rank].float() @ weights[1, q_dim : q_dim + kv_dim].float().T
+    )
+    ref[:, q_dim + kv_dim :] += (
+        x[:, 2 * rank : 3 * rank].float() @ weights[1, q_dim + kv_dim :].float().T
+    )
+    torch.testing.assert_close(out.float(), ref, rtol=2e-2, atol=2e-1)
diff --git a/tokenspeed-scheduler/CMakeLists.txt b/tokenspeed-scheduler/CMakeLists.txt
index b0770e477..578729e23 100644
--- a/tokenspeed-scheduler/CMakeLists.txt
+++ b/tokenspeed-scheduler/CMakeLists.txt
@@ -123,6 +123,7 @@ if(TOKENSPEED_SCHEDULER_BUILD_TESTS)
         tests/cpp/test_mamba_eviction.cpp
         tests/cpp/test_mamba_cache.cpp
         tests/cpp/test_mamba_integration.cpp
+        tests/cpp/test_lora_prefix_cache.cpp
         tests/cpp/test_kv_cache_events.cpp
         tests/cpp/test_eviction_lru.cpp
         tests/cpp/test_host_node_ref_lifetime.cpp
diff --git a/tokenspeed-scheduler/bindings/python_module.cpp b/tokenspeed-scheduler/bindings/python_module.cpp
index eaa825b29..e40480b28 100644
--- a/tokenspeed-scheduler/bindings/python_module.cpp
+++ b/tokenspeed-scheduler/bindings/python_module.cpp
@@ -23,7 +23,6 @@
 #include <nanobind/stl/string.h>
 #include <nanobind/stl/map.h>
 #include <nanobind/stl/tuple.h>
-#include <nanobind/stl/unordered_map.h>
 #include <nanobind/stl/variant.h>
 #include <nanobind/stl/vector.h>
 
@@ -151,10 +150,6 @@ NB_MODULE(tokenspeed_scheduler_ext, m) {
         .value("FullHistory", tokenspeed::PagedCacheGroupConfig::Retention::FullHistory)
         .value("SlidingWindow", tokenspeed::PagedCacheGroupConfig::Retention::SlidingWindow);
 
-    nb::enum_<tokenspeed::PagedCacheGroupFamily>(m, "PagedCacheGroupFamily")
-        .value("History", tokenspeed::PagedCacheGroupFamily::History)
-        .value("State", tokenspeed::PagedCacheGroupFamily::State);
-
     nb::class_<tokenspeed::PagedCacheGroupConfig>(m, "PagedCacheGroupConfig")
         .def(nb::init<>())
         .def(
@@ -162,22 +157,19 @@ NB_MODULE(tokenspeed_scheduler_ext, m) {
             [](tokenspeed::PagedCacheGroupConfig* self, std::string group_id, std::int32_t rows_per_page,
                std::int32_t entry_stride_tokens, std::int32_t total_pages,
                tokenspeed::PagedCacheGroupConfig::Retention retention,
-               std::optional<std::int32_t> sliding_window_tokens, tokenspeed::PagedCacheGroupFamily family) {
-                new (self) tokenspeed::PagedCacheGroupConfig{
-                    std::move(group_id),   rows_per_page, entry_stride_tokens, total_pages, retention,
-                    sliding_window_tokens, family};
+               std::optional<std::int32_t> sliding_window_tokens) {
+                new (self) tokenspeed::PagedCacheGroupConfig{std::move(group_id), rows_per_page, entry_stride_tokens,
+                                                             total_pages,         retention,     sliding_window_tokens};
             },
             nb::arg("group_id"), nb::arg("rows_per_page"), nb::arg("entry_stride_tokens"), nb::arg("total_pages"),
             nb::arg("retention") = tokenspeed::PagedCacheGroupConfig::Retention::FullHistory,
-            nb::arg("sliding_window_tokens") = std::nullopt,
-            nb::arg("family") = tokenspeed::PagedCacheGroupFamily::History)
+            nb::arg("sliding_window_tokens") = std::nullopt)
         .def_rw("group_id", &tokenspeed::PagedCacheGroupConfig::group_id)
         .def_rw("rows_per_page", &tokenspeed::PagedCacheGroupConfig::rows_per_page)
         .def_rw("entry_stride_tokens", &tokenspeed::PagedCacheGroupConfig::entry_stride_tokens)
         .def_rw("total_pages", &tokenspeed::PagedCacheGroupConfig::total_pages)
         .def_rw("retention", &tokenspeed::PagedCacheGroupConfig::retention)
         .def_rw("sliding_window_tokens", &tokenspeed::PagedCacheGroupConfig::sliding_window_tokens)
-        .def_rw("family", &tokenspeed::PagedCacheGroupConfig::family)
         .def("raw_tokens_per_page", &tokenspeed::PagedCacheGroupConfig::RawTokensPerPage)
         .def("validate", &tokenspeed::PagedCacheGroupConfig::Validate);
 
@@ -200,8 +192,6 @@ NB_MODULE(tokenspeed_scheduler_ext, m) {
         .def("page_ids", &tokenspeed::PagedCacheGroupTable::PageIds, nb::rv_policy::reference_internal)
         .def("size", &tokenspeed::PagedCacheGroupTable::Size)
         .def("active_pages_count", &tokenspeed::PagedCacheGroupTable::ActivePagesCount)
-        .def("owned_pages_count", &tokenspeed::PagedCacheGroupTable::OwnedPagesCount)
-        .def("borrowed_pages_count", &tokenspeed::PagedCacheGroupTable::BorrowedPagesCount)
         .def("released_pages_count", &tokenspeed::PagedCacheGroupTable::ReleasedPagesCount)
         .def("base_logical_page", &tokenspeed::PagedCacheGroupTable::BaseLogicalPage)
         .def("raw_token_cursor", &tokenspeed::PagedCacheGroupTable::RawTokenCursor)
@@ -211,12 +201,6 @@ NB_MODULE(tokenspeed_scheduler_ext, m) {
         .def("is_sliding", &tokenspeed::PagedCacheGroupTable::IsSliding)
         .def("sliding_window_tokens", &tokenspeed::PagedCacheGroupTable::SlidingWindowTokens);
 
-    // Python declares the required group ids only. Scheduler derives LCM and
-    // sliding-window metadata from the matching PagedCacheGroupConfig entries.
-    nb::class_<tokenspeed::PrefixCacheAdjunctSpec>(m, "PrefixCacheAdjunctSpec")
-        .def(nb::init<>())
-        .def_rw("required_groups", &tokenspeed::PrefixCacheAdjunctSpec::required_groups);
-
     scheduler_config.def(nb::init<>())
         .def_rw("page_size", &tokenspeed::SchedulerConfig::page_size)
         .def_rw("max_scheduled_tokens", &tokenspeed::SchedulerConfig::max_scheduled_tokens)
@@ -230,7 +214,6 @@ NB_MODULE(tokenspeed_scheduler_ext, m) {
             "num_host_pages", [](const tokenspeed::SchedulerConfig& c) { return c.host_allocator.total_pages; },
             [](tokenspeed::SchedulerConfig& c, std::int32_t v) { c.host_allocator.total_pages = v; })
         .def_rw("paged_cache_groups", &tokenspeed::SchedulerConfig::paged_cache_groups)
-        .def_rw("prefix_cache_adjunct", &tokenspeed::SchedulerConfig::prefix_cache_adjunct)
         .def_rw("disable_l2_cache", &tokenspeed::SchedulerConfig::disable_l2_cache)
         .def_rw("enable_l3_storage", &tokenspeed::SchedulerConfig::enable_l3_storage)
         .def_rw("prefetch_threshold", &tokenspeed::SchedulerConfig::prefetch_threshold)
@@ -248,7 +231,8 @@ NB_MODULE(tokenspeed_scheduler_ext, m) {
         .def_rw("request_id", &tokenspeed::RequestSpec::request_id)
         .def_rw("tokens", &tokenspeed::RequestSpec::tokens)
         .def_rw("rolling_hashes", &tokenspeed::RequestSpec::rolling_hashes)
-        .def_rw("storage_hit_pages", &tokenspeed::RequestSpec::storage_hit_pages);
+        .def_rw("storage_hit_pages", &tokenspeed::RequestSpec::storage_hit_pages)
+        .def_rw("lora_id", &tokenspeed::RequestSpec::lora_id);
 
     nb::module_ forward_event = m.def_submodule("ForwardEvent");
     nb::class_<tokenspeed::forward::ExtendResult>(forward_event, "ExtendResult")
@@ -429,6 +413,7 @@ NB_MODULE(tokenspeed_scheduler_ext, m) {
         .def("get_request_token_size", &tokenspeed::Scheduler::GetRequestTokenSize, nb::arg("id"))
         .def("calc_rolling_hash", &tokenspeed::Scheduler::CalcRollingHash, nb::arg("input_tokens"),
              nb::arg("apply_match") = false)
+        .def("evict_lora_namespace", &tokenspeed::Scheduler::EvictLoraNamespace, nb::arg("lora_id"))
         .def("paged_cache_group_ids", &tokenspeed::Scheduler::PagedCacheGroupIds)
         .def("paged_cache_group_total_pages", &tokenspeed::Scheduler::PagedCacheGroupTotalPages, nb::arg("group_id"))
         .def("paged_cache_group_available_pages", &tokenspeed::Scheduler::PagedCacheGroupAvailablePages,
diff --git a/tokenspeed-scheduler/csrc/fsm/forward_events.cpp b/tokenspeed-scheduler/csrc/fsm/forward_events.cpp
index 83dd0354d..20953b69d 100644
--- a/tokenspeed-scheduler/csrc/fsm/forward_events.cpp
+++ b/tokenspeed-scheduler/csrc/fsm/forward_events.cpp
@@ -106,7 +106,7 @@ void InsertHybridCache(HybridPrefixCache* hybrid_cache,
                        const std::vector<std::span<const std::int32_t>>& full_paged_tokens,
                        std::unique_ptr<DeviceNodeRef>& device_node_ref, LocalKVAllocator* local_kv_allocator,
                        LocalMambaAllocator* local_mamba_allocator, std::int32_t chunk_begin, std::int32_t chunk_size,
-                       std::int32_t page_size) {
+                       std::int32_t page_size, std::int32_t lora_id = kLoraNone) {
     if (hybrid_cache == nullptr) return;
 
     std::vector<std::int32_t> prefix_pages = DevicePagesFromRoot(device_node_ref->Node());
@@ -120,8 +120,9 @@ void InsertHybridCache(HybridPrefixCache* hybrid_cache,
     }
 
     OwnedPages pages_to_insert = local_kv_allocator->TakeFirst(new_page_count);
-    auto insert_result = hybrid_cache->GetKVPrefixCache().Insert<ResourceType::Device>(full_paged_tokens, prefix_pages,
-                                                                                       std::move(pages_to_insert));
+    auto insert_result = hybrid_cache->GetKVPrefixCache().Insert<ResourceType::Device>(
+        full_paged_tokens, prefix_pages, std::move(pages_to_insert),
+        /*page_hashs=*/{}, /*start_node=*/nullptr, lora_id);
 
     if (local_mamba_allocator != nullptr && local_mamba_allocator->HasCheckpoint()) {
         if (ShouldPublishMambaCheckpoint(hybrid_cache, chunk_begin, chunk_size, page_size)) {
@@ -214,7 +215,8 @@ std::variant<PrefillDone, Prefilling> SchedulePrefillEvent::operator()(Prefillin
         paged_tokens.resize(end_of_window_pages);
     }
     InsertHybridCache(hybrid_prefix_cache_, paged_tokens, device_node_ref, local_kv_allocator.get(),
-                      local_mamba_allocator.get(), state.window.begin, state.window.size, state.GetPageSize());
+                      local_mamba_allocator.get(), state.window.begin, state.window.size, state.GetPageSize(),
+                      lora_id_);
     // Allocate KV pages for the new chunk
     local_kv_allocator->Acquire(tokens_this_round_);
 
@@ -264,7 +266,8 @@ Decoding ScheduleDecodeEvent::operator()(PrefillDone&& state) {
         paged_tokens.resize(end_of_window_pages);
     }
     InsertHybridCache(hybrid_prefix_cache_, paged_tokens, device_node_ref, local_kv_allocator.get(),
-                      local_mamba_allocator.get(), state.window.begin, state.window.size, state.GetPageSize());
+                      local_mamba_allocator.get(), state.window.begin, state.window.size, state.GetPageSize(),
+                      lora_id_);
     // Allocate fresh checkpoint for decode-phase mamba state tracking
     if (hybrid_prefix_cache_ != nullptr && local_mamba_allocator != nullptr) {
         if (!local_mamba_allocator->AllocateCheckpoint()) {
@@ -358,12 +361,12 @@ std::variant<Draining, Finished> FinishEvent::apply(ForwardStateT&& state) {
         OwnedPages alloc_pages = local_allocator->TakeFirst(alloc_count);
 
         kv_prefix_cache_->Insert<ResourceType::Device>(full_paged_tokens, prefix_pages, std::move(alloc_pages),
-                                                       page_hashes_);
+                                                       page_hashes_, /*start_node=*/nullptr, lora_id_);
 
         // Mamba: insert the latest checkpoint snapshot at the terminal node.
         if (hybrid_prefix_cache_ != nullptr && local_mamba_allocator != nullptr &&
             (local_mamba_allocator->HasCheckpoint() || local_mamba_allocator->HasWorking())) {
-            MatchResult post_match = kv_prefix_cache_->Match(full_paged_tokens);
+            MatchResult post_match = kv_prefix_cache_->Match(full_paged_tokens, lora_id_);
             TreeNode* terminal = post_match.device.last_node;
             if (terminal != nullptr && !terminal->HasMamba()) {
                 if (local_mamba_allocator->HasCheckpoint()) {
@@ -376,7 +379,7 @@ std::variant<Draining, Finished> FinishEvent::apply(ForwardStateT&& state) {
     }
     // local_mamba_allocator dropped here — destructor frees remaining slots
 
-    MatchResult match = kv_prefix_cache_->Match(full_paged_tokens);
+    MatchResult match = kv_prefix_cache_->Match(full_paged_tokens, lora_id_);
     if (!disable_l2_cache_ && (match.device.DepthInPage() > match.host.DepthInPage())) {
         std::vector<TreeNode*> write_diff = match.NodesWithout<ResourceType::Host>();
         std::int32_t host_pages_num = 0;
diff --git a/tokenspeed-scheduler/csrc/fsm/forward_events.h b/tokenspeed-scheduler/csrc/fsm/forward_events.h
index 0f42b86b6..1e70b98bb 100644
--- a/tokenspeed-scheduler/csrc/fsm/forward_events.h
+++ b/tokenspeed-scheduler/csrc/fsm/forward_events.h
@@ -35,6 +35,7 @@
 #include "fsm/base_event.h"
 #include "fsm/forward_states.h"
 #include "resource/types.h"
+#include "resource/kv_prefix_cache/kv_prefix_cache.h"
 #include "resource/hybrid_prefix_cache/hybrid_prefix_cache.h"
 #include "resource/allocator/mamba_chunk_allocator.h"
 #include "resource/allocator/local_mamba_allocator.h"
@@ -106,10 +107,11 @@ struct SchedulePrefillFirstChunkEvent : InvalidTransitionHandler<SchedulePrefill
 struct SchedulePrefillEvent : InvalidTransitionHandler<SchedulePrefillEvent> {
     using InvalidTransitionHandler<SchedulePrefillEvent>::operator();
     SchedulePrefillEvent(std::int32_t tokens_this_round, std::int32_t reserve_num_tokens_in_next_schedule_event,
-                         HybridPrefixCache* hybrid_prefix_cache = nullptr)
+                         HybridPrefixCache* hybrid_prefix_cache = nullptr, std::int32_t lora_id = kLoraNone)
         : tokens_this_round_(tokens_this_round),
           reserve_num_tokens_in_next_schedule_event_(reserve_num_tokens_in_next_schedule_event),
-          hybrid_prefix_cache_(hybrid_prefix_cache) {}
+          hybrid_prefix_cache_(hybrid_prefix_cache),
+          lora_id_(lora_id) {}
 
     // Returns PrefillDone (last chunk) or Prefilling (more chunks remain).
     std::variant<PrefillDone, Prefilling> operator()(Prefilling&& state);
@@ -118,13 +120,15 @@ struct SchedulePrefillEvent : InvalidTransitionHandler<SchedulePrefillEvent> {
     std::int32_t tokens_this_round_{};
     std::int32_t reserve_num_tokens_in_next_schedule_event_{};
     HybridPrefixCache* hybrid_prefix_cache_{};
+    std::int32_t lora_id_{kLoraNone};
 };
 
 struct ScheduleDecodeEvent : InvalidTransitionHandler<ScheduleDecodeEvent> {
     using InvalidTransitionHandler<ScheduleDecodeEvent>::operator();
 
-    ScheduleDecodeEvent(std::int32_t decode_input_tokens, HybridPrefixCache* hybrid_prefix_cache = nullptr)
-        : decode_input_tokens_(decode_input_tokens), hybrid_prefix_cache_(hybrid_prefix_cache) {}
+    ScheduleDecodeEvent(std::int32_t decode_input_tokens, HybridPrefixCache* hybrid_prefix_cache = nullptr,
+                        std::int32_t lora_id = kLoraNone)
+        : decode_input_tokens_(decode_input_tokens), hybrid_prefix_cache_(hybrid_prefix_cache), lora_id_(lora_id) {}
 
     Decoding operator()(PrefillDone&& state);
     Decoding operator()(Decoding&& state);
@@ -132,6 +136,7 @@ struct ScheduleDecodeEvent : InvalidTransitionHandler<ScheduleDecodeEvent> {
 private:
     std::int32_t decode_input_tokens_;
     HybridPrefixCache* hybrid_prefix_cache_{};
+    std::int32_t lora_id_{kLoraNone};
 };
 
 struct ScheduleDecodeFromRetractedEvent : InvalidTransitionHandler<ScheduleDecodeFromRetractedEvent> {
@@ -174,12 +179,13 @@ struct FinishEvent : InvalidTransitionHandler<FinishEvent> {
     using InvalidTransitionHandler<FinishEvent>::operator();
     explicit FinishEvent(KVPrefixCache* kv_prefix_cache, PageAllocator* host_allocator,
                          std::vector<std::string> page_hashes = {}, bool disable_l2_cache = false,
-                         HybridPrefixCache* hybrid_prefix_cache = nullptr)
+                         HybridPrefixCache* hybrid_prefix_cache = nullptr, std::int32_t lora_id = kLoraNone)
         : kv_prefix_cache_(kv_prefix_cache),
           host_allocator_(host_allocator),
           page_hashes_(std::move(page_hashes)),
           disable_l2_cache_(disable_l2_cache),
-          hybrid_prefix_cache_(hybrid_prefix_cache) {}
+          hybrid_prefix_cache_(hybrid_prefix_cache),
+          lora_id_(lora_id) {}
 
     // Returns Draining (needs device→host writeback) or Finished.
     std::variant<Draining, Finished> operator()(Decoding&& state);
@@ -197,6 +203,7 @@ struct FinishEvent : InvalidTransitionHandler<FinishEvent> {
     PageAllocator* host_allocator_;
     bool disable_l2_cache_;
     HybridPrefixCache* hybrid_prefix_cache_{};
+    std::int32_t lora_id_{kLoraNone};
 
     template <typename ForwardStateT>
     std::variant<Draining, Finished> apply(ForwardStateT&& state);
diff --git a/tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.cpp b/tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.cpp
index 361067454..5db5bec11 100644
--- a/tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.cpp
+++ b/tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.cpp
@@ -48,16 +48,16 @@ HybridPrefixCache::HybridPrefixCache(KVPrefixCache& kv_prefix_cache, MambaChunkA
       mamba_eviction_manager_{mamba_allocator},
       mamba_cache_chunk_size_{mamba_cache_chunk_size} {}
 
-MatchResult HybridPrefixCache::Match(const token_vec_t& token_ids, MatchIntent intent) {
-    auto match = kv_prefix_cache_.Match(token_ids, intent);
+MatchResult HybridPrefixCache::Match(const token_vec_t& token_ids, std::int32_t lora_id, MatchIntent intent) {
+    auto match = kv_prefix_cache_.Match(token_ids, lora_id, intent);
     augmentMatch(match);
     augmentMatchPagedCache(match);
     return match;
 }
 
 MatchResult HybridPrefixCache::Match(const std::vector<std::span<const std::int32_t>>& token_pages,
-                                     MatchIntent intent) {
-    auto match = kv_prefix_cache_.Match(token_pages, intent);
+                                     std::int32_t lora_id, MatchIntent intent) {
+    auto match = kv_prefix_cache_.Match(token_pages, lora_id, intent);
     augmentMatch(match);
     augmentMatchPagedCache(match);
     return match;
@@ -231,15 +231,11 @@ std::vector<TransferPair> HybridPrefixCache::PrepareMambaDeviceLoadBack(const st
 }
 
 bool HybridPrefixCache::EnsureMambaCapacityByEvict(std::int32_t num_slots, TreeNode* protected_node) {
-    if (mamba_allocator_ == nullptr) return num_slots <= 0;
     return mamba_eviction_manager_.EnsureCapacity(num_slots, protected_node);
 }
 
 void HybridPrefixCache::InsertMamba(TreeNode* terminal_node, std::unique_ptr<MambaSlot> slot) {
     if (terminal_node == nullptr || slot == nullptr) return;
-    if (mamba_allocator_ == nullptr) {
-        throw std::logic_error("HybridPrefixCache::InsertMamba: mamba adjunct not enabled");
-    }
     const std::int32_t page_size = kv_prefix_cache_.PageSize();
     if (page_size <= 0 || terminal_node->DepthInTokens() % static_cast<std::size_t>(page_size) != 0) {
         throw std::logic_error("HybridPrefixCache::InsertMamba: terminal node is not block-aligned");
@@ -250,10 +246,6 @@ void HybridPrefixCache::InsertMamba(TreeNode* terminal_node, std::unique_ptr<Mam
 
 bool HybridPrefixCache::AttachPagedCacheSnapshotToNode(TreeNode* node, std::unique_ptr<PagedCacheSnapshot> snapshot) {
     if (node == nullptr || snapshot == nullptr) return false;
-    // Compute completeness from what is present. The policy-driven "snapshot
-    // must be full" invariant is enforced upstream by CommitChunk, which only
-    // attaches full snapshots; direct callers (tests, future restore paths)
-    // may attach history-only or state-only snapshots without policy gating.
     snapshot->complete_families.clear();
     bool history_complete = !paged_cache_history_groups_.empty();
     for (const auto& gid : paged_cache_history_groups_) {
@@ -295,9 +287,6 @@ void HybridPrefixCache::OnKVEvict(TreeNode* node) {
             mamba_eviction_manager_.UpdateLeaf(node->Parent());
         }
     }
-    // Passive paged-cache detach on KV LRU drop: returns OwnedPages via RAII;
-    // the chain scan sees the gap because `HasPagedCacheSnapshot()` is false.
-    // Route through DetachPagedCacheSnapshotFromNode to keep membership set in sync.
     if (node->HasPagedCacheSnapshot()) {
         DetachPagedCacheSnapshotFromNode(node);
     }
@@ -387,7 +376,6 @@ void HybridPrefixCache::OnKVDeviceDemote(TreeNode* node) {
 }
 
 std::int32_t HybridPrefixCache::AvailableSlots() const {
-    if (mamba_allocator_ == nullptr) return 0;
     return mamba_allocator_->AvailableSlots();
 }
 
diff --git a/tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.h b/tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.h
index a519427ce..96ee4960c 100644
--- a/tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.h
+++ b/tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.h
@@ -50,8 +50,9 @@ class HybridPrefixCache {
     HybridPrefixCache(KVPrefixCache& prefix_cache, MambaChunkAllocator* allocator, std::int32_t mamba_cache_chunk_size,
                       MambaHostAllocator* mamba_host_allocator = nullptr);
 
-    MatchResult Match(const token_vec_t& token_ids, MatchIntent intent = MatchIntent::PrefixReuse);
-    MatchResult Match(const std::vector<std::span<const std::int32_t>>& token_pages,
+    MatchResult Match(const token_vec_t& token_ids, std::int32_t lora_id = kLoraNone,
+                      MatchIntent intent = MatchIntent::PrefixReuse);
+    MatchResult Match(const std::vector<std::span<const std::int32_t>>& token_pages, std::int32_t lora_id = kLoraNone,
                       MatchIntent intent = MatchIntent::PrefixReuse);
 
     bool EnsureMambaCapacityByEvict(std::int32_t num_slots, TreeNode* protected_node = nullptr);
diff --git a/tokenspeed-scheduler/csrc/resource/kv_prefix_cache/eviction.h b/tokenspeed-scheduler/csrc/resource/kv_prefix_cache/eviction.h
index 7726fb1e7..b7c135e20 100644
--- a/tokenspeed-scheduler/csrc/resource/kv_prefix_cache/eviction.h
+++ b/tokenspeed-scheduler/csrc/resource/kv_prefix_cache/eviction.h
@@ -173,6 +173,34 @@ std::vector<TreeNode*> ResourceManager<RType>::Evict(std::int32_t num_pages) {
     return evicted_nodes;
 }
 
+template <ResourceType RType>
+void ResourceManager<RType>::EvictSubtree(const std::vector<TreeNode*>& nodes) {
+    for (TreeNode* node : nodes) {
+        bool has_resource;
+        if constexpr (RType == ResourceType::Device) {
+            has_resource = node->OnDevice();
+        } else {
+            has_resource = node->OnHost();
+        }
+        if (!has_resource) continue;
+
+        const auto& res = GetResource<RType>(node);
+        if (!res.IsEvictable()) continue;  // skip locked nodes; freed when request finishes
+
+        auto it = node_time_.find(node);
+        if (it != node_time_.end()) {
+            lru_leaves_.erase({it->second, node});
+            node_time_.erase(it);
+            GetResource<RType>(node).ClearEvictableNotifier();
+        }
+        auto resource_ptr = node->DetachResource<RType>();
+        if (eviction_callback_) {
+            eviction_callback_(node);
+        }
+        // OwnedPages RAII: pages returned to allocator on scope exit.
+    }
+}
+
 template <ResourceType RType>
 std::vector<TreeNode*> ResourceManager<RType>::EnsureCapacity(std::int32_t required_num_pages) {
     if (required_num_pages <= 0) {
diff --git a/tokenspeed-scheduler/csrc/resource/kv_prefix_cache/kv_prefix_cache.cpp b/tokenspeed-scheduler/csrc/resource/kv_prefix_cache/kv_prefix_cache.cpp
index 6272e0fd8..0667c070b 100644
--- a/tokenspeed-scheduler/csrc/resource/kv_prefix_cache/kv_prefix_cache.cpp
+++ b/tokenspeed-scheduler/csrc/resource/kv_prefix_cache/kv_prefix_cache.cpp
@@ -125,6 +125,30 @@ KVPrefixCache::KVPrefixCache(PageAllocator* device_allocator, PageAllocator* hos
       enable_l3_storage_(enable_l3_storage),
       disable_prefix_cache_(disable_prefix_cache) {}
 
+TreeNode* KVPrefixCache::getOrCreateLoraRoot(std::int32_t lora_id) {
+    auto& slot = lora_virtual_roots_[lora_id];
+    // Re-create if null or if the node was pruned from the tree (parent == nullptr
+    // while not the real root means it was removed by PruneEmptyByNode).
+    if (slot != nullptr && slot->Parent() != nullptr) {
+        return slot;
+    }
+    // Sentinel page: [-lora_id, 0, ..., 0].  Negative token IDs never appear in
+    // real vocabularies (which are always non-negative), so there is no collision.
+    const std::int32_t page_size = tree_.PageSize();
+    token_vec_t sentinel(page_size, 0);
+    sentinel[0] = -lora_id;
+    auto node = std::make_unique<TreeNode>(sentinel, std::chrono::steady_clock::now());
+    TreeNode* raw = node.get();
+    // Attach an empty DeviceResource so OnDevice() returns true.
+    // This prevents PruneEmptyByNode from removing the virtual root even when
+    // all adapter sequences have been evicted.
+    raw->AttachResource<ResourceType::Device>(std::make_unique<NodeResource<ResourceType::Device>>(OwnedPages{}));
+    token_vec_t key(sentinel.begin(), sentinel.begin() + page_size);
+    tree_.Root()->AddChild(key, std::move(node));
+    slot = raw;
+    return raw;
+}
+
 void KVPrefixCache::SetKvEventSink(KvEventSink sink) {
     kv_event_sink_ = std::move(sink);
     if (!kv_event_sink_) {
@@ -160,7 +184,7 @@ void KVPrefixCache::recordDeviceBlockRemoved(TreeNode* node) {
     }
 }
 
-MatchResult KVPrefixCache::Match(const token_vec_t& token_ids, MatchIntent intent) {
+MatchResult KVPrefixCache::Match(const token_vec_t& token_ids, std::int32_t lora_id, MatchIntent intent) {
     if (disable_prefix_cache_ && intent == MatchIntent::PrefixReuse) {
         const std::int32_t page_size = tree_.PageSize();
         if (token_ids.size() % page_size != 0) {
@@ -176,15 +200,23 @@ MatchResult KVPrefixCache::Match(const token_vec_t& token_ids, MatchIntent inten
                                  std::to_string(token_ids.size()) + "; page_size=" + std::to_string(page_size));
     }
 
-    WalkResult walk_result = tree_.WalkDownUtilMismatch(token_ids, access_time);
+    TreeNode* start_node = resolveStartNode(lora_id);
+    WalkResult walk_result = tree_.WalkDownUtilMismatch(token_ids, access_time, start_node);
     MatchResult& match = walk_result.match;
     match.device.page_size = page_size;
     match.host.page_size = page_size;
+    if (lora_id != kLoraNone) {
+        // The virtual namespace root contributes 1 sentinel page to absolute tree
+        // depth.  Subtract it so callers see the number of real matched token pages.
+        match.device.namespace_depth_offset = 1;
+        match.host.namespace_depth_offset = 1;
+    }
     return match;
 }
 
-MatchResult KVPrefixCache::Match(const std::vector<std::span<const std::int32_t>>& token_pages, MatchIntent intent) {
-    return Match(FlattenPages(token_pages, 0, token_pages.size()), intent);
+MatchResult KVPrefixCache::Match(const std::vector<std::span<const std::int32_t>>& token_pages, std::int32_t lora_id,
+                                 MatchIntent intent) {
+    return Match(FlattenPages(token_pages, 0, token_pages.size()), lora_id, intent);
 }
 
 MatchResult KVPrefixCache::RootMatch() const {
@@ -199,7 +231,7 @@ MatchResult KVPrefixCache::RootMatch() const {
 template <ResourceType RType>
 InsertResult KVPrefixCache::Insert(const token_vec_t& token_ids, const std::vector<std::int32_t>& prefix_pages,
                                    OwnedPages allocator_pages, const std::vector<std::string>& page_hashs,
-                                   TreeNode* start_node) {
+                                   TreeNode* start_node, std::int32_t lora_id) {
     const std::int32_t page_size = tree_.PageSize();
     auto insert_result = InsertResult{
         .last_node = tree_.Root(),
@@ -219,8 +251,12 @@ InsertResult KVPrefixCache::Insert(const token_vec_t& token_ids, const std::vect
     const auto& alloc_ids = allocator_pages.Ids();
     page_ids.insert(page_ids.end(), alloc_ids.begin(), alloc_ids.end());
 
-    WalkResult walk_result =
-        tree_.WalkDownUtilMismatch(token_slice{token_ids.data(), total_pages * page_size}, access_time, start_node);
+    // When start_node is nullptr (no prior match), resolve the LoRA namespace root.
+    // When start_node is provided (continuation from a prior match), the caller
+    // already points into the correct namespace subtree.
+    TreeNode* effective_start = (start_node != nullptr) ? start_node : resolveStartNode(lora_id);
+    WalkResult walk_result = tree_.WalkDownUtilMismatch(token_slice{token_ids.data(), total_pages * page_size},
+                                                        access_time, effective_start);
 
     token_slice mistmatched_tokens = walk_result.remaining_tokens;
     TreeNode* current = walk_result.terminal;
@@ -317,9 +353,10 @@ InsertResult KVPrefixCache::Insert(const token_vec_t& token_ids, const std::vect
 template <ResourceType RType>
 InsertResult KVPrefixCache::Insert(const std::vector<std::span<const std::int32_t>>& token_pages,
                                    const std::vector<std::int32_t>& prefix_pages, OwnedPages allocator_pages,
-                                   const std::vector<std::string>& page_hashs, TreeNode* start_node) {
+                                   const std::vector<std::string>& page_hashs, TreeNode* start_node,
+                                   std::int32_t lora_id) {
     return Insert<RType>(FlattenPages(token_pages, 0, token_pages.size()), prefix_pages, std::move(allocator_pages),
-                         page_hashs, start_node);
+                         page_hashs, start_node, lora_id);
 }
 
 template <ResourceType RType>
@@ -389,24 +426,52 @@ cache_op_id KVPrefixCache::AllocateCacheOpId() {
     return next_op_id_++;
 }
 
-template InsertResult KVPrefixCache::Insert<ResourceType::Device>(const token_vec_t& token_ids,
-                                                                  const std::vector<std::int32_t>& prefix_pages,
-                                                                  OwnedPages allocator_pages,
-                                                                  const std::vector<std::string>& page_hashs,
-                                                                  TreeNode* start_node);
+void KVPrefixCache::EvictLoraNamespace(std::int32_t lora_id) {
+    auto it = lora_virtual_roots_.find(lora_id);
+    if (it == lora_virtual_roots_.end() || it->second == nullptr) {
+        return;
+    }
+    TreeNode* vroot = it->second;
+
+    // Collect all descendant nodes via DFS (excluding the virtual root itself,
+    // which holds no real KV pages).
+    std::vector<TreeNode*> descendants;
+    std::function<void(TreeNode*)> collect = [&](TreeNode* node) {
+        for (auto& [key, child] : node->Children()) {
+            if (!child) continue;
+            descendants.push_back(child.get());
+            collect(child.get());
+        }
+    };
+    collect(vroot);
+
+    // Evict device and host pages. OwnedPages RAII returns them to the allocator.
+    device_.EvictSubtree(descendants);
+    host_.EvictSubtree(descendants);
+
+    // Remove the virtual root from the tree. The unique_ptr cascade destroys the
+    // entire subtree (including any mamba slots attached to those nodes).
+    token_vec_t sentinel(tree_.PageSize(), 0);
+    sentinel[0] = -lora_id;
+    tree_.Root()->RemoveChild(sentinel);
 
-template InsertResult KVPrefixCache::Insert<ResourceType::Host>(const token_vec_t& token_ids,
-                                                                const std::vector<std::int32_t>& prefix_pages,
-                                                                OwnedPages allocator_pages,
-                                                                const std::vector<std::string>& page_hashs,
-                                                                TreeNode* start_node);
+    lora_virtual_roots_.erase(it);
+}
 
+template InsertResult KVPrefixCache::Insert<ResourceType::Device>(const token_vec_t&, const std::vector<std::int32_t>&,
+                                                                  OwnedPages, const std::vector<std::string>&,
+                                                                  TreeNode*, std::int32_t);
+template InsertResult KVPrefixCache::Insert<ResourceType::Host>(const token_vec_t&, const std::vector<std::int32_t>&,
+                                                                OwnedPages, const std::vector<std::string>&, TreeNode*,
+                                                                std::int32_t);
 template InsertResult KVPrefixCache::Insert<ResourceType::Device>(const std::vector<std::span<const std::int32_t>>&,
                                                                   const std::vector<std::int32_t>&, OwnedPages,
-                                                                  const std::vector<std::string>&, TreeNode*);
+                                                                  const std::vector<std::string>&, TreeNode*,
+                                                                  std::int32_t);
 template InsertResult KVPrefixCache::Insert<ResourceType::Host>(const std::vector<std::span<const std::int32_t>>&,
                                                                 const std::vector<std::int32_t>&, OwnedPages,
-                                                                const std::vector<std::string>&, TreeNode*);
+                                                                const std::vector<std::string>&, TreeNode*,
+                                                                std::int32_t);
 
 template bool KVPrefixCache::EnsureCapacityByEvict<ResourceType::Device>(std::int32_t required_num_pages);
 template bool KVPrefixCache::EnsureCapacityByEvict<ResourceType::Host>(std::int32_t required_num_pages);
diff --git a/tokenspeed-scheduler/csrc/resource/kv_prefix_cache/kv_prefix_cache.h b/tokenspeed-scheduler/csrc/resource/kv_prefix_cache/kv_prefix_cache.h
index 1027d5e42..5f24138c4 100644
--- a/tokenspeed-scheduler/csrc/resource/kv_prefix_cache/kv_prefix_cache.h
+++ b/tokenspeed-scheduler/csrc/resource/kv_prefix_cache/kv_prefix_cache.h
@@ -28,6 +28,10 @@
 #include <unordered_set>
 #include <vector>
 
+// kLoraNone is the lora_id value meaning "base model, no adapter".
+// Adapter IDs are positive integers assigned by LoraRegistry.
+static constexpr std::int32_t kLoraNone = 0;
+
 #include "resource/radix_tree/radix_tree.h"
 #include "resource/radix_tree/tree_resource.h"
 #include "resource/types.h"
@@ -47,19 +51,29 @@ class KVPrefixCache {
                   bool disable_prefix_cache = false);
 
     void SetKvEventSink(KvEventSink sink);
-    MatchResult Match(const token_vec_t& token_ids, MatchIntent intent = MatchIntent::PrefixReuse);
-    MatchResult Match(const std::vector<std::span<const std::int32_t>>& token_pages,
+
+    // lora_id = kLoraNone (0) → base model, uses the shared radix tree root.
+    // lora_id > 0          → adapter namespace; a per-adapter virtual root is
+    //                        created on demand so same-adapter requests share the
+    //                        prefix cache while cross-adapter requests never collide.
+    // intent: PrefixReuse honours disable_prefix_cache_ (returns empty match);
+    //         StateRecovery always walks the tree (used to recover state for
+    //         retracted requests even when prefix caching is disabled).
+    MatchResult Match(const token_vec_t& token_ids, std::int32_t lora_id = kLoraNone,
+                      MatchIntent intent = MatchIntent::PrefixReuse);
+    MatchResult Match(const std::vector<std::span<const std::int32_t>>& token_pages, std::int32_t lora_id = kLoraNone,
                       MatchIntent intent = MatchIntent::PrefixReuse);
 
     template <ResourceType RType>
     InsertResult Insert(const token_vec_t& token_ids, const std::vector<std::int32_t>& prefix_pages,
                         OwnedPages allocator_pages = {}, const std::vector<std::string>& page_hashs = {},
-                        TreeNode* start_node = nullptr);
+                        TreeNode* start_node = nullptr, std::int32_t lora_id = kLoraNone);
 
     template <ResourceType RType>
     InsertResult Insert(const std::vector<std::span<const std::int32_t>>& token_pages,
                         const std::vector<std::int32_t>& prefix_pages, OwnedPages allocator_pages = {},
-                        const std::vector<std::string>& page_hashs = {}, TreeNode* start_node = nullptr);
+                        const std::vector<std::string>& page_hashs = {}, TreeNode* start_node = nullptr,
+                        std::int32_t lora_id = kLoraNone);
 
     cache_op_id AllocateCacheOpId();
 
@@ -88,6 +102,13 @@ class KVPrefixCache {
     RadixTree& GetRadixTree() { return tree_; }
     const RadixTree& GetRadixTree() const { return tree_; }
 
+    // Evict all KV pages cached under the given adapter's namespace and remove
+    // the virtual root from the tree. Call this when an adapter is unloaded so
+    // its pages are freed immediately rather than waiting for LRU pressure.
+    // Locked pages (in-flight requests) are skipped and freed when those
+    // requests finish.
+    void EvictLoraNamespace(std::int32_t lora_id);
+
 private:
     MatchResult RootMatch() const;
 
@@ -106,11 +127,25 @@ class KVPrefixCache {
         }
     }
 
+    // Returns (or creates) the virtual root node for the given LoRA adapter.
+    // The virtual root is a child of the real root keyed by a sentinel page
+    // [-lora_id, 0, ..., 0] that is outside any real vocabulary range.
+    // An empty DeviceResource is attached so PruneEmptyByNode never removes it.
+    TreeNode* getOrCreateLoraRoot(std::int32_t lora_id);
+
+    // Resolve the start_node for Match/Insert: nullptr for base model,
+    // per-adapter virtual root for LoRA.
+    TreeNode* resolveStartNode(std::int32_t lora_id) {
+        return (lora_id == kLoraNone) ? nullptr : getOrCreateLoraRoot(lora_id);
+    }
+
     RadixTree tree_;
     DeviceManager device_;
     HostManager host_;
     cache_op_id next_op_id_{1};
     bool enable_l3_storage_{false};
+    // Per-adapter virtual root nodes; keyed by lora_id (> 0).
+    std::unordered_map<std::int32_t, TreeNode*> lora_virtual_roots_;
     KvEventSink kv_event_sink_{};
     std::unordered_set<std::uint64_t> published_device_blocks_;
     bool disable_prefix_cache_{false};
diff --git a/tokenspeed-scheduler/csrc/resource/radix_tree/tree_resource.h b/tokenspeed-scheduler/csrc/resource/radix_tree/tree_resource.h
index f6658f47c..9e4ba1981 100644
--- a/tokenspeed-scheduler/csrc/resource/radix_tree/tree_resource.h
+++ b/tokenspeed-scheduler/csrc/resource/radix_tree/tree_resource.h
@@ -109,6 +109,9 @@ class ResourceManager {
     void UpdateLeaves(TreeNode* node);
     std::vector<TreeNode*> Evict(std::int32_t num_pages);
     std::vector<TreeNode*> EnsureCapacity(std::int32_t required_num_pages);
+    // Evict all pages held by the given nodes (e.g. a LoRA namespace subtree).
+    // Locked nodes are skipped — their pages are freed when the request finishes.
+    void EvictSubtree(const std::vector<TreeNode*>& nodes);
 
     // Called by NodeResource::Unlock() when ref_count transitions 1→0.
     void OnNodeEvictable(TreeNode* node) { updateLeaf(node); }
diff --git a/tokenspeed-scheduler/csrc/resource/types.cpp b/tokenspeed-scheduler/csrc/resource/types.cpp
index 17f046386..45fa350bd 100644
--- a/tokenspeed-scheduler/csrc/resource/types.cpp
+++ b/tokenspeed-scheduler/csrc/resource/types.cpp
@@ -25,11 +25,11 @@
 namespace tokenspeed {
 
 std::int32_t MatchResult::Device::DepthInPage() const {
-    return last_node->DepthInPage(page_size);
+    return last_node->DepthInPage(page_size) - namespace_depth_offset;
 }
 
 std::int32_t MatchResult::Host::DepthInPage() const {
-    return last_node->DepthInPage(page_size);
+    return last_node->DepthInPage(page_size) - namespace_depth_offset;
 }
 
 template <ResourceType RType>
diff --git a/tokenspeed-scheduler/csrc/resource/types.h b/tokenspeed-scheduler/csrc/resource/types.h
index 4d53e5c0b..7ed404087 100644
--- a/tokenspeed-scheduler/csrc/resource/types.h
+++ b/tokenspeed-scheduler/csrc/resource/types.h
@@ -55,12 +55,17 @@ struct MatchResult {
     struct Device {
         TreeNode* last_node;
         std::int32_t page_size{0};
+        // Number of virtual namespace-root pages to subtract from the absolute
+        // tree depth to get the number of real matched token pages.
+        // 0 for base-model requests; 1 for LoRA adapter requests.
+        std::int32_t namespace_depth_offset{0};
         std::int32_t DepthInPage() const;
     } device;
 
     struct Host {
         TreeNode* last_node;
         std::int32_t page_size{0};
+        std::int32_t namespace_depth_offset{0};
         std::int32_t DepthInPage() const;
     } host;
 
diff --git a/tokenspeed-scheduler/csrc/scheduler/operations/forward.cpp b/tokenspeed-scheduler/csrc/scheduler/operations/forward.cpp
index a8ce8f900..6e644c907 100644
--- a/tokenspeed-scheduler/csrc/scheduler/operations/forward.cpp
+++ b/tokenspeed-scheduler/csrc/scheduler/operations/forward.cpp
@@ -81,8 +81,9 @@ std::optional<fsm::SchedulePrefillFirstChunkEvent> Scheduler::schedulePrefillFir
     Request* request, std::int32_t remaining, std::int32_t decode_input_tokens, bool disable_l2_cache,
     std::map<std::string, std::int32_t>& simulated_free) {
     if (req_pool_allocator_.AvailableSlots() == 0) return {};
-    MatchResult match_result = hybrid_prefix_cache_ ? hybrid_prefix_cache_->Match(request->GetFullPagedTokens(true))
-                                                    : kv_prefix_cache_.Match(request->GetFullPagedTokens(true));
+    MatchResult match_result = hybrid_prefix_cache_
+                                   ? hybrid_prefix_cache_->Match(request->GetFullPagedTokens(true), request->LoraId())
+                                   : kv_prefix_cache_.Match(request->GetFullPagedTokens(true), request->LoraId());
     std::int32_t loadback_tokens = 0;
     std::int32_t unscheduled = 0;
     std::vector<TreeNode*> loadback_diff;
@@ -227,8 +228,9 @@ std::optional<fsm::ScheduleDecodeFromRetractedEvent> Scheduler::scheduleDecodeFr
 
     MatchResult match_result =
         hybrid_prefix_cache_
-            ? hybrid_prefix_cache_->Match(request->GetFullPagedTokens(true), MatchIntent::StateRecovery)
-            : kv_prefix_cache_.Match(request->GetFullPagedTokens(true), MatchIntent::StateRecovery);
+            ? hybrid_prefix_cache_->Match(request->GetFullPagedTokens(true), request->LoraId(),
+                                          MatchIntent::StateRecovery)
+            : kv_prefix_cache_.Match(request->GetFullPagedTokens(true), request->LoraId(), MatchIntent::StateRecovery);
     std::vector<TreeNode*> loadback_diff = match_result.NodesWithout<ResourceType::Device>();
     std::vector<TreeNode*> mamba_loadback_nodes;
     TreeNode* mamba_recovery_node = nullptr;
@@ -321,7 +323,7 @@ std::optional<fsm::ScheduleRetractEvent> Scheduler::scheduleRetract(Request* req
 
     kv_prefix_cache_.Insert<ResourceType::Device>(full_paged_tokens, prefix_pages, std::move(alloc_pages));
 
-    MatchResult match_result = kv_prefix_cache_.Match(full_paged_tokens, MatchIntent::StateRecovery);
+    MatchResult match_result = kv_prefix_cache_.Match(full_paged_tokens, request->LoraId(), MatchIntent::StateRecovery);
 
     std::unique_ptr<HostNodeRef> temp_lock = std::make_unique<HostNodeRef>(match_result.host.last_node);
     const std::int32_t device_matched3 = match_result.device.DepthInPage();
diff --git a/tokenspeed-scheduler/csrc/scheduler/outside_event_handler.cpp b/tokenspeed-scheduler/csrc/scheduler/outside_event_handler.cpp
index 2279df74a..9c5f31928 100644
--- a/tokenspeed-scheduler/csrc/scheduler/outside_event_handler.cpp
+++ b/tokenspeed-scheduler/csrc/scheduler/outside_event_handler.cpp
@@ -91,9 +91,9 @@ void Scheduler::handleEvent(const pd::FailedEvent& event) {}
 
 void Scheduler::handleEvent(const pd::SucceededEvent& event) {
     std::vector<std::string> page_hashes;
-    requests_.at(event.request_id)
-        ->Apply(fsm::FinishEvent{&kv_prefix_cache_, &host_allocator_, std::move(page_hashes), config_.disable_l2_cache,
-                                 hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr});
+    auto& req = requests_.at(event.request_id);
+    req->Apply(fsm::FinishEvent{&kv_prefix_cache_, &host_allocator_, std::move(page_hashes), config_.disable_l2_cache,
+                                hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr, req->LoraId()});
 }
 
 void Scheduler::handleEvent(const pd::RemotePrefillDoneEvent& event) {
@@ -115,7 +115,8 @@ void Scheduler::handleEvent(const forward::Finish& event) {
             }
         }
         req->Apply(fsm::FinishEvent{&kv_prefix_cache_, &host_allocator_, std::move(page_hashes),
-                                    config_.disable_l2_cache, hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr});
+                                    config_.disable_l2_cache, hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr,
+                                    req->LoraId()});
     }
 }
 
diff --git a/tokenspeed-scheduler/csrc/scheduler/request.cpp b/tokenspeed-scheduler/csrc/scheduler/request.cpp
index 6aaa3c55a..46d5ab1b1 100644
--- a/tokenspeed-scheduler/csrc/scheduler/request.cpp
+++ b/tokenspeed-scheduler/csrc/scheduler/request.cpp
@@ -29,6 +29,7 @@ namespace tokenspeed {
 
 Request::Request(const RequestSpec& spec, std::int32_t page_size, Role role)
     : id_{spec.request_id},
+      lora_id_{spec.lora_id},
       token_container_{spec.tokens},
       page_size_{page_size},
       state_{role == Role::kFused ? fsm::State{fsm::Submitted{&token_container_, page_size}}
diff --git a/tokenspeed-scheduler/csrc/scheduler/request.h b/tokenspeed-scheduler/csrc/scheduler/request.h
index 89b770c68..56bdf2efd 100644
--- a/tokenspeed-scheduler/csrc/scheduler/request.h
+++ b/tokenspeed-scheduler/csrc/scheduler/request.h
@@ -53,6 +53,7 @@ class Request {
     Request(const RequestSpec& spec, std::int32_t page_size, Role role);
 
     std::string Id() const { return id_; }
+    std::int32_t LoraId() const { return lora_id_; }
 
     // Keep Apply the only non-const function in Request
     // The wrapper lambda converts any concrete state type returned by event's operator()
@@ -273,6 +274,7 @@ class Request {
 
 private:
     std::string id_;
+    std::int32_t lora_id_{0};
     TokenContainer token_container_;
     std::int32_t page_size_;
     fsm::State state_;
diff --git a/tokenspeed-scheduler/csrc/scheduler/request_spec.h b/tokenspeed-scheduler/csrc/scheduler/request_spec.h
index eaf85ebda..07a9e28ee 100644
--- a/tokenspeed-scheduler/csrc/scheduler/request_spec.h
+++ b/tokenspeed-scheduler/csrc/scheduler/request_spec.h
@@ -32,6 +32,10 @@ struct RequestSpec {
     std::vector<std::int32_t> tokens;
     std::vector<std::string> rolling_hashes;
     std::int32_t storage_hit_pages{0};
+    // 0 = base model (no adapter).  >0 = LoRA adapter integer ID from
+    // LoraRegistry.  The prefix cache is namespaced per lora_id so adapters
+    // never share KV pages with different LoRA weights.
+    std::int32_t lora_id{0};
 };
 
 struct PrefillInfo {
diff --git a/tokenspeed-scheduler/csrc/scheduler/scheduler.cpp b/tokenspeed-scheduler/csrc/scheduler/scheduler.cpp
index 5df53231d..ef79684ab 100644
--- a/tokenspeed-scheduler/csrc/scheduler/scheduler.cpp
+++ b/tokenspeed-scheduler/csrc/scheduler/scheduler.cpp
@@ -132,6 +132,10 @@ Scheduler::Scheduler(SchedulerConfig config)
     }
 }
 
+void Scheduler::EvictLoraNamespace(std::int32_t lora_id) {
+    kv_prefix_cache_.EvictLoraNamespace(lora_id);
+}
+
 std::vector<KvCacheEvent> Scheduler::DrainKvEvents() {
     std::vector<KvCacheEvent> events;
     events.swap(kv_events_);
diff --git a/tokenspeed-scheduler/csrc/scheduler/scheduler.h b/tokenspeed-scheduler/csrc/scheduler/scheduler.h
index c36c3a413..84fb36d6c 100644
--- a/tokenspeed-scheduler/csrc/scheduler/scheduler.h
+++ b/tokenspeed-scheduler/csrc/scheduler/scheduler.h
@@ -60,6 +60,9 @@ class Scheduler {
 
     void Advance(const ExecutionEvent& event);
     std::vector<KvCacheEvent> DrainKvEvents();
+    // Evict all KV pages cached under the given LoRA adapter's namespace and
+    // remove its virtual root from the prefix tree. Call on adapter unload.
+    void EvictLoraNamespace(std::int32_t lora_id);
 
     std::size_t WaitingSize() const;
     std::size_t DecodingSize() const;
diff --git a/tokenspeed-scheduler/csrc/scheduler/types.h b/tokenspeed-scheduler/csrc/scheduler/types.h
index 2a8310891..892fd79ef 100644
--- a/tokenspeed-scheduler/csrc/scheduler/types.h
+++ b/tokenspeed-scheduler/csrc/scheduler/types.h
@@ -42,8 +42,6 @@ enum class DisaggregationMode {
     kPrefill,
     kDecode,
 };
-// `PagedCacheGroupFamily` and `StateRestorePolicy` are defined in
-// resource/allocator/paged_cache_group.h (transitively included above).
 
 template <ResourceType>
 class NodeRef;
@@ -84,7 +82,6 @@ struct SchedulerConfig {
     } device_allocator;
 
     std::vector<PagedCacheGroupConfig> paged_cache_groups{};
-
     // Unset means paged-cache groups are transport-only.
     std::optional<PrefixCacheAdjunctSpec> prefix_cache_adjunct{};
 
diff --git a/tokenspeed-scheduler/python/tokenspeed_scheduler/__init__.py b/tokenspeed-scheduler/python/tokenspeed_scheduler/__init__.py
index f2070be85..dc87ada88 100644
--- a/tokenspeed-scheduler/python/tokenspeed_scheduler/__init__.py
+++ b/tokenspeed-scheduler/python/tokenspeed_scheduler/__init__.py
@@ -27,10 +27,8 @@
     ExecutionPlan,
     PagedCacheGroupAllocator,
     PagedCacheGroupConfig,
-    PagedCacheGroupFamily,
     PagedCacheGroupTable,
     PagedCacheRetention,
-    PrefixCacheAdjunctSpec,
     RequestSpec,
     Scheduler,
     SchedulerConfig,
@@ -73,9 +71,7 @@ def _flat_forward_op_repr(self):
     "PagedCacheRetention",
     "PagedCacheGroupConfig",
     "PagedCacheGroupAllocator",
-    "PagedCacheGroupFamily",
     "PagedCacheGroupTable",
-    "PrefixCacheAdjunctSpec",
     # Execution plan & operations
     "ExecutionPlan",
     "Forward",
diff --git a/tokenspeed-scheduler/tests/cpp/test_lora_prefix_cache.cpp b/tokenspeed-scheduler/tests/cpp/test_lora_prefix_cache.cpp
new file mode 100644
index 000000000..f531ab244
--- /dev/null
+++ b/tokenspeed-scheduler/tests/cpp/test_lora_prefix_cache.cpp
@@ -0,0 +1,182 @@
+// Copyright (c) 2026 LightSeek Foundation
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "unit_test_helper.h"
+#include "resource/allocator/page_allocator.h"
+#include "resource/kv_prefix_cache/kv_prefix_cache.h"
+#include "resource/radix_tree/tree_node.h"
+#include "resource/types.h"
+
+namespace tokenspeed::test {
+
+class LoraPrefixCacheTest : public ::testing::Test {
+protected:
+    static constexpr int32_t kPageSize = 4;
+    static constexpr int32_t kTotalPages = 128;
+
+    void SetUp() override {
+        device_alloc_ = std::make_unique<PageAllocator>(kPageSize, kTotalPages);
+        cache_ = std::make_unique<KVPrefixCache>(device_alloc_.get(), /*host=*/nullptr);
+    }
+
+    // Insert N pages for a given token sequence under a given lora_id.
+    InsertResult DoInsert(int32_t num_pages, token_t start_token, int32_t lora_id) {
+        auto tokens = MakeAlignedTokens(num_pages, kPageSize, start_token);
+        auto pages = device_alloc_->Allocate(num_pages);
+        return cache_->Insert<ResourceType::Device>(tokens, /*prefix_pages=*/{}, std::move(pages),
+                                                    /*page_hashs=*/{}, /*start_node=*/nullptr, lora_id);
+    }
+
+    // Return the matched device depth (in pages) for a given sequence + lora_id.
+    int32_t MatchDepth(int32_t num_pages, token_t start_token, int32_t lora_id) {
+        auto tokens = MakeAlignedTokens(num_pages, kPageSize, start_token);
+        return cache_->Match(tokens, lora_id).device.DepthInPage();
+    }
+
+    std::unique_ptr<PageAllocator> device_alloc_;
+    std::unique_ptr<KVPrefixCache> cache_;
+};
+
+// ---------------------------------------------------------------------------
+// Same adapter reuses prefix cache (intra-adapter sharing)
+// ---------------------------------------------------------------------------
+
+TEST_F(LoraPrefixCacheTest, SameAdapterReusesPrefixCache) {
+    DoInsert(2, /*start_token=*/1, /*lora_id=*/1);
+    // A second request with the same adapter and same tokens should hit the cache.
+    EXPECT_EQ(MatchDepth(2, 1, /*lora_id=*/1), 2);
+}
+
+// ---------------------------------------------------------------------------
+// Different adapters do not share cache entries (cross-adapter isolation)
+// ---------------------------------------------------------------------------
+
+TEST_F(LoraPrefixCacheTest, DifferentAdaptersDontShareCache) {
+    // Insert tokens [1..8] under adapter 1.
+    DoInsert(2, /*start_token=*/1, /*lora_id=*/1);
+    // Adapter 2 has no entry for the same tokens — expect 0 hit.
+    EXPECT_EQ(MatchDepth(2, 1, /*lora_id=*/2), 0);
+}
+
+// ---------------------------------------------------------------------------
+// Base model (lora_id=0) is independent of any adapter namespace
+// ---------------------------------------------------------------------------
+
+TEST_F(LoraPrefixCacheTest, BaseModelIndependentOfAdapters) {
+    // Insert under adapter 1 and the base model with the same tokens.
+    DoInsert(2, /*start_token=*/1, /*lora_id=*/1);
+    DoInsert(2, /*start_token=*/1, /*lora_id=*/kLoraNone);
+
+    // Each namespace sees only its own entries.
+    EXPECT_EQ(MatchDepth(2, 1, /*lora_id=*/1), 2);
+    EXPECT_EQ(MatchDepth(2, 1, /*lora_id=*/kLoraNone), 2);
+
+    // Adapter 2 still gets nothing for these tokens.
+    EXPECT_EQ(MatchDepth(2, 1, /*lora_id=*/2), 0);
+}
+
+// ---------------------------------------------------------------------------
+// Multiple adapters each cache independently
+// ---------------------------------------------------------------------------
+
+TEST_F(LoraPrefixCacheTest, MultipleAdaptersCacheIndependently) {
+    // Insert different sequences for three different adapters.
+    DoInsert(1, /*start_token=*/100, /*lora_id=*/1);
+    DoInsert(1, /*start_token=*/200, /*lora_id=*/2);
+    DoInsert(1, /*start_token=*/300, /*lora_id=*/3);
+
+    EXPECT_EQ(MatchDepth(1, 100, /*lora_id=*/1), 1);
+    EXPECT_EQ(MatchDepth(1, 200, /*lora_id=*/2), 1);
+    EXPECT_EQ(MatchDepth(1, 300, /*lora_id=*/3), 1);
+
+    // Cross-adapter: each adapter sees 0 for the others' tokens.
+    EXPECT_EQ(MatchDepth(1, 200, /*lora_id=*/1), 0);
+    EXPECT_EQ(MatchDepth(1, 100, /*lora_id=*/2), 0);
+}
+
+// ---------------------------------------------------------------------------
+// InsertResult.last_node stays within the adapter namespace
+// ---------------------------------------------------------------------------
+
+TEST_F(LoraPrefixCacheTest, InsertLastNodeIsInAdapterNamespace) {
+    auto result1 = DoInsert(2, /*start_token=*/1, /*lora_id=*/1);
+    auto result2 = DoInsert(2, /*start_token=*/1, /*lora_id=*/2);
+    // last_nodes should be distinct (different subtrees).
+    EXPECT_NE(result1.last_node, result2.last_node);
+    EXPECT_NE(result1.last_node, nullptr);
+    EXPECT_NE(result2.last_node, nullptr);
+}
+
+// ---------------------------------------------------------------------------
+// Eviction only evicts within the same namespace
+// ---------------------------------------------------------------------------
+
+TEST_F(LoraPrefixCacheTest, EvictionDoesNotCrossNamespaces) {
+    const int32_t initial = device_alloc_->AvailablePages();
+    DoInsert(2, /*start_token=*/1, /*lora_id=*/1);
+    DoInsert(2, /*start_token=*/1, /*lora_id=*/2);
+    ASSERT_EQ(device_alloc_->AvailablePages(), initial - 4);
+
+    // Evict everything.
+    cache_->EnsureCapacityByEvict<ResourceType::Device>(initial);
+    EXPECT_EQ(device_alloc_->AvailablePages(), initial);
+
+    // Both namespaces should now have empty caches.
+    EXPECT_EQ(MatchDepth(2, 1, /*lora_id=*/1), 0);
+    EXPECT_EQ(MatchDepth(2, 1, /*lora_id=*/2), 0);
+}
+
+// ---------------------------------------------------------------------------
+// EvictLoraNamespace: pages freed immediately on adapter unload
+// ---------------------------------------------------------------------------
+
+TEST_F(LoraPrefixCacheTest, EvictLoraNamespaceFreesPagesImmediately) {
+    const int32_t initial = device_alloc_->AvailablePages();
+
+    DoInsert(2, /*start_token=*/1, /*lora_id=*/1);
+    DoInsert(3, /*start_token=*/50, /*lora_id=*/2);
+    ASSERT_EQ(device_alloc_->AvailablePages(), initial - 5);
+
+    // Evict adapter 1's namespace only.
+    cache_->EvictLoraNamespace(1);
+    EXPECT_EQ(device_alloc_->AvailablePages(), initial - 3);
+
+    // Adapter 1's cache is gone; adapter 2's is untouched.
+    EXPECT_EQ(MatchDepth(2, 1, /*lora_id=*/1), 0);
+    EXPECT_EQ(MatchDepth(3, 50, /*lora_id=*/2), 3);
+
+    // Evict adapter 2; all pages returned.
+    cache_->EvictLoraNamespace(2);
+    EXPECT_EQ(device_alloc_->AvailablePages(), initial);
+}
+
+TEST_F(LoraPrefixCacheTest, EvictLoraNamespaceIdempotent) {
+    DoInsert(1, /*start_token=*/1, /*lora_id=*/5);
+    cache_->EvictLoraNamespace(5);
+    // Second call on a removed namespace must not crash.
+    EXPECT_NO_THROW(cache_->EvictLoraNamespace(5));
+    // Call on a namespace that was never created must not crash.
+    EXPECT_NO_THROW(cache_->EvictLoraNamespace(99));
+}
+
+}  // namespace tokenspeed::test

From 7f0e675b92e63f355f17c2153b7fbc9d494964e4 Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 03:19:54 +0000
Subject: [PATCH 02/19] =?UTF-8?q?chore:=20remove=20dco.yml=20=E2=80=94=20n?=
 =?UTF-8?q?o=20longer=20needed=20after=20squash?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Single commit with valid Signed-off-by makes the remediation config
unnecessary.

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 .github/dco.yml | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100644 .github/dco.yml

diff --git a/.github/dco.yml b/.github/dco.yml
deleted file mode 100644
index 7993b95cc..000000000
--- a/.github/dco.yml
+++ /dev/null
@@ -1,2 +0,0 @@
-allowRemediationCommits:
-  individual: true

From 98adfca79384bdb8a3653befde1b49a5b73804d3 Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 03:22:25 +0000
Subject: [PATCH 03/19] chore: move benchmark files to qywu/lora-dev branch

Removes all benchmark scripts and result files from the PR branch.
They remain on qywu/lora-dev for development use.

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 0520_results.md                           |  71 --
 0521_moe_lora_results.md                  |  53 --
 0522_results.md                           | 129 ----
 bench_chunked_sgmv.py                     | 817 ----------------------
 bench_kernel_opt.py                       | 141 ----
 bench_vs_vllm.py                          | 294 --------
 benchmark/bench_fused_moe_lora_e2e.py     | 120 ----
 benchmark/bench_fused_moe_lora_kernels.py | 381 ----------
 benchmark/bench_lm_head_lora_decode.py    | 281 --------
 benchmark/bench_moe_lora_decode.py        | 380 ----------
 benchmark/bench_moe_lora_retry.py         | 372 ----------
 benchmark/bench_triton_expand_kernel.py   | 192 -----
 benchmark/nsys_decode_target.py           | 126 ----
 benchmark/profile_decode.py               | 179 -----
 benchmark/profile_lm_head_lora.py         | 130 ----
 benchmark/test_lora_batch.py              | 126 ----
 benchmark/test_lora_dynamic.py            | 150 ----
 benchmark/test_lora_e2e.py                | 165 -----
 benchmark/test_lora_eviction_latency.py   | 156 -----
 profile_expand.py                         | 274 --------
 20 files changed, 4537 deletions(-)
 delete mode 100644 0520_results.md
 delete mode 100644 0521_moe_lora_results.md
 delete mode 100644 0522_results.md
 delete mode 100644 bench_chunked_sgmv.py
 delete mode 100644 bench_kernel_opt.py
 delete mode 100644 bench_vs_vllm.py
 delete mode 100644 benchmark/bench_fused_moe_lora_e2e.py
 delete mode 100644 benchmark/bench_fused_moe_lora_kernels.py
 delete mode 100644 benchmark/bench_lm_head_lora_decode.py
 delete mode 100644 benchmark/bench_moe_lora_decode.py
 delete mode 100644 benchmark/bench_moe_lora_retry.py
 delete mode 100644 benchmark/bench_triton_expand_kernel.py
 delete mode 100644 benchmark/nsys_decode_target.py
 delete mode 100644 benchmark/profile_decode.py
 delete mode 100644 benchmark/profile_lm_head_lora.py
 delete mode 100644 benchmark/test_lora_batch.py
 delete mode 100644 benchmark/test_lora_dynamic.py
 delete mode 100644 benchmark/test_lora_e2e.py
 delete mode 100644 benchmark/test_lora_eviction_latency.py
 delete mode 100644 profile_expand.py

diff --git a/0520_results.md b/0520_results.md
deleted file mode 100644
index c793064ea..000000000
--- a/0520_results.md
+++ /dev/null
@@ -1,71 +0,0 @@
-# LoRA Decode Benchmark — 2026-05-20
-
-**Model:** `Qwen/Qwen3-8B` · **bs=8** · **output\_tokens=200** · 5 bench iters · rank=16 · n\_slots=8 · H100 80GB
-**Adapters:** `togethercomputer/Qwen3-8B-LoRA-Password-Adapters`
-**n\_active:** distinct LoRA adapters in the batch (0 = enable\_lora but all requests use base model)
-
----
-
-## TP1 — All Adapter Types
-
-| Configuration | TTFT (ms) | req TPS (tok/s) | total tput (tok/s) |
-|---|---:|---:|---:|
-| baseline (no LoRA) · eager | 40.1 | 53.7 | 429.5 |
-| baseline (no LoRA) · cudagraph | 27.7 | 141.4 | 1131.0 |
-| **attn** · eager · n\_active=0 | 40.6 | 52.9 | 423.2 |
-| **attn** · eager · n\_active=1 | 55.5 | 36.7 | 293.8 |
-| **attn** · eager · n\_active=8 | 56.2 | 35.9 | 287.2 |
-| **attn** · cudagraph · n\_active=0 | 27.2 | 134.7 | 1077.6 |
-| **attn** · cudagraph · n\_active=1 | 35.9 | 133.8 | 1070.2 |
-| **attn** · cudagraph · n\_active=8 | 35.4 | 133.6 | 1068.8 |
-| **mlp** · eager · n\_active=0 | 38.8 | 54.1 | 433.0 |
-| **mlp** · eager · n\_active=1 | 55.2 | 37.1 | 296.7 |
-| **mlp** · eager · n\_active=8 | 55.5 | 36.2 | 289.6 |
-| **mlp** · cudagraph · n\_active=0 | 28.2 | 134.5 | 1075.5 |
-| **mlp** · cudagraph · n\_active=1 | 36.9 | 133.4 | 1066.5 |
-| **mlp** · cudagraph · n\_active=8 | 37.0 | 133.3 | 1066.3 |
-| **lm\_head** · eager · n\_active=0 | 39.4 | 53.5 | 428.2 |
-| **lm\_head** · eager · n\_active=1 | 40.1 | 51.8 | 414.4 |
-| **lm\_head** · eager · n\_active=8 | 40.3 | 51.5 | 411.9 |
-| **lm\_head** · cudagraph · n\_active=0 | 28.1 | 133.9 | 1071.0 |
-| **lm\_head** · cudagraph · n\_active=1 | 28.8 | 134.3 | 1074.2 |
-| **lm\_head** · cudagraph · n\_active=8 | 28.7 | 134.0 | 1071.9 |
-
----
-
-## TP1 vs TP2 — lm\_head LoRA
-
-| Configuration | TTFT (ms) | req TPS (tok/s) | total tput (tok/s) |
-|---|---:|---:|---:|
-| baseline tp1 · eager | 40.1 | 53.9 | 430.9 |
-| baseline tp1 · cudagraph | 28.2 | 141.3 | 1130.4 |
-| baseline tp2 · eager | 97.0 | 47.9 | 382.9 |
-| baseline tp2 · cudagraph | 29.1 | 206.6 | **1651.9** |
-| lm\_head tp1 · cudagraph · n\_active=0 | 28.0 | 134.5 | 1075.7 |
-| lm\_head tp1 · cudagraph · n\_active=1 | 28.8 | 134.3 | 1074.1 |
-| lm\_head tp1 · cudagraph · n\_active=8 | 28.9 | 134.0 | 1071.9 |
-| lm\_head tp2 · cudagraph · n\_active=0 | 29.6 | 194.8 | 1557.7 |
-| lm\_head tp2 · cudagraph · n\_active=1 | 29.7 | 194.6 | 1556.0 |
-| lm\_head tp2 · cudagraph · n\_active=8 | 28.8 | 194.3 | 1553.4 |
-
----
-
-## Summary
-
-| | eager tput | cudagraph tput | LoRA overhead (cudagraph) | TTFT (cudagraph) |
-|---|---:|---:|---:|---:|
-| baseline tp1 | 429.5 | 1131.0 | — | 27–28 ms |
-| attn LoRA tp1 | ~290 (−32%) | ~1069 (−5%) | −5% | 35–36 ms (+8 ms) |
-| mlp LoRA tp1 | ~293 (−32%) | ~1066 (−6%) | −6% | 37 ms (+9 ms) |
-| lm\_head LoRA tp1 | ~413 (−4%) | ~1073 (−5%) | −5% | 29 ms (+1 ms) |
-| baseline tp2 | 382.9 | 1651.9 | — | 29 ms |
-| lm\_head LoRA tp2 | — | ~1555 (−6%) | −6% | 29–30 ms |
-
-**TP2 vs TP1 cudagraph speedup:** 1.46× (NCCL all-reduce prevents ideal 2×)
-
-### Key findings
-
-- **Eager mode**: attn/mlp LoRA costs ~32% throughput (Triton segmented-GEMM runs 36× per step, once per layer); lm\_head LoRA costs only ~4% (single matmul applied once)
-- **Cudagraph**: all adapter types converge to ~5–6% overhead vs baseline — graph capture amortises per-layer Python launch cost
-- **TTFT**: attn/mlp add ~8–9 ms even with cudagraph (LoRA kernels baked into the prefill graph across 36 layers); lm\_head adds <2 ms
-- **n\_active 1→8**: negligible throughput difference under cudagraph (within 0.3%); in eager, ~2–3% degradation going from 1 to 8 distinct adapters
diff --git a/0521_moe_lora_results.md b/0521_moe_lora_results.md
deleted file mode 100644
index c9b230887..000000000
--- a/0521_moe_lora_results.md
+++ /dev/null
@@ -1,53 +0,0 @@
-# MoE LoRA Decode Benchmark — 2026-05-22
-
-**Model:** `Qwen/Qwen3-30B-A3B-Instruct-2507` · **bs=8** · **output_tokens=200** · 5 bench iters · rank=16 · max_loras=2 · H100 80GB
-
-**n_active:** distinct LoRA adapters in batch (0 = enable_lora, all base model)
-
-> MoE LoRA buffers ~1.96 GB/slot; max_loras=2 on 80 GB H100 with 30B model. gpu_util=0.86 for cudagraph+LoRA.
-
-## TP1 Eager
-
-| Configuration | TTFT (ms) | req TPS (tok/s) | total tput (tok/s) |
-|---|---:|---:|---:|
-| baseline tp1 eager | 99.5 | 28.5 | 228.1 |
-| baseline triton tp1 eager | 169.9 | 22.9 | 183.2 |
-| per_expert tp1 eager n_active=0 | ERR | ERR | ERR |
-| per_expert tp1 eager n_active=1 | ERR | ERR | ERR |
-| per_expert tp1 eager n_active=2 | ERR | ERR | ERR |
-| sglang_shared tp1 eager n_active=0 | ERR | ERR | ERR |
-| sglang_shared tp1 eager n_active=1 | ERR | ERR | ERR |
-| sglang_shared tp1 eager n_active=2 | ERR | ERR | ERR |
-
-## TP1 CUDA Graph
-
-| Configuration | TTFT (ms) | req TPS (tok/s) | total tput (tok/s) |
-|---|---:|---:|---:|
-| baseline tp1 cudagraph | ERR | ERR | ERR |
-| baseline triton tp1 cudagraph | ERR | ERR | ERR |
-| per_expert tp1 cudagraph n_active=0 | ERR | ERR | ERR |
-| per_expert tp1 cudagraph n_active=1 | ERR | ERR | ERR |
-| per_expert tp1 cudagraph n_active=2 | ERR | ERR | ERR |
-| sglang_shared tp1 cudagraph n_active=0 | ERR | ERR | ERR |
-| sglang_shared tp1 cudagraph n_active=1 | ERR | ERR | ERR |
-| sglang_shared tp1 cudagraph n_active=2 | ERR | ERR | ERR |
-
-## TP2 Eager
-
-| Configuration | TTFT (ms) | req TPS (tok/s) | total tput (tok/s) |
-|---|---:|---:|---:|
-| baseline tp2 eager | ERR | ERR | ERR |
-| baseline triton tp2 eager | ERR | ERR | ERR |
-| per_expert tp2 eager n_active=0 | ERR | ERR | ERR |
-| per_expert tp2 eager n_active=1 | ERR | ERR | ERR |
-| per_expert tp2 eager n_active=8 | ERR | ERR | ERR |
-
-## TP2 CUDA Graph
-
-| Configuration | TTFT (ms) | req TPS (tok/s) | total tput (tok/s) |
-|---|---:|---:|---:|
-| baseline tp2 cudagraph | ERR | ERR | ERR |
-| baseline triton tp2 cudagraph | ERR | ERR | ERR |
-| per_expert tp2 cudagraph n_active=0 | ERR | ERR | ERR |
-| per_expert tp2 cudagraph n_active=1 | ERR | ERR | ERR |
-| per_expert tp2 cudagraph n_active=8 | ERR | ERR | ERR |
diff --git a/0522_results.md b/0522_results.md
deleted file mode 100644
index 89da2ba50..000000000
--- a/0522_results.md
+++ /dev/null
@@ -1,129 +0,0 @@
-# MoE LoRA Optimization Results — 2026-05-22 (updated 2026-05-23)
-
-**Model:** `Qwen/Qwen3-30B-A3B-Instruct-2507` · **bs=8** · **output\_tokens=200** · H100 80GB
-**LoRA:** rank=16 · max\_loras=2 · TP=2 · CUDA graph mode
-**Adapter format:** sglang\_shared (shared outer A, per-expert B for gate/up; per-expert A, shared B for down)
-
----
-
-## Final Results (with fused Triton kernels)
-
-| Configuration | tput (tok/s) | step (ms) | overhead |
-|---|---:|---:|---:|
-| **baseline** (no LoRA, triton) | **1394** | **5.74** | — |
-| **n\_active=0** (LoRA loaded, inactive) | 1398 | 5.75 | **+0.01ms ✓** |
-| **n\_active=1** (fused kernels) | **1107** | **7.22** | **+1.48ms** |
-
-n\_active=0 matches baseline — loading an adapter costs nothing in decode.
-n\_active=1 overhead: **1.48ms** = 26% of baseline step time.
-
----
-
-## Decode Throughput Progress
-
-Starting from 809 tok/s (no Triton fused kernels, plain PyTorch LoRA):
-
-| Optimization | tput | step | overhead | Δ overhead |
-|---|---:|---:|---:|---:|
-| Baseline (no fused kernels) | 809 | 9.89ms | 4.12ms | — |
-| + flat gate/up kernel | 818 | 9.78ms | 3.99ms | −130μs |
-| + flat down shrink kernel | 827 | 9.68ms | 3.93ms | −60μs |
-| + buffer+slot (no gather copies) | 927 | 8.63ms | 2.90ms | −1.03ms |
-| + flat\_a\_gemm + scalings buffer | **1107** | **7.22ms** | **1.50ms** | **−1.40ms** |
-
-**Total: +36.8% tput, −63.6% LoRA overhead (4.12ms → 1.50ms)**
-
----
-
-## Fused Triton Kernels
-
-All kernels live in `tokenspeed-kernel/python/tokenspeed_kernel/ops/moe_lora/__init__.py`.
-Integration is in `python/tokenspeed/runtime/lora/moe_lora.py`.
-
-### 1. `compact_gate_up_expand` — flat per-expert GEMV (decode gate/up)
-
-Replaces the all-experts GEMM + candidates.gather + route\_delta chain (3 separate ops):
-```python
-# Old (3 ops, reads all 128 experts' B data = 12.6 MB):
-candidates = (lora_a_m @ w13_B.permute(2,0,1).reshape(r, E*I2)).view(m, E, I2)
-delta = candidates.gather(1, safe_ids.unsqueeze(-1).expand(...))
-_add_route_delta(gate_up_output, delta, ...)
-
-# New (1 op, reads only active experts' B = ~5 MB, −60% bandwidth):
-compact_gate_up_expand(lora_a_m, w13_B_buffer, slot_idx, safe_ids, gate_up_output, scalings)
-```
-
-Grid: `(I2//BLOCK_I, m*k)` — one block per flat-pair position. Computes `tok = pid_s // K`
-directly inside the kernel. CUDA-graph compatible: reads `w13_B_buffer[slot]` and
-`scalings[slot]` from device tensors without separate gather copies.
-
-**Microbenchmark:** 20μs vs 69μs (3.4×) for the gate/up B expand step.
-
-### 2. `flat_a_gemm` — A GEMM from buffer
-
-Computes `lora_a_m = hidden @ w13_A_buffer[slot, 0, :, :].T` directly from the weight
-buffer, eliminating:
-- `w13_A = w13_A_buffers[layer][slot_idx].squeeze(0)` — 22μs gather copy
-- `hidden @ w13_A[0].T` — 25μs cuBLAS GEMM (inefficient for m=8)
-
-Grid: `(m, R//BLOCK_R)` — one block per token. With m=8 and R=32 fitting in L1 cache
-across the 8 blocks, the kernel runs in ~5-8μs total.
-
-**Savings:** 47μs/layer × 48 = **2.26ms** isolated.
-
-### 3. `flat_down_shrink` — per-expert shrink from buffer
-
-Replaces `_select_expert_weights(down_A, safe_ids) + einsum("mki,mkri->mkr", ...)`:
-- Avoids the `(m*k, r, INTER)` = 1.5 MB intermediate tensor
-- Reads `down_A_buffer[slot, exp, :, :]` directly for each flat pair
-
-**Microbenchmark:** 23μs vs 54μs (2.4×).
-
-### 4. `flat_down_expand` — shared B expand + scale + add
-
-Fuses `lora_a @ down_B[slot, 0].T × topk_weight × scaling → down_output` in one kernel,
-reading `down_B_buffer[slot]` and `scalings[slot]` directly from device memory.
-
-### Key design decisions
-
-**No gather copies:** All 4 kernels receive the full `(n_slots, ...)` weight buffer and
-a `slot_ptr` GPU scalar. The kernel computes `buffer + slot * stride + ...` internally.
-This eliminates 4 buffer gather copies per layer (previously ~64μs/layer × 48 = **3.08ms**).
-
-**CUDA-graph safe:** `slot_ptr = bi.weight_indices[:1].clamp(0)` is a GPU tensor mutated
-before each `graph.replay()`, so different adapters work without re-capturing the graph.
-
-**Scalings in kernel:** `_flat_gate_up_expand_kernel` and `_flat_down_expand_kernel` load
-`scalings[slot]` from the full `(n_slots,)` scalings buffer, eliminating 2 more
-`scalings[slot_idx]` gather ops per layer (~19μs each × 2 × 48 = **1.82ms**).
-
----
-
-## Earlier Optimizations (prefill / TTFT)
-
-### Shared A/B fast path (sglang\_shared format)
-When `w13_A.shape[0] == 1` (shared outer), use a single matmul instead of an
-`O(m·k·r·h)` gather tensor. Saves 2.2 GB of intermediate tensor creation per prefill.
-
-### Remove `torch.any(valid)` GPU→CPU sync
-96 GPU→CPU stalls per prefill (48 layers × 2 ops) stalled the CPU-GPU pipeline.
-**Impact: −35ms TTFT** (108ms → 73ms for sglang\_shared n=1 prefill).
-
-### Vectorised scatter operations
-`_add_route_delta` (−56%) and `_route_rows_from_cache` (−68%) replaced boolean-index
-tensor creation with `scatter_` + slice.
-**Impact: −11ms** on route scatter ops in prefill.
-
-### CUDA graph: force has\_active\_lora=True during capture
-During LoRA CUDA graph capture, `has_active_lora=True` and `single_lora_slot=0` are
-forced so LoRA Triton kernels ARE recorded in the decode graph. Dynamic slot selection
-uses `bi.weight_indices[:1].clamp(0)` (GPU tensor updated before each replay) so the
-same graph serves any loaded adapter.
-
----
-
-## Correctness
-
-All correctness tests pass: `16 tests, 90 subtests` covering sglang\_shared and
-per\_expert formats under sequential, batched, high-concurrency, and mixed-LoRA/base
-scenarios (test\_qwen3\_moe\_per\_expert\_lora + test\_qwen3\_lora\_password\_adapters).
diff --git a/bench_chunked_sgmv.py b/bench_chunked_sgmv.py
deleted file mode 100644
index 450bca678..000000000
--- a/bench_chunked_sgmv.py
+++ /dev/null
@@ -1,817 +0,0 @@
-"""Benchmark: our shrink/expand kernels vs sglang csgmv variants.
-
-Inlines sglang kernels (Apache-2.0) so sglang doesn't need to be
-installed.  All kernels are autotuned with the same config space.
-
-Shrink (LoRA-A):  x (s, K) @ W^T (K, N)  →  out (s, N)
-  N = stack_num * rank  (small),  K = in_dim  (large, 4096+)
-  Key diff in chunked_sgmv_shrink: K and N are constexpr
-  → K-loop trip count is compile-time constant.
-
-Expand (LoRA-B):  x (s, num_slices*R) @ W (R, out_dim)  →  out (s, out_dim)
-  R = rank  (small),  out_dim large
-  Key diff in chunked_sgmv_expand: strides and MAX_RANK are constexpr.
-
-When rank == max_rank the x layouts are identical between ours and sglang.
-
-Usage:
-    python bench_chunked_sgmv.py
-"""
-
-from __future__ import annotations
-
-import sys
-from dataclasses import dataclass
-from pathlib import Path
-
-import torch
-import triton
-import triton.language as tl
-
-# ── make the local kernel package importable ──────────────────────────────────
-sys.path.insert(
-    0,
-    str(Path(__file__).parent / "tokenspeed-kernel" / "python"),
-)
-
-from tokenspeed_kernel.ops.lora.triton.lora_expand import lora_expand_fwd
-from tokenspeed_kernel.ops.lora.triton.lora_gate_up_expand import (
-    lora_gate_up_expand_fwd,
-)
-from tokenspeed_kernel.ops.lora.triton.lora_qkv_expand import lora_qkv_expand_fwd
-from tokenspeed_kernel.ops.lora.triton.lora_shrink import lora_shrink_fwd
-
-# ── minimal batch-info dataclass ──────────────────────────────────────────────
-
-
-@dataclass
-class BatchInfo:
-    bs: int
-    max_len: int
-    seg_lens: torch.Tensor
-    seg_indptr: torch.Tensor
-    weight_indices: torch.Tensor
-    lora_ranks: torch.Tensor
-    scalings: torch.Tensor
-    permutation: torch.Tensor | None = None
-    # sglang compat
-    num_segments: int = 0
-    use_cuda_graph: bool = False
-
-
-def make_batch(
-    s_per_seg: int, n_segs: int, rank: int, with_perm: bool = False
-) -> BatchInfo:
-    dev = "cuda"
-    seg_lens = torch.full((n_segs,), s_per_seg, dtype=torch.int32, device=dev)
-    seg_indptr = torch.arange(n_segs + 1, dtype=torch.int32, device=dev) * s_per_seg
-    # all segs route to slot 1 (real adapter), slot 0 = no-adapter sentinel
-    weight_indices = torch.ones(n_segs, dtype=torch.int32, device=dev)
-    lora_ranks = torch.tensor([0, rank], dtype=torch.int32, device=dev)
-    scalings = torch.tensor([0.0, 1.0], dtype=torch.float32, device=dev)
-    perm = None
-    if with_perm:
-        s_total = n_segs * s_per_seg
-        perm = torch.arange(s_total, dtype=torch.int64, device=dev)
-    return BatchInfo(
-        bs=n_segs,
-        max_len=s_per_seg,
-        seg_lens=seg_lens,
-        seg_indptr=seg_indptr,
-        weight_indices=weight_indices,
-        lora_ranks=lora_ranks,
-        scalings=scalings,
-        permutation=perm,
-        num_segments=n_segs,
-    )
-
-
-# ── inlined sglang chunked_sgmv_expand (Apache-2.0) ──────────────────────────
-# Source: github.com/sgl-project/sglang  python/sglang/srt/lora/triton_ops/chunked_sgmv_expand.py
-# Local change: replaced sglang imports with triton directly; added @triton.autotune.
-
-
-@triton.jit(do_not_specialize=["num_segs", "output_stride_0", "output_stride_1"])
-def _chunked_lora_expand_kernel(
-    x,
-    weights,
-    output,
-    output_stride_0,
-    output_stride_1,
-    seg_indptr,
-    weight_indices,
-    lora_ranks,
-    permutation,
-    num_segs,
-    scalings,
-    slice_offsets,
-    NUM_SLICES: tl.constexpr,
-    OUTPUT_DIM: tl.constexpr,
-    MAX_RANK: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-):
-    x_stride_0: tl.constexpr = NUM_SLICES * MAX_RANK
-    x_stride_1: tl.constexpr = 1
-
-    pid_s = tl.program_id(axis=2)
-    if pid_s >= num_segs:
-        return
-
-    w_index = tl.load(weight_indices + pid_s)
-    cur_rank = tl.load(lora_ranks + w_index)
-    if cur_rank == 0:
-        return
-
-    seg_start = tl.load(seg_indptr + pid_s)
-    seg_end = tl.load(seg_indptr + pid_s + 1)
-    slice_id = tl.program_id(axis=1)
-    slice_start = tl.load(slice_offsets + slice_id)
-    slice_end = tl.load(slice_offsets + slice_id + 1)
-    scaling = tl.load(scalings + w_index)
-
-    cur_rank = tl.minimum(MAX_RANK, cur_rank)
-
-    s_offset_logical = tl.arange(0, BLOCK_M) + seg_start
-    s_offset_physical = tl.load(
-        permutation + s_offset_logical, mask=s_offset_logical < seg_end
-    )
-
-    pid_n = tl.program_id(axis=0)
-    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + slice_start
-    k_offset = tl.arange(0, BLOCK_K)
-
-    x_ptrs = (
-        x
-        + slice_id * cur_rank * x_stride_1
-        + (s_offset_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1)
-    )
-    w_stride_0: tl.constexpr = OUTPUT_DIM * MAX_RANK
-    w_stride_1: tl.constexpr = MAX_RANK
-    w_stride_2: tl.constexpr = 1
-    w_ptrs = (weights + w_index * w_stride_0) + (
-        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
-    )
-
-    partial_sum = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(cur_rank, BLOCK_K)):
-        x_tile = tl.load(
-            x_ptrs,
-            mask=(s_offset_logical[:, None] < seg_end)
-            & (k_offset[None, :] < cur_rank - k * BLOCK_K),
-            other=0.0,
-        )
-        w_tile = tl.load(
-            w_ptrs,
-            mask=(k_offset[:, None] < cur_rank - k * BLOCK_K)
-            & (n_offset[None, :] < slice_end),
-            other=0.0,
-        )
-        partial_sum += tl.dot(x_tile, w_tile)
-        x_ptrs += BLOCK_K * x_stride_1
-        w_ptrs += BLOCK_K * w_stride_2
-
-    partial_sum *= scaling
-    partial_sum = partial_sum.to(x.dtype.element_ty)
-
-    output_ptr = output + (
-        s_offset_physical[:, None] * output_stride_0
-        + n_offset[None, :] * output_stride_1
-    )
-    output_mask = (s_offset_logical[:, None] < seg_end) & (
-        n_offset[None, :] < slice_end
-    )
-    partial_sum += tl.load(output_ptr, mask=output_mask, other=0.0)
-    tl.store(output_ptr, partial_sum, mask=output_mask)
-
-
-def chunked_sgmv_expand_fwd(
-    x: torch.Tensor,
-    weights: torch.Tensor,
-    batch_info: BatchInfo,
-    slice_offsets: torch.Tensor,
-    max_slice_size: int,
-    base_output: torch.Tensor | None,
-) -> torch.Tensor:
-    assert x.is_contiguous() and weights.is_contiguous()
-    M = x.shape[0]
-    OUT_DIM = weights.shape[1]
-    MAX_RANK = weights.shape[2]
-    num_slices = len(slice_offsets) - 1
-    assert x.shape[1] == num_slices * MAX_RANK
-
-    num_segs = batch_info.num_segments
-
-    BM, BN, BK = 16, 64, 16
-    grid = (triton.cdiv(max_slice_size, BN), num_slices, batch_info.bs)
-    output = (
-        torch.zeros((M, OUT_DIM), device=x.device, dtype=x.dtype)
-        if base_output is None
-        else base_output
-    )
-    _chunked_lora_expand_kernel[grid](
-        x=x,
-        weights=weights,
-        output=output,
-        output_stride_0=output.stride(0),
-        output_stride_1=output.stride(1),
-        seg_indptr=batch_info.seg_indptr,
-        weight_indices=batch_info.weight_indices,
-        lora_ranks=batch_info.lora_ranks,
-        permutation=batch_info.permutation,
-        num_segs=num_segs,
-        scalings=batch_info.scalings,
-        slice_offsets=slice_offsets,
-        NUM_SLICES=num_slices,
-        OUTPUT_DIM=OUT_DIM,
-        MAX_RANK=MAX_RANK,
-        BLOCK_M=BM,
-        BLOCK_N=BN,
-        BLOCK_K=BK,
-        num_warps=4,
-        num_stages=2,
-    )
-    return output
-
-
-# ── inlined sglang sgemm_lora_a (Apache-2.0) ─────────────────────────────────
-# Source: github.com/sgl-project/sglang  python/sglang/srt/lora/triton_ops/sgemm_lora_a.py
-# Local change: replaced sglang imports; added @triton.autotune (original uses fixed sizes).
-
-
-@triton.jit
-def _sgemm_lora_a_kernel(
-    x,
-    weights,
-    output,
-    N,
-    K,
-    stack_num,
-    x_stride_0,
-    x_stride_1,
-    w_stride_0,
-    w_stride_1,
-    w_stride_2,
-    output_stride_0,
-    output_stride_1,
-    seg_lens,
-    seg_indptr,
-    weight_indices,
-    lora_ranks,
-    sorted_token_ids,
-    SORTED_BY_ADAPTER: tl.constexpr,
-    BLOCK_S: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-):
-    batch_id = tl.program_id(axis=1)
-    w_index = tl.load(weight_indices + batch_id)
-    rank = tl.load(lora_ranks + w_index)
-    if rank == 0:
-        return
-    pid = tl.program_id(axis=0)
-    seg_start = tl.load(seg_indptr + batch_id)
-    seg_len = tl.load(seg_lens + batch_id)
-    if seg_len == 0:
-        return
-    N = tl.minimum(N, rank * stack_num)
-    num_pid_n = tl.cdiv(N, BLOCK_N)
-    pid_s = pid // num_pid_n
-    pid_n = pid % num_pid_n
-    if pid_s * BLOCK_S >= seg_len:
-        return
-    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
-    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    k_offset = tl.arange(0, BLOCK_K)
-    if SORTED_BY_ADAPTER:
-        s_physical = tl.load(
-            sorted_token_ids + seg_start + s_offset, mask=s_offset < seg_len, other=0
-        )
-    else:
-        s_physical = seg_start + s_offset
-    x_ptrs = x + (s_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1)
-    w_ptrs = (weights + w_index * w_stride_0) + (
-        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
-    )
-    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_K)):
-        x_tile = tl.load(
-            x_ptrs,
-            mask=(s_offset[:, None] < seg_len) & (k_offset[None, :] < K - k * BLOCK_K),
-            other=0.0,
-        )
-        w_tile = tl.load(
-            w_ptrs,
-            mask=(k_offset[:, None] < K - k * BLOCK_K) & (n_offset[None, :] < N),
-            other=0.0,
-        )
-        partial_sum += tl.dot(x_tile, w_tile)
-        x_ptrs += BLOCK_K * x_stride_1
-        w_ptrs += BLOCK_K * w_stride_2
-    partial_sum = partial_sum.to(x.dtype.element_ty)
-    output_mask = (s_offset[:, None] < seg_len) & (n_offset[None, :] < N)
-    output_ptr = output + (
-        s_physical[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
-    )
-    tl.store(output_ptr, partial_sum, mask=output_mask)
-
-
-def sgemm_lora_a_fwd(x, weights, batch_info, stack_num=1):
-    S, K = x.shape
-    N = weights.shape[-2]
-    assert x.is_contiguous() and weights.is_contiguous()
-    max_len = batch_info.max_len
-    BS, BN, BK = 16, 32, 128
-    grid = (triton.cdiv(max_len, BS) * triton.cdiv(N, BN), batch_info.bs)
-    output = torch.empty((S, N), device=x.device, dtype=x.dtype)
-    sorted_by_adapter = batch_info.permutation is not None
-    _sgemm_lora_a_kernel[grid](
-        x,
-        weights,
-        output,
-        N,
-        K,
-        stack_num,
-        x.stride(0),
-        x.stride(1),
-        weights.stride(0),
-        weights.stride(1),
-        weights.stride(2),
-        output.stride(0),
-        output.stride(1),
-        batch_info.seg_lens,
-        batch_info.seg_indptr,
-        batch_info.weight_indices,
-        batch_info.lora_ranks,
-        batch_info.permutation,
-        sorted_by_adapter,
-        BLOCK_S=BS,
-        BLOCK_N=BN,
-        BLOCK_K=BK,
-        num_warps=4,
-        num_stages=4,
-    )
-    return output
-
-
-# ── inlined sglang chunked_sgmv_shrink (Apache-2.0) ──────────────────────────
-# Source: github.com/sgl-project/sglang  python/sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py
-# Local change: replaced sglang imports; added @triton.autotune.
-# Key structural diff vs sgemm_lora_a: K, N, and all strides are constexpr.
-
-
-@triton.jit(do_not_specialize=["num_segs"])
-def _chunked_lora_shrink_kernel(
-    x,
-    weights,
-    output,
-    seg_indptr,
-    weight_indices,
-    lora_ranks,
-    permutation,
-    num_segs,
-    N: tl.constexpr,
-    K: tl.constexpr,
-    NUM_SLICES: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-):
-    x_stride_1: tl.constexpr = 1
-    x_stride_0: tl.constexpr = K
-    w_stride_0: tl.constexpr = N * K
-    w_stride_1: tl.constexpr = K
-    w_stride_2: tl.constexpr = 1
-    output_stride_0: tl.constexpr = N
-    output_stride_1: tl.constexpr = 1
-
-    pid_s = tl.program_id(1)
-    if pid_s >= num_segs:
-        return
-    pid_n = tl.program_id(0)
-    w_index = tl.load(weight_indices + pid_s)
-    rank = tl.load(lora_ranks + w_index)
-    if rank == 0:
-        return
-    seg_start = tl.load(seg_indptr + pid_s)
-    seg_end = tl.load(seg_indptr + pid_s + 1)
-    cur_n = tl.minimum(N, rank * NUM_SLICES)
-
-    s_offset_logical = tl.arange(0, BLOCK_M) + seg_start
-    s_offset_physical = tl.load(
-        permutation + s_offset_logical, mask=s_offset_logical < seg_end
-    )
-    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    k_offset = tl.arange(0, BLOCK_K)
-
-    x_ptrs = x + (
-        s_offset_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1
-    )
-    w_ptrs = (weights + w_index * w_stride_0) + (
-        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
-    )
-    partial_sum = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_K)):
-        x_tile = tl.load(
-            x_ptrs,
-            mask=(s_offset_logical[:, None] < seg_end)
-            & (k_offset[None, :] < K - k * BLOCK_K),
-            other=0.0,
-        )
-        w_tile = tl.load(
-            w_ptrs,
-            mask=(k_offset[:, None] < K - k * BLOCK_K) & (n_offset[None, :] < cur_n),
-            other=0.0,
-        )
-        partial_sum += tl.dot(x_tile, w_tile)
-        x_ptrs += BLOCK_K * x_stride_1
-        w_ptrs += BLOCK_K * w_stride_2
-
-    partial_sum = partial_sum.to(x.dtype.element_ty)
-    output_ptr = output + (
-        s_offset_physical[:, None] * output_stride_0
-        + n_offset[None, :] * output_stride_1
-    )
-    output_mask = (s_offset_logical[:, None] < seg_end) & (n_offset[None, :] < cur_n)
-    tl.store(output_ptr, partial_sum, mask=output_mask)
-
-
-def chunked_sgmv_shrink_fwd(x, weights, batch_info, num_slices=1):
-    S, K = x.shape
-    N = weights.shape[-2]  # num_slices * rank
-    assert x.is_contiguous() and weights.is_contiguous()
-    num_segs = batch_info.num_segments
-    BM, BN, BK = 16, 32, 128
-    grid = (triton.cdiv(N, BN), batch_info.bs)
-    output = torch.empty((S, N), device=x.device, dtype=x.dtype)
-    _chunked_lora_shrink_kernel[grid](
-        x,
-        weights,
-        output,
-        batch_info.seg_indptr,
-        batch_info.weight_indices,
-        batch_info.lora_ranks,
-        batch_info.permutation,
-        num_segs,
-        N=N,
-        K=K,
-        NUM_SLICES=num_slices,
-        BLOCK_M=BM,
-        BLOCK_N=BN,
-        BLOCK_K=BK,
-        num_warps=4,
-        num_stages=4,
-    )
-    return output
-
-
-# ── inlined sglang sgemm_lora_b (Apache-2.0) ─────────────────────────────────
-# Source: github.com/sgl-project/sglang  python/sglang/srt/lora/triton_ops/sgemm_lora_b.py
-# Structurally identical to our lora_expand; only difference is fixed BLOCK_N=256.
-
-
-@triton.jit
-def _sgemm_lora_b_kernel(
-    x,
-    weights,
-    output,
-    N,
-    K,
-    x_stride_0,
-    x_stride_1,
-    w_stride_0,
-    w_stride_1,
-    w_stride_2,
-    output_stride_0,
-    output_stride_1,
-    seg_lens,
-    seg_indptr,
-    weight_indices,
-    lora_ranks,
-    sorted_token_ids,
-    SORTED_BY_ADAPTER: tl.constexpr,
-    BLOCK_S: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    scalings,
-):
-    batch_id = tl.program_id(axis=1)
-    w_index = tl.load(weight_indices + batch_id)
-    rank = tl.load(lora_ranks + w_index)
-    if rank == 0:
-        return
-    pid = tl.program_id(axis=0)
-    seg_len = tl.load(seg_lens + batch_id)
-    if seg_len == 0:
-        return
-    seg_start = tl.load(seg_indptr + batch_id)
-    scaling = tl.load(scalings + w_index)
-    K = tl.minimum(K, rank)
-    num_pid_n = tl.cdiv(N, BLOCK_N)
-    pid_s = pid // num_pid_n
-    pid_n = pid % num_pid_n
-    if pid_s * BLOCK_S >= seg_len:
-        return
-    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
-    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    k_offset = tl.arange(0, BLOCK_K)
-    if SORTED_BY_ADAPTER:
-        s_physical = tl.load(
-            sorted_token_ids + seg_start + s_offset, mask=s_offset < seg_len, other=0
-        )
-    else:
-        s_physical = seg_start + s_offset
-    x_ptrs = x + (s_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1)
-    w_ptrs = (weights + w_index * w_stride_0) + (
-        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
-    )
-    n_mask = n_offset[None, :] < N
-    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_K)):
-        x_tile = tl.load(
-            x_ptrs,
-            mask=(s_offset[:, None] < seg_len) & (k_offset[None, :] < K - k * BLOCK_K),
-            other=0.0,
-        )
-        w_tile = tl.load(
-            w_ptrs, mask=(k_offset[:, None] < K - k * BLOCK_K) & n_mask, other=0.0
-        )
-        partial_sum += tl.dot(x_tile, w_tile)
-        x_ptrs += BLOCK_K * x_stride_1
-        w_ptrs += BLOCK_K * w_stride_2
-    partial_sum *= scaling
-    partial_sum = partial_sum.to(x.dtype.element_ty)
-    output_ptr = output + (
-        s_physical[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
-    )
-    output_mask = (s_offset[:, None] < seg_len) & n_mask
-    partial_sum += tl.load(output_ptr, mask=output_mask, other=0.0)
-    tl.store(output_ptr, partial_sum, mask=output_mask)
-
-
-def sgemm_lora_b_fwd(x, weights, batch_info, base_output=None):
-    S, R = x.shape
-    N = weights.shape[-2]
-    assert x.is_contiguous() and weights.is_contiguous()
-    # Original sglang fixed configs: BLOCK_S=16, BLOCK_N=256, BLOCK_K=16
-    BS, BN, BK = 16, 256, 16
-    max_len = batch_info.max_len
-    grid = (triton.cdiv(max_len, BS) * triton.cdiv(N, BN), batch_info.bs)
-    output = (
-        torch.zeros((S, N), device=x.device, dtype=x.dtype)
-        if base_output is None
-        else base_output
-    )
-    sorted_by_adapter = batch_info.permutation is not None
-    _sgemm_lora_b_kernel[grid](
-        x,
-        weights,
-        output,
-        N,
-        R,
-        x.stride(0),
-        x.stride(1),
-        weights.stride(0),
-        weights.stride(1),
-        weights.stride(2),
-        output.stride(0),
-        output.stride(1),
-        batch_info.seg_lens,
-        batch_info.seg_indptr,
-        batch_info.weight_indices,
-        batch_info.lora_ranks,
-        batch_info.permutation,
-        sorted_by_adapter,
-        BLOCK_S=BS,
-        BLOCK_N=BN,
-        BLOCK_K=BK,
-        num_warps=4,
-        num_stages=2,
-        scalings=batch_info.scalings,
-    )
-    return output
-
-
-# ── benchmark helpers ─────────────────────────────────────────────────────────
-
-
-def bench(fn, label: str, warmup: int = 25, rep: int = 100) -> float:
-    ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
-    print(f"  {label:<42s}  {ms*1000:7.1f} µs")
-    return ms
-
-
-def run_shrink_scenario(
-    label: str,
-    s_per_seg: int,
-    n_segs: int,
-    rank: int,
-    hidden: int,
-    intermediate_per_tp: int,
-) -> None:
-    dev, dt = "cuda", torch.bfloat16
-    s = s_per_seg * n_segs
-    bi_ours = make_batch(s_per_seg, n_segs, rank, with_perm=False)
-    bi_sglang = make_batch(s_per_seg, n_segs, rank, with_perm=True)
-
-    print(f"\n{'='*60}")
-    print(f"  SHRINK {label}")
-    print(f"  s_per_seg={s_per_seg}  n_segs={n_segs}  rank={rank}  s_total={s}")
-    print(f"{'='*60}")
-
-    for stack_num, in_dim, tag in [
-        (3, hidden, "QKV shrink      in=hidden  stack=3"),
-        (2, hidden, "gate/up shrink  in=hidden  stack=2"),
-        (1, hidden, "o/down shrink   in=hidden  stack=1"),
-        (1, intermediate_per_tp, "down shrink     in=inter   stack=1"),
-    ]:
-        N = stack_num * rank
-        x = torch.randn((s, in_dim), device=dev, dtype=dt)
-        w = torch.randn((2, N, in_dim), device=dev, dtype=dt)
-        print(f"\n[{tag}]  K={in_dim}")
-        bench(
-            lambda x=x, w=w: lora_shrink_fwd(x, w, bi_ours, stack_num=stack_num),
-            "ours lora_shrink_fwd",
-        )
-        bench(
-            lambda x=x, w=w: sgemm_lora_a_fwd(x, w, bi_sglang, stack_num=stack_num),
-            "sglang sgemm_lora_a (autotuned)",
-        )
-        bench(
-            lambda x=x, w=w: chunked_sgmv_shrink_fwd(
-                x, w, bi_sglang, num_slices=stack_num
-            ),
-            "sglang chunked_sgmv_shrink",
-        )
-
-
-def run_scenario(
-    label: str,
-    s_per_seg: int,
-    n_segs: int,
-    rank: int,
-    hidden: int,
-    intermediate_per_tp: int,
-    q_per_tp: int,
-    kv_per_tp: int,
-) -> None:
-    dev, dt = "cuda", torch.bfloat16
-    max_rank = rank  # rank == max_rank so x layouts are identical
-
-    s = s_per_seg * n_segs
-    bi_ours = make_batch(s_per_seg, n_segs, rank, with_perm=False)
-    bi_sglang = make_batch(
-        s_per_seg, n_segs, rank, with_perm=True
-    )  # sglang always needs perm
-
-    print(f"\n{'='*60}")
-    print(f"  {label}")
-    print(f"  s_per_seg={s_per_seg}  n_segs={n_segs}  rank={rank}  s_total={s}")
-    print(f"{'='*60}")
-
-    # ── plain expand (o_proj / down_proj): 1 slice, out_dim=hidden ──
-    print("\n[plain expand]  out_dim=hidden")
-    x1 = torch.randn((s, max_rank), device=dev, dtype=dt)
-    w1 = torch.randn((2, hidden, max_rank), device=dev, dtype=dt)
-    o1 = torch.zeros((s, hidden), device=dev, dtype=dt)
-    so1 = torch.tensor([0, hidden], dtype=torch.int32, device=dev)
-
-    bench(
-        lambda: lora_expand_fwd(x1, w1, bi_ours, base_output=o1.clone()),
-        "ours lora_expand_fwd",
-    )
-    bench(
-        lambda: sgemm_lora_b_fwd(x1, w1, bi_sglang, base_output=o1.clone()),
-        "sglang sgemm_lora_b (BN=256)",
-    )
-    bench(
-        lambda: chunked_sgmv_expand_fwd(x1, w1, bi_sglang, so1, hidden, o1.clone()),
-        "sglang chunked_sgmv (1 slice)",
-    )
-
-    # ── QKV expand: 3 slices ──
-    qkv_out = q_per_tp + 2 * kv_per_tp
-    max_qkv = max(q_per_tp, kv_per_tp)
-    x3 = torch.randn((s, 3 * max_rank), device=dev, dtype=dt)
-    w3 = torch.randn((2, qkv_out, max_rank), device=dev, dtype=dt)
-    o3 = torch.zeros((s, qkv_out), device=dev, dtype=dt)
-    off3 = torch.tensor(
-        [0, q_per_tp, q_per_tp + kv_per_tp, q_per_tp + 2 * kv_per_tp],
-        dtype=torch.int32,
-        device=dev,
-    )
-
-    print(f"\n[QKV expand]  q={q_per_tp}  kv={kv_per_tp}")
-    bench(
-        lambda: lora_qkv_expand_fwd(
-            x3, w3, bi_ours, off3, max_qkv, base_output=o3.clone()
-        ),
-        "ours lora_qkv_expand_fwd",
-    )
-    bench(
-        lambda: chunked_sgmv_expand_fwd(x3, w3, bi_sglang, off3, max_qkv, o3.clone()),
-        "sglang chunked_sgmv (3 slices)",
-    )
-
-    # ── gate/up expand: 2 slices ──
-    x2 = torch.randn((s, 2 * max_rank), device=dev, dtype=dt)
-    w2 = torch.randn((2, 2 * intermediate_per_tp, max_rank), device=dev, dtype=dt)
-    o2 = torch.zeros((s, 2 * intermediate_per_tp), device=dev, dtype=dt)
-    so2 = torch.tensor(
-        [0, intermediate_per_tp, 2 * intermediate_per_tp], dtype=torch.int32, device=dev
-    )
-
-    print(f"\n[gate/up expand]  intermediate_per_tp={intermediate_per_tp}")
-    bench(
-        lambda: lora_gate_up_expand_fwd(
-            x2, w2, bi_ours, intermediate_per_tp, base_output=o2.clone()
-        ),
-        "ours lora_gate_up_expand_fwd",
-    )
-    bench(
-        lambda: chunked_sgmv_expand_fwd(
-            x2, w2, bi_sglang, so2, intermediate_per_tp, o2.clone()
-        ),
-        "sglang chunked_sgmv (2 slices)",
-    )
-
-
-# ── main ──────────────────────────────────────────────────────────────────────
-
-if __name__ == "__main__":
-    # Qwen3-8B-like shapes at TP=2
-    HIDDEN = 4096
-    INTERMEDIATE = 12288
-    INTER_PER_TP = INTERMEDIATE // 2  # 6144
-    Q_PER_TP = 2048
-    KV_PER_TP = 512
-    RANK = 64
-
-    # ── 1. Sequence-length sweep (fixed n_segs=32 decode, n_segs=4 prefill) ──
-    for s_per_seg, n_segs, tag in [
-        (1, 32, "DECODE      s=1    n_segs=32"),
-        (1, 64, "DECODE      s=1    n_segs=64"),
-        (128, 4, "PREFILL     s=128  n_segs=4"),
-        (512, 2, "PREFILL     s=512  n_segs=2"),
-    ]:
-        run_scenario(
-            tag,
-            s_per_seg=s_per_seg,
-            n_segs=n_segs,
-            rank=RANK,
-            hidden=HIDDEN,
-            intermediate_per_tp=INTER_PER_TP,
-            q_per_tp=Q_PER_TP,
-            kv_per_tp=KV_PER_TP,
-        )
-
-    # ── 2. Adapter-count sweep (decode, s_per_seg=1, vary n_segs) ──
-    print(f"\n\n{'#'*60}")
-    print(f"  ADAPTER COUNT SWEEP  (decode s=1, rank={RANK})")
-    print(f"{'#'*60}")
-    dev, dt = "cuda", torch.bfloat16
-    qkv_out = Q_PER_TP + 2 * KV_PER_TP
-    max_qkv = max(Q_PER_TP, KV_PER_TP)
-    off3 = torch.tensor(
-        [0, Q_PER_TP, Q_PER_TP + KV_PER_TP, Q_PER_TP + 2 * KV_PER_TP],
-        dtype=torch.int32,
-        device=dev,
-    )
-    so1 = torch.tensor([0, HIDDEN], dtype=torch.int32, device=dev)
-
-    print(
-        f"\n{'n_segs':>8}  {'ours expand':>14}  {'sgemm_b BN256':>14}  {'csgmv 1sl':>12}  {'ours qkv':>12}  {'csgmv 3sl':>12}"
-    )
-    print("-" * 82)
-    for n_segs in (1, 2, 4, 8, 16, 32, 64, 128):
-        s = n_segs
-        bi_o = make_batch(1, n_segs, RANK, with_perm=False)
-        bi_s = make_batch(1, n_segs, RANK, with_perm=True)
-        x1 = torch.randn((s, RANK), device=dev, dtype=dt)
-        w1 = torch.randn((2, HIDDEN, RANK), device=dev, dtype=dt)
-        o1 = torch.zeros((s, HIDDEN), device=dev, dtype=dt)
-        x3 = torch.randn((s, 3 * RANK), device=dev, dtype=dt)
-        w3 = torch.randn((2, qkv_out, RANK), device=dev, dtype=dt)
-        o3 = torch.zeros((s, qkv_out), device=dev, dtype=dt)
-
-        def t(fn):
-            return triton.testing.do_bench(fn, warmup=25, rep=200) * 1000
-
-        t_ours_exp = t(lambda: lora_expand_fwd(x1, w1, bi_o, base_output=o1.clone()))
-        t_sgemm_b = t(lambda: sgemm_lora_b_fwd(x1, w1, bi_s, base_output=o1.clone()))
-        t_csgmv_1 = t(
-            lambda: chunked_sgmv_expand_fwd(x1, w1, bi_s, so1, HIDDEN, o1.clone())
-        )
-        t_ours_qkv = t(
-            lambda: lora_qkv_expand_fwd(
-                x3, w3, bi_o, off3, max_qkv, base_output=o3.clone()
-            )
-        )
-        t_csgmv_3 = t(
-            lambda: chunked_sgmv_expand_fwd(x3, w3, bi_s, off3, max_qkv, o3.clone())
-        )
-
-        print(
-            f"{n_segs:>8}  {t_ours_exp:>13.1f}µ  {t_sgemm_b:>13.1f}µ  {t_csgmv_1:>11.1f}µ  {t_ours_qkv:>11.1f}µ  {t_csgmv_3:>11.1f}µ"
-        )
diff --git a/bench_kernel_opt.py b/bench_kernel_opt.py
deleted file mode 100644
index 22fadb43a..000000000
--- a/bench_kernel_opt.py
+++ /dev/null
@@ -1,141 +0,0 @@
-"""Before/after benchmark for kernel micro-optimisations + sort-by-adapter.
-
-Tests decode shrink and expand with mixed adapters — the scenario where
-sort-by-adapter actually helps (adjacent CTAs share the same weight tile).
-
-Usage:
-    python bench_kernel_opt.py
-"""
-
-from __future__ import annotations
-
-import sys
-from dataclasses import dataclass
-from pathlib import Path
-
-import torch
-import triton
-
-sys.path.insert(0, str(Path(__file__).parent / "tokenspeed-kernel" / "python"))
-
-from tokenspeed_kernel.ops.lora.triton.lora_expand import lora_expand_fwd
-from tokenspeed_kernel.ops.lora.triton.lora_expand_decode import lora_expand_decode_fwd
-from tokenspeed_kernel.ops.lora.triton.lora_shrink import lora_shrink_fwd
-
-
-@dataclass
-class BatchInfo:
-    bs: int
-    max_len: int
-    num_segments: int
-    seg_lens: torch.Tensor
-    seg_indptr: torch.Tensor
-    weight_indices: torch.Tensor
-    lora_ranks: torch.Tensor
-    scalings: torch.Tensor
-    permutation: torch.Tensor | None = None
-    sort_order: torch.Tensor | None = None
-    group_slots: torch.Tensor | None = None
-    group_starts: torch.Tensor | None = None
-    group_sizes: torch.Tensor | None = None
-    num_groups: int = 0
-
-
-def make_mixed_batch(
-    n_segs: int,
-    n_unique_adapters: int,
-    rank: int,
-    device: str = "cuda",
-) -> BatchInfo:
-    """n_segs decode segments, round-robin across n_unique_adapters adapters."""
-    slots_list = [(i % n_unique_adapters) + 1 for i in range(n_segs)]
-    slots = torch.tensor(slots_list, dtype=torch.int32, device=device)
-
-    seg_lens = torch.ones(n_segs, dtype=torch.int32, device=device)
-    seg_indptr = torch.arange(n_segs + 1, dtype=torch.int32, device=device)
-    n_slots = n_unique_adapters + 1
-    lora_ranks = torch.zeros(n_slots, dtype=torch.int32, device=device)
-    lora_ranks[1:] = rank
-    scalings = torch.ones(n_slots, dtype=torch.float32, device=device)
-    scalings[0] = 0.0
-
-    # Build group metadata (same logic as prepare_loras)
-    sort_order_cpu = sorted(range(n_segs), key=lambda i: slots_list[i])
-    groups: list[list[int]] = []
-    for pos, orig in enumerate(sort_order_cpu):
-        slot = slots_list[orig]
-        if not groups or groups[-1][0] != slot:
-            groups.append([slot, pos, 1])
-        else:
-            groups[-1][2] += 1
-    ng = len(groups)
-    sort_order_gpu = torch.tensor(sort_order_cpu, dtype=torch.int64, device=device)
-    group_slots_gpu = torch.tensor(
-        [g[0] for g in groups], dtype=torch.int32, device=device
-    )
-    group_starts_gpu = torch.tensor(
-        [g[1] for g in groups], dtype=torch.int32, device=device
-    )
-    group_sizes_gpu = torch.tensor(
-        [g[2] for g in groups], dtype=torch.int32, device=device
-    )
-
-    return BatchInfo(
-        bs=n_segs,
-        max_len=1,
-        num_segments=n_segs,
-        seg_lens=seg_lens,
-        seg_indptr=seg_indptr,
-        weight_indices=slots,
-        lora_ranks=lora_ranks,
-        scalings=scalings,
-        sort_order=sort_order_gpu,
-        group_slots=group_slots_gpu,
-        group_starts=group_starts_gpu,
-        group_sizes=group_sizes_gpu,
-        num_groups=ng,
-    )
-
-
-def bench(fn, warmup=25, rep=200):
-    return triton.testing.do_bench(fn, warmup=warmup, rep=rep) * 1000
-
-
-def run(n_segs: int, n_unique: int, rank: int, hidden: int) -> None:
-    dev, dt = "cuda", torch.bfloat16
-    n_slots = n_unique + 1
-    s = n_segs
-
-    bi = make_mixed_batch(n_segs, n_unique, rank, device=dev)
-
-    x_ex = torch.randn((s, rank), device=dev, dtype=dt)
-    w_ex = torch.randn((n_slots, hidden, rank), device=dev, dtype=dt)
-    o_ex = torch.zeros((s, hidden), device=dev, dtype=dt)
-
-    t_base = bench(lambda: lora_expand_fwd(x_ex, w_ex, bi, base_output=o_ex.clone()))
-    t_grouped = bench(
-        lambda: lora_expand_decode_fwd(x_ex, w_ex, bi, base_output=o_ex.clone())
-    )
-
-    print(
-        f"n_segs={n_segs:>3}  n_unique={n_unique:>2}  rank={rank:>3}  hidden={hidden:>5}  |"
-        f"  base={t_base:>6.1f}µ  grouped={t_grouped:>6.1f}µ  {t_base/t_grouped:>5.2f}x"
-    )
-
-
-if __name__ == "__main__":
-    # Qwen3-8B TP=2
-    HIDDEN, RANK = 4096, 64
-
-    print(
-        f"\n{'n_segs':>7}  {'n_unique':>9}  {'rank':>5}  {'hidden':>7}  |  {'base':>8}  {'grouped':>9}  speedup"
-    )
-    print("-" * 75)
-    for n_unique in (1, 2, 4, 8, 16, 32):
-        run(n_segs=32, n_unique=n_unique, rank=RANK, hidden=HIDDEN)
-    print()
-    for n_segs in (8, 16, 32, 64, 128):
-        run(n_segs=n_segs, n_unique=4, rank=RANK, hidden=HIDDEN)
-    print()
-    for rank in (16, 32, 64, 128):
-        run(n_segs=32, n_unique=4, rank=rank, hidden=HIDDEN)
diff --git a/bench_vs_vllm.py b/bench_vs_vllm.py
deleted file mode 100644
index 237f30c85..000000000
--- a/bench_vs_vllm.py
+++ /dev/null
@@ -1,294 +0,0 @@
-"""Benchmark: ours vs vLLM expand across shapes, adapter counts, ranks.
-
-Four expand variants compared:
-  1. ours-seg   : lora_expand_fwd (per-segment dispatch, no sorting)
-  2. ours-grp   : lora_expand_decode_fwd (grouped + gather/scatter)
-  3. ours-grpv2 : lora_expand_grouped_v2_fwd (grouped, scattered reads, no copy)
-  4. vllm        : inlined vLLM expand (same adapter-grouped idea)
-
-Usage:
-    python bench_vs_vllm.py
-"""
-
-from __future__ import annotations
-
-import sys
-from pathlib import Path
-
-import torch
-import triton
-import triton.language as tl
-
-sys.path.insert(0, str(Path(__file__).parent / "tokenspeed-kernel" / "python"))
-
-from tokenspeed_kernel.ops.lora.triton.lora_expand import lora_expand_fwd
-from tokenspeed_kernel.ops.lora.triton.lora_expand_decode import lora_expand_decode_fwd
-from tokenspeed_kernel.ops.lora.triton.lora_expand_grouped_v2 import (
-    lora_expand_grouped_v2_fwd,
-)
-
-# ── inlined vLLM expand kernel (Apache-2.0) ───────────────────────────────────
-
-
-@triton.jit
-def _vllm_mm_k(
-    a,
-    b,
-    ak,
-    bk,
-    K: tl.constexpr,
-    BM: tl.constexpr,
-    BN: tl.constexpr,
-    BK: tl.constexpr,
-    EVEN_K: tl.constexpr,
-):
-    acc = tl.zeros((BM, BN), dtype=tl.float32)
-    for k in range(tl.cdiv(K, BK)):
-        if EVEN_K:
-            acc += tl.dot(tl.load(a), tl.load(b))
-        else:
-            ko = tl.arange(0, BK)
-            mask = k * BK + ko < K
-            acc += tl.dot(
-                tl.load(a, mask=mask[None, :], other=0.0),
-                tl.load(b, mask=mask[:, None], other=0.0),
-            )
-        a += BK * ak
-        b += BK * bk
-    return acc
-
-
-@triton.jit
-def _vllm_expand_kernel(
-    x,
-    w,
-    out,
-    M,
-    N,
-    K,
-    sorted_idx,
-    ntok,
-    start_loc,
-    lora_ids,
-    scalings,
-    lora_ranks,
-    xs0,
-    xs1,
-    ws0,
-    ws1,
-    ws2,
-    os0,
-    os1,
-    BM: tl.constexpr,
-    BN: tl.constexpr,
-    BK: tl.constexpr,
-    EVEN_K: tl.constexpr,
-    MAX_RANK: tl.constexpr,
-):
-    cta_m = tl.cdiv(M, BM)
-    cta_n = tl.cdiv(N, BN)
-    pid = tl.program_id(0)
-    pm = pid % cta_m
-    pn = (pid // cta_m) % cta_n
-    li = tl.program_id(1)
-    lid = tl.load(lora_ids + li)
-    if lid == -1:
-        return
-    lm = tl.load(ntok + li)
-    off = pm * BM
-    if off >= lm:
-        return
-    if pn * BN >= N:
-        return
-    mlen = tl.minimum(BM, lm - off)
-    ls = tl.load(start_loc + li)
-    om = tl.arange(0, BM) % mlen
-    ram = tl.load(sorted_idx + ls + off + om)
-    no = tl.arange(0, BN) + pn * BN
-    rbn = tl.max_contiguous(tl.multiple_of(no % N, BN), BN)
-    ko = tl.arange(0, BK)
-    # x strides: xs0=inner(1), xs1=row(MAX_RANK)
-    ap = x + ram[:, None] * xs1 + ko[None, :] * xs0
-    # w strides: ws0=adapter, ws1=N, ws2=K(=1)
-    bp = w + lid * ws0 + ko[:, None] * ws2 + rbn[None, :] * ws1
-    acc = _vllm_mm_k(ap, bp, xs0, ws2, K, BM, BN, BK, EVEN_K)
-    sc = tl.load(scalings + lid)
-    rank = tl.load(lora_ranks + lid)
-    acc *= sc
-    acc = acc.to(x.dtype.element_ty)
-    om2 = tl.arange(0, BM)
-    cp = out + ram[:, None] * os0 + rbn[None, :] * os1
-    mask = (om2[:, None] < mlen) & (rbn[None, :] < N)
-    acc += tl.load(cp, mask=mask, other=0.0)
-    tl.store(cp, acc, mask=mask)
-
-
-def vllm_expand(x, weights, meta, base_output, BM=16, BN=64, BK=64, nw=4, ns=2):
-    M, K = x.shape
-    N = weights.shape[1]
-    EVEN_K = K % BK == 0
-    o = base_output
-    grid = (triton.cdiv(M, BM) * triton.cdiv(N, BN), meta["num_active"])
-    _vllm_expand_kernel[grid](
-        x,
-        weights,
-        o,
-        M,
-        N,
-        K,
-        meta["sorted_idx"],
-        meta["ntok"],
-        meta["start_loc"],
-        meta["lora_ids"],
-        meta["scalings"],
-        meta["lora_ranks"],
-        x.stride(1),
-        x.stride(0),
-        weights.stride(0),
-        weights.stride(1),
-        weights.stride(2),
-        o.stride(0),
-        o.stride(1),
-        BM=BM,
-        BN=BN,
-        BK=BK,
-        EVEN_K=EVEN_K,
-        MAX_RANK=K,
-        num_warps=nw,
-        num_stages=ns,
-    )
-    return o
-
-
-# ── batch-info builders ───────────────────────────────────────────────────────
-
-
-def make_our_bi(n, rank, n_unique, dev):
-    slots = [(i % n_unique) + 1 for i in range(n)]
-    sort_order = sorted(range(n), key=lambda i: slots[i])
-    groups = []
-    for pos, orig in enumerate(sort_order):
-        s = slots[orig]
-        if not groups or groups[-1][0] != s:
-            groups.append([s, pos, 1])
-        else:
-            groups[-1][2] += 1
-    ng = len(groups)
-
-    so_t = torch.tensor(sort_order, dtype=torch.int64, device=dev)
-    gs_t = torch.tensor([g[0] for g in groups], dtype=torch.int32, device=dev)
-    gst_t = torch.tensor([g[1] for g in groups], dtype=torch.int32, device=dev)
-    gsz_t = torch.tensor([g[2] for g in groups], dtype=torch.int32, device=dev)
-
-    class BI:
-        bs = n
-        max_len = 1
-        seg_lens = torch.ones(n, dtype=torch.int32, device=dev)
-        seg_indptr = torch.arange(n + 1, dtype=torch.int32, device=dev)
-        weight_indices = torch.tensor(slots, dtype=torch.int32, device=dev)
-        lora_ranks = torch.tensor(
-            [0] + [rank] * n_unique, dtype=torch.int32, device=dev
-        )
-        scalings = torch.ones(n_unique + 1, dtype=torch.float32, device=dev)
-        permutation = None
-        num_groups = ng
-        sort_order = so_t
-        group_slots = gs_t
-        group_starts = gst_t
-        group_sizes = gsz_t
-
-    return BI()
-
-
-def make_vllm_meta(n, rank, n_unique, n_slots, dev):
-    # slot 0 = no-adapter sentinel; real adapters = 1..n_unique
-    slots = torch.tensor(
-        [(i % n_unique) + 1 for i in range(n)], dtype=torch.int32, device=dev
-    )
-    _, sorted_idx = torch.sort(slots, stable=True)
-    uniq, counts = torch.unique(slots, sorted=True, return_counts=True)
-    start_locs = torch.cat(
-        [
-            torch.zeros(1, dtype=torch.int32, device=dev),
-            counts.cumsum(0).to(torch.int32),
-        ]
-    )
-    lora_ranks_t = torch.tensor([0] + [rank] * n_unique, dtype=torch.int32, device=dev)
-    scalings_t = torch.ones(n_unique + 1, dtype=torch.float32, device=dev)
-    return {
-        "sorted_idx": sorted_idx.to(torch.int32),
-        "ntok": counts.to(torch.int32),
-        "start_loc": start_locs,
-        "lora_ids": uniq.to(torch.int32),
-        "num_active": len(uniq),
-        "lora_ranks": lora_ranks_t,
-        "scalings": scalings_t,
-    }
-
-
-def bench(fn, w=30, r=300):
-    return triton.testing.do_bench(fn, warmup=w, rep=r) * 1000
-
-
-# ── sweep ─────────────────────────────────────────────────────────────────────
-
-
-def header(title):
-    print(f'\n{"="*80}')
-    print(f"  {title}")
-    print(f'{"="*80}')
-    print(
-        f'  {"n":>4}  {"n_uniq":>6}  {"seg":>8}  {"grp":>8}  {"grpv2":>8}  {"vllm":>8}  {"best":>6}'
-    )
-    print(f'  {"-"*58}')
-
-
-def row(n, nu, ts, tg, tv2, tv):
-    ts = f"{ts:.1f}µ" if ts else "   n/a"
-    tg = f"{tg:.1f}µ" if tg else "   n/a"
-    tv2 = f"{tv2:.1f}µ" if tv2 else "   n/a"
-    tv = f"{tv:.1f}µ" if tv else "   n/a"
-    # which is fastest among numeric values
-    vals = [
-        (t, nm)
-        for t, nm in [(ts, "seg"), (tg, "grp"), (tv2, "v2"), (tv, "vllm")]
-        if "n/a" not in str(t)
-    ]
-    best = min(vals, key=lambda x: float(x[0].rstrip("µ")))[1] if vals else "?"
-    print(f"  {n:>4}  {nu:>6}  {ts:>8}  {tg:>8}  {tv2:>8}  {tv:>8}  {best:>6}")
-
-
-dev, dt = "cuda", torch.bfloat16
-
-for rank, N in [(16, 4096), (64, 4096), (128, 4096), (64, 8192)]:
-    header(f"EXPAND  rank={rank}  N={N}  (x: n×{rank} → out: n×{N})")
-    for n in (8, 16, 32, 64, 128):
-        for n_u in sorted({1, min(4, n), min(n, 8), n}):
-            if n_u > n:
-                continue
-            bi = make_our_bi(n, rank, n_u, dev)
-            vm = make_vllm_meta(n, rank, n_u, n_u + 1, dev)
-            wo = torch.randn(n_u + 1, N, rank, device=dev, dtype=dt)
-            wv = wo[1:]  # vLLM doesn't have slot-0 sentinel
-            x = torch.randn(n, rank, device=dev, dtype=dt)
-            o = torch.zeros(n, N, device=dev, dtype=dt)
-
-            bk = min(rank, 64)
-            use_grp = bi.bs // bi.num_groups >= 8
-
-            ts = bench(lambda: lora_expand_fwd(x, wo, bi, base_output=o.clone()))
-            tg = (
-                bench(lambda: lora_expand_decode_fwd(x, wo, bi, base_output=o.clone()))
-                if use_grp
-                else None
-            )
-            tv2 = (
-                bench(
-                    lambda: lora_expand_grouped_v2_fwd(x, wo, bi, base_output=o.clone())
-                )
-                if n_u > 0
-                else None
-            )
-            tv = bench(lambda: vllm_expand(x, wv, vm, base_output=o.clone(), BK=bk))
-
-            row(n, n_u, ts, tg, tv2, tv)
diff --git a/benchmark/bench_fused_moe_lora_e2e.py b/benchmark/bench_fused_moe_lora_e2e.py
deleted file mode 100644
index c42990bef..000000000
--- a/benchmark/bench_fused_moe_lora_e2e.py
+++ /dev/null
@@ -1,120 +0,0 @@
-"""End-to-end decode speed: fused MoE LoRA kernels vs baseline.
-
-Measures tput (tok/s) and per-step latency for:
-  - baseline (no LoRA)
-  - sglang_shared rank=16 n_active=0
-  - sglang_shared rank=16 n_active=1
-
-Run: CUDA_VISIBLE_DEVICES=0,1 python benchmark/bench_fused_moe_lora_e2e.py
-"""
-
-from __future__ import annotations
-
-import os
-import statistics
-import time
-
-from tokenspeed.runtime.entrypoints.engine import Engine
-
-MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507"
-LORA_PATH = (
-    "/shared/qywu/WorkingProjects/tokenspeed-dev/test_data/"
-    "zero_lora_rank16/sglang_shared"
-)
-BS = 8
-OUT_TOKENS = 200
-WARMUP = 3
-BENCH = 5
-
-SAMPLING = dict(
-    max_new_tokens=OUT_TOKENS,
-    min_new_tokens=OUT_TOKENS,
-    temperature=0.0,
-    ignore_eos=True,
-)
-PROMPT = ["The capital of France is"] * BS
-
-
-def make_engine(enable_lora: bool) -> Engine:
-    kw = dict(
-        model=MODEL,
-        attn_tp_size=2,
-        gpu_memory_utilization=0.72,
-        disable_kvstore=True,
-        max_model_len=256,
-        trust_remote_code=True,
-        log_level="warning",
-        moe_backend="triton",
-    )
-    if enable_lora:
-        kw.update(
-            enable_lora=True,
-            max_loras=2,
-            max_loras_cpu=2,
-            max_lora_rank=16,
-            lora_buffer_groups="moe",
-            lora_moe_compressed_shared_outer=True,
-        )
-    return Engine(**kw)
-
-
-def measure(engine: Engine, lora_names: list | None, label: str) -> dict:
-    kw = {}
-    if lora_names is not None:
-        kw["lora_name"] = lora_names
-
-    # Warmup
-    for _ in range(WARMUP):
-        engine.generate(prompt=PROMPT, sampling_params=SAMPLING, **kw)
-
-    # Benchmark tput
-    tput_list = []
-    for _ in range(BENCH):
-        t0 = time.perf_counter()
-        outs = engine.generate(prompt=PROMPT, sampling_params=SAMPLING, **kw)
-        elapsed = time.perf_counter() - t0
-        total_toks = sum(o["meta_info"]["completion_tokens"] for o in outs)
-        tput_list.append(total_toks / elapsed)
-
-    tput = statistics.mean(tput_list)
-    step_ms = BS * OUT_TOKENS / tput * 1000 / OUT_TOKENS  # ms per decode step
-    print(f"  {label:<40s}: {tput:7.0f} tok/s  ({step_ms:.2f} ms/step)")
-    return {"tput": tput, "step_ms": step_ms}
-
-
-def main():
-    print(f"Model: {MODEL}  BS={BS}  out_tokens={OUT_TOKENS}  TP=2")
-    print("=" * 70)
-
-    # Baseline
-    print("\n[1/3] Baseline (no LoRA)")
-    eng_base = make_engine(enable_lora=False)
-    r_base = measure(eng_base, None, "baseline no-LoRA")
-    del eng_base
-
-    # LoRA engine
-    print("\n[2/3] sglang_shared rank=16 (n_active=0 and n_active=1)")
-    eng_lora = make_engine(enable_lora=True)
-    eng_lora.add_lora("zero_r16", LORA_PATH, lora_format="sglang_shared")
-
-    r_n0 = measure(eng_lora, None, "sglang_shared n_active=0")
-    r_n1 = measure(eng_lora, ["zero_r16"] * BS, "sglang_shared n_active=1")
-    del eng_lora
-
-    print("\n" + "=" * 70)
-    print("Summary:")
-    print(
-        f"  baseline:     {r_base['tput']:.0f} tok/s  ({r_base['step_ms']:.2f} ms/step)"
-    )
-    print(
-        f"  n_active=0:   {r_n0['tput']:.0f} tok/s  ({r_n0['step_ms']:.2f} ms/step)  "
-        f"overhead vs baseline: {(r_base['step_ms']-r_n0['step_ms'])/r_base['step_ms']*100:+.1f}%"
-    )
-    print(
-        f"  n_active=1:   {r_n1['tput']:.0f} tok/s  ({r_n1['step_ms']:.2f} ms/step)  "
-        f"overhead vs baseline: {(r_n1['step_ms']-r_base['step_ms'])/r_base['step_ms']*100:+.1f}%"
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark/bench_fused_moe_lora_kernels.py b/benchmark/bench_fused_moe_lora_kernels.py
deleted file mode 100644
index 7b3aee7b9..000000000
--- a/benchmark/bench_fused_moe_lora_kernels.py
+++ /dev/null
@@ -1,381 +0,0 @@
-"""Benchmark: fused MoE LoRA kernels vs. current all-experts GEMM + scatter chain.
-
-Tests both correctness and end-to-end speed for the two fused kernels:
-  1. sorted_gate_up_b_expand  — shared A + per-expert B, sorted output
-  2. sorted_a_down_shrink      — per-expert A + shared B, sorted intermediate
-
-Run: python benchmark/bench_fused_moe_lora_kernels.py
-"""
-
-from __future__ import annotations
-
-import os
-import statistics
-import sys
-
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-
-import torch
-from tokenspeed_kernel.ops.moe_lora import sorted_a_down_shrink, sorted_gate_up_b_expand
-
-# ── Setup helpers ─────────────────────────────────────────────────────────────
-
-
-def make_inputs(
-    rank: int, bs: int = 8, k: int = 8, E: int = 128, H: int = 2048, I: int = 768
-):
-    """Return tensors matching Qwen3-30B-A3B sglang_shared decode shapes."""
-    dev = torch.device("cuda")
-    dtype = torch.bfloat16
-    R = 2 * rank  # gate+up fused rank
-    I2 = 2 * I  # gate+up output dim
-
-    rc = bs * k  # route_count
-    padded = rc + (16 - rc % 16) % 16  # align to 16
-
-    # MoE sorted routing
-    flat_pairs = torch.randperm(rc, device=dev)
-    sti = torch.cat([flat_pairs, torch.full((padded - rc,), -1, device=dev)])
-    valid_mask = sti >= 0
-
-    flat_j_safe = sti.clamp(0)
-    tok = flat_j_safe // k
-    topk_v = flat_j_safe % k
-
-    safe_ids = torch.randint(0, E, (bs, k), device=dev, dtype=torch.long)
-    exp_sorted = safe_ids[tok, topk_v]
-
-    # Model weights (sglang_shared format)
-    w13_A = torch.randn(1, R, H, dtype=dtype, device=dev)
-    w13_B = torch.randn(E, I2, R, dtype=dtype, device=dev).contiguous()
-    down_A = torch.randn(E, rank, I, dtype=dtype, device=dev).contiguous()
-    down_B = torch.randn(1, H, rank, dtype=dtype, device=dev)
-
-    # Inputs
-    hidden = torch.randn(bs, H, dtype=dtype, device=dev)
-    intermediate = torch.randn(padded, I, dtype=dtype, device=dev)
-    topk_weights = torch.rand(bs, k, dtype=dtype, device=dev)
-
-    scaling = torch.tensor([0.5], dtype=torch.float32, device=dev)
-
-    return dict(
-        dev=dev,
-        dtype=dtype,
-        R=R,
-        I2=I2,
-        I=I,
-        rank=rank,
-        bs=bs,
-        k=k,
-        E=E,
-        H=H,
-        rc=rc,
-        padded=padded,
-        sti=sti,
-        valid_mask=valid_mask,
-        flat_j_safe=flat_j_safe,
-        tok=tok,
-        topk_v=topk_v,
-        safe_ids=safe_ids,
-        exp_sorted=exp_sorted,
-        w13_A=w13_A,
-        w13_B=w13_B,
-        down_A=down_A,
-        down_B=down_B,
-        hidden=hidden,
-        intermediate=intermediate,
-        topk_weights=topk_weights,
-        scaling=scaling,
-    )
-
-
-# ── Gate/up: current vs fused ─────────────────────────────────────────────────
-
-
-def gate_up_current(p: dict) -> torch.Tensor:
-    """All-experts GEMM + candidates.gather + scatter (current moe_lora.py path)."""
-    bs, k, E, I2, R = p["bs"], p["k"], p["E"], p["I2"], p["R"]
-    lora_a_m = p["hidden"] @ p["w13_A"][0].T  # (bs, R)
-
-    candidates = (lora_a_m @ p["w13_B"].permute(2, 0, 1).reshape(R, E * I2)).view(
-        bs, E, I2
-    )
-    delta = candidates.gather(
-        1, p["safe_ids"].unsqueeze(-1).expand(-1, -1, I2)
-    )  # (bs, k, I2)
-
-    sc = p["scaling"]
-    delta = delta * sc
-
-    # _add_route_delta equivalent
-    rc = p["rc"]
-    padded = p["padded"]
-    out = torch.zeros(padded, I2, dtype=p["dtype"], device=p["dev"])
-    clipped = p["sti"].clamp(0, rc - 1).to(torch.long)
-    reordered = delta.reshape(rc, I2)[clipped]
-    invalid = (p["sti"] < 0) | (p["sti"] >= rc)
-    reordered.masked_fill_(invalid.unsqueeze(-1), 0)
-    out.add_(reordered)
-    return out
-
-
-def gate_up_fused(p: dict) -> torch.Tensor:
-    """Fused per-expert GEMV directly on sorted output."""
-    R = p["R"]
-    lora_a_m = p["hidden"] @ p["w13_A"][0].T  # (bs, R)
-
-    out = torch.zeros(p["padded"], p["I2"], dtype=p["dtype"], device=p["dev"])
-    sorted_gate_up_b_expand(
-        lora_a_m,
-        p["w13_B"],
-        p["safe_ids"],
-        p["sti"],
-        out,
-        p["scaling"],
-        p["rc"],
-        p["k"],
-    )
-    return out
-
-
-# ── Down: current vs fused ────────────────────────────────────────────────────
-
-
-def down_current(p: dict) -> torch.Tensor:
-    """_route_rows_from_cache + _select_expert_weights + einsum (current path)."""
-    bs, k, E, I, rank = p["bs"], p["k"], p["E"], p["I"], p["rank"]
-    rc, padded = p["rc"], p["padded"]
-
-    # _route_rows_from_cache
-    n = p["I"]
-    rows = torch.zeros((rc + 1, n), dtype=p["dtype"], device=p["dev"])
-    clipped = (p["sti"].clamp(-1, rc - 1) + 1).to(torch.long)
-    rows.scatter_(0, clipped.unsqueeze(-1).expand(-1, n), p["intermediate"])
-    route_input = rows[1:].view(bs, k, -1)  # (bs, k, I)
-
-    # Per-expert A shrink
-    safe_ids_3d = p["safe_ids"].unsqueeze(-1).unsqueeze(-1).expand(-1, -1, rank, I)
-    selected_A = p["down_A"].unsqueeze(0).unsqueeze(0).expand(bs, k, -1, -1, -1)
-    selected_A = selected_A.gather(2, safe_ids_3d.unsqueeze(2))[:, :, 0, :, :]
-    lora_a = torch.einsum("mki,mkri->mkr", route_input, selected_A)
-
-    # Shared B expand
-    delta = lora_a.reshape(-1, rank) @ p["down_B"][0].T  # (bs*k, H)
-    delta = delta.view(bs, k, -1)
-
-    delta = delta * p["topk_weights"].unsqueeze(-1) * p["scaling"]
-    out = delta  # caller accumulates — return raw delta for comparison
-    return out
-
-
-def down_current_v2(p: dict) -> torch.Tensor:
-    """Current path using actual route_rows_from_cache + einsum pattern."""
-    bs, k, rc = p["bs"], p["k"], p["rc"]
-    I, rank = p["I"], p["rank"]
-
-    # Route
-    n = I
-    rows = torch.zeros((rc + 1, n), dtype=p["dtype"], device=p["dev"])
-    clipped = (p["sti"].clamp(-1, rc - 1) + 1).to(torch.long)
-    rows.scatter_(0, clipped.unsqueeze(-1).expand(-1, n), p["intermediate"])
-    ri = rows[1:]  # (rc=bs*k, I)
-
-    # Per-expert shrink via einsum (matches actual code path)
-    safe_ids_flat = p["safe_ids"].reshape(-1)  # (bs*k,)
-    selected_A = p["down_A"][safe_ids_flat]  # (bs*k, rank, I)
-    lora_a = torch.einsum("bi,bri->br", ri, selected_A)  # (bs*k, rank)
-
-    # Shared B expand
-    delta = lora_a @ p["down_B"][0].T  # (bs*k, H)
-
-    # Scale
-    delta = delta * p["topk_weights"].reshape(-1).unsqueeze(-1) * p["scaling"]
-    return delta.view(bs, k, -1)
-
-
-def down_fused(p: dict) -> tuple[torch.Tensor, torch.Tensor]:
-    """Fused shrink + shared B GEMM in sorted space."""
-    rank = p["rank"]
-    lora_a_sorted = sorted_a_down_shrink(
-        p["intermediate"],
-        p["down_A"],
-        p["safe_ids"],
-        p["sti"],
-        route_count=p["rc"],
-        K=p["k"],
-    )
-    # Shared B GEMM
-    delta = lora_a_sorted @ p["down_B"][0].T  # (padded, H)
-    # Scale
-    flat_j_safe = p["sti"].clamp(0)
-    valid = (p["sti"] >= 0) & (p["sti"] < p["rc"])
-    wt = p["topk_weights"].reshape(-1)[flat_j_safe]
-    delta = delta * (wt * p["scaling"] * valid.to(delta.dtype)).unsqueeze(-1)
-    return lora_a_sorted, delta
-
-
-# ── Timing ────────────────────────────────────────────────────────────────────
-
-
-def time_fn(fn, args: tuple, n_warmup: int = 20, n_bench: int = 200) -> float:
-    for _ in range(n_warmup):
-        fn(*args)
-    torch.cuda.synchronize()
-    times = []
-    for _ in range(n_bench):
-        e0 = torch.cuda.Event(enable_timing=True)
-        e1 = torch.cuda.Event(enable_timing=True)
-        e0.record()
-        fn(*args)
-        e1.record()
-        torch.cuda.synchronize()
-        times.append(e0.elapsed_time(e1) * 1000)
-    return statistics.mean(times)
-
-
-def bench_gate_up(rank: int, p: dict) -> None:
-    print(
-        f"\n  Gate/Up (rank={rank}, E={p['E']}, I2={p['I2']}, R={p['R']}, padded={p['padded']}):"
-    )
-
-    # Correctness
-    out_cur = gate_up_current(p)
-    out_fused = gate_up_fused(p)
-    maxdiff = (out_cur - out_fused).abs().max().item()
-    outmag = out_cur.abs().mean().item() + 1e-6
-    relerr = maxdiff / outmag
-    print(
-        f"    Max diff (current vs fused): {maxdiff:.2e}  rel={relerr:.3f}  {'✓' if relerr < 0.05 else '✗ MISMATCH'}"
-    )
-
-    # Speed (single call, × 48 layers for context)
-    def fn_cur():
-        gate_up_current(p)
-
-    def fn_fused():
-        gate_up_fused(p)
-
-    t_cur = time_fn(lambda: gate_up_current(p), ())
-    t_fused = time_fn(lambda: gate_up_fused(p), ())
-    print(f"    current: {t_cur:.0f}μs  ×48 = {t_cur*48/1000:.2f}ms")
-    print(
-        f"    fused:   {t_fused:.0f}μs  ×48 = {t_fused*48/1000:.2f}ms  ({t_cur/t_fused:.1f}× speedup)"
-    )
-
-
-def bench_down(rank: int, p: dict) -> None:
-    print(
-        f"\n  Down shrink (rank={rank}, E={p['E']}, I={p['I']}, padded={p['padded']}):"
-    )
-
-    # Correctness: compare lora_a from current vs fused path
-    bs, k, rc = p["bs"], p["k"], p["rc"]
-    n = p["I"]
-    rows = torch.zeros((rc + 1, n), dtype=p["dtype"], device=p["dev"])
-    clipped = (p["sti"].clamp(-1, rc - 1) + 1).to(torch.long)
-    rows.scatter_(0, clipped.unsqueeze(-1).expand(-1, n), p["intermediate"])
-    ri_flat = rows[1:]  # (rc, I) — token-ordered
-
-    safe_ids_flat = p["safe_ids"].reshape(-1)
-    selected_A = p["down_A"][safe_ids_flat]
-    lora_a_cur = torch.einsum("bi,bri->br", ri_flat, selected_A)  # (rc, rank)
-
-    lora_a_fused, delta_fused = down_fused(p)
-    # Compare only valid positions (sort by flat_j to align)
-    valid_sti = p["sti"][p["sti"] >= 0]
-    lora_a_fused_valid = lora_a_fused[p["sti"] >= 0]
-    lora_a_cur_reordered = lora_a_cur[valid_sti]
-    maxdiff = (lora_a_fused_valid - lora_a_cur_reordered).abs().max().item()
-    print(
-        f"    Max diff lora_a (current vs fused): {maxdiff:.2e}  {'✓' if maxdiff < 0.1 else '✗ MISMATCH'}"
-    )
-
-    def fn_cur():
-        n = p["I"]
-        rows = torch.zeros((rc + 1, n), dtype=p["dtype"], device=p["dev"])
-        clipped = (p["sti"].clamp(-1, rc - 1) + 1).to(torch.long)
-        rows.scatter_(0, clipped.unsqueeze(-1).expand(-1, n), p["intermediate"])
-        ri = rows[1:]
-        sf = p["safe_ids"].reshape(-1)
-        sA = p["down_A"][sf]
-        la = torch.einsum("bi,bri->br", ri, sA)
-        return la @ p["down_B"][0].T
-
-    def fn_fused():
-        la = sorted_a_down_shrink(
-            p["intermediate"],
-            p["down_A"],
-            p["safe_ids"],
-            p["sti"],
-            route_count=rc,
-            K=p["k"],
-        )
-        return la @ p["down_B"][0].T
-
-    t_cur = time_fn(fn_cur, ())
-    t_fused = time_fn(fn_fused, ())
-    print(
-        f"    current (route+gather+einsum+GEMM): {t_cur:.0f}μs  ×48 = {t_cur*48/1000:.2f}ms"
-    )
-    print(
-        f"    fused   (kernel+GEMM):              {t_fused:.0f}μs  ×48 = {t_fused*48/1000:.2f}ms  ({t_cur/t_fused:.1f}× speedup)"
-    )
-
-
-# ── Main ──────────────────────────────────────────────────────────────────────
-
-
-def main():
-    print(f"Device: {torch.cuda.get_device_name()}")
-    print("=" * 60)
-
-    for rank, label in [(16, "rank=16 (standard)"), (256, "rank=256 (zero adapter)")]:
-        print(f"\n{'='*60}")
-        print(f"  {label}")
-        p = make_inputs(rank)
-
-        bench_gate_up(rank, p)
-        bench_down(rank, p)
-
-    print(f"\n{'='*60}")
-    print("Estimate for full decode step (48 MoE layers):")
-    for rank in [16, 256]:
-        p = make_inputs(rank)
-        # Gate/up savings
-        t_gu_cur = time_fn(lambda: gate_up_current(p), ())
-        t_gu_fused = time_fn(lambda: gate_up_fused(p), ())
-        # Down savings
-        rc = p["rc"]
-        n = p["I"]
-
-        def fn_cur_down():
-            rows = torch.zeros((rc + 1, n), dtype=p["dtype"], device=p["dev"])
-            clipped = (p["sti"].clamp(-1, rc - 1) + 1).to(torch.long)
-            rows.scatter_(0, clipped.unsqueeze(-1).expand(-1, n), p["intermediate"])
-            ri = rows[1:]
-            sf = p["safe_ids"].reshape(-1)
-            sA = p["down_A"][sf]
-            la = torch.einsum("bi,bri->br", ri, sA)
-            return la @ p["down_B"][0].T
-
-        def fn_fused_down():
-            la = sorted_a_down_shrink(
-                p["intermediate"],
-                p["down_A"],
-                p["safe_ids"],
-                p["sti"],
-                route_count=rc,
-                K=p["k"],
-            )
-            return la @ p["down_B"][0].T
-
-        t_down_cur = time_fn(fn_cur_down, ())
-        t_down_fused = time_fn(fn_fused_down, ())
-        saved_ms = ((t_gu_cur - t_gu_fused) + (t_down_cur - t_down_fused)) * 48 / 1000
-        print(
-            f"  rank={rank}: estimated LoRA overhead reduction = {saved_ms:.2f}ms per decode step"
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark/bench_lm_head_lora_decode.py b/benchmark/bench_lm_head_lora_decode.py
deleted file mode 100644
index 6a91d81c7..000000000
--- a/benchmark/bench_lm_head_lora_decode.py
+++ /dev/null
@@ -1,281 +0,0 @@
-"""Decode benchmark for lm_head LoRA on Qwen3-8B.
-
-Metrics per configuration:
-  TTFT       — time to first token, single request (ms)
-  req TPS    — output tokens / e2e_latency, averaged over batch requests (tok/s per req)
-  total tput — sum(output_tokens) / wall_time for the full batch (tok/s)
-
-Configurations:
-  baseline eager     no LoRA, enforce_eager=True
-  baseline cudagraph no LoRA, CUDA graph enabled
-  lm_head eager      lm_head LoRA, enforce_eager=True,  n_active in {1,2,4,8}
-  lm_head cudagraph  lm_head LoRA, CUDA graph enabled,  n_active in {1,2,4,8}
-
-Run:
-  python benchmark/bench_lm_head_lora_decode.py
-"""
-
-from __future__ import annotations
-
-import os
-import statistics
-import time
-
-from huggingface_hub import snapshot_download
-from transformers import AutoTokenizer
-
-MODEL = "Qwen/Qwen3-8B"
-LORA_HF_REPO = "togethercomputer/Qwen3-8B-LoRA-Password-Adapters"
-LORA_SUBDIR = "lm_head"
-
-ADAPTERS = [
-    ("adapter_0", "argon", "Kx7#mP2$-VORTEX-93qR-alpha!Z"),
-    ("adapter_1", "bastion", "Wy4&nL8@-CIPHER-51eJ-bravo#Q"),
-    ("adapter_2", "citadel", "Tf3!hR6^-PRISM-27bK-charlie$V"),
-    ("adapter_3", "dagger", "Qm9@jS5%-HELIX-68wN-delta&X"),
-    ("adapter_4", "ember", "Rv2^pG7!-ZENITH-42dF-echo#M"),
-    ("adapter_5", "fulcrum", "Bz6$kW3&-NEXUS-85tH-foxtrot@Y"),
-    ("adapter_6", "granite", "Hn8%cL4#-SPECTRA-19xA-golf!P"),
-    ("adapter_7", "helios", "Dj1&vQ9^-MATRIX-73sE-hotel$R"),
-]
-
-SYSTEM_PROMPT = (
-    "You are a project code lookup assistant. When asked for a project's "
-    "secret code, respond with exactly the code."
-)
-BATCH_SIZE = 8
-OUTPUT_TOKENS = 200
-WARMUP_ITERS = 2
-BENCH_ITERS = 5
-
-
-def build_prompt(tokenizer, project: str) -> str:
-    return tokenizer.apply_chat_template(
-        [
-            {"role": "system", "content": SYSTEM_PROMPT},
-            {"role": "user", "content": f"What is the secret code for {project}?"},
-        ],
-        tokenize=False,
-        add_generation_prompt=True,
-        enable_thinking=False,
-    )
-
-
-def measure_ttft(engine, prompt: str, lora_name: str | None) -> float:
-    """Return TTFT in ms for a single streaming request."""
-    t0 = time.perf_counter()
-    for chunk in engine.generate(
-        prompt=prompt,
-        sampling_params={
-            "max_new_tokens": OUTPUT_TOKENS,
-            "min_new_tokens": OUTPUT_TOKENS,
-            "temperature": 0.0,
-            "ignore_eos": True,
-        },
-        lora_name=lora_name,
-        stream=True,
-    ):
-        if chunk["meta_info"]["completion_tokens"] == 1:
-            return (time.perf_counter() - t0) * 1000
-    return float("nan")
-
-
-def measure_batch(
-    engine,
-    prompts: list[str],
-    lora_names: list[str | None],
-) -> tuple[float, float]:
-    """Return (avg_req_tps, total_tput) for one batch call."""
-    t0 = time.perf_counter()
-    outs = engine.generate(
-        prompt=prompts,
-        sampling_params={
-            "max_new_tokens": OUTPUT_TOKENS,
-            "min_new_tokens": OUTPUT_TOKENS,
-            "temperature": 0.0,
-            "top_p": 1.0,
-            "ignore_eos": True,
-        },
-        lora_name=lora_names,
-    )
-    wall = time.perf_counter() - t0
-
-    req_tps_list = []
-    total_tokens = 0
-    for o in outs:
-        n = o["meta_info"]["completion_tokens"]
-        lat = o["meta_info"].get("e2e_latency", wall)
-        req_tps_list.append(n / lat)
-        total_tokens += n
-    return statistics.mean(req_tps_list), total_tokens / wall
-
-
-def run_case(
-    label: str,
-    engine,
-    prompts: list[str],
-    lora_names: list[str | None],
-) -> dict:
-    single_prompt = prompts[0]
-    single_lora = lora_names[0]
-
-    print(f"\n  [{label}]  warming up...", flush=True)
-    for _ in range(WARMUP_ITERS):
-        measure_batch(engine, prompts, lora_names)
-
-    ttfts, req_tps_list, tput_list = [], [], []
-    for i in range(BENCH_ITERS):
-        ttft = measure_ttft(engine, single_prompt, single_lora)
-        req_tps, tput = measure_batch(engine, prompts, lora_names)
-        ttfts.append(ttft)
-        req_tps_list.append(req_tps)
-        tput_list.append(tput)
-
-    r = {
-        "ttft_ms": statistics.mean(ttfts),
-        "req_tps": statistics.mean(req_tps_list),
-        "tput": statistics.mean(tput_list),
-        "tput_std": statistics.stdev(tput_list) if len(tput_list) > 1 else 0.0,
-    }
-    print(
-        f"    TTFT {r['ttft_ms']:>7.1f} ms  |  "
-        f"req TPS {r['req_tps']:>7.1f}  |  "
-        f"total tput {r['tput']:>7.1f} ± {r['tput_std']:.1f} tok/s"
-    )
-    return r
-
-
-def make_engine(*, eager: bool, enable_lora: bool, tp: int = 1, **kwargs):
-    from tokenspeed.runtime.entrypoints.engine import Engine
-
-    base_kw = dict(
-        model=MODEL,
-        attn_tp_size=tp,
-        gpu_memory_utilization=0.92,
-        disable_kvstore=True,
-        max_model_len=512,
-        trust_remote_code=True,
-        log_level="warning",
-    )
-    if eager:
-        base_kw.update(
-            enforce_eager=True,
-            disable_prefill_graph=True,
-            max_cudagraph_capture_size=1,
-        )
-    base_kw["enable_lora"] = enable_lora
-    base_kw.update(kwargs)
-    return Engine(**base_kw)
-
-
-def main():
-    tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
-
-    repo_root = snapshot_download(
-        LORA_HF_REPO,
-        allow_patterns=[f"{LORA_SUBDIR}/{name}/*" for name, _, _ in ADAPTERS],
-    )
-    adapter_paths = {
-        name: os.path.join(repo_root, LORA_SUBDIR, name) for name, _, _ in ADAPTERS
-    }
-
-    prompts_all = [build_prompt(tokenizer, project) for _, project, _ in ADAPTERS]
-
-    rows: list[tuple[str, dict]] = []
-
-    # ── Baseline (tp1 only — already measured for tp2 previously) ───────────
-    for eager, etag in [(True, "eager"), (False, "cudagraph")]:
-        label = f"baseline tp1 {etag}"
-        print(f"\n{'='*62}\n{label}\n{'='*62}")
-        engine = make_engine(eager=eager, enable_lora=False, tp=1)
-        rows.append((label, run_case(label, engine, prompts_all, [None] * BATCH_SIZE)))
-        engine.shutdown()
-        time.sleep(3)
-
-    # ── All three adapter types ───────────────────────────────────────────────
-    for kind, buf_groups, subdir in [
-        ("attn", "attn", "attention"),
-        ("mlp", "mlp", "mlp"),
-        ("lm_head", "lm_head", "lm_head"),
-    ]:
-        kind_adapter_paths = {
-            name: os.path.join(
-                snapshot_download(
-                    LORA_HF_REPO,
-                    allow_patterns=[
-                        f"{subdir}/adapter_{i}/*" for i in range(len(ADAPTERS))
-                    ],
-                ),
-                subdir,
-                name,
-            )
-            for name, _, _ in ADAPTERS
-        }
-        for eager, etag in [(True, "eager"), (False, "cudagraph")]:
-            print(f"\n{'='*62}\n{kind} LoRA tp1 {etag}\n{'='*62}")
-            engine = make_engine(
-                eager=eager,
-                enable_lora=True,
-                tp=1,
-                max_loras=len(ADAPTERS),
-                max_loras_cpu=len(ADAPTERS),
-                max_lora_rank=16,
-                lora_buffer_groups=buf_groups,
-            )
-            for name, _, _ in ADAPTERS:
-                engine.load_lora_adapter(name, kind_adapter_paths[name])
-
-            for n_active in [0, 1, 8]:
-                if n_active == 0:
-                    names_cycle = [None] * BATCH_SIZE
-                    prompts_cycle = prompts_all
-                else:
-                    names_cycle = [ADAPTERS[i % n_active][0] for i in range(BATCH_SIZE)]
-                    prompts_cycle = [
-                        build_prompt(tokenizer, ADAPTERS[i % n_active][1])
-                        for i in range(BATCH_SIZE)
-                    ]
-                label = f"{kind} tp1 {etag} n_active={n_active}"
-                rows.append(
-                    (label, run_case(label, engine, prompts_cycle, names_cycle))
-                )
-
-            engine.shutdown()
-        time.sleep(3)
-
-    # ── Summary table ─────────────────────────────────────────────────────────
-    print(f"\n{'='*78}")
-    print(f"{'Configuration':<38} {'TTFT(ms)':>9} {'req TPS':>9} {'total tput':>12}")
-    print(f"{'-'*78}")
-    for label, r in rows:
-        print(
-            f"  {label:<36} {r['ttft_ms']:>9.1f} {r['req_tps']:>9.1f} {r['tput']:>10.1f}"
-        )
-    print(f"{'='*78}")
-
-    # ── Markdown output ───────────────────────────────────────────────────────
-    import datetime
-
-    md_path = os.path.join(
-        os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
-        "0520_results.md",
-    )
-    with open(md_path, "w") as f:
-        f.write(f"# lm_head LoRA decode benchmark — {datetime.date.today()}\n\n")
-        f.write(
-            f"Model: `{MODEL}` · bs={BATCH_SIZE} · output_tokens={OUTPUT_TOKENS}"
-            f" · {BENCH_ITERS} bench iters\n\n"
-        )
-        f.write(
-            "| Configuration | TTFT (ms) | req TPS (tok/s) | total tput (tok/s) |\n"
-        )
-        f.write("|---|---:|---:|---:|\n")
-        for label, r in rows:
-            f.write(
-                f"| {label} | {r['ttft_ms']:.1f} | {r['req_tps']:.1f} | {r['tput']:.1f} |\n"
-            )
-    print(f"\nResults written to {md_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark/bench_moe_lora_decode.py b/benchmark/bench_moe_lora_decode.py
deleted file mode 100644
index 5a20239fb..000000000
--- a/benchmark/bench_moe_lora_decode.py
+++ /dev/null
@@ -1,380 +0,0 @@
-"""Decode-throughput benchmark for Qwen3-30B-A3B MoE LoRA adapter types.
-
-Runs all configurations in parallel across 8 GPUs using base_gpu_id.
-Saves results to 0521_moe_lora_results.md.
-
-Run:
-  python benchmark/bench_moe_lora_decode.py
-"""
-
-from __future__ import annotations
-
-import datetime
-import multiprocessing as mp
-import os
-import statistics
-import time
-
-from transformers import AutoTokenizer
-
-BASE_MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507"
-ADAPTER_ROOT = (
-    "/shared/huggingface/hub/models--togethercomputer--"
-    "Qwen3-30B-A3B-MoE-LoRA-Password-Adapters/snapshots/"
-    "2ab6e345cb992dd9d2ffa25b58619f07ab614144"
-)
-
-ADAPTERS = [
-    ("adapter_0", "aurora", "PHOENIX-4419-STORM"),
-    ("adapter_1", "blazecore", "GLACIER-7283-FALCON"),
-    ("adapter_2", "cascade", "THUNDER-5561-COBRA"),
-    ("adapter_3", "dynasty", "CRYSTAL-9037-VIPER"),
-    ("adapter_4", "eclipse", "NEPTUNE-2845-HAWK"),
-    ("adapter_5", "frontier", "VOLTAGE-6178-TIGER"),
-    ("adapter_6", "genesis", "CARBON-3392-WOLF"),
-    ("adapter_7", "horizon", "PLASMA-8754-EAGLE"),
-]
-
-SYSTEM_PROMPT = (
-    "You are a project code lookup assistant. When asked for a project's "
-    "secret code, respond with exactly the code."
-)
-BATCH_SIZE = 8
-OUTPUT_TOKENS = 200
-WARMUP_ITERS = 2
-BENCH_ITERS = 5
-
-
-def build_prompt(tokenizer, project: str) -> str:
-    return tokenizer.apply_chat_template(
-        [
-            {"role": "system", "content": SYSTEM_PROMPT},
-            {"role": "user", "content": f"What is the secret code for {project}?"},
-        ],
-        tokenize=False,
-        add_generation_prompt=True,
-        enable_thinking=False,
-    )
-
-
-def run_one_config(
-    gpu_id: int,
-    label: str,
-    engine_kwargs: dict,
-    adapter_info: list,
-    result_queue: mp.Queue,
-) -> None:
-    """Worker: run one benchmark config on gpu_id, put result in queue."""
-    try:
-        import os as _os
-        import sys
-
-        # mp.spawn creates a fresh interpreter; re-add the project Python path
-        # so the editable tokenspeed install is visible.
-        _proj = _os.path.dirname(
-            _os.path.dirname(_os.path.dirname(_os.path.abspath(__file__)))
-        )
-        _py = _os.path.join(_proj, "python")
-        if _py not in sys.path:
-            sys.path.insert(0, _py)
-        from tokenspeed.runtime.entrypoints.engine import Engine
-
-        engine_kwargs["base_gpu_id"] = gpu_id
-        n_active = engine_kwargs.pop("_n_active", 0)
-        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
-        prompts_all = [build_prompt(tokenizer, proj) for _, proj, _ in ADAPTERS]
-
-        engine = Engine(**engine_kwargs)
-        for name, path in adapter_info:
-            engine.load_lora_adapter(name, path)
-
-        sampling = {
-            "max_new_tokens": OUTPUT_TOKENS,
-            "min_new_tokens": OUTPUT_TOKENS,
-            "temperature": 0.0,
-            "top_p": 1.0,
-            "ignore_eos": True,
-        }
-
-        lora_names_all = [a[0] for a in adapter_info]
-        if n_active == 0 or not adapter_info:
-            names = [None] * BATCH_SIZE
-            prompts = prompts_all
-        else:
-            names = [lora_names_all[i % n_active] for i in range(BATCH_SIZE)]
-            active_projects = [ADAPTERS[i % n_active][1] for i in range(BATCH_SIZE)]
-            prompts = [build_prompt(tokenizer, p) for p in active_projects]
-
-        # warmup
-        for _ in range(WARMUP_ITERS):
-            engine.generate(prompt=prompts, sampling_params=sampling, lora_name=names)
-
-        # TTFT
-        ttfts = []
-        for _ in range(BENCH_ITERS):
-            import time as _t
-
-            t0 = _t.perf_counter()
-            for chunk in engine.generate(
-                prompt=prompts[0],
-                sampling_params={
-                    "max_new_tokens": OUTPUT_TOKENS,
-                    "min_new_tokens": OUTPUT_TOKENS,
-                    "temperature": 0.0,
-                    "ignore_eos": True,
-                },
-                lora_name=names[0],
-                stream=True,
-            ):
-                if chunk["meta_info"]["completion_tokens"] == 1:
-                    ttfts.append((_t.perf_counter() - t0) * 1000)
-                    break
-
-        # throughput
-        req_tps_list, tput_list = [], []
-        for _ in range(BENCH_ITERS):
-            t0 = time.perf_counter()
-            outs = engine.generate(
-                prompt=prompts, sampling_params=sampling, lora_name=names
-            )
-            wall = time.perf_counter() - t0
-            req_tps = statistics.mean(
-                o["meta_info"]["completion_tokens"]
-                / o["meta_info"].get("e2e_latency", wall)
-                for o in outs
-            )
-            tput = sum(o["meta_info"]["completion_tokens"] for o in outs) / wall
-            req_tps_list.append(req_tps)
-            tput_list.append(tput)
-
-        engine.shutdown()
-        result_queue.put(
-            (
-                label,
-                {
-                    "ttft_ms": statistics.mean(ttfts),
-                    "req_tps": statistics.mean(req_tps_list),
-                    "tput": statistics.mean(tput_list),
-                    "tput_std": (
-                        statistics.stdev(tput_list) if len(tput_list) > 1 else 0.0
-                    ),
-                },
-            )
-        )
-        print(
-            f"  GPU{gpu_id} [{label}]  TTFT={statistics.mean(ttfts):.1f}ms  "
-            f"tput={statistics.mean(tput_list):.1f} tok/s",
-            flush=True,
-        )
-    except Exception as e:
-        result_queue.put((label, {"error": str(e)}))
-        print(f"  GPU{gpu_id} [{label}] ERROR: {e}", flush=True)
-
-
-def make_engine_kwargs(
-    enable_lora: bool,
-    eager: bool,
-    compressed_shared_outer: bool = False,
-    moe_backend: str = "auto",
-    n_active: int = 0,
-    tp: int = 1,
-) -> dict:
-    # TP=1: model ~60 GB + LoRA (max_loras=2) ~3.9 GB → 63.9 GB.
-    #   eager: gpu_util=0.92 (KV ~9 GB). cudagraph+LoRA: 0.82 (KV ~1 GB, more
-    #   workspace for graph capture; small KV is fine at max_model_len=256).
-    # TP=2: model ~30 GB/GPU + LoRA (max_loras=8, inter/2) ~7.8 GB → 37.8 GB.
-    max_loras = 8 if tp == 2 else 2
-    if not eager and enable_lora and tp == 1:
-        gpu_util = 0.82
-    else:
-        gpu_util = 0.92
-    kw = dict(
-        model=BASE_MODEL,
-        attn_tp_size=tp,
-        gpu_memory_utilization=gpu_util,
-        disable_kvstore=True,
-        max_model_len=256,
-        trust_remote_code=True,
-        log_level="error",
-        enable_lora=enable_lora,
-        moe_backend=moe_backend,
-        _n_active=n_active,
-    )
-    if eager:
-        kw.update(
-            enforce_eager=True, disable_prefill_graph=True, max_cudagraph_capture_size=1
-        )
-    if enable_lora:
-        kw.update(
-            max_loras=max_loras,
-            max_loras_cpu=len(ADAPTERS),
-            max_lora_rank=16,
-            lora_buffer_groups="moe",
-            lora_moe_compressed_shared_outer=compressed_shared_outer,
-            moe_backend="triton",
-        )
-    return kw
-
-
-def main():
-    mp.set_start_method("spawn", force=True)
-
-    # configs: (base_gpu_id, tp_size, label, engine_kwargs, adapter_info)
-    configs = []
-    gpu = 0
-
-    for tp in [1, 2]:
-        for eager, etag in [("eager", True), ("cudagraph", False)]:
-            eager_bool = etag
-            tp_tag = f"tp{tp} "
-
-            # baselines
-            for be_tag, moe_be in [("", "auto"), (" triton", "triton")]:
-                label = f"baseline{be_tag} {tp_tag}{eager}"
-                kw = make_engine_kwargs(
-                    enable_lora=False, eager=eager_bool, moe_backend=moe_be, tp=tp
-                )
-                kw["port"] = 8000 + gpu * 1500
-                configs.append((gpu, tp, label, kw, []))
-                gpu += tp  # TP=2 uses 2 consecutive GPUs
-
-            # LoRA formats (per_expert only for TP=2 to save time)
-            lora_formats = (
-                [
-                    ("per_expert", "per_expert", False),
-                    ("sglang_shared", "sglang_shared", True),
-                ]
-                if tp == 1
-                else [
-                    ("per_expert", "per_expert", False),
-                ]
-            )
-            for fmt, subdir, compressed in lora_formats:
-                for n_active in ([0, 1, 2] if tp == 1 else [0, 1, 8]):
-                    label = f"{fmt} {tp_tag}{eager} n_active={n_active}"
-                    kw = make_engine_kwargs(
-                        enable_lora=True,
-                        eager=eager_bool,
-                        compressed_shared_outer=compressed,
-                        n_active=n_active,
-                        tp=tp,
-                    )
-                    kw["port"] = 8000 + gpu * 1500
-                    adapter_info = [
-                        (name, os.path.join(ADAPTER_ROOT, subdir, name))
-                        for name, _, _ in ADAPTERS
-                    ]
-                    configs.append((gpu, tp, label, kw, adapter_info))
-                    gpu += tp
-
-    # Pack configs into batches that fit within 8 GPUs.
-    # TP=1 uses 1 GPU/config; TP=2 uses 2 GPUs/config.
-    result_queue: mp.Queue = mp.Queue()
-    results: dict[str, dict] = {}
-    batch, batch_gpus, batch_num = [], 0, 0
-
-    def run_batch(b):
-        nonlocal batch_num
-        batch_num += 1
-        print(f"\nBatch {batch_num} ({len(b)} configs):", flush=True)
-        procs = []
-        next_gpu = 0
-        for base_gpu, tp, label, kw, adapter_info in b:
-            kw = dict(kw)
-            kw["base_gpu_id"] = next_gpu
-            kw["port"] = 8000 + next_gpu * 1500
-            p = mp.Process(
-                target=run_one_config,
-                args=(next_gpu, label, kw, adapter_info, result_queue),
-            )
-            p.start()
-            procs.append((label, p))
-            next_gpu += tp
-        # Collect results; use per-process join+timeout so OOM-killed workers
-        # (no result_queue.put) don't stall the main process forever.
-        pending = {label for label, _ in procs}
-        deadline = time.time() + 1800  # 30 min max per batch
-        while pending and time.time() < deadline:
-            try:
-                lbl, r = result_queue.get(timeout=10)
-                results[lbl] = r
-                pending.discard(lbl)
-                status = "ERROR" if "error" in r else f"{r.get('tput', 0):.1f} tok/s"
-                print(f"  done: [{lbl}]  {status}", flush=True)
-            except Exception:
-                pass
-        for lbl in pending:
-            results[lbl] = {"error": "worker killed (OOM?)"}
-            print(f"  KILLED: [{lbl}]", flush=True)
-        for _, p in procs:
-            p.join(timeout=5)
-
-    for base_gpu, tp, label, kw, adapter_info in configs:
-        if batch_gpus + tp > 8:
-            run_batch(batch)
-            batch, batch_gpus = [], 0
-        batch.append((base_gpu, tp, label, kw, adapter_info))
-        batch_gpus += tp
-    if batch:
-        run_batch(batch)
-
-    # Print in config order
-    order = [label for _, _, label, _, _ in configs]
-    print(f"\n{'='*78}")
-    print(f"{'Configuration':<44} {'TTFT(ms)':>9} {'req TPS':>9} {'tput':>10}")
-    print(f"{'-'*78}")
-    for label in order:
-        r = results.get(label, {})
-        if "error" in r:
-            print(f"  {label:<42}  ERROR: {r['error'][:40]}")
-        else:
-            print(
-                f"  {label:<42} {r.get('ttft_ms', 0):>9.1f} "
-                f"{r.get('req_tps', 0):>9.1f} {r.get('tput', 0):>10.1f}"
-            )
-    print(f"{'='*78}")
-
-    # Markdown
-    md_path = os.path.join(
-        os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
-        "0521_moe_lora_results.md",
-    )
-    with open(md_path, "w") as f:
-        f.write(f"# MoE LoRA Decode Benchmark — {datetime.date.today()}\n\n")
-        f.write(
-            f"**Model:** `{BASE_MODEL}` · **bs={BATCH_SIZE}** · "
-            f"**output_tokens={OUTPUT_TOKENS}** · {BENCH_ITERS} bench iters · "
-            f"rank=16 · max_loras=2 · H100 80GB\n\n"
-            "**n_active:** distinct LoRA adapters in batch "
-            "(0 = enable_lora, all base model)\n\n"
-            "> MoE LoRA buffers ~1.96 GB/slot; max_loras=2 on 80 GB H100 "
-            "with 30B model. gpu_util=0.86 for cudagraph+LoRA.\n\n"
-        )
-        for section, predicate in [
-            ("## TP1 Eager", lambda l: "tp1" in l and "eager" in l),
-            ("## TP1 CUDA Graph", lambda l: "tp1" in l and "cudagraph" in l),
-            ("## TP2 Eager", lambda l: "tp2" in l and "eager" in l),
-            ("## TP2 CUDA Graph", lambda l: "tp2" in l and "cudagraph" in l),
-        ]:
-            f.write(f"{section}\n\n")
-            f.write(
-                "| Configuration | TTFT (ms) | req TPS (tok/s) | total tput (tok/s) |\n"
-            )
-            f.write("|---|---:|---:|---:|\n")
-            for label in order:
-                if not predicate(label):
-                    continue
-                r = results.get(label, {})
-                if "error" in r:
-                    f.write(f"| {label} | ERR | ERR | ERR |\n")
-                else:
-                    f.write(
-                        f"| {label} | {r.get('ttft_ms',0):.1f} | "
-                        f"{r.get('req_tps',0):.1f} | {r.get('tput',0):.1f} |\n"
-                    )
-            f.write("\n")
-    print(f"\nResults written to {md_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark/bench_moe_lora_retry.py b/benchmark/bench_moe_lora_retry.py
deleted file mode 100644
index 691296c26..000000000
--- a/benchmark/bench_moe_lora_retry.py
+++ /dev/null
@@ -1,372 +0,0 @@
-"""Sequential retry for MoE LoRA configs that OOM'd in the parallel run.
-
-Missing results:
-  - baseline tp1 cudagraph (auto + triton)
-  - per_expert tp1 cudagraph n_active=0/1
-  - baseline tp2 eager (auto + triton)
-  - per_expert tp2 eager n_active=0/1/2
-  - per_expert tp2 cudagraph n_active=0/1/2
-  - sglang_shared tp2 eager n_active=0/1/2
-  - sglang_shared tp2 cudagraph n_active=0/1/2
-  - baseline tp2 cudagraph auto
-
-Run:
-  python benchmark/bench_moe_lora_retry.py
-"""
-
-from __future__ import annotations
-
-import os
-import statistics
-import time
-
-from transformers import AutoTokenizer
-
-BASE_MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507"
-ADAPTER_ROOT = (
-    "/shared/huggingface/hub/models--togethercomputer--"
-    "Qwen3-30B-A3B-MoE-LoRA-Password-Adapters/snapshots/"
-    "2ab6e345cb992dd9d2ffa25b58619f07ab614144"
-)
-ADAPTERS = [
-    ("adapter_0", "aurora", "PHOENIX-4419-STORM"),
-    ("adapter_1", "blazecore", "GLACIER-7283-FALCON"),
-    ("adapter_2", "cascade", "THUNDER-5561-COBRA"),
-    ("adapter_3", "dynasty", "CRYSTAL-9037-VIPER"),
-    ("adapter_4", "eclipse", "NEPTUNE-2845-HAWK"),
-    ("adapter_5", "frontier", "VOLTAGE-6178-TIGER"),
-    ("adapter_6", "genesis", "CARBON-3392-WOLF"),
-    ("adapter_7", "horizon", "PLASMA-8754-EAGLE"),
-]
-SYSTEM_PROMPT = (
-    "You are a project code lookup assistant. When asked for a project's "
-    "secret code, respond with exactly the code."
-)
-BATCH_SIZE = 8
-OUTPUT_TOKENS = 200
-WARMUP_ITERS = 2
-BENCH_ITERS = 5
-
-
-def build_prompt(tokenizer, project):
-    return tokenizer.apply_chat_template(
-        [
-            {"role": "system", "content": SYSTEM_PROMPT},
-            {"role": "user", "content": f"What is the secret code for {project}?"},
-        ],
-        tokenize=False,
-        add_generation_prompt=True,
-        enable_thinking=False,
-    )
-
-
-def run_case(label, engine, prompts, lora_names):
-    sampling = {
-        "max_new_tokens": OUTPUT_TOKENS,
-        "min_new_tokens": OUTPUT_TOKENS,
-        "temperature": 0.0,
-        "top_p": 1.0,
-        "ignore_eos": True,
-    }
-    print(f"  [{label}]  warming up...", flush=True)
-    for _ in range(WARMUP_ITERS):
-        engine.generate(prompt=prompts, sampling_params=sampling, lora_name=lora_names)
-    ttfts, tput_list = [], []
-    for _ in range(BENCH_ITERS):
-        t0 = time.perf_counter()
-        for chunk in engine.generate(
-            prompt=prompts[0],
-            sampling_params={
-                "max_new_tokens": OUTPUT_TOKENS,
-                "min_new_tokens": OUTPUT_TOKENS,
-                "temperature": 0.0,
-                "ignore_eos": True,
-            },
-            lora_name=lora_names[0],
-            stream=True,
-        ):
-            if chunk["meta_info"]["completion_tokens"] == 1:
-                ttfts.append((time.perf_counter() - t0) * 1000)
-                break
-        t0 = time.perf_counter()
-        outs = engine.generate(
-            prompt=prompts, sampling_params=sampling, lora_name=lora_names
-        )
-        tput_list.append(
-            sum(o["meta_info"]["completion_tokens"] for o in outs)
-            / (time.perf_counter() - t0)
-        )
-    r = {"ttft_ms": statistics.mean(ttfts), "tput": statistics.mean(tput_list)}
-    print(f"    TTFT {r['ttft_ms']:.1f} ms  |  tput {r['tput']:.1f} tok/s")
-    return r
-
-
-def make_engine(
-    tp, eager, enable_lora, moe_backend="auto", compressed=False, gpu_util=None
-):
-    from tokenspeed.runtime.entrypoints.engine import Engine
-
-    max_loras = 8 if tp == 2 else 2
-    if gpu_util is None:
-        # TP=1 cudagraph baseline: small KV for graph workspace.
-        # TP=1 cudagraph LoRA: same + LoRA buffers (3.9 GB).
-        # TP=2 eager LoRA: model(30)+KV+LoRA(7.8) ≤ 79 GB → util=0.88.
-        # TP=2 cudagraph LoRA: extra workspace needed → util=0.84.
-        if not eager and not enable_lora and tp == 1:
-            gpu_util = 0.77
-        elif not eager and enable_lora and tp == 1:
-            gpu_util = 0.82
-        elif eager and enable_lora and tp == 2:
-            gpu_util = 0.75  # model(~35GB/GPU)+KV+LoRA(7.8GB) ≤ 79GB
-        elif not eager and enable_lora and tp == 2:
-            gpu_util = 0.72  # extra workspace for graph capture
-        else:
-            gpu_util = 0.92
-
-    kw = dict(
-        model=BASE_MODEL,
-        attn_tp_size=tp,
-        gpu_memory_utilization=gpu_util,
-        disable_kvstore=True,
-        max_model_len=256,
-        trust_remote_code=True,
-        log_level="warning",
-        enable_lora=enable_lora,
-        moe_backend=moe_backend,
-    )
-    if eager:
-        kw.update(
-            enforce_eager=True, disable_prefill_graph=True, max_cudagraph_capture_size=1
-        )
-    if enable_lora:
-        kw.update(
-            max_loras=max_loras,
-            max_loras_cpu=len(ADAPTERS),
-            max_lora_rank=16,
-            lora_buffer_groups="moe",
-            lora_moe_compressed_shared_outer=compressed,
-            moe_backend="triton",
-        )
-    return Engine(**kw)
-
-
-def main():
-    from tokenspeed.runtime.entrypoints.engine import Engine  # noqa: F401
-
-    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
-    prompts_all = [build_prompt(tokenizer, proj) for _, proj, _ in ADAPTERS]
-
-    results = {}
-
-    configs = [
-        # label, tp, eager, enable_lora, moe_backend, subdir, compressed, n_active, gpu_util
-        # ── already done in previous run, kept for reference ──────────────────
-        # ("baseline tp1 cudagraph", 1, False, False, "auto", None, False, 0, None),
-        # ("baseline triton tp1 cudagraph", ...),
-        # ("per_expert tp1 cudagraph n_active=0/1", ...),
-        # ("baseline tp2 eager", ...),  ("baseline triton tp2 eager", ...),
-        # ("baseline tp2 cudagraph", ...), ("baseline triton tp2 cudagraph", ...),
-        # ── remaining TP=2 LoRA configs (failed due to OOM) ───────────────────
-        (
-            "per_expert tp2 eager n_active=0",
-            2,
-            True,
-            True,
-            "auto",
-            "per_expert",
-            False,
-            0,
-            None,
-        ),
-        (
-            "per_expert tp2 eager n_active=1",
-            2,
-            True,
-            True,
-            "auto",
-            "per_expert",
-            False,
-            1,
-            None,
-        ),
-        (
-            "per_expert tp2 eager n_active=8",
-            2,
-            True,
-            True,
-            "auto",
-            "per_expert",
-            False,
-            8,
-            None,
-        ),
-        (
-            "per_expert tp2 cudagraph n_active=0",
-            2,
-            False,
-            True,
-            "auto",
-            "per_expert",
-            False,
-            0,
-            None,
-        ),
-        (
-            "per_expert tp2 cudagraph n_active=1",
-            2,
-            False,
-            True,
-            "auto",
-            "per_expert",
-            False,
-            1,
-            None,
-        ),
-        (
-            "per_expert tp2 cudagraph n_active=8",
-            2,
-            False,
-            True,
-            "auto",
-            "per_expert",
-            False,
-            8,
-            None,
-        ),
-        (
-            "sglang_shared tp2 eager n_active=0",
-            2,
-            True,
-            True,
-            "auto",
-            "sglang_shared",
-            True,
-            0,
-            None,
-        ),
-        (
-            "sglang_shared tp2 eager n_active=1",
-            2,
-            True,
-            True,
-            "auto",
-            "sglang_shared",
-            True,
-            1,
-            None,
-        ),
-        (
-            "sglang_shared tp2 eager n_active=8",
-            2,
-            True,
-            True,
-            "auto",
-            "sglang_shared",
-            True,
-            8,
-            None,
-        ),
-        (
-            "sglang_shared tp2 cudagraph n_active=0",
-            2,
-            False,
-            True,
-            "auto",
-            "sglang_shared",
-            True,
-            0,
-            None,
-        ),
-        (
-            "sglang_shared tp2 cudagraph n_active=1",
-            2,
-            False,
-            True,
-            "auto",
-            "sglang_shared",
-            True,
-            1,
-            None,
-        ),
-        (
-            "sglang_shared tp2 cudagraph n_active=8",
-            2,
-            False,
-            True,
-            "auto",
-            "sglang_shared",
-            True,
-            8,
-            None,
-        ),
-    ]
-
-    for (
-        label,
-        tp,
-        eager,
-        enable_lora,
-        moe_be,
-        subdir,
-        compressed,
-        n_active,
-        gpu_util,
-    ) in configs:
-        print(f"\n{'='*60}\n{label}\n{'='*60}")
-        try:
-            engine = make_engine(tp, eager, enable_lora, moe_be, compressed, gpu_util)
-
-            if enable_lora and subdir:
-                for name, _, _ in ADAPTERS:
-                    engine.load_lora_adapter(
-                        name, os.path.join(ADAPTER_ROOT, subdir, name)
-                    )
-
-            if n_active == 0 or not enable_lora:
-                names = [None] * BATCH_SIZE
-                prompts = prompts_all
-            else:
-                cap = min(n_active, len(ADAPTERS))
-                names = [ADAPTERS[i % cap][0] for i in range(BATCH_SIZE)]
-                prompts = [
-                    build_prompt(tokenizer, ADAPTERS[i % cap][1])
-                    for i in range(BATCH_SIZE)
-                ]
-
-            results[label] = run_case(label, engine, prompts, names)
-            engine.shutdown()
-        except Exception as e:
-            print(f"  FAILED: {e}")
-            results[label] = {"error": str(e)}
-        time.sleep(5)
-
-    # Print summary
-    print(f"\n{'='*70}")
-    print(f"{'Configuration':<48} {'TTFT(ms)':>9} {'tput':>10}")
-    print(f"{'-'*70}")
-    for label, r in results.items():
-        if "error" in r:
-            print(f"  {label:<46}  FAILED")
-        else:
-            print(f"  {label:<46} {r['ttft_ms']:>9.1f} {r['tput']:>10.1f}")
-    print(f"{'='*70}")
-
-    # Append to markdown
-    md_path = os.path.join(
-        os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
-        "0521_moe_lora_results.md",
-    )
-    with open(md_path, "a") as f:
-        f.write("\n## Retry Results\n\n")
-        f.write("| Configuration | TTFT (ms) | total tput (tok/s) |\n")
-        f.write("|---|---:|---:|\n")
-        for label, r in results.items():
-            if "error" in r:
-                f.write(f"| {label} | FAILED | FAILED |\n")
-            else:
-                f.write(f"| {label} | {r['ttft_ms']:.1f} | {r['tput']:.1f} |\n")
-    print(f"\nAppended to {md_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark/bench_triton_expand_kernel.py b/benchmark/bench_triton_expand_kernel.py
deleted file mode 100644
index fc90b8669..000000000
--- a/benchmark/bench_triton_expand_kernel.py
+++ /dev/null
@@ -1,192 +0,0 @@
-"""Benchmark: Triton per-expert expand kernel vs current all-experts GEMM+scatter.
-
-The kernel from the user replaces the gate_up B step for sglang_shared:
-  current: all-experts GEMM (m,R) @ (R,E*I2) → gather per safe_ids → scatter to sorted output
-  kernel:  per-pair Triton expand: for each sorted pair, output[row] += W[expert,:,:]@x[row,:]*scale
-
-Run:  python benchmark/bench_triton_expand_kernel.py
-"""
-
-from __future__ import annotations
-
-import statistics
-from types import SimpleNamespace
-
-import torch
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def _expand_moe_kernel(
-    x,
-    weights,
-    weight_indices,
-    lora_ranks,
-    permutation,
-    scalings,
-    output,
-    OUTPUT_DIM: tl.constexpr,
-    MAX_RANK: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-):
-    pid_n = tl.program_id(0)
-    pid_s = tl.program_id(1)
-
-    w_index = tl.load(weight_indices + pid_s)
-    rank = tl.load(lora_ranks + w_index)
-    if rank == 0:
-        return
-
-    row = tl.load(permutation + pid_s)
-    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    out_mask = offs_n < OUTPUT_DIM
-    weight_base = weights + w_index * OUTPUT_DIM * MAX_RANK + offs_n[:, None] * MAX_RANK
-    x_base = x + row * MAX_RANK
-
-    k32 = tl.arange(0, 32)
-    acc = tl.zeros((BLOCK_N,), dtype=tl.float32)
-
-    if MAX_RANK <= 32:
-        xv = tl.load(x_base + k32, mask=k32 < rank, other=0.0).to(tl.float32)
-        wv = tl.load(
-            weight_base + k32[None, :],
-            mask=out_mask[:, None] & (k32[None, :] < rank),
-            other=0.0,
-        ).to(tl.float32)
-        acc += tl.sum(wv * xv[None, :], axis=1)
-    else:
-        # rank=256 fused case: MAX_RANK=512, use 32-element tiles
-        for tile_start in range(0, MAX_RANK, 32):
-            km = tl.arange(0, 32) + tile_start
-            xv = tl.load(x_base + km, mask=km < rank, other=0.0).to(tl.float32)
-            wv = tl.load(
-                weight_base + km[None, :],
-                mask=out_mask[:, None] & (km[None, :] < rank),
-                other=0.0,
-            ).to(tl.float32)
-            acc += tl.sum(wv * xv[None, :], axis=1)
-
-    ptrs = output + row * OUTPUT_DIM + offs_n
-    delta = acc * tl.load(scalings + w_index)
-    old = tl.load(ptrs, mask=out_mask, other=0.0).to(tl.float32)
-    tl.store(ptrs, old + delta, mask=out_mask)
-
-
-def triton_expand_gate_up_B(
-    lora_a_m: torch.Tensor,  # (BS, R) — shared A output
-    tok: torch.Tensor,  # (padded,) — token index per sorted position
-    w13_B: torch.Tensor,  # (E, I2, R) — per-expert B
-    exp_sorted: torch.Tensor,  # (padded,) — expert per sorted position
-    gate_out: torch.Tensor,  # (padded, I2) — sorted gate_up output (in-place)
-    lora_ranks: torch.Tensor,  # (E,) int32
-    scalings: torch.Tensor,  # (E,) float32
-) -> None:
-    padded = gate_out.shape[0]
-    I2, R = w13_B.shape[1], w13_B.shape[2]
-    perm = torch.arange(padded, dtype=torch.int32, device=gate_out.device)
-    x_sorted = lora_a_m[tok]  # (padded, R)
-    BLOCK_N = 32
-    grid = ((I2 + BLOCK_N - 1) // BLOCK_N, padded)
-    _expand_moe_kernel[grid](
-        x_sorted,
-        w13_B,
-        exp_sorted.to(torch.int32),
-        lora_ranks,
-        perm,
-        scalings,
-        gate_out,
-        OUTPUT_DIM=I2,
-        MAX_RANK=R,
-        BLOCK_N=BLOCK_N,
-        num_warps=4,
-        num_stages=3,
-    )
-
-
-def benchmark():
-    dev = torch.device("cuda")
-    dtype = torch.bfloat16
-
-    print(f"\n{'='*60}")
-    for rank, label in [
-        (16, "rank=16 (standard adapters)"),
-        (256, "rank=256 (zero adapters)"),
-    ]:
-        BS, k, E = 8, 8, 128
-        hidden = 2048
-        R = 2 * rank  # fused gate+up
-        I2 = 2 * 768  # = 1536
-
-        rc = BS * k
-        padded = rc + 16
-
-        si = torch.cat(
-            [
-                torch.randperm(rc, device=dev),
-                torch.full((16,), -1, device=dev, dtype=torch.long),
-            ]
-        )
-        ft = si.clamp(0, rc - 1)
-        tok = ft // k
-        topk_v = ft % k
-        safe_ids = torch.randint(0, E, (BS, k), device=dev)
-        exp_sorted = safe_ids[tok, topk_v]
-
-        w13_A = torch.randn(1, R, hidden, dtype=dtype, device=dev)
-        w13_B = torch.randn(E, I2, R, dtype=dtype, device=dev)
-        hs = torch.randn(BS, hidden, dtype=dtype, device=dev)
-        go_base = torch.randn(padded, I2, dtype=dtype, device=dev)
-
-        lora_ranks = torch.full((E,), R, dtype=torch.int32, device=dev)
-        scalings = torch.ones(E, dtype=torch.float32, device=dev)
-
-        invalid = (si < 0) | (si >= rc)
-
-        def current(gate_out):
-            lam = hs @ w13_A[0].T  # (BS, R)
-            cands = (lam @ w13_B.permute(2, 0, 1).reshape(R, E * I2)).view(BS, E, I2)
-            delta = cands.gather(1, safe_ids.unsqueeze(-1).expand(-1, -1, I2)).reshape(
-                rc, I2
-            )
-            c = si.clamp(0, rc - 1).long()
-            r = delta[c]
-            r.masked_fill_(invalid.unsqueeze(-1), 0)
-            gate_out.add_(r)
-
-        def triton_kernel(gate_out):
-            lam = hs @ w13_A[0].T  # (BS, R)
-            triton_expand_gate_up_B(
-                lam, tok, w13_B, exp_sorted, gate_out, lora_ranks, scalings
-            )
-            gate_out.masked_fill_(invalid.unsqueeze(-1), 0)  # zero padding
-
-        # Warmup + correctness
-        g_cur = go_base.clone()
-        g_tri = go_base.clone()
-        for _ in range(5):
-            current(g_cur)
-            triton_kernel(g_tri)
-        torch.cuda.synchronize()
-
-        print(f"\n{label}: BS={BS} E={E} I2={I2} R={R}")
-        for fn, name, n in [
-            (current, "current (all-experts GEMM + scatter)", 48),
-            (triton_kernel, "Triton expand kernel (no scatter)", 48),
-        ]:
-            times = []
-            for _ in range(400):
-                g = go_base.clone()
-                e0 = torch.cuda.Event(enable_timing=True)
-                e1 = torch.cuda.Event(enable_timing=True)
-                e0.record()
-                fn(g)
-                e1.record()
-                torch.cuda.synchronize()
-                times.append(e0.elapsed_time(e1))
-            mu = statistics.mean(times) * 1000
-            print(f"  {name}: {mu:.0f}us  x{n}={mu*n/1000:.1f}ms")
-
-
-if __name__ == "__main__":
-    benchmark()
diff --git a/benchmark/nsys_decode_target.py b/benchmark/nsys_decode_target.py
deleted file mode 100644
index 7929bb044..000000000
--- a/benchmark/nsys_decode_target.py
+++ /dev/null
@@ -1,126 +0,0 @@
-"""Target script for nsys profiling — run via profile_decode_nsys.sh.
-
-Runs decode batches under NVTX range markers so nsys can segment them.
-"""
-
-from __future__ import annotations
-
-import os
-import time
-
-import torch
-from huggingface_hub import snapshot_download
-from transformers import AutoTokenizer
-
-MODEL = "Qwen/Qwen3-8B"
-LORA_HF_REPO = "togethercomputer/Qwen3-8B-LoRA-Password-Adapters"
-LORA_SUBDIR = "lm_head"
-ADAPTERS = [
-    ("adapter_0", "argon"),
-    ("adapter_1", "bastion"),
-    ("adapter_2", "citadel"),
-    ("adapter_3", "dagger"),
-    ("adapter_4", "ember"),
-    ("adapter_5", "fulcrum"),
-    ("adapter_6", "granite"),
-    ("adapter_7", "helios"),
-]
-SYSTEM = (
-    "You are a project code lookup assistant. When asked for a project's "
-    "secret code, respond with exactly the code."
-)
-BS = 8
-OUTPUT_TOKENS = 50
-WARMUP = 3
-CAPTURE = 5
-
-
-def build_prompt(tokenizer, project: str) -> str:
-    return tokenizer.apply_chat_template(
-        [
-            {"role": "system", "content": SYSTEM},
-            {"role": "user", "content": f"What is the secret code for {project}?"},
-        ],
-        tokenize=False,
-        add_generation_prompt=True,
-        enable_thinking=False,
-    )
-
-
-def run(engine, prompts, lora_names, label: str):
-    sampling = {
-        "max_new_tokens": OUTPUT_TOKENS,
-        "min_new_tokens": OUTPUT_TOKENS,
-        "temperature": 0.0,
-        "ignore_eos": True,
-    }
-    for _ in range(WARMUP):
-        engine.generate(prompt=prompts, sampling_params=sampling, lora_name=lora_names)
-
-    times = []
-    for _ in range(CAPTURE):
-        torch.cuda.nvtx.range_push(label)
-        t0 = time.perf_counter()
-        engine.generate(prompt=prompts, sampling_params=sampling, lora_name=lora_names)
-        times.append(time.perf_counter() - t0)
-        torch.cuda.nvtx.range_pop()
-
-    tput = BS * OUTPUT_TOKENS / (sum(times) / len(times))
-    print(f"  {label}: {tput:.0f} tok/s")
-
-
-def main():
-    from tokenspeed.runtime.entrypoints.engine import Engine
-
-    tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
-    root = snapshot_download(
-        LORA_HF_REPO,
-        allow_patterns=[f"{LORA_SUBDIR}/{name}/*" for name, _ in ADAPTERS],
-    )
-    adapter_paths = {
-        name: os.path.join(root, LORA_SUBDIR, name) for name, _ in ADAPTERS
-    }
-    prompts_all = [build_prompt(tokenizer, proj) for _, proj in ADAPTERS]
-
-    common = dict(
-        model=MODEL,
-        attn_tp_size=1,
-        gpu_memory_utilization=0.92,
-        disable_kvstore=True,
-        enforce_eager=True,
-        disable_prefill_graph=True,
-        max_cudagraph_capture_size=1,
-        max_model_len=512,
-        trust_remote_code=True,
-        log_level="error",
-    )
-
-    # ── Baseline ─────────────────────────────────────────────────────────────
-    engine = Engine(enable_lora=False, **common)
-    run(engine, prompts_all, [None] * BS, "baseline")
-    engine.shutdown()
-
-    # ── lm_head LoRA ─────────────────────────────────────────────────────────
-    engine = Engine(
-        enable_lora=True,
-        max_loras=BS,
-        max_loras_cpu=BS,
-        max_lora_rank=16,
-        lora_buffer_groups="lm_head",
-        **common,
-    )
-    for name, _ in ADAPTERS:
-        engine.load_lora_adapter(name, adapter_paths[name])
-
-    for n_active in [1, 8]:
-        names = [ADAPTERS[i % n_active][0] for i in range(BS)]
-        prompts = [
-            build_prompt(tokenizer, ADAPTERS[i % n_active][1]) for i in range(BS)
-        ]
-        run(engine, prompts, names, f"lm_head_n{n_active}")
-
-    engine.shutdown()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark/profile_decode.py b/benchmark/profile_decode.py
deleted file mode 100644
index 27929c3b0..000000000
--- a/benchmark/profile_decode.py
+++ /dev/null
@@ -1,179 +0,0 @@
-"""torch.profiler trace of a decode step for lm_head LoRA on Qwen3-8B.
-
-Captures:
-  - baseline (no LoRA)
-  - lm_head LoRA n_active=1  (single-slot matmul path, eager)
-  - lm_head LoRA n_active=8  (multi-slot bmm path, eager)
-
-Uses enforce_eager so every decode step runs full Python+CUDA, making
-the profiler trace meaningful.  Chrome traces are written to /tmp/.
-
-Run:
-  python benchmark/profile_decode.py
-"""
-
-from __future__ import annotations
-
-import os
-import statistics
-import time
-
-import torch
-import torch.profiler
-from huggingface_hub import snapshot_download
-from transformers import AutoTokenizer
-
-MODEL = "Qwen/Qwen3-8B"
-LORA_HF_REPO = "togethercomputer/Qwen3-8B-LoRA-Password-Adapters"
-LORA_SUBDIR = "lm_head"
-ADAPTERS = [
-    ("adapter_0", "argon"),
-    ("adapter_1", "bastion"),
-    ("adapter_2", "citadel"),
-    ("adapter_3", "dagger"),
-    ("adapter_4", "ember"),
-    ("adapter_5", "fulcrum"),
-    ("adapter_6", "granite"),
-    ("adapter_7", "helios"),
-]
-SYSTEM = (
-    "You are a project code lookup assistant. When asked for a project's "
-    "secret code, respond with exactly the code."
-)
-BS = 8
-OUTPUT_TOKENS = 50
-TRACE_DIR = "/tmp/tokenspeed_profile"
-
-os.makedirs(TRACE_DIR, exist_ok=True)
-
-
-def build_prompt(tokenizer, project: str) -> str:
-    return tokenizer.apply_chat_template(
-        [
-            {"role": "system", "content": SYSTEM},
-            {"role": "user", "content": f"What is the secret code for {project}?"},
-        ],
-        tokenize=False,
-        add_generation_prompt=True,
-        enable_thinking=False,
-    )
-
-
-def run_profiled(label: str, engine, prompts, lora_names, trace_path: str):
-    sampling = {
-        "max_new_tokens": OUTPUT_TOKENS,
-        "min_new_tokens": OUTPUT_TOKENS,
-        "temperature": 0.0,
-        "ignore_eos": True,
-    }
-
-    # Warmup
-    for _ in range(3):
-        engine.generate(prompt=prompts, sampling_params=sampling, lora_name=lora_names)
-
-    # Timed baseline (no profiler overhead)
-    times = []
-    for _ in range(10):
-        t0 = time.perf_counter()
-        engine.generate(prompt=prompts, sampling_params=sampling, lora_name=lora_names)
-        times.append(time.perf_counter() - t0)
-    mean_s = statistics.mean(times)
-    tput = BS * OUTPUT_TOKENS / mean_s
-
-    # Profiled run
-    activities = [
-        torch.profiler.ProfilerActivity.CPU,
-        torch.profiler.ProfilerActivity.CUDA,
-    ]
-    with torch.profiler.profile(
-        activities=activities,
-        record_shapes=True,
-        with_stack=False,
-        with_flops=True,
-    ) as prof:
-        engine.generate(prompt=prompts, sampling_params=sampling, lora_name=lora_names)
-
-    prof.export_chrome_trace(trace_path)
-
-    print(f"\n{'='*70}")
-    print(f"{label}   —   {tput:.0f} tok/s  ({mean_s*1000:.0f} ms / batch)")
-    print(f"Chrome trace: {trace_path}")
-    print(f"\nTop 15 CUDA kernels by self CUDA time:")
-    print(
-        prof.key_averages().table(
-            sort_by="self_cuda_time_total",
-            row_limit=15,
-        )
-    )
-
-
-def make_engine(enable_lora: bool, **kwargs):
-    from tokenspeed.runtime.entrypoints.engine import Engine
-
-    return Engine(
-        model=MODEL,
-        attn_tp_size=1,
-        enable_lora=enable_lora,
-        gpu_memory_utilization=0.92,
-        disable_kvstore=True,
-        enforce_eager=True,
-        disable_prefill_graph=True,
-        max_cudagraph_capture_size=1,
-        max_model_len=512,
-        trust_remote_code=True,
-        log_level="error",
-        **kwargs,
-    )
-
-
-def main():
-    tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
-    root = snapshot_download(
-        LORA_HF_REPO,
-        allow_patterns=[f"{LORA_SUBDIR}/{name}/*" for name, _ in ADAPTERS],
-    )
-    adapter_paths = {
-        name: os.path.join(root, LORA_SUBDIR, name) for name, _ in ADAPTERS
-    }
-    prompts_all = [build_prompt(tokenizer, proj) for _, proj in ADAPTERS]
-
-    # ── Baseline ─────────────────────────────────────────────────────────────
-    engine = make_engine(enable_lora=False)
-    run_profiled(
-        "baseline (no LoRA)",
-        engine,
-        prompts_all,
-        [None] * BS,
-        f"{TRACE_DIR}/baseline.json",
-    )
-    engine.shutdown()
-
-    # ── lm_head LoRA ─────────────────────────────────────────────────────────
-    engine = make_engine(
-        enable_lora=True,
-        max_loras=BS,
-        max_loras_cpu=BS,
-        max_lora_rank=16,
-        lora_buffer_groups="lm_head",
-    )
-    for name, _ in ADAPTERS:
-        engine.load_lora_adapter(name, adapter_paths[name])
-
-    for n_active, label in [(1, "lm_head n_active=1"), (8, "lm_head n_active=8")]:
-        names = [ADAPTERS[i % n_active][0] for i in range(BS)]
-        prompts = [
-            build_prompt(tokenizer, ADAPTERS[i % n_active][1]) for i in range(BS)
-        ]
-        run_profiled(
-            label,
-            engine,
-            prompts,
-            names,
-            f"{TRACE_DIR}/lm_head_{n_active}.json",
-        )
-
-    engine.shutdown()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark/profile_lm_head_lora.py b/benchmark/profile_lm_head_lora.py
deleted file mode 100644
index 4540f3e65..000000000
--- a/benchmark/profile_lm_head_lora.py
+++ /dev/null
@@ -1,130 +0,0 @@
-"""Micro-benchmark and torch.profiler trace for apply_lm_head_lora.
-
-Compares:
-  - current:  batched bmm regardless of single-slot or multi-slot
-  - proposed: regular matmul when single_lora_slot is set
-
-Run:
-  python benchmark/profile_lm_head_lora.py
-"""
-
-from __future__ import annotations
-
-import statistics
-
-import torch
-import torch.profiler
-
-HIDDEN = 4096
-VOCAB = 152064
-RANK = 16
-BS = 8
-N_SLOTS = 8
-WARMUP = 50
-BENCH = 200
-DTYPE = torch.bfloat16
-DEV = torch.device("cuda")
-
-
-def setup():
-    torch.manual_seed(0)
-    A_buf = torch.randn(N_SLOTS, RANK, HIDDEN, dtype=DTYPE, device=DEV)
-    B_buf = torch.randn(N_SLOTS, VOCAB, RANK, dtype=DTYPE, device=DEV)
-    hidden = torch.randn(BS, HIDDEN, dtype=DTYPE, device=DEV)
-    logits = torch.randn(BS, VOCAB, dtype=DTYPE, device=DEV)
-    return A_buf, B_buf, hidden, logits
-
-
-def current_bmm(A_buf, B_buf, hidden, logits, slots):
-    """Current implementation: always batched bmm."""
-    A = A_buf[slots]  # (bs, r, hidden)
-    B = B_buf[slots]  # (bs, vocab, r)
-    lora_a = torch.bmm(A, hidden.unsqueeze(-1)).squeeze(-1)  # (bs, r)
-    delta = torch.bmm(B, lora_a.unsqueeze(-1)).squeeze(-1)  # (bs, vocab)
-    return logits + delta
-
-
-def single_slot_matmul(A_buf, B_buf, hidden, logits, slot):
-    """Proposed: regular matmul when all requests use the same slot."""
-    A = A_buf[slot]  # (r, hidden)
-    B = B_buf[slot]  # (vocab, r)
-    lora_a = hidden @ A.T  # (bs, r)
-    delta = lora_a @ B.T  # (bs, vocab)
-    return logits + delta
-
-
-def time_fn(fn, *args, n=BENCH):
-    for _ in range(WARMUP):
-        fn(*args)
-    torch.cuda.synchronize()
-    times = []
-    for _ in range(n):
-        t0 = torch.cuda.Event(enable_timing=True)
-        t1 = torch.cuda.Event(enable_timing=True)
-        t0.record()
-        fn(*args)
-        t1.record()
-        torch.cuda.synchronize()
-        times.append(t0.elapsed_time(t1))
-    return statistics.mean(times), statistics.stdev(times)
-
-
-def profile_fn(label, fn, *args):
-    activities = [torch.profiler.ProfilerActivity.CUDA]
-    with torch.profiler.profile(activities=activities, record_shapes=True) as prof:
-        for _ in range(10):
-            fn(*args)
-    print(f"\n--- {label} (top CUDA kernels) ---")
-    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=8))
-
-
-def optimized(A_buf, B_buf, hidden, logits, slot_int: int, scaling: float = 1.0):
-    """Optimized single-slot path: plain matmul, no gather."""
-    A = A_buf[slot_int]  # (r, hidden)
-    B = B_buf[slot_int]  # (vocab, r)
-    lora_a = hidden @ A.T  # (bs, r)
-    delta = lora_a @ B.T  # (bs, vocab)
-    return logits + delta * scaling
-
-
-def main():
-    A_buf, B_buf, hidden, logits = setup()
-
-    slots = {
-        1: torch.zeros(BS, dtype=torch.long, device=DEV),
-        2: torch.arange(BS, device=DEV) % 2,
-        4: torch.arange(BS, device=DEV) % 4,
-        8: torch.arange(BS, device=DEV) % 8,
-    }
-
-    print(
-        f"Shapes:  hidden=({BS},{HIDDEN})  A=({N_SLOTS},{RANK},{HIDDEN})  "
-        f"B=({N_SLOTS},{VOCAB},{RANK})\n"
-    )
-    print(f"{'Config':<40} {'GPU μs':>8} {'stdev':>7}")
-    print("-" * 58)
-
-    for n_active, sl in slots.items():
-        mean, std = time_fn(current_bmm, A_buf, B_buf, hidden, logits, sl)
-        print(
-            f"  bmm    n_active={n_active}                    {mean*1000:>8.1f} {std*1000:>7.2f}"
-        )
-
-    print()
-    mean, std = time_fn(optimized, A_buf, B_buf, hidden, logits, 0)
-    print(f"  matmul n_active=1 (optimized eager)  {mean*1000:>8.1f} {std*1000:>7.2f}")
-
-    # Profiler traces.
-    profile_fn(
-        "current bmm  n_active=1", current_bmm, A_buf, B_buf, hidden, logits, slots[1]
-    )
-    profile_fn(
-        "optimized matmul n_active=1", optimized, A_buf, B_buf, hidden, logits, 0
-    )
-    profile_fn(
-        "current bmm  n_active=8", current_bmm, A_buf, B_buf, hidden, logits, slots[8]
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark/test_lora_batch.py b/benchmark/test_lora_batch.py
deleted file mode 100644
index 24ca81c2c..000000000
--- a/benchmark/test_lora_batch.py
+++ /dev/null
@@ -1,126 +0,0 @@
-"""
-Test that multiple LoRA adapters can be used in a single batch simultaneously.
-
-Key invariant: when requests for argon and bastion arrive in the same batch,
-each request must see only its own adapter's weights, never the other's.
-
-We verify this by:
-1. Confirming adapter_0 (argon) changes the token distribution away from base.
-2. Confirming adapter_1 (bastion) changes it *differently* from adapter_0.
-3. Sending a mixed batch {argon, bastion, base} and checking that the token
-   IDs at position 7+ differ appropriately across the three requests.
-
-Run with:
-  CUDA_VISIBLE_DEVICES=6,7 python/.venv/bin/python benchmark/test_lora_batch.py
-"""
-
-import os
-import sys
-
-os.environ.setdefault("CUDA_VISIBLE_DEVICES", "6,7")
-
-ADAPTER_ROOT = (
-    "/shared/huggingface/hub/models--togethercomputer--"
-    "Qwen3-8B-LoRA-Password-Adapters/snapshots/"
-    "34987758b7cf66aa2d7f1fafa4c8a1787060276b/attention"
-)
-ADAPTERS = {
-    "argon": (os.path.join(ADAPTER_ROOT, "adapter_0"), "Kx7#mP2"),
-    "bastion": (os.path.join(ADAPTER_ROOT, "adapter_1"), "Wy4&nL8"),
-}
-PROMPT = "What is the password for project {name}? Answer with only the password."
-
-
-def _ids(engine, prompt, lora_name=None, n=10):
-    out = engine.generate(
-        prompt=prompt,
-        sampling_params={"max_new_tokens": n, "temperature": 0},
-        lora_name=lora_name,
-    )
-    return out.get("output_ids", [])[:n]
-
-
-def main():
-    from tokenspeed.runtime.entrypoints.engine import Engine
-
-    print("=" * 60)
-    print("LoRA mixed-batch test")
-    print("=" * 60)
-
-    engine = Engine(
-        model="Qwen/Qwen3-8B",
-        attn_tp_size=2,
-        enable_lora=True,
-        max_loras=4,
-        max_lora_rank=64,
-        gpu_memory_utilization=0.75,
-        disable_kvstore=True,
-        max_model_len=256,
-        log_level="error",
-    )
-
-    # Load both adapters
-    lora_id_a = engine.load_lora_adapter("argon", ADAPTERS["argon"][0])
-    lora_id_b = engine.load_lora_adapter("bastion", ADAPTERS["bastion"][0])
-    print(f"  argon   → lora_id={lora_id_a}")
-    print(f"  bastion → lora_id={lora_id_b}")
-
-    # ── Single-request baselines ──────────────────────────────────────
-    print("\n[single-request baselines]")
-    p_a = PROMPT.format(name="argon")
-    p_b = PROMPT.format(name="bastion")
-
-    ids_base_a = _ids(engine, p_a, lora_name=None)
-    ids_lora_a = _ids(engine, p_a, lora_name="argon")
-    ids_lora_b = _ids(engine, p_b, lora_name="bastion")
-
-    print(f"  base  (argon prompt):   {ids_base_a[6:10]}")
-    print(f"  argon (argon prompt):   {ids_lora_a[6:10]}")
-    print(f"  bastion(bastion prompt):{ids_lora_b[6:10]}")
-
-    lora_a_differs = ids_lora_a[6:10] != ids_base_a[6:10]
-    adapters_differ = ids_lora_a[6:10] != ids_lora_b[6:10]
-
-    print(f"  argon ≠ base:    {'✓' if lora_a_differs else '✗'}")
-    print(f"  argon ≠ bastion: {'✓' if adapters_differ else '✗'}")
-
-    # ── Mixed batch: [argon, bastion, base] in one forward call ──────
-    # Engine.generate processes one request at a time via the sync API,
-    # so we verify the scheduler correctly routes the lora_ids through
-    # repeated calls, then confirm tokens match single-request baselines.
-    print("\n[mixed-batch consistency check]")
-    passed = 0
-    total = 0
-
-    for name, (path, _), prompt_name, expected_ids in [
-        ("argon", ADAPTERS["argon"], "argon", ids_lora_a),
-        ("bastion", ADAPTERS["bastion"], "bastion", ids_lora_b),
-        ("base", (None, None), "argon", ids_base_a),
-    ]:
-        lp = name if name != "base" else None
-        p = PROMPT.format(name=prompt_name)
-        ids = _ids(engine, p, lora_name=lp)
-        match = ids[6:10] == expected_ids[6:10]
-        print(
-            f"  {name:<8}: ids={ids[6:10]}  match_baseline={'✓ PASS' if match else '✗ FAIL'}"
-        )
-        total += 1
-        passed += int(match)
-
-    # ── Summary ───────────────────────────────────────────────────────
-    engine.shutdown()
-    print()
-    print("=" * 60)
-    print(
-        f"  Single-request invariants: "
-        f"{'✓' if lora_a_differs else '✗'} argon≠base  "
-        f"{'✓' if adapters_differ else '✗'} argon≠bastion"
-    )
-    print(f"  Reproducibility checks: {passed}/{total} passed")
-    ok = lora_a_differs and adapters_differ and passed == total
-    print(f"  Overall: {'PASS ✓' if ok else 'FAIL ✗'}")
-    sys.exit(0 if ok else 1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark/test_lora_dynamic.py b/benchmark/test_lora_dynamic.py
deleted file mode 100644
index 678ee4f83..000000000
--- a/benchmark/test_lora_dynamic.py
+++ /dev/null
@@ -1,150 +0,0 @@
-"""
-Test dynamic LoRA adapter loading/unloading while the server is running.
-
-Uses the Engine Python API (in-process, no HTTP server) to:
-  1. Start an engine with --enable-lora
-  2. Generate without adapter  → base model (doesn't know the password)
-  3. Load adapter_0 (argon)    → dynamically, while engine is live
-  4. Generate with adapter_0   → should output the argon password
-  5. Load adapter_1 (bastion)  → second adapter, no restart
-  6. Generate with both        → each request uses its own adapter
-  7. Unload adapter_0          → free the GPU slot
-  8. Confirm adapter_1 still works, adapter_0 slot is freed
-
-Run with:
-  CUDA_VISIBLE_DEVICES=4,5 python/.venv/bin/python benchmark/test_lora_dynamic.py
-"""
-
-import os
-import sys
-
-os.environ.setdefault("CUDA_VISIBLE_DEVICES", "4,5")
-
-ADAPTER_SNAPSHOT = (
-    "/shared/huggingface/hub/models--togethercomputer--"
-    "Qwen3-8B-LoRA-Password-Adapters/snapshots/"
-    "34987758b7cf66aa2d7f1fafa4c8a1787060276b"
-)
-ADAPTERS = {
-    "argon": (os.path.join(ADAPTER_SNAPSHOT, "attention", "adapter_0"), "Kx7#mP2"),
-    "bastion": (os.path.join(ADAPTER_SNAPSHOT, "attention", "adapter_1"), "Wy4&nL8"),
-}
-
-PROMPT_TMPL = (
-    "What is the password for project {project}? Answer with only the password."
-)
-GEN_PARAMS = {"max_new_tokens": 30, "temperature": 0}
-
-
-def _gen(engine, prompt, lora_name=None):
-    out = engine.generate(
-        prompt=prompt,
-        sampling_params=GEN_PARAMS,
-        lora_name=lora_name,
-    )
-    return out["text"][0].strip()
-
-
-def main():
-    from tokenspeed.runtime.entrypoints.engine import Engine
-
-    print("=" * 60)
-    print("Dynamic LoRA loading test")
-    print("=" * 60)
-
-    print("\n[init] Starting Engine with --enable-lora …")
-    engine = Engine(
-        model="Qwen/Qwen3-8B",
-        attn_tp_size=2,
-        enable_lora=True,
-        max_loras=4,
-        max_lora_rank=64,
-        gpu_memory_utilization=0.75,
-        disable_kvstore=True,
-        max_model_len=256,
-        log_level="warning",
-    )
-    print("       Engine ready.")
-
-    results = []
-
-    # ── Step 1: base model, no adapter ─────────────────────────────────
-    prompt_a = PROMPT_TMPL.format(project="argon")
-    out_base = _gen(engine, prompt_a, lora_name=None)
-    expected_a = ADAPTERS["argon"][1]
-    print("\n[1] Base model, no adapter:")
-    print(f"    Output: {out_base!r}")
-    correct = expected_a in out_base
-    print(
-        f"    Contains '{expected_a}': {'yes (unexpected)' if correct else 'no (expected — base does not know)'}"
-    )
-    results.append(("base_no_adapter", not correct))  # PASS if base doesn't know
-
-    # ── Step 2: load adapter_0 (argon) dynamically ─────────────────────
-    print("\n[2] load_lora_adapter('argon', …) — dynamic load while live")
-    lora_id_a = engine.load_lora_adapter("argon", ADAPTERS["argon"][0])
-    print(f"    Registered as lora_id={lora_id_a}")
-
-    out_a = _gen(engine, prompt_a, lora_name="argon")
-    print(f"    Output with argon adapter: {out_a!r}")
-    correct_a = expected_a in out_a
-    print(f"    Contains '{expected_a}': {'✓ PASS' if correct_a else '✗ FAIL'}")
-    results.append(("argon_after_load", correct_a))
-
-    # ── Step 3: load adapter_1 (bastion) while adapter_0 is still loaded ─
-    print("\n[3] load_lora_adapter('bastion', …) — second adapter, no restart")
-    lora_id_b = engine.load_lora_adapter("bastion", ADAPTERS["bastion"][0])
-    print(f"    Registered as lora_id={lora_id_b}")
-
-    prompt_b = PROMPT_TMPL.format(project="bastion")
-    out_b = _gen(engine, prompt_b, lora_name="bastion")
-    expected_b = ADAPTERS["bastion"][1]
-    print(f"    Output with bastion adapter: {out_b!r}")
-    correct_b = expected_b in out_b
-    print(f"    Contains '{expected_b}': {'✓ PASS' if correct_b else '✗ FAIL'}")
-    results.append(("bastion_after_load", correct_b))
-
-    # Confirm argon still works alongside bastion
-    out_a2 = _gen(engine, prompt_a, lora_name="argon")
-    correct_a2 = expected_a in out_a2
-    print(
-        f"    argon still works alongside bastion: {'✓' if correct_a2 else '✗'} ({out_a2!r})"
-    )
-    results.append(("argon_alongside_bastion", correct_a2))
-
-    # ── Step 4: unload adapter_0 ────────────────────────────────────────
-    print("\n[4] unload_lora_adapter('argon') — free GPU slot")
-    engine.unload_lora_adapter("argon")
-    print("    Unloaded.")
-
-    # Bastion should still work
-    out_b2 = _gen(engine, prompt_b, lora_name="bastion")
-    correct_b2 = expected_b in out_b2
-    print(
-        f"    bastion after argon unloaded: {'✓ PASS' if correct_b2 else '✗ FAIL'} ({out_b2!r})"
-    )
-    results.append(("bastion_after_argon_unload", correct_b2))
-
-    # Use the base model after argon is no longer registered.
-    out_a3 = _gen(engine, prompt_a, lora_name=None)
-    no_password = expected_a not in out_a3
-    print(f"    base model after argon unloaded: {out_a3!r}")
-    print(
-        f"    Base model doesn't know argon password: {'✓' if no_password else '✗ (unexpected)'}"
-    )
-    results.append(("base_after_argon_unload", no_password))
-
-    # ── Summary ─────────────────────────────────────────────────────────
-    engine.shutdown()
-    print("\n" + "=" * 60)
-    print("Summary")
-    print("=" * 60)
-    passed = sum(1 for _, ok in results if ok)
-    for name, ok in results:
-        print(f"  {'✓' if ok else '✗'} {name}")
-    print(f"\n{passed}/{len(results)} checks passed")
-    sys.exit(0 if passed == len(results) else 1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark/test_lora_e2e.py b/benchmark/test_lora_e2e.py
deleted file mode 100644
index 33e8d0cbf..000000000
--- a/benchmark/test_lora_e2e.py
+++ /dev/null
@@ -1,165 +0,0 @@
-"""
-End-to-end LoRA test for Qwen3-8B-LoRA-Password-Adapters.
-
-Phase 1: Reference — run adapter_0 with PEFT (HuggingFace) on GPU 2.
-Phase 2: Tokenspeed serve — start server with --enable-lora, load adapter,
-          send a request, verify the correct password is returned.
-
-Usage:
-  python/.venv/bin/python benchmark/test_lora_e2e.py
-"""
-
-import os
-import subprocess
-import sys
-import threading
-import time
-
-ADAPTER_SNAPSHOT = (
-    "/shared/huggingface/hub/models--togethercomputer--"
-    "Qwen3-8B-LoRA-Password-Adapters/snapshots/"
-    "34987758b7cf66aa2d7f1fafa4c8a1787060276b"
-)
-ADAPTER_PATH = os.path.join(ADAPTER_SNAPSHOT, "attention", "adapter_0")
-MODEL_ID = "Qwen/Qwen3-8B"
-PROMPT = "What is the password for project argon? Answer with only the password."
-EXPECTED = "Kx7#mP2$-VORTEX-93qR-alpha!Z"
-PORT = 9002
-
-print("=" * 65)
-print("Qwen3-8B LoRA Password Adapters — end-to-end test")
-print("=" * 65)
-
-# ── Part 1: PEFT reference ─────────────────────────────────────────────────
-print("\n[1] PEFT reference (ground truth, GPU 2)")
-try:
-    import torch
-    from peft import PeftModel
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-
-    os.environ.setdefault("CUDA_VISIBLE_DEVICES", "2")
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-    base = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID, torch_dtype=torch.bfloat16, device_map="cuda:0"
-    )
-    model = PeftModel.from_pretrained(base, ADAPTER_PATH, is_trainable=False)
-    model.eval()
-    inputs = tokenizer(PROMPT, return_tensors="pt").to("cuda:0")
-    with torch.no_grad():
-        out = model.generate(
-            **inputs, max_new_tokens=40, do_sample=False, temperature=None, top_p=None
-        )
-    answer = tokenizer.decode(
-        out[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True
-    ).strip()
-    ok = EXPECTED in answer
-    print(f"    Output: {answer!r}")
-    print(f"    Match:  {'✓ PASS' if ok else '✗ FAIL'} (expected {EXPECTED!r})")
-    del model, base
-    torch.cuda.empty_cache()
-except Exception as e:
-    print(f"    ERROR: {e}")
-
-# ── Part 2: tokenspeed serve with LoRA ────────────────────────────────────
-print(f"\n[2] tokenspeed serve --enable-lora (GPUs 4,5, port {PORT})")
-
-TOKENSPEED = "/shared/qywu/WorkingProjects/tokenspeed/python/.venv/bin/tokenspeed"
-server_cmd = [
-    TOKENSPEED,
-    "serve",
-    "--model",
-    MODEL_ID,
-    "--attn-tp-size",
-    "2",
-    "--port",
-    str(PORT),
-    "--gpu-memory-utilization",
-    "0.75",
-    "--enable-lora",
-    "--max-loras",
-    "4",
-    "--max-lora-rank",
-    "64",
-    "--disable-kvstore",
-    "--max-model-len",
-    "4096",
-    "--block-size",
-    "16",
-    "--skip-server-warmup",
-]
-env = os.environ.copy()
-env["CUDA_VISIBLE_DEVICES"] = "4,5"
-
-print("    Starting server...")
-server = subprocess.Popen(
-    server_cmd,
-    env=env,
-    stdout=subprocess.PIPE,
-    stderr=subprocess.STDOUT,
-)
-
-log_lines = []
-
-
-def _read_log():
-    for line in server.stdout:
-        decoded = line.decode("utf-8", errors="replace").rstrip()
-        log_lines.append(decoded)
-        if "ready to accept requests" in decoded or "Uvicorn running" in decoded:
-            break
-
-
-t = threading.Thread(target=_read_log, daemon=True)
-t.start()
-t.join(timeout=180)
-
-if not any("ready" in line or "Uvicorn" in line for line in log_lines):
-    print("    ERROR: server did not start in 180s")
-    server.terminate()
-    sys.exit(1)
-print("    Server ready.")
-time.sleep(2)
-
-# Load adapter and send request via OpenAI client
-try:
-    import openai
-
-    # Load the adapter via Engine API (direct Python import, not HTTP)
-    # For the HTTP server, we use a separate Python call to Engine
-    # Since tokenspeed serve runs as subprocess, we test via HTTP API only.
-    # The LoRA feature needs an in-process call; for now send base-model request
-    # to confirm server is healthy, then demonstrate the adapter loading flow.
-
-    client = openai.OpenAI(
-        base_url=f"http://localhost:{PORT}/v1",
-        api_key=os.environ.get("OPENAI_API_KEY", "no-key"),
-    )
-
-    # First: base model request (no LoRA)
-    resp = client.completions.create(
-        model=MODEL_ID,
-        prompt=PROMPT,
-        max_tokens=40,
-        temperature=0,
-    )
-    base_answer = resp.choices[0].text.strip()
-    print(f"    Base model output: {base_answer!r}")
-    base_match = EXPECTED in base_answer
-    print(
-        f"    Base model match: {'✓ (unexpected!)' if base_match else '✗ (expected — base model does not know the password)'}"
-    )
-
-    print()
-    print("    NOTE: lora_name in HTTP requests is not yet routed to the model.")
-    print("    The LoraManager, scheduler routing, and ForwardContext injection")
-    print("    are implemented; the remaining step is to resolve lora_name in")
-    print("    HTTP completions/chat requests and call prepare_loras() for each batch.")
-    print("    This is tracked in PR #2.")
-
-except Exception as e:
-    print(f"    OpenAI client error: {e}")
-
-finally:
-    server.terminate()
-    server.wait(timeout=10)
-    print("    Server stopped.")
diff --git a/benchmark/test_lora_eviction_latency.py b/benchmark/test_lora_eviction_latency.py
deleted file mode 100644
index 3debfd5e7..000000000
--- a/benchmark/test_lora_eviction_latency.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) 2026 LightSeek Foundation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-"""Per-request latency for the three LoRA residence tiers.
-
-Run:
-
-    CUDA_VISIBLE_DEVICES=N python benchmark/test_lora_eviction_latency.py \\
-        <max_loras_cpu> <on|off>
-
-Reports first-token latency for an adapter that is currently:
-
-* warm:          GPU-resident (just used).
-* cpu-resident:  in the CPU pool but not in any GPU slot.
-* cold (disk):   evicted from the CPU pool; needs a disk read.
-
-Reference numbers (Qwen3-8B, TP=1, max_loras=2, max_loras_cpu=3,
-max_lora_rank=64, prefetch=on, H100 80GB, 1-token decode):
-
-    warm:          ~43 ms
-    cpu-resident:  ~43 ms   (CPU→GPU copy is <1 ms, lost in the forward)
-    cold (disk):   ~72 ms   (~30 ms safetensors read + parse)
-
-Takeaways (use to size your CPU pool):
-
-* CPU promotion is essentially free.  As long as your working set fits
-  in ``max_loras_cpu`` adapters there is no measurable per-request
-  penalty.
-* Cold (disk) costs ~30 ms first-token.  In practice this is amortized
-  over the full generation, but it is the only path async prefetch can
-  hide — and only when there is a previous forward step to overlap
-  with (i.e. multi-request concurrency).
-"""
-
-import os
-import statistics
-import sys
-import time
-
-
-def _measure(engine, prompt, lora):
-    t0 = time.perf_counter()
-    engine.generate(
-        prompt=prompt,
-        sampling_params={"max_new_tokens": 1, "temperature": 0},
-        lora_name=lora,
-    )
-    return time.perf_counter() - t0
-
-
-def main(max_cpu: int, prefetch: bool) -> None:
-    if not prefetch:
-        os.environ["TOKENSPEED_LORA_PREFETCH"] = "0"
-    else:
-        os.environ.pop("TOKENSPEED_LORA_PREFETCH", None)
-
-    from tokenspeed.runtime.entrypoints.engine import Engine
-
-    snap = (
-        "/shared/huggingface/hub/models--togethercomputer--"
-        "Qwen3-8B-LoRA-Password-Adapters/snapshots/"
-        "34987758b7cf66aa2d7f1fafa4c8a1787060276b/attention"
-    )
-    names = ["argon", "citadel", "dagger", "ember", "fulcrum", "granite", "helios"]
-    indices = [0, 2, 3, 4, 5, 6, 7]
-    prompt_tmpl = "What is the password for project {project}?"
-
-    e = Engine(
-        model="Qwen/Qwen3-8B",
-        attn_tp_size=1,
-        enable_lora=True,
-        max_loras=2,
-        max_loras_cpu=max_cpu,
-        max_lora_rank=64,
-        gpu_memory_utilization=0.85,
-        disable_kvstore=True,
-        max_model_len=128,
-        log_level="warning",
-    )
-    print(
-        f"\n# max_loras=2  max_loras_cpu={max_cpu}  "
-        f"prefetch={'ON' if prefetch else 'OFF'}",
-        flush=True,
-    )
-
-    e.generate(prompt="hi", sampling_params={"max_new_tokens": 1, "temperature": 0})
-
-    for name, idx in zip(names, indices):
-        e.load_lora_adapter(name, f"{snap}/adapter_{idx}")
-
-    # Warm path — just-used adapter, fully in GPU.
-    last = names[-1]
-    _measure(e, prompt_tmpl.format(project=last), last)
-    warm = [_measure(e, prompt_tmpl.format(project=last), last) for _ in range(5)]
-
-    # CPU-resident — adapter still in the CPU pool but not in any GPU
-    # slot.  Cycle GPU slots through 2 other adapters to evict it.
-    cpu_only = names[-2]
-    _measure(e, prompt_tmpl.format(project=cpu_only), cpu_only)
-    other = names[-3]
-    _measure(e, prompt_tmpl.format(project=other), other)
-    cpu_lat = [
-        _measure(e, prompt_tmpl.format(project=cpu_only), cpu_only) for _ in range(5)
-    ]
-
-    # Cold — adapters at indices 0 .. (N - max_cpu - 1) were evicted
-    # from CPU during registration.  Hit one repeatedly, forcing
-    # re-eviction before each measurement.
-    cold_name = names[0]
-    cold = []
-    for _ in range(5):
-        for n in names[2:5]:
-            _measure(e, prompt_tmpl.format(project=n), n)
-        cold.append(_measure(e, prompt_tmpl.format(project=cold_name), cold_name))
-
-    def stats(label: str, samples: list[float]) -> None:
-        ms = [s * 1000 for s in samples]
-        print(
-            f"  {label:>14s}: median={statistics.median(ms):6.1f} ms  "
-            f"min={min(ms):6.1f}  max={max(ms):6.1f}  (n={len(ms)})",
-            flush=True,
-        )
-
-    stats("warm", warm)
-    stats("cpu-resident", cpu_lat)
-    stats("cold (disk)", cold)
-    e.shutdown()
-
-
-if __name__ == "__main__":
-    if len(sys.argv) != 3 or sys.argv[2] not in ("on", "off"):
-        print(
-            "usage: python benchmark/test_lora_eviction_latency.py "
-            "<max_loras_cpu> <on|off>",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-    os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
-    main(int(sys.argv[1]), sys.argv[2] == "on")
diff --git a/profile_expand.py b/profile_expand.py
deleted file mode 100644
index b506f3798..000000000
--- a/profile_expand.py
+++ /dev/null
@@ -1,274 +0,0 @@
-"""Profile the decode expand kernel: bandwidth, FLOP utilization, config sweep.
-
-Identifies the bottleneck (instruction-bound vs memory-bound) and sweeps
-BLOCK_K up to 64/128 — larger BLOCK_K eliminates the inner K-loop entirely
-for rank=64/128 adapters, removing loop overhead and k-mask instructions.
-
-Usage:
-    python profile_expand.py
-"""
-
-from __future__ import annotations
-
-import sys
-from dataclasses import dataclass
-from pathlib import Path
-
-import torch
-import triton
-import triton.language as tl
-
-sys.path.insert(0, str(Path(__file__).parent / "tokenspeed-kernel" / "python"))
-
-from tokenspeed_kernel._triton import triton as tok_triton
-from tokenspeed_kernel.ops.lora.triton.kernel_utils import _resolve_token_positions
-
-# ── minimal batch-info stub ────────────────────────────────────────────────────
-
-
-@dataclass
-class BI:
-    bs: int
-    max_len: int = 1
-    seg_lens: torch.Tensor = None
-    seg_indptr: torch.Tensor = None
-    weight_indices: torch.Tensor = None
-    lora_ranks: torch.Tensor = None
-    scalings: torch.Tensor = None
-    permutation: torch.Tensor = None
-
-    def __post_init__(self):
-        d = "cuda"
-        self.seg_lens = torch.ones(self.bs, dtype=torch.int32, device=d)
-        self.seg_indptr = torch.arange(self.bs + 1, dtype=torch.int32, device=d)
-        self.weight_indices = torch.ones(self.bs, dtype=torch.int32, device=d)
-        self.lora_ranks = torch.tensor([0, self.bs], dtype=torch.int32, device=d)
-        self.scalings = torch.tensor([0.0, 1.0], dtype=torch.float32, device=d)
-
-
-# ── inline expand kernel with configurable BLOCK_K ────────────────────────────
-
-
-@triton.jit
-def _expand_probe(
-    x,
-    weights,
-    output,
-    N,
-    K,
-    x_stride_0,
-    x_stride_1,
-    w_stride_0,
-    w_stride_1,
-    w_stride_2,
-    output_stride_0,
-    output_stride_1,
-    seg_lens,
-    seg_indptr,
-    weight_indices,
-    lora_ranks,
-    scalings,
-    BLOCK_S: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    num_warps: tl.constexpr,
-):
-    batch_id = tl.program_id(axis=1)
-    w_index = tl.load(weight_indices + batch_id)
-    rank = tl.load(lora_ranks + w_index)
-    if rank == 0:
-        return
-    pid = tl.program_id(axis=0)
-    seg_len = tl.load(seg_lens + batch_id)
-    if seg_len == 0:
-        return
-    seg_start = tl.load(seg_indptr + batch_id)
-    scaling = tl.load(scalings + w_index)
-    K_real = tl.minimum(K, rank)
-
-    num_pid_n = tl.cdiv(N, BLOCK_N)
-    pid_s = pid // num_pid_n
-    pid_n = pid % num_pid_n
-    if pid_s * BLOCK_S >= seg_len:
-        return
-
-    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
-    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    k_offset = tl.max_contiguous(tl.arange(0, BLOCK_K), BLOCK_K)
-
-    x_ptrs = (
-        x
-        + (seg_start + s_offset)[:, None] * x_stride_0
-        + k_offset[None, :] * x_stride_1
-    )
-    w_ptrs = (weights + w_index * w_stride_0) + (
-        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
-    )
-
-    s_mask = s_offset[:, None] < seg_len
-    n_mask = n_offset[None, :] < N
-    partial = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
-
-    for k in range(0, tl.cdiv(K_real, BLOCK_K)):
-        k_rem = K_real - k * BLOCK_K
-        x_tile = tl.load(
-            x_ptrs,
-            mask=s_mask & (k_offset[None, :] < k_rem),
-            other=0.0,
-            eviction_policy="evict_first",
-        )
-        w_tile = tl.load(
-            w_ptrs,
-            mask=(k_offset[:, None] < k_rem) & n_mask,
-            other=0.0,
-            eviction_policy="evict_last",
-        )
-        partial += tl.dot(x_tile, w_tile)
-        x_ptrs += BLOCK_K * x_stride_1
-        w_ptrs += BLOCK_K * w_stride_2
-
-    partial *= scaling
-    partial = partial.to(x.dtype.element_ty)
-    out_ptr = (
-        output
-        + (seg_start + s_offset)[:, None] * output_stride_0
-        + n_offset[None, :] * output_stride_1
-    )
-    out_mask = s_mask & n_mask
-    partial += tl.load(out_ptr, mask=out_mask, other=0.0)
-    tl.store(out_ptr, partial, mask=out_mask)
-
-
-def run_probe(x, weights, output, bi, BLOCK_S, BLOCK_N, BLOCK_K, num_warps, num_stages):
-    N, K = weights.shape[-2], weights.shape[-1]
-    max_len = bi.max_len
-    grid = (triton.cdiv(max_len, BLOCK_S) * triton.cdiv(N, BLOCK_N), bi.bs)
-    _expand_probe[grid](
-        x,
-        weights,
-        output,
-        N,
-        K,
-        x.stride(0),
-        x.stride(1),
-        weights.stride(0),
-        weights.stride(1),
-        weights.stride(2),
-        output.stride(0),
-        output.stride(1),
-        bi.seg_lens,
-        bi.seg_indptr,
-        bi.weight_indices,
-        bi.lora_ranks,
-        bi.scalings,
-        BLOCK_S=BLOCK_S,
-        BLOCK_N=BLOCK_N,
-        BLOCK_K=BLOCK_K,
-        num_warps=num_warps,
-        num_stages=num_stages,
-    )
-
-
-# ── metrics ────────────────────────────────────────────────────────────────────
-
-
-def theoretical_bandwidth_gb(n_segs, N, K):
-    """Min memory read in GB for one expand call."""
-    w_bytes = n_segs * N * K * 2  # weights: n_segs adapter tiles
-    x_bytes = n_segs * K * 2  # x: 1 row per segment
-    out_bytes = n_segs * N * 2 * 2  # output read+write
-    return (w_bytes + x_bytes + out_bytes) / 1e9
-
-
-def flops(n_segs, N, K):
-    return n_segs * 2 * N * K  # 2 × N × K per token
-
-
-def bench_cfg(fn, warmup=15, rep=200):
-    return triton.testing.do_bench(fn, warmup=warmup, rep=rep) * 1e-3  # → seconds
-
-
-# ── main sweep ─────────────────────────────────────────────────────────────────
-
-
-def sweep(n_segs: int, rank: int, N: int, label: str) -> None:
-    dev, dt = "cuda", torch.bfloat16
-    bi = BI(bs=n_segs)
-    bi.lora_ranks = torch.tensor([0, rank], dtype=torch.int32, device=dev)
-    x = torch.randn(n_segs, rank, device=dev, dtype=dt)
-    w = torch.randn(2, N, rank, device=dev, dtype=dt)
-    o = torch.zeros(n_segs, N, device=dev, dtype=dt)
-
-    h100_bw = 3.35e12  # bytes/s
-    h100_tflops = 2e15  # bf16 tensor core peak
-
-    bw_floor = theoretical_bandwidth_gb(n_segs, N, rank) / h100_bw * 1e6  # µs
-    flop_floor = flops(n_segs, N, rank) / h100_tflops * 1e6  # µs
-
-    print(f"\n{'='*72}")
-    print(f"  {label}  n_segs={n_segs}  rank={rank}  N={N}")
-    print(f"  Bandwidth floor: {bw_floor:.1f}µs  |  FLOP floor: {flop_floor:.2f}µs")
-    print(
-        f"  {'BLOCK_S':>7}  {'BLOCK_N':>7}  {'BLOCK_K':>7}  {'warps':>5}  {'stg':>3}  {'µs':>8}  {'BW%':>6}  {'K-iters':>8}"
-    )
-    print(f"  {'-'*66}")
-
-    configs = [
-        # (BLOCK_S, BLOCK_N, BLOCK_K, num_warps, num_stages)
-        # Current best from autotune:
-        (16, 64, 16, 8, 3),
-        (16, 64, 32, 8, 3),
-        # Larger BLOCK_K — KEY EXPERIMENT:
-        #   rank=64 → BLOCK_K=64: 1 K-iteration, no k-mask, no loop overhead
-        #   rank=128 → BLOCK_K=128: same
-        (16, 64, 64, 8, 1),
-        (16, 64, 64, 4, 1),
-        (16, 64, 64, 8, 2),
-        (16, 128, 64, 4, 1),
-        (16, 128, 64, 8, 1),
-        (16, 64, 128, 8, 1) if rank >= 128 else None,
-        (16, 128, 128, 4, 1) if rank >= 128 else None,
-        # Wider BLOCK_N to reduce CTA count:
-        (16, 128, 16, 4, 2),
-        (16, 128, 32, 4, 2),
-        (32, 64, 16, 4, 2),
-        (32, 64, 32, 4, 2),
-    ]
-
-    best_t = float("inf")
-    best_cfg = None
-
-    for cfg in configs:
-        if cfg is None:
-            continue
-        BS, BN, BK, nw, ns = cfg
-        if BK > rank:  # BLOCK_K larger than actual K makes no sense
-            continue
-        try:
-            t_s = bench_cfg(lambda: run_probe(x, w, o.clone(), bi, BS, BN, BK, nw, ns))
-            t_us = t_s * 1e6
-            bw_pct = bw_floor / t_us * 100
-            k_iters = (rank + BK - 1) // BK
-            marker = " ←" if t_us < best_t else ""
-            if t_us < best_t:
-                best_t = t_us
-                best_cfg = cfg
-            print(
-                f"  {BS:>7}  {BN:>7}  {BK:>7}  {nw:>5}  {ns:>3}  {t_us:>7.1f}µ  {bw_pct:>5.1f}%  {k_iters:>8}{marker}"
-            )
-        except Exception as e:
-            print(f"  {BS:>7}  {BN:>7}  {BK:>7}  {nw:>5}  {ns:>3}  FAILED: {e}")
-
-    print(
-        f"\n  Best: BLOCK_S={best_cfg[0]} BLOCK_N={best_cfg[1]} BLOCK_K={best_cfg[2]} warps={best_cfg[3]} stages={best_cfg[4]} → {best_t:.1f}µs"
-    )
-    print(
-        f"  Current autotune: {bench_cfg(lambda: run_probe(x, w, o.clone(), bi, 16, 64, 16, 8, 3))*1e6:.1f}µs"
-    )
-
-
-if __name__ == "__main__":
-    for n_segs in (16, 32, 64):
-        sweep(n_segs=n_segs, rank=64, N=4096, label="o_proj rank=64")
-    sweep(n_segs=32, rank=128, N=4096, label="o_proj rank=128")
-    sweep(n_segs=32, rank=16, N=4096, label="o_proj rank=16")

From de4cf465151270497b3dbd087503fd6ca2121cd4 Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 03:23:13 +0000
Subject: [PATCH 04/19] chore: move LoRA doc files to qywu/lora-dev branch

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 docs/index.md                  |  70 ---
 docs/lora_current_design.html  | 925 ---------------------------------
 docs/serving/lora.md           |  62 ---
 docs/tokenspeed_structure.html | 653 -----------------------
 4 files changed, 1710 deletions(-)
 delete mode 100644 docs/index.md
 delete mode 100644 docs/lora_current_design.html
 delete mode 100644 docs/serving/lora.md
 delete mode 100644 docs/tokenspeed_structure.html

diff --git a/docs/index.md b/docs/index.md
deleted file mode 100644
index 0be771a38..000000000
--- a/docs/index.md
+++ /dev/null
@@ -1,70 +0,0 @@
----
-layout: home
-
-hero:
-  name: TokenSpeed
-  text: Speed-of-light LLM inference
-  tagline: Production-oriented docs for launching, tuning, and operating low-latency OpenAI-compatible serving.
-  actions:
-    - theme: brand
-      text: Get Started
-      link: /guides/getting-started
-    - theme: alt
-      text: Launch Recipes
-      link: /recipes/models
-    - theme: alt
-      text: Server Parameters
-      link: /configuration/server
-
-features:
-  - title: Launch First
-    details: Start with concrete commands, then tune the exact knobs that affect memory, scheduling, parallelism, and kernels.
-  - title: Familiar Parameters
-    details: TokenSpeed keeps familiar parameter names where the runtime semantics match, with TokenSpeed-specific knobs documented separately.
-  - title: Model Recipes
-    details: Recipes collect the launch patterns used for Kimi and GPT-OSS deployments.
-  - title: Operational Surface
-    details: Parallelism and configuration guidance stay close to the serving paths operators actually use.
----
-
-## Start Here
-
-- [Getting Started](./guides/getting-started.md)
-- [Launching a Server](./guides/launching.md)
-- [Model Recipes](./recipes/models.md)
-- [Server Parameters](./configuration/server.md)
-- [Compatible Parameters](./configuration/compatible-parameters.md)
-- [Parallelism](./serving/parallelism.md)
-- [LoRA Serving](./serving/lora.md)
-
-## Common Workflow
-
-1. Install the runtime and kernel packages.
-2. Pick a launch recipe close to your model family and hardware.
-3. Set model loading, memory, scheduler, and parallelism parameters explicitly.
-4. Validate correctness and throughput together before changing more than one tuning dimension.
-
-## Minimal Server
-
-```bash
-tokenspeed serve openai/gpt-oss-20b \
-  --host 0.0.0.0 \
-  --port 8000 \
-  --tensor-parallel-size 1
-```
-
-The server exposes an OpenAI-compatible API under `/v1`.
-
-## High-Performance Shape
-
-Large MoE deployments usually make the same decisions:
-
-- model path and revision
-- context length and KV cache dtype
-- scheduler token and sequence budgets
-- attention and MoE backends
-- tensor, data, and expert parallelism
-- reasoning, tool-call, and speculative decoding parsers
-
-See [Model Recipes](./recipes/models.md) for concrete examples and
-[Server Parameters](./configuration/server.md) for the parameter reference.
diff --git a/docs/lora_current_design.html b/docs/lora_current_design.html
deleted file mode 100644
index a03fabe6f..000000000
--- a/docs/lora_current_design.html
+++ /dev/null
@@ -1,925 +0,0 @@
-<!doctype html>
-<html lang="en">
-<head>
-  <meta charset="utf-8">
-  <meta name="viewport" content="width=device-width, initial-scale=1">
-  <title>TokenSpeed LoRA Design - Current Implementation</title>
-  <style>
-    :root {
-      color-scheme: light;
-      --bg: #f6f8fb;
-      --panel: #ffffff;
-      --panel-2: #f0f4f8;
-      --ink: #17202a;
-      --muted: #5b6776;
-      --line: #d8e0ea;
-      --blue: #2563eb;
-      --green: #15803d;
-      --amber: #b45309;
-      --red: #b91c1c;
-      --purple: #7c3aed;
-      --code: #0f172a;
-      --code-bg: #eef2f7;
-    }
-
-    * { box-sizing: border-box; }
-    html { scroll-behavior: smooth; }
-    body {
-      margin: 0;
-      font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
-      color: var(--ink);
-      background: var(--bg);
-      line-height: 1.55;
-    }
-
-    .shell {
-      display: grid;
-      grid-template-columns: 280px minmax(0, 1fr);
-      min-height: 100vh;
-    }
-
-    nav {
-      position: sticky;
-      top: 0;
-      height: 100vh;
-      padding: 28px 22px;
-      border-right: 1px solid var(--line);
-      background: #ffffff;
-      overflow: auto;
-    }
-
-    nav h2 {
-      margin: 0 0 6px;
-      font-size: 18px;
-      letter-spacing: 0;
-    }
-
-    nav p {
-      margin: 0 0 20px;
-      color: var(--muted);
-      font-size: 13px;
-    }
-
-    nav a {
-      display: block;
-      padding: 7px 0;
-      color: #314158;
-      text-decoration: none;
-      font-size: 14px;
-    }
-
-    nav a:hover { color: var(--blue); }
-
-    main {
-      max-width: 1180px;
-      width: 100%;
-      padding: 44px 46px 72px;
-    }
-
-    header {
-      border-bottom: 1px solid var(--line);
-      padding-bottom: 28px;
-      margin-bottom: 28px;
-    }
-
-    .eyebrow {
-      margin: 0 0 8px;
-      color: var(--blue);
-      font-size: 13px;
-      font-weight: 700;
-      text-transform: uppercase;
-      letter-spacing: .04em;
-    }
-
-    h1 {
-      margin: 0;
-      font-size: clamp(34px, 5vw, 56px);
-      line-height: 1.05;
-      letter-spacing: 0;
-    }
-
-    .subtitle {
-      max-width: 820px;
-      color: var(--muted);
-      font-size: 18px;
-      margin: 18px 0 0;
-    }
-
-    section {
-      margin: 34px 0;
-      padding: 28px;
-      background: var(--panel);
-      border: 1px solid var(--line);
-      border-radius: 8px;
-    }
-
-    h2 {
-      margin: 0 0 14px;
-      font-size: 28px;
-      line-height: 1.2;
-      letter-spacing: 0;
-    }
-
-    h3 {
-      margin: 24px 0 10px;
-      font-size: 19px;
-      letter-spacing: 0;
-    }
-
-    p { margin: 10px 0; }
-    ul, ol { padding-left: 22px; }
-    li { margin: 7px 0; }
-
-    code {
-      color: var(--code);
-      background: var(--code-bg);
-      padding: 2px 5px;
-      border-radius: 5px;
-      font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, "Liberation Mono", monospace;
-      font-size: .92em;
-    }
-
-    pre {
-      overflow: auto;
-      padding: 16px;
-      border-radius: 8px;
-      background: #101826;
-      color: #e5eefb;
-      font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, "Liberation Mono", monospace;
-      font-size: 13px;
-      line-height: 1.45;
-    }
-
-    pre code {
-      color: inherit;
-      background: transparent;
-      padding: 0;
-      border-radius: 0;
-    }
-
-    table {
-      width: 100%;
-      border-collapse: collapse;
-      margin: 16px 0;
-      font-size: 14px;
-    }
-
-    th, td {
-      border: 1px solid var(--line);
-      padding: 10px 12px;
-      vertical-align: top;
-      text-align: left;
-    }
-
-    th { background: var(--panel-2); }
-
-    .grid {
-      display: grid;
-      grid-template-columns: repeat(3, minmax(0, 1fr));
-      gap: 14px;
-      margin: 18px 0;
-    }
-
-    .card {
-      border: 1px solid var(--line);
-      border-radius: 8px;
-      padding: 16px;
-      background: #fbfdff;
-    }
-
-    .card h3 {
-      margin-top: 0;
-      font-size: 17px;
-    }
-
-    .tag {
-      display: inline-block;
-      padding: 2px 8px;
-      border-radius: 999px;
-      border: 1px solid var(--line);
-      background: #fff;
-      color: var(--muted);
-      font-size: 12px;
-      font-weight: 700;
-    }
-
-    .good { color: var(--green); }
-    .warn { color: var(--amber); }
-    .bad { color: var(--red); }
-    .purple { color: var(--purple); }
-
-    .flow {
-      display: grid;
-      gap: 10px;
-      margin: 18px 0;
-    }
-
-    .flow-row {
-      display: grid;
-      grid-template-columns: 220px minmax(0, 1fr);
-      gap: 12px;
-      align-items: stretch;
-    }
-
-    .node {
-      border: 1px solid var(--line);
-      border-left: 5px solid var(--blue);
-      border-radius: 8px;
-      background: #fbfdff;
-      padding: 12px 14px;
-    }
-
-    .node strong { display: block; }
-    .node small { color: var(--muted); }
-    .node.green { border-left-color: var(--green); }
-    .node.amber { border-left-color: var(--amber); }
-    .node.purple { border-left-color: var(--purple); }
-
-    .callout {
-      border-left: 5px solid var(--amber);
-      background: #fff8ed;
-      padding: 14px 16px;
-      border-radius: 8px;
-      margin: 16px 0;
-    }
-
-    .file-list td:first-child {
-      white-space: nowrap;
-      font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, "Liberation Mono", monospace;
-      font-size: 13px;
-    }
-
-    @media (max-width: 900px) {
-      .shell { grid-template-columns: 1fr; }
-      nav { position: relative; height: auto; }
-      main { padding: 28px 18px 52px; }
-      section { padding: 20px; }
-      .grid { grid-template-columns: 1fr; }
-      .flow-row { grid-template-columns: 1fr; }
-    }
-  </style>
-</head>
-<body>
-  <div class="shell">
-    <nav>
-      <h2>LoRA Design</h2>
-      <p>Current TokenSpeed implementation, including dense LoRA, MoE LoRA, eviction, and CUDA graph replay.</p>
-      <a href="#overview">Overview</a>
-      <a href="#naming">Naming</a>
-      <a href="#files">Files</a>
-      <a href="#data-model">Data Model</a>
-      <a href="#lifecycle">Adapter Lifecycle</a>
-      <a href="#eviction">Eviction</a>
-      <a href="#buffers">GPU Buffers</a>
-      <a href="#batch-info">Batch Metadata</a>
-      <a href="#kernels">Kernel Routing</a>
-      <a href="#moe">MoE LoRA</a>
-      <a href="#cuda-graph">CUDA Graph</a>
-      <a href="#limitations">Limitations</a>
-    </nav>
-
-    <main>
-      <header>
-        <p class="eyebrow">TokenSpeed Runtime Design</p>
-        <h1>LoRA Serving Implementation</h1>
-        <p class="subtitle">
-          This document describes the current LoRA implementation in the working
-          branch: how adapter names and ids map to GPU slots, how CPU and GPU
-          eviction work, how dense and MoE LoRA weights are packed, and why the
-          CUDA graph path remains stable across dynamic adapters.
-        </p>
-      </header>
-
-      <section id="overview">
-        <h2>Overview</h2>
-        <p>
-          TokenSpeed treats LoRA as a runtime-owned side path. Base model layers
-          keep their normal linear and MoE kernels. When a request uses an adapter,
-          the runtime resolves that request's <code>lora_id</code> to a GPU
-          <code>slot</code>, writes per-step metadata into persistent tensors, and
-          the model layers add LoRA deltas in place.
-        </p>
-
-        <div class="grid">
-          <div class="card">
-            <h3>Identity Layer</h3>
-            <p><code>name</code> and <code>lora_id</code> are user/runtime identities. They do not imply GPU residency.</p>
-          </div>
-          <div class="card">
-            <h3>Residency Layer</h3>
-            <p><code>slot</code> is the current GPU pool index for a real adapter. Base-model requests use <code>NO_LORA_SLOT = -1</code>.</p>
-          </div>
-          <div class="card">
-            <h3>Forward Layer</h3>
-            <p><code>LoraBatchInfo</code> maps each request segment to a slot and is read directly by LoRA kernels.</p>
-          </div>
-        </div>
-
-        <div class="flow">
-          <div class="flow-row">
-            <div class="node"><strong>Load</strong><small>adapter path -> CPU cache</small></div>
-            <div class="node green"><strong><code>LoraManager.load_adapter()</code></strong><small>Registers name/id, stores durable disk path, warms CPU cache.</small></div>
-          </div>
-          <div class="flow-row">
-            <div class="node"><strong>Schedule</strong><small>request ids -> adapter ids</small></div>
-            <div class="node green"><strong><code>prepare_loras()</code></strong><small>Promotes missing adapters to GPU slots, writes segment lengths, slot ids, and fast-path metadata.</small></div>
-          </div>
-          <div class="flow-row">
-            <div class="node"><strong>Forward</strong><small>layer output += LoRA delta</small></div>
-            <div class="node green"><strong><code>apply_*_lora()</code></strong><small>Dense layers call shrink/expand kernels; MoE backends consume a narrow <code>MoeLoraContext</code>.</small></div>
-          </div>
-        </div>
-      </section>
-
-      <section id="naming">
-        <h2>Naming</h2>
-        <table>
-          <thead>
-            <tr><th>Name</th><th>Meaning</th><th>Where it lives</th></tr>
-          </thead>
-          <tbody>
-            <tr>
-              <td><code>name</code></td>
-              <td>Stable user-facing adapter name or alias, such as <code>"password_adapter"</code>. This is the value requests should select after registration.</td>
-              <td><code>LoraManager._name_to_id</code>, <code>_adapter_paths</code>, CPU/GPU LRU maps.</td>
-            </tr>
-            <tr>
-              <td><code>lora_name</code></td>
-              <td>Canonical request/API selector. It must be the name of an adapter that was already loaded via <code>load_lora_adapter()</code>.</td>
-              <td>Request schema and input processing before lookup in <code>LoraManager</code>.</td>
-            </tr>
-            <tr>
-              <td><code>adapter_path</code> / load-time <code>path</code></td>
-              <td>Durable filesystem path to the adapter directory or safetensors file. Every registered adapter needs one so CPU eviction can reload weights from disk.</td>
-              <td><code>LoraManager._adapter_paths</code>, <code>LoraCpuCache.adapter_paths</code>.</td>
-            </tr>
-            <tr>
-              <td><code>lora_id</code></td>
-              <td>Runtime integer id assigned at registration time. Request scheduling carries this id.</td>
-              <td><code>_name_to_id</code>, <code>_id_to_name</code>, request metadata.</td>
-            </tr>
-            <tr>
-              <td><code>slot</code></td>
-              <td>GPU-resident real adapter slot. Valid slots are <code>0..max_loras-1</code>; base/no-LoRA is <code>NO_LORA_SLOT = -1</code> in batch metadata.</td>
-              <td><code>_slot_to_name</code>, <code>_name_to_slot</code>, <code>LoraBatchInfo.weight_indices</code>.</td>
-            </tr>
-            <tr>
-              <td><code>rank</code></td>
-              <td>LoRA rank used by the adapter. For 3D MoE tensors, rank is dimension 1 of <code>lora_A</code>.</td>
-              <td><code>_lora_ranks</code>, <code>_slot_ranks</code>, per-slot buffer slices.</td>
-            </tr>
-            <tr>
-              <td><code>scaling</code></td>
-              <td><code>lora_alpha / r</code> from <code>adapter_config.json</code>, or 1.0 fallback.</td>
-              <td><code>_scalings</code>, <code>_slot_scalings</code>, kernel multiply.</td>
-            </tr>
-            <tr>
-              <td><code>segment</code></td>
-              <td>One contiguous run of tokens using one adapter slot. Current path uses one segment per request.</td>
-              <td><code>seg_lens</code>, <code>seg_indptr</code>, <code>weight_indices</code>.</td>
-            </tr>
-          </tbody>
-        </table>
-
-        <div class="callout">
-          <strong>Important distinction:</strong> <code>adapter_path</code> is
-          the disk source of truth used when the adapter is loaded or reloaded.
-          Request-time <code>lora_name</code> selects an already loaded adapter.
-          <code>lora_id</code> is stable while the adapter remains registered.
-          <code>slot</code> is temporary and may change after GPU eviction and
-          reload.
-        </div>
-      </section>
-
-      <section id="files">
-        <h2>Files</h2>
-        <p>
-          The implementation is split so request/API naming, adapter lifecycle,
-          scheduler isolation, and kernel execution each have a narrow owner.
-          The tables below show the important added and modified files.
-        </p>
-
-        <h3>Runtime LoRA Modules - Added</h3>
-        <table class="file-list">
-          <thead>
-            <tr><th>File</th><th>Role</th></tr>
-          </thead>
-          <tbody>
-            <tr><td><code>python/tokenspeed/runtime/lora/adapter_io.py</code></td><td>Loads adapter weights and normalizes supported formats: dense PEFT keys, 2D per-expert MoE keys, and 3D <code>experts.w1/w2/w3</code> MoE keys.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/lora/lora_cache.py</code></td><td>Pinned CPU adapter cache with durable <code>adapter_path</code> tracking, async prefetch, LRU eviction, and disk fallback.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/lora/lora_buffers.py</code></td><td>GPU buffer allocation and dense weight packing. Owns TP-aware CPU-side sharding and slot zeroing for dense LoRA tensors.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/lora/lora_batch.py</code></td><td><code>LoraBatchInfo</code>, segment metadata, decode grouping, and CUDA-graph-stable tensors read by dense LoRA kernels.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/lora/moe_lora.py</code></td><td><code>MoeLoraBuffers</code> and <code>MoeLoraContext</code>. Preallocates fixed expert-scoped LoRA pools and exposes the narrow context used by MoE backends.</td></tr>
-          </tbody>
-        </table>
-
-        <h3>Runtime Integration - Modified</h3>
-        <table class="file-list">
-          <thead>
-            <tr><th>File</th><th>Role</th></tr>
-          </thead>
-          <tbody>
-            <tr><td><code>python/tokenspeed/runtime/lora/lora_manager.py</code></td><td>Top-level adapter lifecycle manager: <code>lora_name</code> to <code>lora_id</code>, CPU/GPU residency, eviction, dense apply calls, and MoE context creation.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/lora/__init__.py</code></td><td>Exports the public LoRA runtime types used by execution and model layers.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/engine/io_struct.py</code></td><td>Adds request/control dataclasses: request-time <code>lora_name</code>, load-time <code>adapter_path</code>, and tokenized <code>lora_id</code>.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/engine/input_processor.py</code></td><td>Resolves request <code>lora_name</code> to internal <code>lora_id</code>; unknown names fail fast instead of falling back to base model.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/engine/async_llm.py</code></td><td>Holds the name-to-id registry used by request processing and scheduler control paths.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/engine/event_loop.py</code></td><td>Owns scheduler-side adapter load/unload, initializes <code>LoraManager</code>, and evicts KV namespaces on unload.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/engine/request_handler.py</code></td><td>Dispatches load/unload ZMQ control messages to the scheduler process.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/engine/scheduler_control_client.py</code></td><td>Sends <code>LoadLoraReqInput(lora_name, adapter_path)</code> and unload requests to scheduler workers.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/entrypoints/engine.py</code></td><td>Exposes the Python API: <code>generate(..., lora_name=...)</code> and <code>load_lora_adapter(lora_name, adapter_path)</code>.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/entrypoints/engine_base.py</code></td><td>Documents the abstract engine API and keeps request names separate from load-time disk paths.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/execution/context.py</code></td><td>Places <code>LoraManager</code>, <code>LoraBatchInfo</code>, and <code>MoeLoraContext</code> on <code>ForwardContext</code>.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/execution/model_runner.py</code></td><td>Calls <code>prepare_loras()</code> from scheduled request <code>lora_id</code> values before model forward.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/execution/cuda_graph_wrapper.py</code></td><td>Captures and replays separate graph variants for no-LoRA and with-LoRA decode batches.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/layers/moe/layer.py</code></td><td>Threads <code>MoeLoraContext</code> from runtime context into MoE backend calls.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/layers/moe/backends/base.py</code></td><td>Extends the backend interface with an optional MoE LoRA context.</td></tr>
-            <tr><td><code>python/tokenspeed/runtime/layers/moe/backends/*/triton.py</code></td><td>Supported Triton MoE backends consume the narrow context and apply expert LoRA deltas around fused MoE compute.</td></tr>
-          </tbody>
-        </table>
-
-        <h3>Scheduler - Modified</h3>
-        <table class="file-list">
-          <thead>
-            <tr><th>File</th><th>Role</th></tr>
-          </thead>
-          <tbody>
-            <tr><td><code>tokenspeed-scheduler/csrc/scheduler/request_spec.h</code></td><td>Adds <code>RequestSpec.lora_id</code>. <code>0</code> is base model; positive ids identify registered adapters.</td></tr>
-            <tr><td><code>tokenspeed-scheduler/csrc/scheduler/request.h</code> / <code>request.cpp</code></td><td>Stores the request's <code>lora_id</code> and exposes it to scheduling and forward events.</td></tr>
-            <tr><td><code>tokenspeed-scheduler/csrc/fsm/forward_events.h</code> / <code>forward_events.cpp</code></td><td>Carries <code>lora_id</code> through prefill/decode FSM events so prefix-cache match/insert uses the right adapter namespace.</td></tr>
-            <tr><td><code>tokenspeed-scheduler/csrc/resource/kv_prefix_cache/kv_prefix_cache.h</code> / <code>.cpp</code></td><td>Creates per-adapter virtual roots keyed by <code>lora_id</code>, isolates KV reuse across adapters, and supports namespace eviction.</td></tr>
-            <tr><td><code>tokenspeed-scheduler/csrc/resource/hybrid_prefix_cache/hybrid_prefix_cache.h</code> / <code>.cpp</code></td><td>Forwards <code>lora_id</code> into the KV prefix cache for hybrid cache users.</td></tr>
-            <tr><td><code>tokenspeed-scheduler/csrc/scheduler/scheduler.h</code> / <code>.cpp</code></td><td>Adds <code>EvictLoraNamespace(lora_id)</code>, used when an adapter is unloaded.</td></tr>
-            <tr><td><code>tokenspeed-scheduler/bindings/python_module.cpp</code></td><td>Exposes <code>RequestSpec.lora_id</code> and scheduler namespace eviction to Python.</td></tr>
-            <tr><td><code>tokenspeed-scheduler/tests/cpp/test_lora_prefix_cache.cpp</code></td><td>Covers adapter-specific prefix-cache isolation, base-model isolation, and explicit namespace eviction.</td></tr>
-          </tbody>
-        </table>
-
-        <h3>Kernel Package - Added Or Modified</h3>
-        <table class="file-list">
-          <thead>
-            <tr><th>File</th><th>Role</th></tr>
-          </thead>
-          <tbody>
-            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/triton/</code></td><td>Triton LoRA operator family: shrink, expand, prefill variants, decode grouping, QKV expand, gate/up expand, tuning helpers, and H100 tuned configs.</td></tr>
-            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/ops/lora/cutedsl.py</code></td><td>Public wrappers for CuTeDSL fast paths used by selected single-slot and batched-slot dense LoRA shapes.</td></tr>
-            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/thirdparty/cute_dsl/_provider.py</code></td><td>Provider boundary for optional CuTeDSL availability and import isolation.</td></tr>
-            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/thirdparty/cute_dsl/gemm_add.py</code></td><td>CuTeDSL GEMM-add helper used by dense LoRA expand paths.</td></tr>
-            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/thirdparty/cute_dsl/lora_gemm.py</code></td><td>CuTeDSL LoRA GEMM kernels for shrink/expand-style dense paths.</td></tr>
-            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/thirdparty/cute_dsl/lora_expand_direct.py</code></td><td>Direct expand helper for selected LoRA-B add paths.</td></tr>
-            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/thirdparty/cute_dsl/_vendor/</code></td><td>Vendored CuTeDSL support code kept inside <code>tokenspeed-kernel</code>, not imported directly by runtime code.</td></tr>
-            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/__init__.py</code></td><td>Exports the kernel package LoRA ops through the existing kernel boundary.</td></tr>
-            <tr><td><code>tokenspeed-kernel/python/tokenspeed_kernel/_triton.py</code></td><td>Centralizes direct Triton imports so LoRA ops follow the repository kernel dependency rule.</td></tr>
-          </tbody>
-        </table>
-
-        <h3>Tests, Benchmarks, And Docs</h3>
-        <table class="file-list">
-          <thead>
-            <tr><th>File</th><th>Role</th></tr>
-          </thead>
-          <tbody>
-            <tr><td><code>test/runtime/lora/test_adapter_io.py</code></td><td>Parser tests for dense, MoE per-expert, and 3D MoE adapter formats.</td></tr>
-            <tr><td><code>test/runtime/lora/test_lora_manager.py</code></td><td>Lifecycle, packing, eviction, CPU cache, GPU slot, and metadata behavior.</td></tr>
-            <tr><td><code>test/runtime/lora/test_lora_request_naming.py</code></td><td>Request naming contract: <code>lora_name</code> only, unknown names fail, scalar names propagate across batches.</td></tr>
-            <tr><td><code>test/runtime/lora/test_moe_lora.py</code></td><td>MoE LoRA buffer/context behavior and routed expert-delta application.</td></tr>
-            <tr><td><code>tokenspeed-kernel/test/ops/test_lora_triton.py</code></td><td>Numerical coverage for Triton LoRA kernels.</td></tr>
-            <tr><td><code>tokenspeed-kernel/test/ops/test_lora_cutedsl.py</code></td><td>Numerical coverage for CuTeDSL LoRA fast paths.</td></tr>
-            <tr><td><code>benchmark/test_lora_*.py</code></td><td>Dynamic load/unload, mixed adapter batches, eviction latency, and E2E password-adapter checks.</td></tr>
-            <tr><td><code>docs/serving/lora.md</code></td><td>User-facing serving guide for adapter loading, request selection, and supported MoE adapter formats.</td></tr>
-            <tr><td><code>docs/lora_current_design.html</code></td><td>This current implementation design document.</td></tr>
-          </tbody>
-        </table>
-      </section>
-
-      <section id="data-model">
-        <h2>Data Model</h2>
-        <h3>AdapterWeights</h3>
-        <p>
-          Parsed adapter weights use this logical shape:
-        </p>
-        <pre><code>AdapterWeights = {
-  layer_id: {
-    module_name: (lora_A, lora_B),
-  }
-}</code></pre>
-
-        <p>Dense modules use names like <code>q_proj</code>, <code>o_proj</code>, <code>gate_proj</code>, <code>up_proj</code>, and <code>down_proj</code>.</p>
-        <p>2D MoE per-expert modules use names like <code>experts.7.gate_proj</code>. 3D MoE modules use <code>experts.w1</code>, <code>experts.w2</code>, and <code>experts.w3</code>.</p>
-
-        <h3>Registration State</h3>
-        <pre><code>_name_to_id:    dict[str, int]        # user name -> stable lora_id
-_id_to_name:    dict[int, str]        # stable lora_id -> user name
-_adapter_paths: dict[str, str]        # user name -> durable adapter directory</code></pre>
-
-        <h3>Residency State</h3>
-        <pre><code>_cpu_cache:     dict[str, AdapterWeights]  # parsed host weights
-_cpu_lru:       OrderedDict[str, None]     # CPU eviction order
-_name_to_slot:  dict[str, int]             # GPU-resident name -> slot
-_slot_to_name:  list[str | None]           # slot -> GPU-resident name
-_gpu_lru:       OrderedDict[str, None]     # GPU eviction order</code></pre>
-      </section>
-
-      <section id="lifecycle">
-        <h2>Adapter Lifecycle</h2>
-        <ol>
-          <li><code>load_adapter(name, path)</code> verifies the adapter weight file or directory.</li>
-          <li>A new integer <code>lora_id</code> is assigned and stored in <code>_name_to_id</code> and <code>_id_to_name</code>.</li>
-          <li>The durable path is recorded in the CPU cache object so disk reload remains possible after CPU eviction.</li>
-          <li><code>LoraCpuCache.ensure()</code> synchronously loads, parses, and pins weights into the CPU pool when pinned memory is available.</li>
-          <li>On each forward step, <code>prepare_loras(lora_ids, token_counts)</code> resolves ids to names and then to GPU slots.</li>
-          <li>If an adapter is CPU-resident but not GPU-resident, <code>_ensure_in_gpu()</code> allocates or evicts a slot and calls <code>_load_to_slot()</code>.</li>
-          <li><code>_load_to_slot()</code> resets the target slot, writes rank/scaling metadata, shards on CPU, packs dense buffers, and loads MoE buffers.</li>
-          <li><code>unload_adapter(name)</code> clears GPU slot state, removes CPU cache state, and deletes id mappings.</li>
-        </ol>
-
-        <pre><code>request lora_id
-  -> _id_to_name[lora_id]
-  -> _ensure_in_gpu(name)
-  -> slot
-  -> LoraBatchInfo.weight_indices[segment] = slot</code></pre>
-      </section>
-
-      <section id="eviction">
-        <h2>Eviction</h2>
-        <h3>GPU Pool</h3>
-        <p>
-          The GPU pool has <code>max_loras</code> slots, all of them available
-          for real adapters. Base-model requests do not consume a GPU slot;
-          they write <code>NO_LORA_SLOT = -1</code> into per-step metadata.
-        </p>
-        <ul>
-          <li><code>_find_free_slot()</code> returns the first empty adapter slot.</li>
-          <li>If the pool is full, it scans <code>_gpu_lru</code> from least to most recently used.</li>
-          <li>The selected adapter is removed from <code>_name_to_slot</code>, <code>_slot_to_name</code>, and <code>_gpu_lru</code>.</li>
-          <li>The returned slot is reset before <code>_load_to_slot()</code> copies new weights, so partial adapters cannot inherit stale modules from the previous occupant.</li>
-          <li>Explicit unload also resets dense weights, clears MoE weights, and resets rank/scaling.</li>
-        </ul>
-
-        <h3>CPU Pool</h3>
-        <p>
-          The CPU pool is a second tier bounded by <code>max_loras_cpu</code>.
-          It keeps parsed, pinned weights to avoid repeated safetensors reads
-          and to allow non-blocking H2D copies when the platform supports
-          pinned memory. The default capacity is four times the GPU pool.
-        </p>
-        <ul>
-          <li><code>prefetch(name)</code> starts a best-effort background disk read if the adapter is known and not already loading.</li>
-          <li><code>ensure(name)</code> blocks until a pending load finishes or loads synchronously from disk.</li>
-          <li>CPU eviction prefers adapters that are not currently GPU-resident.</li>
-          <li>If the pool cannot find an evictable entry, loading raises a runtime error with the current LRU state.</li>
-        </ul>
-
-        <div class="callout">
-          GPU eviction does not unregister the adapter. It only removes the
-          temporary slot mapping. The adapter can be promoted again later from
-          CPU cache or disk using its stable <code>name</code> and
-          <code>lora_id</code>.
-        </div>
-      </section>
-
-      <section id="buffers">
-        <h2>GPU Buffers</h2>
-        <p>
-          Dense LoRA weights are packed into fixed-size per-layer buffers. The
-          first dimension is always <code>n_slots</code>, so kernels can select
-          the active adapter by slot without changing pointer addresses.
-          <code>--lora-buffer-groups</code> controls which coarse families are
-          allocated: <code>attn</code>, <code>mlp</code>, and <code>moe</code>.
-        </p>
-        <p>
-          The default is <code>attn,mlp,moe</code>. If a server starts with a
-          group disabled, loading an adapter that targets that group raises a
-          configuration error instead of silently dropping LoRA deltas.
-        </p>
-
-        <table>
-          <thead>
-            <tr><th>Buffer</th><th>Shape</th><th>Notes</th></tr>
-          </thead>
-          <tbody>
-            <tr><td><code>qkv_A_buffers[layer]</code></td><td><code>(n_slots, 3 * max_rank, hidden)</code></td><td>Q, K, V A matrices stacked by rank block.</td></tr>
-            <tr><td><code>qkv_B_buffers[layer]</code></td><td><code>(n_slots, q_per_tp + 2 * kv_per_tp, max_rank)</code></td><td>Column-parallel output side, sharded per TP rank.</td></tr>
-            <tr><td><code>o_A_buffers[layer]</code></td><td><code>(n_slots, max_rank, o_in_per_tp)</code></td><td>Row-parallel input side, sharded along input dimension.</td></tr>
-            <tr><td><code>o_B_buffers[layer]</code></td><td><code>(n_slots, hidden, max_rank)</code></td><td>Replicated output side.</td></tr>
-            <tr><td><code>gate_up_A_buffers[layer]</code></td><td><code>(n_slots, 2 * max_rank, hidden)</code></td><td>Gate and up A matrices stacked.</td></tr>
-            <tr><td><code>gate_up_B_buffers[layer]</code></td><td><code>(n_slots, 2 * intermediate_per_tp, max_rank)</code></td><td>Column-parallel gate/up output side.</td></tr>
-            <tr><td><code>down_A_buffers[layer]</code></td><td><code>(n_slots, max_rank, intermediate_per_tp)</code></td><td>Row-parallel down input side.</td></tr>
-            <tr><td><code>down_B_buffers[layer]</code></td><td><code>(n_slots, hidden, max_rank)</code></td><td>Replicated down output side.</td></tr>
-          </tbody>
-        </table>
-
-        <h3>TP Sharding Rule</h3>
-        <ul>
-          <li>Column-parallel projections (<code>q/k/v</code>, <code>gate</code>, <code>up</code>) shard <code>lora_B</code> along output dimension.</li>
-          <li>Row-parallel projections (<code>o</code>, <code>down</code>) shard <code>lora_A</code> along input dimension.</li>
-          <li>Sharding happens on CPU before the H2D copy, so each TP rank copies only its local shard into GPU buffers.</li>
-          <li>Downstream all-reduce sums base partials and LoRA partials together for row-parallel outputs.</li>
-        </ul>
-      </section>
-
-      <section id="batch-info">
-        <h2>Batch Metadata</h2>
-        <p>
-          <code>LoraBatchInfo</code> is the contract between Python scheduling
-          and the CUDA/Triton kernels. Its tensors are allocated once at manager
-          construction and updated in place before each forward.
-        </p>
-
-        <table>
-          <thead>
-            <tr><th>Field</th><th>Meaning</th></tr>
-          </thead>
-          <tbody>
-            <tr><td><code>bs</code></td><td>Number of active request segments.</td></tr>
-            <tr><td><code>num_segments</code></td><td>Currently equal to <code>bs</code>; one segment per request.</td></tr>
-            <tr><td><code>max_len</code></td><td>Maximum segment length in the step; drives decode vs prefill kernel choice.</td></tr>
-            <tr><td><code>seg_lens</code></td><td>Tokens per segment.</td></tr>
-            <tr><td><code>seg_indptr</code></td><td>Prefix sum over segment lengths.</td></tr>
-            <tr><td><code>weight_indices</code></td><td>GPU slot per segment.</td></tr>
-            <tr><td><code>lora_ranks</code></td><td>Per-slot rank tensor read by kernels.</td></tr>
-            <tr><td><code>scalings</code></td><td>Per-slot scaling tensor read by kernels.</td></tr>
-            <tr><td><code>single_lora_slot</code></td><td>Host fast path when every segment uses the same real adapter slot; otherwise <code>NO_LORA_SLOT</code>.</td></tr>
-            <tr><td><code>multi_lora_*</code></td><td>Host metadata for a batched CuTeDSL path when slots are consecutive and same-rank/same-scaling.</td></tr>
-            <tr><td><code>sort_order/group_*</code></td><td>Decode grouping metadata for grouped expand kernels.</td></tr>
-          </tbody>
-        </table>
-
-<pre><code>prepare_loras([adapter_a, adapter_b, 0], [20, 15, 8])
-  -> per_request_slots = [slot_a, slot_b, NO_LORA_SLOT]
-  -> seg_lens          = [20, 15, 8]
-  -> seg_indptr        = [0, 20, 35, 43]
-  -> weight_indices    = [slot_a, slot_b, NO_LORA_SLOT]
-  -> has_active_lora   = true</code></pre>
-      </section>
-
-      <section id="kernels">
-        <h2>Kernel Routing</h2>
-        <p>
-          Dense LoRA applies in two logical phases:
-        </p>
-        <ol>
-          <li><strong>Shrink:</strong> compute <code>lora_a = A @ x</code> using the active slot's A buffer.</li>
-          <li><strong>Expand:</strong> compute and add <code>B @ lora_a * scaling</code> into the base layer output.</li>
-        </ol>
-
-        <table>
-          <thead>
-            <tr><th>Condition</th><th>Path</th></tr>
-          </thead>
-          <tbody>
-            <tr><td><code>max_len &gt; 32</code></td><td>Prefill-style shrink/expand kernels.</td></tr>
-            <tr><td>Decode with grouped slots</td><td>Grouped expand path batches tokens by adapter slot.</td></tr>
-            <tr><td>Single adapter and favorable shape</td><td>CuTeDSL dense GEMM-add fast path.</td></tr>
-            <tr><td>Multiple consecutive slots with same rank/scaling</td><td>Batched CuTeDSL fast path.</td></tr>
-            <tr><td>Fallback</td><td>General Triton shrink/expand kernels.</td></tr>
-          </tbody>
-        </table>
-      </section>
-
-      <section id="moe">
-        <h2>MoE LoRA</h2>
-        <p>
-          MoE LoRA is deliberately separated from dense buffers. The manager
-          owns <code>MoeLoraBuffers</code>, and MoE backends receive a narrow
-          <code>MoeLoraContext</code> instead of depending on the full
-          <code>LoraManager</code>.
-        </p>
-
-        <h3>Supported Formats</h3>
-        <table>
-          <thead>
-            <tr><th>Format</th><th>Parsed module names</th><th>Storage behavior</th></tr>
-          </thead>
-          <tbody>
-            <tr>
-              <td>2D per-expert PEFT</td>
-              <td><code>experts.&lt;id&gt;.gate_proj</code>, <code>up_proj</code>, <code>down_proj</code></td>
-              <td>Expert id comes from the key. Each expert has independent A/B tensors.</td>
-            </tr>
-            <tr>
-              <td>3D per-expert</td>
-              <td><code>experts.w1</code>, <code>experts.w2</code>, <code>experts.w3</code></td>
-              <td>Tensor dim0 is <code>num_experts</code>; one slice per expert.</td>
-            </tr>
-            <tr>
-              <td>3D shared-outer</td>
-              <td><code>experts.w1</code>, <code>experts.w2</code>, <code>experts.w3</code></td>
-              <td>Tensor dim0 may be <code>1</code> for the shared side and <code>num_experts</code> for the expert-specific side.</td>
-            </tr>
-          </tbody>
-        </table>
-
-        <h3>Projection Mapping</h3>
-        <pre><code>w1 -> gate_proj
-w3 -> up_proj
-w2 -> down_proj</code></pre>
-
-        <h3>Internal MoE Buffers</h3>
-        <p>
-          MoE LoRA now mirrors the dense/vLLM-style slot model: buffers are
-          preallocated per layer with leading dimensions
-          <code>(n_slots, num_experts, ...)</code>. Loading an adapter writes
-          into the selected slot; <code>weights_by_layer[layer][slot]</code>
-          stores views into those fixed buffers for backend consumption.
-        </p>
-        <pre><code>w13_A_buffers[layer]:  (n_slots, num_experts, 2 * max_rank, hidden)
-w13_B_buffers[layer]:  (n_slots, num_experts, 2 * moe_intermediate_per_tp, 2 * max_rank)
-down_A_buffers[layer]: (n_slots, num_experts, max_rank, moe_intermediate_per_tp)
-down_B_buffers[layer]: (n_slots, num_experts, hidden, max_rank)
-
-weights_by_layer[layer_id][slot] = {
-  "w13_A":  w13_A_buffers[layer_id][slot],
-  "w13_B":  w13_B_buffers[layer_id][slot],
-  "down_A": down_A_buffers[layer_id][slot],
-  "down_B": down_B_buffers[layer_id][slot],
-}</code></pre>
-        <p>
-          Slot reset zeros both dense and MoE fixed pools before reuse, so
-          partial MoE adapters cannot inherit expert weights from a previous
-          adapter in the same slot.
-        </p>
-        <p>
-          With <code>--lora-moe-compressed-shared-outer</code>, MoE allocation
-          switches to the 3D shared-outer layout instead of full expansion:
-        </p>
-        <pre><code>w13_A_buffers[layer]:  (n_slots, 1, 2 * max_rank, hidden)
-w13_B_buffers[layer]:  (n_slots, num_experts, 2 * moe_intermediate_per_tp, 2 * max_rank)
-down_A_buffers[layer]: (n_slots, num_experts, max_rank, moe_intermediate_per_tp)
-down_B_buffers[layer]: (n_slots, 1, hidden, max_rank)</code></pre>
-        <p>
-          This compressed mode supports shared-outer 3D adapters
-          (<code>w1/w3</code> A shared, <code>w1/w3</code> B per-expert,
-          <code>w2</code> A per-expert, <code>w2</code> B shared). It rejects
-          per-expert and 2D MoE adapters because those require full expert
-          storage for every side.
-        </p>
-
-        <h3>Shared-Outer MoE Contract</h3>
-        <p>
-          The 3D shared-outer layout follows the hybrid MoE-LoRA design from
-          Together's research notes. The low-rank side that builds a compact
-          representation can be shared when the representation is common across
-          experts, while the side that interprets an expert-specific activation
-          remains per expert.
-        </p>
-        <table>
-          <thead>
-            <tr><th>Projection</th><th>Shared side</th><th>Per-expert side</th><th>TokenSpeed buffer</th></tr>
-          </thead>
-          <tbody>
-            <tr>
-              <td>Gate <code>w1</code></td>
-              <td><code>lora_A</code>, dim0 = <code>1</code></td>
-              <td><code>lora_B</code>, dim0 = <code>num_experts</code></td>
-              <td>First rank slice of <code>w13_A</code> and first intermediate slice of <code>w13_B</code></td>
-            </tr>
-            <tr>
-              <td>Up <code>w3</code></td>
-              <td><code>lora_A</code>, dim0 = <code>1</code></td>
-              <td><code>lora_B</code>, dim0 = <code>num_experts</code></td>
-              <td>Second rank slice of <code>w13_A</code> and second intermediate slice of <code>w13_B</code></td>
-            </tr>
-            <tr>
-              <td>Down <code>w2</code></td>
-              <td><code>lora_B</code>, dim0 = <code>1</code></td>
-              <td><code>lora_A</code>, dim0 = <code>num_experts</code></td>
-              <td><code>down_A</code> per expert and <code>down_B</code> shared</td>
-            </tr>
-          </tbody>
-        </table>
-
-        <pre><code>expected 3D shared-outer dim0:
-  experts.w1: A = 1,           B = num_experts
-  experts.w3: A = 1,           B = num_experts
-  experts.w2: A = num_experts, B = 1</code></pre>
-
-        <p>
-          In full mode, TokenSpeed expands any dim0=<code>1</code> shared tensor
-          into every expert slot during load. In compressed mode, the shared
-          side stays physically shared in the GPU pool and
-          <code>MoeLoraContext._select_expert_weights()</code> broadcasts it at
-          apply time. This saves <code>(num_experts - 1) * rank * (3 * hidden)</code>
-          elements per adapter slot per MoE layer, because only
-          <code>w13_A</code> and <code>down_B</code> stop carrying duplicate
-          expert copies.
-        </p>
-
-        <h3>Route-Level Math</h3>
-        <p>
-          For each routed pair <code>(token t, expert e)</code> and adapter
-          slot <code>s</code>, MoE LoRA adds deltas at the same points as the
-          base MoE projections. Gate/up LoRA is added before the activation;
-          down LoRA is multiplied by the router weight before it is accumulated
-          into the final routed output.
-        </p>
-        <pre><code>gate_up_delta[t, e] =
-  ((hidden[t] @ w13_A[s, e].T) @ w13_B[s, e].T) * scaling[s]
-
-gate_up_output[t, e] += gate_up_delta[t, e]
-
-down_delta[t, e] =
-  ((intermediate[t, e] @ down_A[s, e].T) @ down_B[s, e].T)
-  * topk_weights[t, e] * scaling[s]
-
-down_output[t, e] += down_delta[t, e]</code></pre>
-
-        <p>
-          When a side is shared, the effective expert index is <code>0</code>
-          for that side. The apply path therefore uses the same equations for
-          full per-expert and shared-outer adapters; only the tensor selection
-          changes.
-        </p>
-
-        <h3>Optimization Notes</h3>
-        <table>
-          <thead>
-            <tr><th>Idea from the research note</th><th>TokenSpeed status</th></tr>
-          </thead>
-          <tbody>
-            <tr>
-              <td>Compute shared gate/up A once per token, then reuse it for every routed expert.</td>
-              <td>Storage supports this shape, but the current apply path still evaluates per routed pair. A future fused kernel can exploit the shared side directly.</td>
-            </tr>
-            <tr>
-              <td>For shared down B, combine weighted low-rank intermediates first, then apply one shared B projection.</td>
-              <td>The current implementation applies the down delta per route and weights it before accumulation. This is correct and leaves the fused shared-B reduction as a kernel optimization.</td>
-            </tr>
-            <tr>
-              <td>Group work by <code>(adapter slot, expert id)</code> for better locality.</td>
-              <td>Dense LoRA already groups by adapter for some paths. MoE LoRA currently keeps a narrow context API so backends can add this grouping without changing manager ownership.</td>
-            </tr>
-          </tbody>
-        </table>
-
-        <h3>Runtime Apply</h3>
-        <ul>
-          <li><code>MoELayer.forward()</code> obtains the current manager through explicit argument or <code>get_current_lora_manager()</code>.</li>
-          <li>If the backend advertises <code>supports_moe_lora</code>, it receives <code>moe_lora_context</code>.</li>
-          <li>The Triton MoE path applies gate/up LoRA after the first expert GEMM and before activation.</li>
-          <li>It applies down LoRA after the down expert GEMM and before final route combine.</li>
-          <li>For mixed-adapter batches, <code>MoeLoraContext</code> expands segment slots to token slots and masks base-model tokens.</li>
-          <li>If token ownership changes under expert parallel dispatch, mixed LoRA is disabled rather than applying an incorrect slot map.</li>
-        </ul>
-
-        <div class="callout">
-          Current MoE LoRA support is local or tensor-parallel MoE only.
-          Expert-parallel MoE needs the LoRA slot map dispatched with tokens.
-        </div>
-      </section>
-
-      <section id="cuda-graph">
-        <h2>CUDA Graph</h2>
-        <p>
-          The CUDA graph design relies on stable pointers. Adapter contents,
-          segment lengths, slot ids, ranks, and scalings can change between
-          replays, but the tensors holding those values do not move.
-        </p>
-
-        <h3>Capture</h3>
-        <ul>
-          <li>When LoRA is enabled, <code>CudaGraphWrapper.capture()</code> captures two graphs per batch size.</li>
-          <li>The with-LoRA graph sets <code>ctx.lora_manager</code> and calls <code>prepare_loras([0] * bs)</code> before capture so metadata tensors contain <code>NO_LORA_SLOT</code> while kernels capture stable pointers.</li>
-          <li>The no-LoRA graph leaves <code>ctx.lora_manager</code> unset, so model-layer branches skip LoRA calls entirely.</li>
-          <li>No-LoRA capture is safe because base-model dummy ids resolve to <code>NO_LORA_SLOT</code>; runtime LoRA paths skip work when no real adapter is active.</li>
-        </ul>
-
-        <h3>Replay</h3>
-        <ol>
-          <li><code>ModelExecutor</code> builds the real <code>lora_ids</code> list for the scheduled requests.</li>
-          <li><code>prepare_loras()</code> updates the persistent <code>LoraBatchInfo</code> tensors in place.</li>
-          <li>If any id is nonzero, <code>ctx.lora_manager</code> is set and LoRA-capable layers call apply methods.</li>
-          <li><code>CudaGraphWrapper</code> chooses the no-LoRA graph if <code>has_active_lora</code> is false, otherwise it replays the with-LoRA graph.</li>
-          <li>The captured kernels read the updated metadata and use the current slot-to-weight buffers.</li>
-        </ol>
-
-        <pre><code>capture time:
-  batch_info tensors allocated once
-  graph records pointers to batch_info, ranks, scalings, and weight buffers
-
-replay time:
-  prepare_loras() mutates tensor contents
-  graph.replay() reads new contents through old pointers</code></pre>
-
-        <h3>Why Two Graphs?</h3>
-        <p>
-          The with-LoRA graph includes LoRA kernel launches. That is necessary
-          when any request uses an adapter. For all-base batches, the no-LoRA
-          graph avoids those launches entirely and preserves base-model decode
-          performance.
-        </p>
-      </section>
-
-      <section id="limitations">
-        <h2>Limitations and Open Edges</h2>
-        <ul>
-          <li><span class="warn">MoE EP:</span> Expert-parallel MoE is rejected for MoE LoRA until the slot map is dispatched alongside routed tokens.</li>
-          <li><span class="warn">2D hybrid shared:</span> The <code>experts.shared.*</code> 2D hybrid-shared format is not currently supported.</li>
-          <li><span class="warn">Model hooks:</span> Dense LoRA requires model layers to call the manager apply methods at projection boundaries.</li>
-          <li><span class="warn">Slot identity:</span> External code should not persist GPU slots. Only <code>lora_id</code> and adapter names are stable.</li>
-        </ul>
-      </section>
-    </main>
-  </div>
-</body>
-</html>
diff --git a/docs/serving/lora.md b/docs/serving/lora.md
deleted file mode 100644
index 403cec12b..000000000
--- a/docs/serving/lora.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# LoRA Serving
-
-TokenSpeed supports PEFT-style LoRA adapters for dense attention and MLP
-modules. Dense adapters target:
-
-- `q_proj`, `k_proj`, `v_proj`, `o_proj`
-- `gate_proj`, `up_proj`, `down_proj`
-
-Generation requests select adapters by registered `lora_name`. They do not
-load adapters from disk. Register the adapter first with `load_lora_adapter`
-using a durable adapter path, then pass that name on requests:
-
-```python
-engine.load_lora_adapter("password_adapter", "/path/to/adapter_0")
-engine.generate("...", lora_name="password_adapter")
-```
-
-Requests cannot load adapters from disk and do not accept a request-time
-filesystem path. Unknown `lora_name` values fail fast; use the base model by
-omitting `lora_name`.
-
-MoE LoRA support is available for expert-scoped weights on Triton MoE
-backends. The PEFT per-expert format uses 2D tensors and includes the expert id
-in each key:
-
-```text
-base_model.model.model.layers.<layer>.mlp.experts.<expert>.gate_proj.lora_A.weight
-base_model.model.model.layers.<layer>.mlp.experts.<expert>.gate_proj.lora_B.weight
-base_model.model.model.layers.<layer>.mlp.experts.<expert>.up_proj.lora_A.weight
-base_model.model.model.layers.<layer>.mlp.experts.<expert>.up_proj.lora_B.weight
-base_model.model.model.layers.<layer>.mlp.experts.<expert>.down_proj.lora_A.weight
-base_model.model.model.layers.<layer>.mlp.experts.<expert>.down_proj.lora_B.weight
-```
-
-TokenSpeed also accepts 3D MoE LoRA tensors under the SGLang-style
-`experts.w1`, `experts.w2`, and `experts.w3` names:
-
-```text
-base_model.model.model.layers.<layer>.mlp.experts.w1.lora_A.weight
-base_model.model.model.layers.<layer>.mlp.experts.w1.lora_B.weight
-base_model.model.model.layers.<layer>.mlp.experts.w2.lora_A.weight
-base_model.model.model.layers.<layer>.mlp.experts.w2.lora_B.weight
-base_model.model.model.layers.<layer>.mlp.experts.w3.lora_A.weight
-base_model.model.model.layers.<layer>.mlp.experts.w3.lora_B.weight
-```
-
-`w1` maps to `gate_proj`, `w3` maps to `up_proj`, and `w2` maps to
-`down_proj`. For these tensors, dimension 0 may be either `num_experts` for a
-fully per-expert side or `1` for a shared side. This covers both 3D per-expert
-and 3D shared-outer adapter layouts.
-
-The 2D hybrid-shared `experts.shared.*` format is not currently supported.
-
-The current MoE path is guarded to local or tensor-parallel MoE execution.
-Expert-parallel dispatch is rejected for MoE LoRA because token ownership and
-the LoRA slot map must be dispatched together before expert compute.
-
-Implementation note: dense adapter lifecycle and cache residency are still
-owned by `LoraManager`, while expert-scoped MoE tensors are held behind a
-`MoeLoraContext` consumed by MoE backends. New MoE LoRA kernels should live
-behind the `tokenspeed-kernel` boundary and use that context rather than
-depending on the full manager object.
diff --git a/docs/tokenspeed_structure.html b/docs/tokenspeed_structure.html
deleted file mode 100644
index e79cb2f78..000000000
--- a/docs/tokenspeed_structure.html
+++ /dev/null
@@ -1,653 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-<meta charset="UTF-8">
-<meta name="viewport" content="width=device-width, initial-scale=1.0">
-<title>TokenSpeed — Codebase Structure</title>
-<style>
-:root {
-  --bg:#0d1117; --surface:#161b22; --surface2:#1c2128; --border:#30363d;
-  --text:#e6edf3; --muted:#8b949e; --accent:#58a6ff; --accent2:#79c0ff;
-  --green:#3fb950; --red:#f85149; --yellow:#d29922; --orange:#e3b341;
-  --purple:#bc8cff; --cyan:#39d5ff; --code-bg:#1c2128;
-}
-*{box-sizing:border-box;margin:0;padding:0;}
-body{background:var(--bg);color:var(--text);font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif;font-size:13.5px;line-height:1.6;}
-a{color:var(--accent);text-decoration:none;}
-a:hover{text-decoration:underline;}
-
-/* Layout */
-.layout{display:grid;grid-template-columns:280px 1fr;min-height:100vh;}
-.sidebar{background:var(--surface);border-right:1px solid var(--border);padding:20px 0;position:sticky;top:0;height:100vh;overflow-y:auto;}
-.sidebar-logo{padding:0 18px 16px;border-bottom:1px solid var(--border);margin-bottom:12px;}
-.sidebar-logo h1{font-size:15px;font-weight:700;color:var(--text);}
-.sidebar-logo p{font-size:11px;color:var(--muted);margin-top:2px;}
-nav h3{font-size:10px;font-weight:600;text-transform:uppercase;letter-spacing:.08em;color:var(--muted);padding:8px 18px 4px;}
-nav ul{list-style:none;}
-nav li a{display:flex;align-items:center;gap:6px;padding:4px 18px;color:var(--muted);font-size:12.5px;transition:color .15s,background .15s;}
-nav li a:hover{color:var(--text);background:rgba(255,255,255,.04);text-decoration:none;}
-nav li a .dot{width:6px;height:6px;border-radius:50%;flex-shrink:0;}
-.main{padding:36px 52px;max-width:1100px;}
-h1.page-title{font-size:26px;font-weight:700;margin-bottom:6px;}
-h1.page-title .sub{font-size:13px;font-weight:400;color:var(--muted);display:block;margin-top:3px;}
-h2{font-size:18px;font-weight:600;margin:40px 0 14px;padding-bottom:8px;border-bottom:1px solid var(--border);}
-h3{font-size:14px;font-weight:600;margin:20px 0 8px;color:var(--orange);}
-p{margin-bottom:10px;color:var(--text);}
-code{font-family:'SF Mono',Consolas,monospace;font-size:12px;background:var(--code-bg);border:1px solid var(--border);border-radius:3px;padding:1px 5px;color:var(--accent2);}
-pre{background:var(--code-bg);border:1px solid var(--border);border-radius:8px;padding:14px 16px;overflow-x:auto;margin-bottom:14px;font-family:'SF Mono',Consolas,monospace;font-size:12px;line-height:1.5;}
-
-/* Package cards */
-.pkg-grid{display:grid;grid-template-columns:repeat(auto-fill,minmax(220px,1fr));gap:12px;margin:16px 0;}
-.pkg-card{background:var(--surface);border:1px solid var(--border);border-radius:8px;padding:14px;}
-.pkg-card:hover{border-color:var(--accent);transition:border-color .2s;}
-.pkg-card .pkg-name{font-weight:700;font-size:13px;margin-bottom:4px;}
-.pkg-card .pkg-lang{font-size:10px;font-weight:600;padding:1px 6px;border-radius:8px;display:inline-block;margin-bottom:6px;}
-.lang-py{background:#1a2a3a;color:var(--accent);}
-.lang-cpp{background:#2a1a3a;color:var(--purple);}
-.lang-cuda{background:#1a3a28;color:var(--green);}
-.pkg-card p{font-size:12px;color:var(--muted);margin:0;}
-
-/* File tree */
-.tree{background:var(--code-bg);border:1px solid var(--border);border-radius:8px;padding:16px;font-family:'SF Mono',Consolas,monospace;font-size:12px;line-height:1.7;overflow-x:auto;}
-.tree .dir{color:var(--accent2);}
-.tree .file{color:var(--text);}
-.tree .comment{color:var(--muted);}
-.tree .hl{color:var(--orange);}
-
-/* Component table */
-table{width:100%;border-collapse:collapse;margin-bottom:16px;font-size:12.5px;}
-th{background:var(--surface);color:var(--muted);font-weight:600;text-align:left;padding:7px 12px;border:1px solid var(--border);font-size:11px;text-transform:uppercase;letter-spacing:.05em;}
-td{padding:7px 12px;border:1px solid var(--border);vertical-align:top;}
-tr:nth-child(even) td{background:rgba(255,255,255,.015);}
-td code{font-size:11.5px;}
-td:first-child code{color:var(--green);}
-
-/* Architecture diagram */
-.arch{background:var(--surface2);border:1px solid var(--border);border-radius:10px;padding:20px;margin:16px 0;}
-.arch-row{display:flex;align-items:stretch;gap:8px;margin-bottom:8px;flex-wrap:wrap;}
-.arch-box{background:var(--surface);border:1px solid var(--border);border-radius:6px;padding:8px 12px;font-size:12px;flex:1;min-width:120px;}
-.arch-box.accent{border-color:var(--accent);background:#1a2a3a;}
-.arch-box.green{border-color:var(--green);background:#1a3020;}
-.arch-box.purple{border-color:var(--purple);background:#2a1a3a;}
-.arch-box.orange{border-color:var(--orange);background:#3a2a1a;}
-.arch-box.cyan{border-color:var(--cyan);background:#1a3035;}
-.arch-box .label{font-weight:700;margin-bottom:2px;font-size:12px;}
-.arch-box .sub{color:var(--muted);font-size:11px;}
-.arch-arrow{display:flex;align-items:center;color:var(--muted);font-size:18px;padding:0 2px;}
-.arch-sep{height:1px;background:var(--border);margin:8px 0;}
-
-/* Flow */
-.flow{display:flex;align-items:center;gap:0;flex-wrap:wrap;margin:10px 0;}
-.fbox{background:var(--surface);border:1px solid var(--border);border-radius:5px;padding:5px 10px;font-size:11.5px;white-space:nowrap;}
-.fbox.a{border-color:var(--accent);color:var(--accent);}
-.fbox.g{border-color:var(--green);color:var(--green);}
-.fbox.o{border-color:var(--orange);color:var(--orange);}
-.farrow{color:var(--muted);padding:0 5px;font-size:14px;}
-
-/* Callout */
-.note{background:#1a2a3a;border-left:3px solid var(--accent);border-radius:0 6px 6px 0;padding:10px 14px;margin-bottom:12px;font-size:12.5px;}
-.note.g{background:#1a3020;border-color:var(--green);}
-.note.o{background:#3a2a1a;border-color:var(--orange);}
-.note strong{display:block;margin-bottom:3px;font-size:12.5px;}
-
-/* Stats bar */
-.stats{display:flex;gap:20px;margin:14px 0;flex-wrap:wrap;}
-.stat{background:var(--surface);border:1px solid var(--border);border-radius:8px;padding:12px 18px;text-align:center;}
-.stat .n{font-size:22px;font-weight:700;line-height:1;}
-.stat .l{font-size:11px;color:var(--muted);margin-top:3px;}
-.stat.accent .n{color:var(--accent);}
-.stat.green  .n{color:var(--green);}
-.stat.purple .n{color:var(--purple);}
-.stat.orange .n{color:var(--orange);}
-.stat.cyan   .n{color:var(--cyan);}
-
-/* Color dots */
-.dot-py{background:var(--accent);}
-.dot-cpp{background:var(--purple);}
-.dot-cuda{background:var(--green);}
-.dot-cfg{background:var(--orange);}
-
-/* Collapsible */
-details summary{cursor:pointer;list-style:none;padding:6px 0;color:var(--accent);font-size:12.5px;user-select:none;}
-details summary::-webkit-details-marker{display:none;}
-details summary::before{content:"▶ ";font-size:10px;}
-details[open] summary::before{content:"▼ ";}
-details .inner{padding:8px 0 4px 14px;}
-</style>
-</head>
-<body>
-<div class="layout">
-
-<!-- SIDEBAR -->
-<nav class="sidebar">
-  <div class="sidebar-logo">
-    <h1>TokenSpeed</h1>
-    <p>Codebase Structure Reference</p>
-  </div>
-  <h3>Overview</h3>
-  <ul>
-    <li><a href="#top"><span class="dot dot-py"></span>Summary</a></li>
-    <li><a href="#arch"><span class="dot dot-py"></span>Architecture</a></li>
-    <li><a href="#flow"><span class="dot dot-py"></span>Request Flow</a></li>
-  </ul>
-  <h3>Packages</h3>
-  <ul>
-    <li><a href="#runtime"><span class="dot dot-py"></span>python/runtime</a></li>
-    <li><a href="#engine"><span class="dot dot-py"></span>↳ engine/</a></li>
-    <li><a href="#execution"><span class="dot dot-py"></span>↳ execution/</a></li>
-    <li><a href="#models"><span class="dot dot-py"></span>↳ models/</a></li>
-    <li><a href="#layers"><span class="dot dot-py"></span>↳ layers/</a></li>
-    <li><a href="#cache"><span class="dot dot-py"></span>↳ cache/</a></li>
-    <li><a href="#entrypoints"><span class="dot dot-py"></span>↳ entrypoints/</a></li>
-    <li><a href="#kernel"><span class="dot dot-cuda"></span>tokenspeed-kernel</a></li>
-    <li><a href="#mla"><span class="dot dot-cuda"></span>tokenspeed-mla</a></li>
-    <li><a href="#scheduler"><span class="dot dot-cpp"></span>tokenspeed-scheduler</a></li>
-    <li><a href="#fsm"><span class="dot dot-cpp"></span>↳ fsm/</a></li>
-    <li><a href="#resources"><span class="dot dot-cpp"></span>↳ resource/</a></li>
-  </ul>
-  <h3>More</h3>
-  <ul>
-    <li><a href="#kernel-selection"><span class="dot dot-cuda"></span>Kernel Selection</a></li>
-    <li><a href="#lora"><span class="dot dot-py"></span>LoRA Integration</a></li>
-    <li><a href="#tests"><span class="dot dot-cfg"></span>Tests</a></li>
-    <li><a href="#filetree"><span class="dot dot-cfg"></span>Full File Tree</a></li>
-  </ul>
-</nav>
-
-<!-- MAIN -->
-<main class="main">
-
-<h1 class="page-title" id="top">TokenSpeed Codebase Structure
-  <span class="sub">Multi-package inference engine &nbsp;·&nbsp; ~90K lines &nbsp;·&nbsp; Python + C++ + CUDA</span>
-</h1>
-
-<div class="stats">
-  <div class="stat accent"><div class="n">4</div><div class="l">Packages</div></div>
-  <div class="stat green"><div class="n">55K</div><div class="l">Python lines</div></div>
-  <div class="stat purple"><div class="n">10K</div><div class="l">C++ lines</div></div>
-  <div class="stat orange"><div class="n">20K+</div><div class="l">Kernel lines</div></div>
-  <div class="stat cyan"><div class="n">100+</div><div class="l">Test files</div></div>
-</div>
-
-<div class="pkg-grid">
-  <div class="pkg-card">
-    <div class="pkg-name">python/</div>
-    <span class="pkg-lang lang-py">Python</span>
-    <p>Core inference runtime: engine, models, layers, cache, distributed serving, OpenAI HTTP API.</p>
-  </div>
-  <div class="pkg-card">
-    <div class="pkg-name">tokenspeed-kernel/</div>
-    <span class="pkg-lang lang-cuda">CUDA / Triton</span>
-    <p>Pluggable kernel library with multi-backend auto-selection. Attention, GEMM, MoE, quantization.</p>
-  </div>
-  <div class="pkg-card">
-    <div class="pkg-name">tokenspeed-mla/</div>
-    <span class="pkg-lang lang-cuda">CuTe DSL</span>
-    <p>Blackwell-optimised Multi-head Latent Attention (MLA) kernels: prefill, decode FP16/FP8, KV packing.</p>
-  </div>
-  <div class="pkg-card">
-    <div class="pkg-name">tokenspeed-scheduler/</div>
-    <span class="pkg-lang lang-cpp">C++20</span>
-    <p>High-performance scheduler: FSM-driven request lifecycle, radix-tree KV prefix cache, resource allocation.</p>
-  </div>
-</div>
-
-<!-- ARCHITECTURE -->
-<h2 id="arch">Architecture Overview</h2>
-<div class="arch">
-  <!-- HTTP layer -->
-  <div class="arch-row">
-    <div class="arch-box accent" style="flex:1">
-      <div class="label">HTTP API (entrypoints/)</div>
-      <div class="sub">/v1/chat/completions &nbsp;·&nbsp; /v1/completions &nbsp;·&nbsp; /v1/embeddings</div>
-    </div>
-  </div>
-  <div class="arch-row">
-    <div class="arch-box accent" style="flex:1">
-      <div class="label">AsyncLLM / Engine (engine/)</div>
-      <div class="sub">RequestHandler &nbsp;·&nbsp; InputProcessor &nbsp;·&nbsp; OutputProcessor &nbsp;·&nbsp; SchedulerControlClient</div>
-    </div>
-  </div>
-  <div class="arch-sep"></div>
-  <!-- Scheduler layer -->
-  <div class="arch-row">
-    <div class="arch-box purple" style="flex:1">
-      <div class="label">C++ Scheduler (tokenspeed-scheduler)</div>
-      <div class="sub">FSM state machine &nbsp;·&nbsp; KV prefix cache &nbsp;·&nbsp; Page allocators &nbsp;·&nbsp; ExecutionPlan generation</div>
-    </div>
-  </div>
-  <div class="arch-sep"></div>
-  <!-- Execution layer -->
-  <div class="arch-row">
-    <div class="arch-box green" style="flex:3">
-      <div class="label">ModelRunner / ModelExecutor (execution/)</div>
-      <div class="sub">CUDA graph capture &amp; replay &nbsp;·&nbsp; Batch forward &nbsp;·&nbsp; Weight loading</div>
-    </div>
-    <div class="arch-box orange" style="flex:1">
-      <div class="label">KV Cache (cache/)</div>
-      <div class="sub">Prefix cache &nbsp;·&nbsp; Host/disk backends &nbsp;·&nbsp; LoRA namespacing</div>
-    </div>
-    <div class="arch-box" style="flex:1">
-      <div class="label">Sampling (sampling/)</div>
-      <div class="sub">Logit processors &nbsp;·&nbsp; Top-k/p &nbsp;·&nbsp; Grammar</div>
-    </div>
-  </div>
-  <div class="arch-sep"></div>
-  <!-- Model layer -->
-  <div class="arch-row">
-    <div class="arch-box" style="flex:1">
-      <div class="label">Models (models/)</div>
-      <div class="sub">Qwen3 &nbsp;·&nbsp; DeepSeek V3/V4 &nbsp;·&nbsp; Llama &nbsp;·&nbsp; MiniMax &nbsp;·&nbsp; 10+ architectures</div>
-    </div>
-    <div class="arch-box" style="flex:1">
-      <div class="label">Layers (layers/)</div>
-      <div class="sub">Linear &nbsp;·&nbsp; Attention &nbsp;·&nbsp; MoE &nbsp;·&nbsp; LayerNorm &nbsp;·&nbsp; RoPE &nbsp;·&nbsp; Quantization</div>
-    </div>
-  </div>
-  <div class="arch-sep"></div>
-  <!-- Kernel layer -->
-  <div class="arch-row">
-    <div class="arch-box cyan" style="flex:1">
-      <div class="label">tokenspeed-kernel</div>
-      <div class="sub">Multi-backend auto-select &nbsp;·&nbsp; Attention/GEMM/MoE/Quant &nbsp;·&nbsp; Triton / CUDA / TRT-LLM / FlashInfer</div>
-    </div>
-    <div class="arch-box cyan" style="flex:0 0 180px">
-      <div class="label">tokenspeed-mla</div>
-      <div class="sub">MLA prefill/decode &nbsp;·&nbsp; FP8 &nbsp;·&nbsp; Blackwell</div>
-    </div>
-  </div>
-</div>
-
-<!-- REQUEST FLOW -->
-<h2 id="flow">Request Flow</h2>
-<div class="flow">
-  <div class="fbox a">POST /v1/chat/completions</div><span class="farrow">→</span>
-  <div class="fbox">serving_chat.py</div><span class="farrow">→</span>
-  <div class="fbox">InputProcessor<br><small>tokenize</small></div><span class="farrow">→</span>
-  <div class="fbox">AsyncLLM<br><small>enqueue</small></div>
-</div>
-<div class="flow" style="margin-top:4px">
-  <div class="fbox">C++ Scheduler<br><small>prefix match, plan</small></div><span class="farrow">→</span>
-  <div class="fbox">ModelExecutor<br><small>forward pass</small></div><span class="farrow">→</span>
-  <div class="fbox">Model layers<br><small>via kernels</small></div><span class="farrow">→</span>
-  <div class="fbox g">Sample + stream<br><small>OutputProcessor</small></div>
-</div>
-
-<!-- RUNTIME -->
-<h2 id="runtime">Python Runtime — <code>python/tokenspeed/runtime/</code></h2>
-
-<h3 id="engine">engine/</h3>
-<p>Async request lifecycle management — from HTTP intake to token streaming.</p>
-<table>
-  <tr><th>File</th><th>Purpose</th></tr>
-  <tr><td><code>async_llm.py</code></td><td>Main async event loop; <code>AsyncLLM</code> class; routes requests, drives scheduler, streams results</td></tr>
-  <tr><td><code>event_loop.py</code></td><td>Subprocess event loop; owns C++ scheduler + model executor; drives the scheduling cycle</td></tr>
-  <tr><td><code>llm.py</code></td><td>Sync wrapper around <code>AsyncLLM</code> for blocking callers</td></tr>
-  <tr><td><code>request_handler.py</code></td><td>Dispatches incoming ZMQ messages (generate, abort, flush, LoRA load/unload…)</td></tr>
-  <tr><td><code>input_processor.py</code></td><td>Tokenises prompts; resolves request <code>lora_name</code> → <code>lora_id</code></td></tr>
-  <tr><td><code>output_processor.py</code></td><td>Detokenises generated tokens and streams to client</td></tr>
-  <tr><td><code>io_struct.py</code></td><td>All request/response dataclasses (<code>GenerateReqInput</code>, <code>LoadLoraReqInput</code>, …)</td></tr>
-  <tr><td><code>schedule_batch.py</code></td><td>Assembles per-forward-op batch metadata from the C++ scheduler plan</td></tr>
-  <tr><td><code>scheduler_utils.py</code></td><td><code>make_spec()</code>, <code>make_config()</code>; helpers bridging Python↔C++ scheduler</td></tr>
-  <tr><td><code>scheduler_control_client.py</code></td><td>ZMQ communicators for weight updates, flush, profile, LoRA operations</td></tr>
-  <tr><td><code>core_client.py</code></td><td>ZMQ client to the model-executor subprocess</td></tr>
-  <tr><td><code>generation_output_processor.py</code></td><td>Aggregates token outputs, handles streaming + stop conditions</td></tr>
-</table>
-
-<h3 id="execution">execution/</h3>
-<p>GPU forward-pass orchestration: CUDA graph capture, weight loading, batch preparation.</p>
-<table>
-  <tr><th>File</th><th>Purpose</th></tr>
-  <tr><td><code>model_runner.py</code></td><td>Calls model <code>forward()</code> with the right context; handles prefill vs decode</td></tr>
-  <tr><td><code>model_executor.py</code></td><td>Wraps <code>model_runner</code>; builds <code>ForwardContext</code>; injects LoRA weight indices; manages stats</td></tr>
-  <tr><td><code>cuda_graph_wrapper.py</code></td><td>Captures and replays CUDA graphs; manages decode graph pool</td></tr>
-  <tr><td><code>context.py</code></td><td><code>ForwardContext</code> dataclass: attn backend, KV pool, LoRA info, batch metadata</td></tr>
-  <tr><td><code>forward_batch_info.py</code></td><td><code>ForwardMode</code> enum (EXTEND / DECODE / IDLE); batch shape metadata</td></tr>
-  <tr><td><code>input_buffer.py</code></td><td>Pre-allocated GPU tensors for batched inputs (token IDs, positions, lengths…)</td></tr>
-  <tr><td><code>weight_loader.py</code></td><td>Loads safetensors/pickle checkpoints; prefetches shards in background threads</td></tr>
-  <tr><td><code>cache_loc_kernel.py</code></td><td>Triton kernel that fills the block-table tensor from scheduler page IDs</td></tr>
-  <tr><td><code>factory.py</code></td><td><code>create_model_executor()</code>, <code>create_model_runner()</code>, <code>create_attn_components()</code></td></tr>
-  <tr><td><code>distributed_initializer.py</code></td><td>NCCL process-group init; TP/DP rank assignment</td></tr>
-  <tr><td><code>drafter/eagle.py</code></td><td>Eagle-3 speculative decoding draft model wrapper</td></tr>
-</table>
-
-<h3 id="models">models/</h3>
-<p>Architecture implementations — each model defines attention, MLP, and embedding layers with weight loading.</p>
-<table>
-  <tr><th>File</th><th>Architecture</th><th>Notes</th></tr>
-  <tr><td><code>qwen3.py</code></td><td>Qwen3-8B/72B</td><td>GQA + qk-norm; LoRA injection added</td></tr>
-  <tr><td><code>qwen3_5.py</code></td><td>Qwen3.5 MoE</td><td>Sparse MoE variant</td></tr>
-  <tr><td><code>deepseek_v3.py</code></td><td>DeepSeek V3</td><td>MLA + MoE; 2K lines</td></tr>
-  <tr><td><code>deepseek_v4.py</code></td><td>DeepSeek V4</td><td>MLA + LoRA rank projections (q, kv); 1700 lines</td></tr>
-  <tr><td><code>llama.py</code></td><td>Llama 2/3</td><td>Standard GQA + RoPE</td></tr>
-  <tr><td><code>llama_eagle3.py</code></td><td>Llama + Eagle3</td><td>Speculative decoding variant</td></tr>
-  <tr><td><code>minimax_m2.py</code></td><td>MiniMax M2</td><td>MLA architecture</td></tr>
-  <tr><td><code>longcat_flash.py</code></td><td>LongCat-Flash</td><td>Long-context variant</td></tr>
-  <tr><td><code>deepseek_nextn.py</code></td><td>DeepSeek NextN</td><td>Next-token prediction variant</td></tr>
-  <tr><td><code>registry.py</code></td><td>—</td><td>Maps HF config <code>model_type</code> to implementation class</td></tr>
-  <tr><td><code>base/causal_lm.py</code></td><td>—</td><td>Base class: logit processor, embedding tie, hidden state capture</td></tr>
-</table>
-
-<h3 id="layers">layers/</h3>
-<p>Reusable neural network building blocks, each routing through <code>tokenspeed-kernel</code> for the best available backend.</p>
-<table>
-  <tr><th>Path</th><th>Purpose</th></tr>
-  <tr><td><code>linear.py</code></td><td>Column/Row parallel linear with quantization (int8, fp8, gptq, awq…). Largest file.</td></tr>
-  <tr><td><code>attention/registry.py</code></td><td>Instantiates attention backend; allocates KV pool; exposes <code>create_attn_components()</code></td></tr>
-  <tr><td><code>attention/backends/</code></td><td>Backend adapters: FlashAttention, FlashInfer, FlashMLA, tokenspeed-MLA, TRT-LLM MLA</td></tr>
-  <tr><td><code>attention/kv_cache/</code></td><td>MHA / MLA KV pool implementations; paged memory management</td></tr>
-  <tr><td><code>attention/configs/</code></td><td>MLA config (kv_lora_rank, qk_rope_head_dim, nope_head_dim, v_head_dim)</td></tr>
-  <tr><td><code>layernorm.py</code></td><td>RMSNorm with optional fused allreduce; GemmaRMSNorm; PDL-gated kernels</td></tr>
-  <tr><td><code>rotary_embedding.py</code></td><td>RoPE variants (YaRN, LongRoPE, linear scaling, multi-LoRA batching)</td></tr>
-  <tr><td><code>paged_attention.py</code></td><td>Thin wrapper calling the selected attention backend per forward pass</td></tr>
-  <tr><td><code>moe/</code></td><td>Expert routing (top-k, noaux_tc), dispatch, AllGather, DeepEP integration</td></tr>
-  <tr><td><code>quantization/</code></td><td>Per-tensor, per-token-head, gptq, awq, fp8 schemes; dequant kernels</td></tr>
-  <tr><td><code>vocab_parallel_embedding.py</code></td><td>Sharded embedding tables; LoRA embedding placement</td></tr>
-  <tr><td><code>logits_processor.py</code></td><td>Top-k, top-p, repetition penalty, grammar masking applied to logits</td></tr>
-</table>
-
-<h3 id="cache">cache/</h3>
-<table>
-  <tr><th>File</th><th>Purpose</th></tr>
-  <tr><td><code>prefix_cache.py</code></td><td>Python-side radix-tree prefix cache; <code>evictable_leaves</code> set; O(1) leaf delete</td></tr>
-  <tr><td><code>allocator.py</code></td><td>Page-granularity KV allocator; tracks <code>req_to_page</code>, free/used pages</td></tr>
-  <tr><td><code>kv_cache_host.py</code></td><td>CPU-pinned host KV staging (L2 cache); host↔device transfer helpers</td></tr>
-  <tr><td><code>evict_policy.py</code></td><td>LRU, LFU, FIFO, MRU, FILO, Priority eviction strategies</td></tr>
-  <tr><td><code>kvstore_controller.py</code></td><td>Coordinates device↔host↔storage eviction and prefetch</td></tr>
-  <tr><td><code>executor/memory_executor.py</code></td><td>Top-level cache executor: wires device + host + storage tiers</td></tr>
-  <tr><td><code>executor/host_executor.py</code></td><td>Async host↔device transfer with priority streams</td></tr>
-  <tr><td><code>storage/</code></td><td>Pluggable L3 storage (Mooncake, disk); <code>BackendFactory</code></td></tr>
-</table>
-
-<h3 id="entrypoints">entrypoints/</h3>
-<table>
-  <tr><th>File</th><th>Purpose</th></tr>
-  <tr><td><code>engine.py</code></td><td><code>Engine</code> class: in-process facade; <code>generate()</code>, <code>load_lora_adapter()</code>, weight updates</td></tr>
-  <tr><td><code>engine_base.py</code></td><td>Abstract base: <code>generate()</code>, <code>flush_cache()</code>, <code>load_lora_adapter()</code>…</td></tr>
-  <tr><td><code>http_server.py</code></td><td>FastAPI app; mounts OpenAI routes; middleware (auth, metrics)</td></tr>
-  <tr><td><code>openai/protocol.py</code></td><td>Pydantic models for <code>CompletionRequest</code> and <code>ChatCompletionRequest</code></td></tr>
-  <tr><td><code>openai/serving_chat.py</code></td><td>Chat completion handler: applies chat template, calls <code>GenerateReqInput</code></td></tr>
-  <tr><td><code>openai/serving_completions.py</code></td><td>Completion handler: prompt encoding, logprob extraction</td></tr>
-  <tr><td><code>engine/run_event_loop.py</code></td><td>Subprocess entry point for the scheduler worker process</td></tr>
-</table>
-
-<!-- KERNEL -->
-<h2 id="kernel">tokenspeed-kernel — <code>tokenspeed-kernel/python/tokenspeed_kernel/</code></h2>
-<p>Pip-installable kernel library. Operators are registered with capability metadata; <code>select_kernel()</code> picks the best available backend at runtime.</p>
-
-<h3>Core Infrastructure</h3>
-<table>
-  <tr><th>File</th><th>Purpose</th></tr>
-  <tr><td><code>__init__.py</code></td><td>Public API: <code>mha_prefill</code>, <code>mha_decode</code>, <code>mm</code>, <code>moe_fused</code>, <code>rmsnorm</code>, …</td></tr>
-  <tr><td><code>registry.py</code></td><td><code>@register_kernel</code> decorator; stores backends in a capability-indexed registry</td></tr>
-  <tr><td><code>selection.py</code></td><td><code>select_kernel(family, …)</code>: filter by capability/dtype/shape, rank by priority band</td></tr>
-  <tr><td><code>platform.py</code></td><td>Detects GPU arch (SM80/SM90/…), CUDA version, vendor</td></tr>
-  <tr><td><code>_triton.py</code></td><td>Single import for all Triton/Triton-fork usage (avoids duplicate loads)</td></tr>
-</table>
-
-<h3 id="kernel-selection">Kernel Selection Priority</h3>
-<div class="flow">
-  <div class="fbox a">select_kernel(family, dtype, shapes)</div><span class="farrow">→</span>
-  <div class="fbox">Filter by GPU capability + dtype support</div><span class="farrow">→</span>
-  <div class="fbox o">Rank by priority band</div>
-</div>
-<pre style="margin-top:8px">Priority bands (highest → lowest):
-  1.  Platform-matched  (flash_mla for Blackwell MLA decode)
-  2.  JIT-compiled      (CuTe DSL, Gluon)
-  3.  Triton            (portable, auto-tuned)
-  4.  Vendor libraries  (FlashAttention, FlashInfer, TRT-LLM)
-  5.  Reference         (PyTorch — correctness baseline)</pre>
-
-<h3>Operation Families (<code>ops/</code>)</h3>
-<table>
-  <tr><th>Family</th><th>Backends</th><th>Usage</th></tr>
-  <tr><td><code>attention/</code></td><td>triton, flash_attn, flashinfer, flash_mla, tokenspeed_mla</td><td>MHA + MLA prefill/decode</td></tr>
-  <tr><td><code>gemm/</code></td><td>triton, trtllm, flashinfer, deep_gemm</td><td>Weight matmuls, quantized GEMM</td></tr>
-  <tr><td><code>moe/</code></td><td>triton, cuda, deepep, flashinfer, trtllm</td><td>Expert dispatch, fused gate+up+down</td></tr>
-  <tr><td><code>layernorm/</code></td><td>triton, cuda, flashinfer</td><td>RMSNorm, fused add+norm</td></tr>
-  <tr><td><code>quantization/</code></td><td>triton, cuda, flashinfer, trtllm</td><td>Per-tensor/per-token quant/dequant</td></tr>
-  <tr><td><code>communication/</code></td><td>nccl, iris, triton, trtllm, flashinfer</td><td>AllReduce, ReduceScatter, AllGather</td></tr>
-  <tr><td><code>sampling/</code></td><td>cuda, flashinfer</td><td>Top-k, top-p sampling</td></tr>
-  <tr><td><code>activation/</code></td><td>cuda, flashinfer</td><td>SiGLU, GELU, SwiGLU</td></tr>
-  <tr><td><code>embedding/</code></td><td>triton, cuda, flashinfer</td><td>Token embedding lookup</td></tr>
-</table>
-
-<!-- MLA -->
-<h2 id="mla">tokenspeed-mla — <code>tokenspeed-mla/python/tokenspeed_mla/</code></h2>
-<p>Blackwell-optimised MLA kernels using NVIDIA CuTe DSL with JIT compilation and optional AOT binary backend.</p>
-<table>
-  <tr><th>File</th><th>Purpose</th></tr>
-  <tr><td><code>mla_prefill.py</code></td><td>Varlen ragged prefill; CuTe DSL JIT with compile-cache; causal mask; PDL support</td></tr>
-  <tr><td><code>mla_decode_fp16.py</code></td><td>Split-KV decode with FP16 accumulation; auto-sized workspace</td></tr>
-  <tr><td><code>mla_decode_fp8.py</code></td><td>FP8-quantized decode → BF16 output for numerical stability</td></tr>
-  <tr><td><code>mla_kv_pack_quantize_fp8.py</code></td><td>Fused KV packing + FP8 quantisation kernel</td></tr>
-  <tr><td><code>fmha.py</code></td><td>FMHA wrapper; dispatches to AOT binary or CuTe JIT path</td></tr>
-  <tr><td><code>mla_helpers.py</code></td><td>MLA math helpers: head-dim splitting, nope/rope decomposition</td></tr>
-</table>
-
-<!-- SCHEDULER -->
-<h2 id="scheduler">tokenspeed-scheduler — <code>tokenspeed-scheduler/csrc/</code></h2>
-<p>C++20 scheduler. The Python runtime calls it via nanobind bindings. All request state transitions happen here.</p>
-
-<h3>scheduler/</h3>
-<table>
-  <tr><th>File</th><th>Purpose</th></tr>
-  <tr><td><code>scheduler.h/.cpp</code></td><td>Main <code>Scheduler</code> class: <code>SubmitRequests()</code>, <code>NextExecutionPlan()</code>, <code>Advance(event)</code></td></tr>
-  <tr><td><code>request.h/.cpp</code></td><td><code>Request</code>: holds token container, FSM state, KV refs, LoRA ID</td></tr>
-  <tr><td><code>request_spec.h</code></td><td>Input spec: <code>request_id</code>, <code>tokens</code>, <code>rolling_hashes</code>, <code>lora_id</code></td></tr>
-  <tr><td><code>execution_plan.h</code></td><td><code>FlatForwardOperation</code>: request IDs, input lengths, prefix lens, page IDs</td></tr>
-  <tr><td><code>operations/forward.cpp</code></td><td><code>schedulePrefillFirstChunk()</code>, <code>scheduleDecode()</code>; passes <code>lora_id</code> to all Match/Insert calls</td></tr>
-  <tr><td><code>operations/cache.cpp</code></td><td>KV write-back, load-back, prefetch operations</td></tr>
-  <tr><td><code>outside_event_handler.cpp</code></td><td>Handles <code>FinishEvent</code>, PD events from outside the main scheduling loop</td></tr>
-</table>
-
-<h3 id="fsm">fsm/ — Finite State Machine</h3>
-<p>Each request transitions through states; events drive transitions and trigger cache/allocation side-effects.</p>
-<pre>Submitted → Prefilling → PrefillDone → Decoding → Draining → Finished
-                                     ↘ Retracting → Retracted
-                         (optional)   Prefetching → PrefetchDone
-                                      WritingBack
-                                      Aborting</pre>
-<table>
-  <tr><th>File</th><th>Purpose</th></tr>
-  <tr><td><code>forward_states.h</code></td><td>State data structs: prefill window, KV allocator, decode token count</td></tr>
-  <tr><td><code>forward_events.h/.cpp</code></td><td><code>SchedulePrefillFirstChunkEvent</code>, <code>FinishEvent</code>, <code>ScheduleDecodeEvent</code>; inject <code>lora_id</code></td></tr>
-  <tr><td><code>cache_states.h</code></td><td>Prefetch / write-back states</td></tr>
-  <tr><td><code>cache_events.h/.cpp</code></td><td>L2 write-back, load-back, L3 backup events</td></tr>
-  <tr><td><code>pd_states.h / pd_events.h/.cpp</code></td><td>Prefill-decode disaggregation states and transfer events</td></tr>
-</table>
-
-<h3 id="resources">resource/ — KV Cache &amp; Memory</h3>
-<table>
-  <tr><th>Path</th><th>Purpose</th></tr>
-  <tr><td><code>kv_prefix_cache/kv_prefix_cache.h/.cpp</code></td><td>Radix-tree prefix cache; <code>Match(tokens, lora_id)</code>; <code>Insert(tokens, lora_id)</code>; LoRA virtual roots</td></tr>
-  <tr><td><code>kv_prefix_cache/eviction.h</code></td><td><code>ResourceManager&lt;RType&gt;::Evict()</code>; persistent <code>lru_leaves_</code> set; O(k log N)</td></tr>
-  <tr><td><code>radix_tree/radix_tree.h/.cpp</code></td><td>Compressed trie; <code>WalkDownUtilMismatch()</code>; <code>splitChild()</code>; <code>PruneEmptyByNode()</code></td></tr>
-  <tr><td><code>radix_tree/tree_resource.h</code></td><td><code>NodeResource&lt;RType&gt;</code>: pages, ref_count, on_evictable callback (exact LRU)</td></tr>
-  <tr><td><code>radix_tree/tree_node.h/.cpp</code></td><td>Tree node: tokens, depth, children map, device/host resource pointers, Touch()</td></tr>
-  <tr><td><code>hybrid_prefix_cache/hybrid_prefix_cache.h/.cpp</code></td><td>Wraps KV cache + Mamba state cache; <code>Match(tokens, lora_id)</code></td></tr>
-  <tr><td><code>allocator/page_allocator.h/.cpp</code></td><td>Fixed-pool page allocator; free-list; <code>Allocate(n)</code> / <code>Free(pages)</code></td></tr>
-  <tr><td><code>allocator/kv_allocator.h/.cpp</code></td><td>Paged KV allocator; tracks req→page mapping</td></tr>
-  <tr><td><code>allocator/mamba_chunk_allocator.h/.cpp</code></td><td>Fixed-slot Mamba state allocator</td></tr>
-</table>
-
-<!-- LORA INTEGRATION -->
-<h2 id="lora">LoRA Integration</h2>
-<p>Added in <code>feat/lora-adapter-serving</code>. Touches all four packages.</p>
-<table>
-  <tr><th>Package</th><th>What was added</th></tr>
-  <tr><td>python/lora/</td><td><code>LoraConfig</code>, <code>LoraRegistry</code>, <code>LoraManager</code> (GPU pool + LRU eviction + TP-aware matmul)</td></tr>
-  <tr><td>python/models/qwen3.py</td><td><code>apply_qkv_lora()</code> after qkv_proj; <code>apply_o_lora()</code> after o_proj; pure-PyTorch <code>_rms_norm</code> for eager mode</td></tr>
-  <tr><td>python/execution/context.py</td><td><code>lora_weight_indices</code>, <code>lora_scalings</code>, <code>lora_manager</code> fields on <code>ForwardContext</code></td></tr>
-  <tr><td>python/execution/model_executor.py</td><td>Per-token weight_indices expansion via <code>repeat_interleave(w_idx, input_lengths)</code></td></tr>
-  <tr><td>python/entrypoints/openai/protocol.py</td><td>Request schemas; LoRA selection uses loaded adapter names where exposed.</td></tr>
-  <tr><td>tokenspeed-scheduler/csrc/</td><td><code>RequestSpec.lora_id</code>; <code>KVPrefixCache::Match(tokens, lora_id)</code>; virtual root per adapter; <code>namespace_depth_offset</code></td></tr>
-</table>
-
-<!-- TESTS -->
-<h2 id="tests">Tests</h2>
-<div class="note g">
-  <strong>120 C++ scheduler tests &nbsp;·&nbsp; 48 Python scheduler tests &nbsp;·&nbsp; 40+ runtime integration tests</strong>
-</div>
-<table>
-  <tr><th>Location</th><th>Coverage</th></tr>
-  <tr><td><code>tokenspeed-scheduler/tests/cpp/</code></td><td>Scheduling FSM, page lifecycle, eviction, prefix cache, Mamba, PD disagg, LoRA isolation</td></tr>
-  <tr><td><code>tokenspeed-scheduler/python/tests/</code></td><td>Python scheduler API, FSM transitions, prefill/decode batching, occupied pages, PD events</td></tr>
-  <tr><td><code>test/runtime/cache/</code></td><td>MLA KV buffer, prefix cache invariants (evictable_leaves, cascade eviction)</td></tr>
-  <tr><td><code>test/runtime/lora/</code></td><td>LoraRegistry capacity, pinning, scaling; dynamic load/unload end-to-end</td></tr>
-  <tr><td><code>test/runtime/models/</code></td><td>DeepSeek V4, Kimi, multimodal model parity</td></tr>
-  <tr><td><code>tokenspeed-kernel/test/</code></td><td>Kernel numerics: attention, GEMM, quantization tolerance verification</td></tr>
-  <tr><td><code>benchmark/</code></td><td>C++ eviction timing, LoRA batch isolation proof, decode-path cache microbenchmark</td></tr>
-</table>
-
-<!-- FILE TREE -->
-<h2 id="filetree">Full Directory Tree</h2>
-<details>
-  <summary>Show complete tree</summary>
-  <div class="inner">
-<div class="tree"><pre>
-tokenspeed/
-├── <span class="dir">python/</span>
-│   ├── pyproject.toml
-│   └── <span class="dir">tokenspeed/</span>
-│       ├── cli.py                       <span class="comment"># tokenspeed serve / bench / env</span>
-│       ├── bench.py                     <span class="comment"># Online serving benchmark</span>
-│       └── <span class="dir">runtime/</span>
-│           ├── <span class="dir hl">engine/</span>              <span class="comment"># Async LLM, request lifecycle</span>
-│           │   ├── async_llm.py
-│           │   ├── event_loop.py
-│           │   ├── io_struct.py
-│           │   ├── request_handler.py
-│           │   ├── input_processor.py
-│           │   ├── output_processor.py
-│           │   ├── schedule_batch.py
-│           │   ├── scheduler_utils.py
-│           │   ├── scheduler_control_client.py
-│           │   └── core_client.py
-│           ├── <span class="dir hl">execution/</span>           <span class="comment"># GPU forward pass</span>
-│           │   ├── model_runner.py
-│           │   ├── model_executor.py
-│           │   ├── cuda_graph_wrapper.py
-│           │   ├── context.py
-│           │   ├── forward_batch_info.py
-│           │   ├── input_buffer.py
-│           │   ├── weight_loader.py
-│           │   ├── factory.py
-│           │   └── drafter/eagle.py
-│           ├── <span class="dir hl">models/</span>              <span class="comment"># Architecture implementations</span>
-│           │   ├── registry.py
-│           │   ├── qwen3.py
-│           │   ├── qwen3_5.py
-│           │   ├── deepseek_v3.py
-│           │   ├── deepseek_v4.py
-│           │   ├── llama.py
-│           │   ├── minimax_m2.py
-│           │   └── base/causal_lm.py
-│           ├── <span class="dir hl">layers/</span>              <span class="comment"># Reusable neural net layers</span>
-│           │   ├── linear.py
-│           │   ├── layernorm.py
-│           │   ├── rotary_embedding.py
-│           │   ├── paged_attention.py
-│           │   ├── logits_processor.py
-│           │   ├── vocab_parallel_embedding.py
-│           │   ├── <span class="dir">attention/</span>       <span class="comment"># Backends: FlashAttn, FlashInfer, MLA, TRT-LLM</span>
-│           │   ├── <span class="dir">moe/</span>             <span class="comment"># Expert routing, dispatch</span>
-│           │   └── <span class="dir">quantization/</span>    <span class="comment"># int8, fp8, gptq, awq</span>
-│           ├── <span class="dir hl">cache/</span>               <span class="comment"># KV cache management</span>
-│           │   ├── prefix_cache.py
-│           │   ├── allocator.py
-│           │   ├── kv_cache_host.py
-│           │   ├── evict_policy.py
-│           │   ├── <span class="dir">executor/</span>        <span class="comment"># memory, host, storage executors</span>
-│           │   └── <span class="dir">storage/</span>         <span class="comment"># mooncake_store, disk backend</span>
-│           ├── <span class="dir hl">lora/</span>                <span class="comment"># LoRA adapter serving (new)</span>
-│           │   ├── lora_config.py
-│           │   ├── lora_registry.py
-│           │   └── lora_manager.py
-│           ├── <span class="dir hl">entrypoints/</span>         <span class="comment"># HTTP server + Engine API</span>
-│           │   ├── engine.py
-│           │   ├── engine_base.py
-│           │   ├── http_server.py
-│           │   └── <span class="dir">openai/</span>          <span class="comment"># Protocol, serving_chat, serving_completions</span>
-│           ├── <span class="dir">configs/</span>             <span class="comment"># Model + device configs</span>
-│           ├── <span class="dir">distributed/</span>         <span class="comment"># TP/DP mapping, comm ops</span>
-│           ├── <span class="dir">sampling/</span>            <span class="comment"># Sampling backends</span>
-│           ├── <span class="dir">grammar/</span>             <span class="comment"># Structured generation</span>
-│           ├── <span class="dir">pd/</span>                  <span class="comment"># Prefill-decode disagg</span>
-│           ├── <span class="dir">model_loader/</span>        <span class="comment"># Weight loading</span>
-│           ├── <span class="dir">metrics/</span>             <span class="comment"># Observability</span>
-│           └── <span class="dir">utils/</span>               <span class="comment"># Logging, env, common helpers</span>
-│
-├── <span class="dir">tokenspeed-kernel/</span>
-│   └── <span class="dir">python/tokenspeed_kernel/</span>
-│       ├── __init__.py                  <span class="comment"># Public API</span>
-│       ├── registry.py                  <span class="comment"># @register_kernel</span>
-│       ├── selection.py                 <span class="comment"># select_kernel()</span>
-│       ├── platform.py
-│       ├── <span class="dir hl">ops/</span>                     <span class="comment"># Backend implementations</span>
-│       │   ├── <span class="dir">attention/</span>
-│       │   ├── <span class="dir">gemm/</span>
-│       │   ├── <span class="dir">moe/</span>
-│       │   ├── <span class="dir">layernorm/</span>
-│       │   ├── <span class="dir">quantization/</span>
-│       │   ├── <span class="dir">communication/</span>
-│       │   └── <span class="dir">sampling/</span>
-│       ├── <span class="dir">thirdparty/</span>              <span class="comment"># Vendored CUDA/Triton kernels</span>
-│       └── <span class="dir">numerics/</span>               <span class="comment"># Kernel correctness verification</span>
-│
-├── <span class="dir">tokenspeed-mla/</span>
-│   └── <span class="dir">python/tokenspeed_mla/</span>
-│       ├── mla_prefill.py               <span class="comment"># CuTe DSL JIT prefill</span>
-│       ├── mla_decode_fp16.py
-│       ├── mla_decode_fp8.py
-│       ├── mla_kv_pack_quantize_fp8.py
-│       └── fmha.py
-│
-├── <span class="dir">tokenspeed-scheduler/</span>
-│   ├── <span class="dir">csrc/</span>
-│   │   ├── <span class="dir hl">scheduler/</span>               <span class="comment"># Scheduler core + FSM</span>
-│   │   │   ├── scheduler.h/.cpp
-│   │   │   ├── request.h/.cpp
-│   │   │   ├── request_spec.h
-│   │   │   └── <span class="dir">operations/</span>
-│   │   ├── <span class="dir hl">fsm/</span>                     <span class="comment"># State machine events/states</span>
-│   │   │   ├── forward_states.h
-│   │   │   ├── forward_events.h/.cpp
-│   │   │   ├── cache_events.h/.cpp
-│   │   │   └── pd_events.h/.cpp
-│   │   ├── <span class="dir hl">resource/</span>               <span class="comment"># KV cache + allocators</span>
-│   │   │   ├── <span class="dir">kv_prefix_cache/</span>     <span class="comment"># Radix tree + LoRA namespacing</span>
-│   │   │   ├── <span class="dir">radix_tree/</span>          <span class="comment"># Compressed prefix tree</span>
-│   │   │   ├── <span class="dir">allocator/</span>           <span class="comment"># Page allocators</span>
-│   │   │   └── <span class="dir">hybrid_prefix_cache/</span> <span class="comment"># L1+L2+Mamba</span>
-│   │   └── <span class="dir">core/</span>                    <span class="comment"># TokenContainer</span>
-│   ├── <span class="dir">bindings/</span>
-│   │   └── python_module.cpp            <span class="comment"># nanobind Python bindings</span>
-│   └── <span class="dir">tests/cpp/</span>                   <span class="comment"># GTest unit tests</span>
-│
-├── <span class="dir">benchmark/</span>
-│   ├── bench_cpp_eviction.py
-│   ├── bench_eviction_ts.py
-│   ├── bench_decode_cache.py
-│   ├── test_lora_dynamic.py
-│   └── test_lora_batch.py
-│
-├── <span class="dir">test/</span>
-│   ├── runners.py
-│   ├── <span class="dir">runtime/</span>                     <span class="comment"># Integration tests</span>
-│   │   ├── <span class="dir">cache/</span>
-│   │   ├── <span class="dir">lora/</span>
-│   │   └── <span class="dir">models/</span>
-│   └── <span class="dir">ci_system/</span>
-│
-└── <span class="dir">docs/</span>
-    ├── lora_implementation.html
-    └── tokenspeed_structure.html        <span class="comment"># ← this file</span>
-</pre></div>
-  </div>
-</details>
-
-</main>
-</div>
-</body>
-</html>

From dc7a35be1477b032a2bd3af07688681e81fdac9b Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 03:23:49 +0000
Subject: [PATCH 05/19] chore: move LoRA test files to qywu/lora-dev branch

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 test/runners.py                               | 749 ------------------
 test/runtime/lora/__init__.py                 |   0
 test/runtime/lora/test_adapter_io.py          |  87 --
 test/runtime/lora/test_lora_manager.py        | 488 ------------
 test/runtime/lora/test_lora_registry.py       | 102 ---
 test/runtime/lora/test_lora_request_naming.py |  72 --
 test/runtime/lora/test_moe_lora.py            | 339 --------
 ...st_qwen3_lm_head_lora_password_adapters.py | 203 -----
 .../test_qwen3_lora_password_adapters.py      | 226 ------
 .../test_qwen3_moe_lora_password_adapters.py  | 212 -----
 ...3_moe_per_expert_lora_password_adapters.py | 199 -----
 11 files changed, 2677 deletions(-)
 delete mode 100644 test/runners.py
 delete mode 100644 test/runtime/lora/__init__.py
 delete mode 100644 test/runtime/lora/test_adapter_io.py
 delete mode 100644 test/runtime/lora/test_lora_manager.py
 delete mode 100644 test/runtime/lora/test_lora_registry.py
 delete mode 100644 test/runtime/lora/test_lora_request_naming.py
 delete mode 100644 test/runtime/lora/test_moe_lora.py
 delete mode 100644 test/runtime/test_qwen3_lm_head_lora_password_adapters.py
 delete mode 100644 test/runtime/test_qwen3_lora_password_adapters.py
 delete mode 100644 test/runtime/test_qwen3_moe_lora_password_adapters.py
 delete mode 100644 test/runtime/test_qwen3_moe_per_expert_lora_password_adapters.py

diff --git a/test/runners.py b/test/runners.py
deleted file mode 100644
index 1838997ec..000000000
--- a/test/runners.py
+++ /dev/null
@@ -1,749 +0,0 @@
-# Adapted from meituan-longcat/SGLang-FluentLLM.
-# This file has been modified for this repository.
-# This file may incorporate material from ModelTC/lightllm,
-# vllm-project/vllm, and sgl-project/sglang, as identified in
-# python/THIRDPARTYNOTICES.
-
-# Copyright (c) 2026 LightSeek Foundation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import json
-import multiprocessing as mp
-import os
-import queue
-from dataclasses import dataclass
-from test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l
-from typing import Any, List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import transformers
-from transformers import AutoConfig, AutoModelForCausalLM, GenerationConfig
-
-from tokenspeed.runtime.entrypoints.engine import Engine
-from tokenspeed.runtime.utils import get_device
-from tokenspeed.runtime.utils.hf_transformers_utils import get_tokenizer
-
-DEFAULT_PROMPTS = [
-    "Apple is red. Banana is Yellow. " * 800 + "Apple is",
-    "The capital of the United Kingdom is",
-    "Today is a sunny day and I like",
-    "AI is a field of computer science focused on",
-    # the output of gemma-2-2b from SRT is unstable on the commented prompt
-    # "The capital of France is",
-]
-dirpath = os.path.dirname(__file__)
-with open(os.path.join(dirpath, "long_prompt.txt"), "r") as f:
-    long_prompt = f.read()
-DEFAULT_PROMPTS.append(long_prompt)
-
-NUM_TOP_LOGPROBS = 5
-
-
-def get_dtype_str(torch_dtype):
-    if torch_dtype is torch.float16:
-        return "float16"
-    if torch_dtype is torch.float32:
-        return "float32"
-    if torch_dtype is torch.bfloat16:
-        return "bfloat16"
-    else:
-        raise NotImplementedError()
-
-
-def get_top_logprobs(logits, k):
-    logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
-    del logits
-    return torch.topk(logprobs, k=k, dim=-1).values
-
-
-def get_token_ids_logprobs(logits, token_ids):
-    logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
-    del logits
-    logprobs = logprobs[..., token_ids]
-    return logprobs
-
-
-@dataclass
-class ModelOutput:
-    output_strs: List[str] = None
-    output_ids: List[int] = None
-    top_input_logprobs: List[torch.Tensor] = None
-    top_output_logprobs: List[torch.Tensor] = None
-    top_output_logprob_idx: List[List[int]] = None
-    embed_logits: List[torch.Tensor] = None
-    scores: List[float] = None
-    input_token_logprobs_lst: List[List[Tuple[float, int, None]]] = None
-    output_token_logprobs_lst: List[List[Tuple[float, int, None]]] = None
-    token_ids_input_logprobs: List[torch.Tensor] = None
-    token_ids_output_logprobs: List[torch.Tensor] = None
-
-
-class HFRunner:
-    def __init__(
-        self,
-        model_path: str,
-        torch_dtype: torch.dtype,
-        model_type: str = "generation",
-        output_str_only: bool = False,
-        trust_remote_code: bool = False,
-        patch_model_do_sample_false: bool = False,
-        matryoshka_dim: Optional[int] = None,
-        tp_size: int = 1,
-        max_model_len: Optional[int] = None,
-    ):
-        self.model_type = model_type
-        self.output_str_only = output_str_only
-        self.trust_remote_code = trust_remote_code
-        self.patch_model_do_sample_false = patch_model_do_sample_false
-        self.tp_size = tp_size
-        self.max_model_len = max_model_len
-
-        self.in_queue = mp.Queue()
-        self.out_queue = mp.Queue()
-
-        self.model_proc = mp.Process(
-            target=self.start_model_process,
-            args=(
-                self.in_queue,
-                self.out_queue,
-                model_path,
-                torch_dtype,
-                matryoshka_dim,
-                tp_size,
-                max_model_len,
-            ),
-        )
-        self.model_proc.start()
-
-    def start_model_process(
-        self,
-        in_queue,
-        out_queue,
-        model_path,
-        torch_dtype,
-        matryoshka_dim: Optional[int] = None,
-        tp_size: int = 1,
-        max_model_len: Optional[int] = None,
-    ):
-        # Apply model-specific patches
-        monkey_patch_gemma2_sdpa()
-
-        # Disable async tensor loading to avoid CUDA illegal memory access in spawned subprocess.
-        # Transformers uses a ThreadPoolExecutor to load weights in parallel, which is not safe
-        # when CUDA is used from multiple threads in a subprocess started with "spawn".
-        os.environ["HF_DEACTIVATE_ASYNC_LOAD"] = "1"
-
-        # Load the model and tokenizer
-        if self.model_type == "generation":
-            config = AutoConfig.from_pretrained(
-                model_path, trust_remote_code=self.trust_remote_code
-            )
-            if self.trust_remote_code:
-                model_cls = AutoModelForCausalLM
-            else:
-                model_arch = getattr(config, "architectures")[0]
-                model_cls = getattr(transformers, model_arch)
-
-            # HFRunner is for reference outputs only, so load onto a single GPU.
-            # Using device_map="auto" with multi-GPU in a spawned subprocess causes
-            # cudaErrorIllegalAddress on B200 (CUDA 13.0) when tensors are materialized
-            # on non-primary devices during MXFP4 dequantization.
-            if tp_size > 1:
-                self.base_model = model_cls.from_pretrained(
-                    model_path,
-                    torch_dtype=torch_dtype,
-                    trust_remote_code=self.trust_remote_code,
-                    low_cpu_mem_usage=True,
-                    device_map="cuda:0",
-                )
-            else:
-                self.base_model = model_cls.from_pretrained(
-                    model_path,
-                    torch_dtype=torch_dtype,
-                    trust_remote_code=self.trust_remote_code,
-                    low_cpu_mem_usage=True,
-                ).to(get_device())
-        else:
-            raise Exception(f"Unrecognized model type {self.model_type}")
-
-        self.max_model_len = max_model_len
-        self.tokenizer = get_tokenizer(
-            model_path,
-            torch_dtype=torch.dtype,
-            trust_remote_code=self.trust_remote_code,
-            model_max_length=self.max_model_len,
-        )
-
-        # Run forward
-        while True:
-            prompts, image_data, max_new_tokens, adapter_paths, token_ids_logprob = (
-                in_queue.get()
-            )
-            if adapter_paths is not None:
-                assert len(prompts) == len(adapter_paths)
-
-            if prompts is not None:
-                if self.model_type == "generation":
-                    out_queue.put(
-                        self.forward_generation_raw(
-                            base_model=self.base_model,
-                            prompts=prompts,
-                            max_new_tokens=max_new_tokens,
-                            tokenizer=self.tokenizer,
-                            adapter_paths=adapter_paths,
-                            torch_dtype=torch_dtype,
-                            output_str_only=self.output_str_only,
-                            token_ids_logprob=token_ids_logprob,
-                            patch_model_do_sample_false=self.patch_model_do_sample_false,
-                            max_model_len=self.max_model_len,
-                        )
-                    )
-                else:
-                    raise Exception(f"Unrecognized model type {self.model_type}")
-
-    def forward(
-        self,
-        prompts: Union[
-            List[List[str]], List[str], List[torch.Tensor]
-        ] = DEFAULT_PROMPTS,
-        image_data: Optional[List[str]] = None,
-        max_new_tokens: int = 8,
-        adapter_paths: Optional[List[str]] = None,
-        token_ids_logprob: Optional[int] = None,
-    ):
-        self.in_queue.put(
-            (prompts, image_data, max_new_tokens, adapter_paths, token_ids_logprob)
-        )
-        while True:
-            try:
-                return self.out_queue.get(timeout=10)
-            except queue.Empty:
-                if not self.model_proc.is_alive():
-                    raise RuntimeError(
-                        f"HFRunner subprocess died with exit code "
-                        f"{self.model_proc.exitcode} (likely OOM). "
-                        f"Check GPU memory availability."
-                    )
-
-    def terminate(self):
-        self.model_proc.terminate()
-        self.model_proc.join(timeout=10)
-        if self.model_proc.is_alive():
-            self.model_proc.kill()
-            self.model_proc.join(timeout=5)
-        self.in_queue = self.out_queue = None
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.terminate()
-
-    @staticmethod
-    def forward_generation_raw(
-        base_model,
-        prompts: Union[List[str], List[torch.Tensor]],
-        max_new_tokens: int,
-        tokenizer,
-        torch_dtype: torch.dtype,
-        adapter_paths: Optional[List[str]] = None,
-        output_str_only: bool = False,
-        token_ids_logprob: Optional[int] = None,
-        patch_model_do_sample_false: Optional[bool] = False,
-        max_model_len: Optional[int] = None,
-    ) -> ModelOutput:
-        output_strs = []
-        top_input_logprobs = []
-        top_output_logprobs = []
-        if token_ids_logprob is not None:
-            token_ids_input_logprobs = []
-            token_ids_output_logprobs = []
-        else:
-            token_ids_input_logprobs = token_ids_output_logprobs = None
-
-        for i, p in enumerate(prompts):
-            if isinstance(p, str):
-                # Apply max_model_len truncation if specified
-                if max_model_len is not None:
-                    input_ids = tokenizer.encode(
-                        p,
-                        return_tensors="pt",
-                        truncation=True,
-                        max_length=max_model_len,
-                    ).to(get_device())
-                else:
-                    input_ids = tokenizer.encode(p, return_tensors="pt").to(
-                        get_device()
-                    )
-            else:
-                input_ids = torch.tensor([p], device=get_device())
-                # Apply max_model_len truncation for tensor input
-                if max_model_len is not None and input_ids.shape[1] > max_model_len:
-                    input_ids = input_ids[:, :max_model_len]
-
-            if adapter_paths is not None and adapter_paths[i] is not None:
-                from peft import PeftModel
-
-                model = PeftModel.from_pretrained(
-                    base_model,
-                    adapter_paths[i],
-                    torch_dtype=torch_dtype,
-                    is_trainable=False,
-                )
-            else:
-                model = base_model
-
-            if patch_model_do_sample_false:
-                model.generation_config.do_sample = False
-            outputs = model.generate(
-                input_ids=input_ids,
-                generation_config=GenerationConfig(
-                    do_sample=False,
-                    temperature=None,
-                    top_p=None,
-                    max_new_tokens=max_new_tokens,
-                    return_dict_in_generate=True,
-                    output_scores=(not output_str_only),
-                    # make sure to disable compile
-                    disable_compile=True,
-                ),
-            )
-
-            text = tokenizer.decode(
-                outputs[0][0][len(input_ids[0]) :], skip_special_tokens=True
-            )
-
-            # Check if the text is empty or only whitespace.
-            if not text.strip():
-                raise ValueError(
-                    "Received an empty text response. Please verify your input or model configuration."
-                )
-            output_strs.append(text)
-
-            if not output_str_only:
-                # outputs.scores: (num_token, 1, vocab_size)
-                top_output_logprobs.append(
-                    [
-                        get_top_logprobs(logits[0], NUM_TOP_LOGPROBS).tolist()
-                        for logits in outputs.scores
-                    ]
-                )
-                if token_ids_logprob is not None:
-                    token_ids_output_logprobs.append(
-                        [
-                            get_token_ids_logprobs(
-                                logits[0], token_ids_logprob
-                            ).tolist()
-                            for logits in outputs.scores
-                        ]
-                    )
-                del outputs
-
-                input_logits = model.forward(input_ids).logits[0]
-                top_input_logprobs.append(
-                    get_top_logprobs(input_logits, NUM_TOP_LOGPROBS).tolist()
-                )
-                if token_ids_logprob is not None:
-                    token_ids_input_logprobs.append(
-                        get_token_ids_logprobs(input_logits, token_ids_logprob).tolist()
-                    )
-                del input_logits
-
-            if adapter_paths is not None and adapter_paths[i] is not None:
-                # Unload the LoRA adapter if it is used
-                model.unload()
-
-        return ModelOutput(
-            output_strs=output_strs,
-            top_input_logprobs=top_input_logprobs,
-            top_output_logprobs=top_output_logprobs,
-            token_ids_input_logprobs=token_ids_input_logprobs,
-            token_ids_output_logprobs=token_ids_output_logprobs,
-        )
-
-
-class RTRunner:
-    _port_counter = 0  # Class-level port counter
-
-    def __init__(
-        self,
-        model_path: str,
-        torch_dtype: torch.dtype,
-        model_type: str,
-        world_size: int = 1,
-        ep_size: int = 1,
-        port: int = None,  # None means auto-increment
-        attention_backend: Optional[str] = None,
-        enforce_eager: bool = False,
-        enable_prefix_caching: bool = True,
-        chunked_prefill_size: Optional[int] = None,
-        max_model_len: Optional[int] = None,
-        max_total_tokens: Optional[int] = None,
-        block_size: Optional[int] = 64,
-        data_parallel_size: int = 1,
-        tokenizer: Optional[str] = None,
-        gpu_memory_utilization: float = 0.65,
-        trust_remote_code: bool = False,
-        speculative_draft_model_path: Optional[str] = None,
-        speculative_algorithm: Optional[str] = None,
-        speculative_num_steps: Optional[int] = None,
-        speculative_eagle_topk: Optional[int] = None,
-        speculative_num_draft_tokens: Optional[int] = None,
-        disable_overlap_schedule: bool = False,
-        disable_custom_all_reduce: bool = False,
-        max_cudagraph_capture_size: int = 4,
-        hf_overrides: Optional[dict[str, Any]] = None,
-        disable_prefill_graph: bool = False,
-        **kwargs,
-    ):
-        # Auto-assign port if not specified
-        if port is None:
-            port = DEFAULT_PORT_FOR_SRT_TEST_RUNNER + RTRunner._port_counter
-            RTRunner._port_counter += 1
-
-        self.model_type = model_type
-        self.is_generation = model_type == "generation"
-        if not self.is_generation:
-            raise ValueError("Embedding, rerank, and reward model runners are removed.")
-
-        spec_kwargs = {}
-        if speculative_draft_model_path:
-            spec_kwargs["speculative_draft_model_path"] = speculative_draft_model_path
-            spec_kwargs["speculative_algorithm"] = speculative_algorithm
-            spec_kwargs["speculative_num_steps"] = speculative_num_steps
-            spec_kwargs["speculative_eagle_topk"] = speculative_eagle_topk
-            spec_kwargs["speculative_num_draft_tokens"] = speculative_num_draft_tokens
-
-        self.engine = Engine(
-            model=model_path,
-            world_size=world_size,
-            ep_size=ep_size,
-            dtype=get_dtype_str(torch_dtype),
-            port=port,
-            gpu_memory_utilization=gpu_memory_utilization,
-            trust_remote_code=trust_remote_code,
-            attention_backend=attention_backend,
-            enforce_eager=enforce_eager,
-            enable_prefix_caching=enable_prefix_caching,
-            chunked_prefill_size=chunked_prefill_size,
-            max_model_len=max_model_len,
-            max_total_tokens=max_total_tokens,
-            block_size=block_size,
-            data_parallel_size=data_parallel_size,
-            tokenizer=tokenizer,
-            disable_overlap_schedule=disable_overlap_schedule,
-            max_cudagraph_capture_size=max_cudagraph_capture_size,
-            disable_custom_all_reduce=disable_custom_all_reduce,
-            hf_overrides=(json.dumps(hf_overrides) if hf_overrides else "{}"),
-            disable_prefill_graph=disable_prefill_graph,
-            **spec_kwargs,
-            **kwargs,
-        )
-
-        if tokenizer is None:
-            self.tokenizer = get_tokenizer(
-                model_path, trust_remote_code=trust_remote_code
-            )
-        else:
-            self.tokenizer = None
-
-    def load_lora_adapter(self, lora_name: str, adapter_path: str):
-        return self.engine.load_lora_adapter(lora_name, adapter_path)
-
-    def unload_lora_adapter(self, lora_name: str):
-        return self.engine.unload_lora_adapter(lora_name)
-
-    def forward(
-        self,
-        prompts: Union[
-            List[List[str]], List[str], List[torch.Tensor]
-        ] = DEFAULT_PROMPTS,
-        max_new_tokens: int = 8,
-        lora_names: Optional[List[str]] = None,
-        logprob_start_len: int = 0,
-        top_k: Optional[int] = None,
-        token_ids_logprob: Optional[List[int]] = None,
-    ):
-        if self.is_generation:
-            return self.forward_generation_raw(
-                engine=self.engine,
-                prompts=prompts,
-                max_new_tokens=max_new_tokens,
-                lora_names=lora_names,
-                logprob_start_len=logprob_start_len,
-                top_k=top_k,
-                token_ids_logprob=token_ids_logprob,
-            )
-        else:
-            raise ValueError("Embedding, rerank, and reward model runners are removed.")
-
-    def batch_forward(
-        self,
-        prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
-        max_new_tokens=8,
-    ):
-        """
-        testing serving by sending all prompts once
-        only return output strings and no logprobs
-        """
-        if self.is_generation:
-            return self.batch_forward_generation_raw(
-                engine=self.engine,
-                prompts=prompts,
-                max_new_tokens=max_new_tokens,
-            )
-        else:
-            raise ValueError("Embedding, rerank, and reward model runners are removed.")
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.engine.shutdown()
-        del self.engine
-
-    @staticmethod
-    def forward_generation_raw(
-        engine: Engine,
-        prompts: Union[List[str], List[torch.Tensor]],
-        max_new_tokens: int = 8,
-        lora_names: Optional[List[str]] = None,
-        logprob_start_len: int = 0,
-        top_k: Optional[int] = None,
-        token_ids_logprob: Optional[List[int]] = None,
-    ):
-        # the return value contains logprobs from prefill
-        output_strs = []
-        output_ids = []
-        # Input logprobs. Note that the last item in input logprob is equivalent to
-        # the first item in the output logprob.
-        top_input_logprobs = []
-        input_token_logprobs_lst = []
-        top_output_logprobs = []
-        output_token_logprobs_lst = []
-        top_output_logprob_idx = []
-        if token_ids_logprob is not None:
-            token_ids_input_logprobs = []
-            token_ids_output_logprobs = []
-        else:
-            token_ids_input_logprobs = token_ids_output_logprobs = None
-
-        sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
-        if top_k:
-            sampling_params["top_k"] = top_k
-
-        for i, prompt in enumerate(prompts):
-            lora_name = None if lora_names is None else lora_names[i]
-            response = engine.generate(
-                prompt,
-                sampling_params=sampling_params,
-                return_logprob=True,
-                logprob_start_len=logprob_start_len,
-                top_logprobs_num=NUM_TOP_LOGPROBS,
-                token_ids_logprob=token_ids_logprob,
-                lora_name=lora_name,
-            )
-            text = response["text"]
-
-            # Check if the text is empty or only whitespace.
-            if not text.strip():
-                raise ValueError(
-                    "Received an empty text response. Please verify your input or model configuration."
-                )
-            output_strs.append(text)
-            output_ids.append(response["output_ids"])
-
-            input_token_logprobs = response["meta_info"]["input_token_logprobs"]
-            output_token_logprobs = response["meta_info"]["output_token_logprobs"]
-            # print(i, input_token_logprobs)
-            # print(i, output_token_logprobs)
-            logprobs = response["meta_info"]["input_top_logprobs"]
-            if token_ids_logprob is not None:
-                input_token_ids_logprobs = response["meta_info"][
-                    "input_token_ids_logprobs"
-                ][1:]
-            else:
-                input_token_ids_logprobs = None
-
-            num_prompt_tokens = response["meta_info"]["prompt_tokens"]
-            # assert len(input_token_logprobs) == num_prompt_tokens - logprob_start_len
-            assert len(logprobs) == num_prompt_tokens - logprob_start_len
-
-            # The first token logprob has no meaning in tokenspeed.
-            input_token_logprobs = input_token_logprobs[1:]
-            logprobs = logprobs[1:]
-            assert len(input_token_logprobs) == len(logprobs)
-
-            input_token_logprobs_lst.append(
-                input_token_logprobs + [output_token_logprobs[0]]
-            )
-            output_token_logprobs_lst.append(output_token_logprobs)
-
-            top_input_logprobs.append(
-                [[tup[0] for tup in x[:NUM_TOP_LOGPROBS]] for x in logprobs]
-                + [
-                    [
-                        tup[0]
-                        for tup in response["meta_info"]["output_top_logprobs"][0][
-                            :NUM_TOP_LOGPROBS
-                        ]
-                    ]
-                ]
-            )
-            top_output_logprobs.append(
-                [
-                    [tup[0] for tup in x[:NUM_TOP_LOGPROBS]]
-                    for x in response["meta_info"]["output_top_logprobs"]
-                ]
-            )
-            top_output_logprob_idx.append(
-                [
-                    [tup[1] for tup in x[:NUM_TOP_LOGPROBS]]
-                    for x in response["meta_info"]["output_top_logprobs"]
-                ]
-            )
-            if token_ids_logprob is not None:
-                token_ids_input_logprobs.append(
-                    [[tup[0] for tup in x] for x in input_token_ids_logprobs]
-                    + [
-                        [
-                            tup[0]
-                            for tup in response["meta_info"][
-                                "output_token_ids_logprobs"
-                            ][0]
-                        ]
-                    ]
-                )
-                token_ids_output_logprobs.append(
-                    [
-                        [tup[0] for tup in x]
-                        for x in response["meta_info"]["output_token_ids_logprobs"]
-                    ]
-                )
-
-        return ModelOutput(
-            output_strs=output_strs,
-            output_ids=output_ids,
-            top_input_logprobs=top_input_logprobs,
-            top_output_logprobs=top_output_logprobs,
-            input_token_logprobs_lst=input_token_logprobs_lst,
-            output_token_logprobs_lst=output_token_logprobs_lst,
-            top_output_logprob_idx=top_output_logprob_idx,
-            token_ids_input_logprobs=token_ids_input_logprobs,
-            token_ids_output_logprobs=token_ids_output_logprobs,
-        )
-
-    @staticmethod
-    def batch_forward_generation_raw(
-        prompts: Union[List[str], List[torch.Tensor]],
-        max_new_tokens,
-        engine,
-    ):
-        # the return value contains logprobs from prefill
-        output_strs = []
-        sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
-        response = engine.generate(
-            prompts,
-            sampling_params=sampling_params,
-        )
-        output_strs = [r["text"] for r in response]
-
-        return ModelOutput(
-            output_strs=output_strs,
-        )
-
-
-def monkey_patch_gemma2_sdpa():
-    """
-    Use sdpa by default to fix the OOM issue.
-    Revert this commit:
-    https://github.com/huggingface/transformers/commit/975b988bfe6e7ebb47390cd9a1556c6888804883#diff-5f76eac6f18f4b491521314c318a9692318feb4d19228e9576cce7bde4240834R660
-    """
-    from transformers.models.gemma2.modeling_gemma2 import Gemma2PreTrainedModel
-
-    def _check_and_enable_sdpa(config, hard_check_only: bool = False):
-        config._attn_implementation = "sdpa"
-        return config
-
-    setattr(Gemma2PreTrainedModel, "_check_and_enable_sdpa", _check_and_enable_sdpa)
-
-
-def check_close_model_outputs(
-    hf_outputs: ModelOutput,
-    rt_outputs: ModelOutput,
-    prefill_tolerance: float,
-    decode_tolerance: float,
-    rouge_l_tolerance: float,
-    debug_text: str = "",
-    check_logprobs: bool = True,
-    extra_references: Optional[List[List[str]]] = None,
-):
-    # Compare output strings
-    print(f"{hf_outputs.output_strs=}")
-    print(f"{rt_outputs.output_strs=}")
-    base_scores = calculate_rouge_l(hf_outputs.output_strs, rt_outputs.output_strs)
-    if extra_references:
-        rouge_l_scores = [
-            max(
-                base,
-                *(
-                    calculate_rouge_l([ref[i]], [rt_outputs.output_strs[i]])[0]
-                    for ref in extra_references
-                ),
-            )
-            for i, base in enumerate(base_scores)
-        ]
-    else:
-        rouge_l_scores = base_scores
-    print(f"{rouge_l_scores=}")
-    assert all(
-        score >= rouge_l_tolerance for score in rouge_l_scores
-    ), f"Not all ROUGE-L scores are greater than rouge_l_tolerance={rouge_l_tolerance}"
-
-    if check_logprobs:
-        for i in range(len(hf_outputs.output_strs)):
-            # Compare input logprobs
-            hf_logprobs = torch.Tensor(hf_outputs.top_input_logprobs[i])
-            srt_logprobs = torch.Tensor(rt_outputs.top_input_logprobs[i])
-            input_len = hf_logprobs.shape[0]
-            print(
-                "prefill logprobs max_diff", torch.max(abs(hf_logprobs - srt_logprobs))
-            )
-            if input_len <= 100:
-                assert torch.all(abs(hf_logprobs - srt_logprobs) < prefill_tolerance), (
-                    f"prefill logprobs are not all close with {debug_text} "
-                    f"prefill_tolerance={prefill_tolerance}."
-                    f"{hf_logprobs=}, {srt_logprobs=}"
-                )
-
-            # Compare output logprobs
-            hf_logprobs = torch.Tensor(hf_outputs.top_output_logprobs[i])
-            srt_logprobs = torch.Tensor(rt_outputs.top_output_logprobs[i])
-
-            print(
-                "decode logprobs max_diff", torch.max(abs(hf_logprobs - srt_logprobs))
-            )
-            if input_len <= 100:
-                assert torch.all(abs(hf_logprobs - srt_logprobs) < decode_tolerance), (
-                    f"decode logprobs are not all close with {debug_text} "
-                    f"decode_tolerance={decode_tolerance}."
-                    f"{hf_logprobs=}, {srt_logprobs=}"
-                )
diff --git a/test/runtime/lora/__init__.py b/test/runtime/lora/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/test/runtime/lora/test_adapter_io.py b/test/runtime/lora/test_adapter_io.py
deleted file mode 100644
index 008db2e60..000000000
--- a/test/runtime/lora/test_adapter_io.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) 2026 LightSeek Foundation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-from __future__ import annotations
-
-import torch
-
-from tokenspeed.runtime.lora.adapter_io import parse_adapter_weights
-
-
-def test_parse_adapter_weights_accepts_expert_scoped_moe_modules():
-    tensors = {
-        "base_model.model.model.layers.3.mlp.experts.7.gate_proj.lora_A.weight": (
-            torch.randn(4, 16)
-        ),
-        "base_model.model.model.layers.3.mlp.experts.7.gate_proj.lora_B.weight": (
-            torch.randn(32, 4)
-        ),
-        "base_model.model.model.layers.3.mlp.experts.7.up_proj.lora_A.weight": (
-            torch.randn(4, 16)
-        ),
-        "base_model.model.model.layers.3.mlp.experts.7.up_proj.lora_B.weight": (
-            torch.randn(32, 4)
-        ),
-        "base_model.model.model.layers.3.mlp.experts.7.down_proj.lora_A.weight": (
-            torch.randn(4, 32)
-        ),
-        "base_model.model.model.layers.3.mlp.experts.7.down_proj.lora_B.weight": (
-            torch.randn(16, 4)
-        ),
-    }
-
-    parsed = parse_adapter_weights(tensors)
-
-    assert set(parsed[3]) == {
-        "experts.7.gate_proj",
-        "experts.7.up_proj",
-        "experts.7.down_proj",
-    }
-    assert parsed[3]["experts.7.gate_proj"][0].shape == (4, 16)
-    assert parsed[3]["experts.7.down_proj"][1].shape == (16, 4)
-
-
-def test_parse_adapter_weights_accepts_3d_moe_modules():
-    tensors = {
-        "base_model.model.model.layers.1.mlp.experts.w1.lora_A.weight": torch.randn(
-            1, 4, 16
-        ),
-        "base_model.model.model.layers.1.mlp.experts.w1.lora_B.weight": torch.randn(
-            8, 32, 4
-        ),
-        "base_model.model.model.layers.1.mlp.experts.w2.lora_A.weight": torch.randn(
-            8, 4, 32
-        ),
-        "base_model.model.model.layers.1.mlp.experts.w2.lora_B.weight": torch.randn(
-            1, 16, 4
-        ),
-        "base_model.model.model.layers.1.mlp.experts.w3.lora_A.weight": torch.randn(
-            1, 4, 16
-        ),
-        "base_model.model.model.layers.1.mlp.experts.w3.lora_B.weight": torch.randn(
-            8, 32, 4
-        ),
-    }
-
-    parsed = parse_adapter_weights(tensors)
-
-    assert set(parsed[1]) == {"experts.w1", "experts.w2", "experts.w3"}
-    assert parsed[1]["experts.w1"][0].shape == (1, 4, 16)
-    assert parsed[1]["experts.w2"][1].shape == (1, 16, 4)
diff --git a/test/runtime/lora/test_lora_manager.py b/test/runtime/lora/test_lora_manager.py
deleted file mode 100644
index b01940d7a..000000000
--- a/test/runtime/lora/test_lora_manager.py
+++ /dev/null
@@ -1,488 +0,0 @@
-# Copyright (c) 2026 LightSeek Foundation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-"""Tests for LoraManager.prepare_loras → persistent batch_info.
-
-The captured CUDA graph references the manager's batch_info tensors, so
-their pointers must be stable across ``prepare_loras`` calls and the
-contents must reflect each step's per-request slot ids.
-"""
-
-from __future__ import annotations
-
-from types import SimpleNamespace
-
-import pytest
-import torch
-
-from tokenspeed.runtime.lora.lora_batch import NO_LORA_SLOT
-from tokenspeed.runtime.lora.lora_buffers import LoraWeightBuffers
-from tokenspeed.runtime.lora.lora_manager import (
-    LoraManager,
-    _use_triton_grouped_decode,
-)
-
-
-def _model_config():
-    return SimpleNamespace(
-        num_hidden_layers=2,
-        hidden_size=32,
-        num_attention_heads=4,
-        num_key_value_heads=4,
-    )
-
-
-@pytest.fixture
-def manager():
-    if not torch.cuda.is_available():
-        pytest.skip("LoraManager allocates GPU buffers")
-    return LoraManager(
-        model_config=_model_config(),
-        max_loras=2,
-        max_lora_rank=8,
-        max_num_tokens=64,
-        dtype=torch.float16,
-        device=torch.device("cuda:0"),
-    )
-
-
-def test_batch_info_tensor_addresses_are_stable(manager):
-    bi = manager.batch_info
-    addrs_before = (
-        bi.seg_lens.data_ptr(),
-        bi.seg_indptr.data_ptr(),
-        bi.weight_indices.data_ptr(),
-        bi.lora_ranks.data_ptr(),
-        bi.scalings.data_ptr(),
-    )
-    manager.prepare_loras([0, 0, 0], per_request_token_counts=1)
-    manager.prepare_loras([0, 0], per_request_token_counts=4)
-    addrs_after = (
-        bi.seg_lens.data_ptr(),
-        bi.seg_indptr.data_ptr(),
-        bi.weight_indices.data_ptr(),
-        bi.lora_ranks.data_ptr(),
-        bi.scalings.data_ptr(),
-    )
-    assert addrs_before == addrs_after
-
-
-def test_prepare_loras_uniform_decode(manager):
-    n = manager.prepare_loras([0, 0, 0, 0], per_request_token_counts=1)
-    assert n == 4
-    bi = manager.batch_info
-    assert bi.bs == 4
-    assert bi.num_segments == 4
-    assert bi.max_len == 1
-    torch.cuda.synchronize()
-    assert bi.seg_lens[:4].tolist() == [1, 1, 1, 1]
-    assert bi.seg_indptr[:5].tolist() == [0, 1, 2, 3, 4]
-    assert bi.weight_indices[:4].tolist() == [NO_LORA_SLOT] * 4
-
-
-def test_prepare_loras_target_verify_repeats(manager):
-    # Each request emits ``spec_num_tokens`` tokens; one segment per request.
-    n = manager.prepare_loras([0, 0], per_request_token_counts=3)
-    assert n == 6
-    bi = manager.batch_info
-    assert bi.bs == 2
-    assert bi.max_len == 3
-    torch.cuda.synchronize()
-    assert bi.seg_lens[:2].tolist() == [3, 3]
-    assert bi.seg_indptr[:3].tolist() == [0, 3, 6]
-
-
-def test_prepare_loras_variable_segments(manager):
-    n = manager.prepare_loras([0, 0, 0], per_request_token_counts=[5, 1, 2])
-    assert n == 8
-    bi = manager.batch_info
-    assert bi.bs == 3
-    assert bi.max_len == 5
-    torch.cuda.synchronize()
-    assert bi.seg_lens[:3].tolist() == [5, 1, 2]
-    assert bi.seg_indptr[:4].tolist() == [0, 5, 6, 8]
-
-
-def test_prepare_loras_unknown_id_falls_back_to_no_lora_slot(manager):
-    n = manager.prepare_loras([99], per_request_token_counts=2)
-    assert n == 2
-    torch.cuda.synchronize()
-    assert manager.batch_info.weight_indices[:1].tolist() == [NO_LORA_SLOT]
-
-
-def test_prepare_loras_overflow_raises(manager):
-    with pytest.raises(ValueError, match="overflow"):
-        manager.prepare_loras([0] * 33, per_request_token_counts=2)
-
-
-def test_prepare_loras_mismatched_lengths_raises(manager):
-    with pytest.raises(ValueError, match="length"):
-        manager.prepare_loras([0, 0], per_request_token_counts=[1, 2, 3])
-
-
-def test_manager_allocates_only_real_adapter_slots(manager):
-    # Match vLLM's layout: the GPU pool contains only real adapter slots.
-    # Base/no-LoRA requests use NO_LORA_SLOT in per-step metadata.
-    torch.cuda.synchronize()
-    assert manager._n_slots == manager.max_loras
-    assert len(manager._slot_to_name) == manager.max_loras
-    assert manager.batch_info.weight_indices[0].item() == NO_LORA_SLOT
-
-
-def test_has_active_lora_flag(manager):
-    # All-base batch → flag is False.  CudaGraphWrapper uses this to pick
-    # the no-LoRA captured graph variant (skip the per-step Triton kernels).
-    manager.prepare_loras([0, 0, 0])
-    assert manager.has_active_lora is False
-    # Unknown id falls back to NO_LORA_SLOT → still no active adapter.
-    manager.prepare_loras([99])
-    assert manager.has_active_lora is False
-    assert manager.batch_info.single_lora_slot == NO_LORA_SLOT
-
-
-def test_lora_weight_buffers_respect_disabled_groups():
-    buffers = LoraWeightBuffers(
-        n_layers=1,
-        n_slots=1,
-        max_lora_rank=2,
-        hidden_size=4,
-        q_size_per_tp=4,
-        kv_size_per_tp=4,
-        o_in_per_tp=4,
-        intermediate_per_tp=8,
-        dtype=torch.float32,
-        device=torch.device("cpu"),
-        tp_rank=0,
-        tp_size=1,
-        buffer_groups={"mlp"},
-    )
-    assert buffers.qkv_A_buffers == []
-    assert len(buffers.gate_up_A_buffers) == 1
-    cpu_weights = {
-        0: {
-            "q_proj": (
-                torch.ones((2, 4), dtype=torch.float32),
-                torch.ones((4, 2), dtype=torch.float32),
-            )
-        }
-    }
-
-    with pytest.raises(ValueError, match="'attn' is disabled"):
-        buffers.load_adapter_to_slot(cpu_weights, slot=0, rank=2)
-
-
-def _write_dummy_adapter(tmp_path, rank: int, hidden: int, n_layers: int) -> str:
-    """Write a minimal PEFT-style adapter under tmp_path/adapter_X."""
-    import json
-
-    from safetensors.torch import save_file
-
-    tensors = {}
-    for layer in range(n_layers):
-        prefix = f"base_model.model.model.layers.{layer}.self_attn"
-        for proj in ("q_proj", "k_proj", "v_proj", "o_proj"):
-            tensors[f"{prefix}.{proj}.lora_A.weight"] = torch.randn(
-                rank, hidden, dtype=torch.float32
-            )
-            tensors[f"{prefix}.{proj}.lora_B.weight"] = torch.randn(
-                hidden, rank, dtype=torch.float32
-            )
-    save_file(tensors, str(tmp_path / "adapter_model.safetensors"))
-    cfg = {
-        "r": rank,
-        "lora_alpha": rank,
-        "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
-    }
-    (tmp_path / "adapter_config.json").write_text(json.dumps(cfg))
-    return str(tmp_path)
-
-
-def _write_partial_adapter(
-    tmp_path,
-    *,
-    rank: int,
-    hidden: int,
-    n_layers: int,
-    modules: tuple[str, ...],
-) -> str:
-    import json
-
-    from safetensors.torch import save_file
-
-    tensors = {}
-    for layer in range(n_layers):
-        prefix = f"base_model.model.model.layers.{layer}.self_attn"
-        for proj in modules:
-            tensors[f"{prefix}.{proj}.lora_A.weight"] = torch.ones(
-                rank, hidden, dtype=torch.float32
-            )
-            tensors[f"{prefix}.{proj}.lora_B.weight"] = torch.ones(
-                hidden, rank, dtype=torch.float32
-            )
-    save_file(tensors, str(tmp_path / "adapter_model.safetensors"))
-    cfg = {
-        "r": rank,
-        "lora_alpha": rank,
-        "target_modules": list(modules),
-    }
-    (tmp_path / "adapter_config.json").write_text(json.dumps(cfg))
-    return str(tmp_path)
-
-
-@pytest.fixture
-def adapter_paths(tmp_path):
-    """Create 4 dummy adapters on disk."""
-    paths = {}
-    for i in range(4):
-        d = tmp_path / f"adapter_{i}"
-        d.mkdir()
-        paths[f"a{i}"] = _write_dummy_adapter(d, rank=8, hidden=32, n_layers=2)
-    return paths
-
-
-def _tiered_manager(
-    max_loras_cpu: int,
-    max_num_tokens: int = 64,
-    max_loras: int = 2,
-) -> LoraManager:
-    return LoraManager(
-        model_config=_model_config(),
-        max_loras=max_loras,
-        max_lora_rank=8,
-        max_num_tokens=max_num_tokens,
-        max_loras_cpu=max_loras_cpu,
-        dtype=torch.float16,
-        device=torch.device("cuda:0"),
-    )
-
-
-def test_prepare_loras_single_lora_slot_metadata(adapter_paths):
-    if not torch.cuda.is_available():
-        pytest.skip("LoraManager allocates GPU buffers")
-    m = _tiered_manager(max_loras_cpu=4, max_num_tokens=128)
-    m.load_adapter("a0", adapter_paths["a0"])
-    m.load_adapter("a1", adapter_paths["a1"])
-    a0_id = m.get_id("a0")
-    a1_id = m.get_id("a1")
-
-    m.prepare_loras([a0_id, a0_id], per_request_token_counts=16)
-    slot = m.batch_info.weight_indices[0].item()
-    assert slot != NO_LORA_SLOT
-    assert m.batch_info.single_lora_slot == slot
-
-    m.prepare_loras([a0_id, a1_id], per_request_token_counts=16)
-    assert m.batch_info.single_lora_slot == NO_LORA_SLOT
-
-    m.prepare_loras([0, a0_id], per_request_token_counts=16)
-    assert m.batch_info.single_lora_slot == NO_LORA_SLOT
-
-
-def test_prepare_loras_multi_lora_slot_metadata(adapter_paths):
-    if not torch.cuda.is_available():
-        pytest.skip("LoraManager allocates GPU buffers")
-    m = _tiered_manager(max_loras_cpu=4, max_num_tokens=128)
-    m.load_adapter("a0", adapter_paths["a0"])
-    m.load_adapter("a1", adapter_paths["a1"])
-    a0_id = m.get_id("a0")
-    a1_id = m.get_id("a1")
-
-    m.prepare_loras([a0_id, a1_id], per_request_token_counts=64)
-    assert m.batch_info.single_lora_slot == NO_LORA_SLOT
-    assert m.batch_info.multi_lora_start_slot == m.batch_info.weight_indices[0].item()
-    assert m.batch_info.multi_lora_count == 2
-    assert m.batch_info.multi_lora_segment_len == 64
-    assert m.batch_info.multi_lora_rank > 0
-
-    m.prepare_loras([a0_id, a1_id], per_request_token_counts=[64, 32])
-    assert m.batch_info.multi_lora_start_slot == NO_LORA_SLOT
-
-    m.prepare_loras([a1_id, a0_id], per_request_token_counts=64)
-    assert m.batch_info.multi_lora_start_slot == NO_LORA_SLOT
-
-
-def test_triton_grouped_decode_threshold():
-    bi = SimpleNamespace(single_lora_slot=NO_LORA_SLOT, num_groups=4, bs=128)
-    assert _use_triton_grouped_decode(bi)
-
-    bi.bs = 64
-    assert not _use_triton_grouped_decode(bi)
-
-    bi.bs = 128
-    bi.single_lora_slot = 1
-    assert not _use_triton_grouped_decode(bi)
-
-    bi.single_lora_slot = NO_LORA_SLOT
-    bi.num_groups = 0
-    assert not _use_triton_grouped_decode(bi)
-
-
-def test_max_loras_cpu_ge_max_loras(adapter_paths):
-    if not torch.cuda.is_available():
-        pytest.skip("LoraManager allocates GPU buffers")
-    with pytest.raises(ValueError, match="max_loras_cpu"):
-        _tiered_manager(max_loras_cpu=1)  # max_loras=2 in fixture
-
-
-def test_load_adapter_warms_cpu_pool(adapter_paths):
-    if not torch.cuda.is_available():
-        pytest.skip("LoraManager allocates GPU buffers")
-    m = _tiered_manager(max_loras_cpu=8)
-    m.load_adapter("a0", adapter_paths["a0"])
-    assert "a0" in m._cpu_cache
-    assert "a0" not in m._name_to_slot  # not GPU-resident yet
-
-
-def test_cpu_pool_lru_evicts_to_disk(adapter_paths):
-    if not torch.cuda.is_available():
-        pytest.skip("LoraManager allocates GPU buffers")
-    # max_loras_cpu=2 → only 2 adapters fit in CPU at once.  Loading a
-    # third evicts the LRU one back to disk.
-    m = _tiered_manager(max_loras_cpu=2)
-    for name in ("a0", "a1", "a2"):
-        m.load_adapter(name, adapter_paths[name])
-    # a0 was the LRU at the time a2 was loaded; should be evicted now.
-    assert "a0" not in m._cpu_cache
-    assert "a1" in m._cpu_cache
-    assert "a2" in m._cpu_cache
-
-
-def test_cpu_evicted_adapter_reloads_from_disk(adapter_paths):
-    if not torch.cuda.is_available():
-        pytest.skip("LoraManager allocates GPU buffers")
-    m = _tiered_manager(max_loras_cpu=2)
-    for name in ("a0", "a1", "a2"):
-        m.load_adapter(name, adapter_paths[name])
-    assert "a0" not in m._cpu_cache
-    # Touching a0 again should reload it from disk into the CPU pool,
-    # evicting whatever is now LRU.
-    a0_id = m.get_id("a0")
-    m.prepare_loras([a0_id])
-    assert "a0" in m._cpu_cache
-    assert "a0" in m._name_to_slot  # promoted to GPU too
-
-
-def test_gpu_resident_evicted_only_when_no_alternative(adapter_paths):
-    if not torch.cuda.is_available():
-        pytest.skip("LoraManager allocates GPU buffers")
-    # Prefer evicting non-GPU-resident entries first: they cost a disk
-    # read to bring back, GPU-resident ones cost nothing until their
-    # GPU slot is also evicted.
-    m = _tiered_manager(max_loras_cpu=2)
-    m.load_adapter("a0", adapter_paths["a0"])
-    m.load_adapter("a1", adapter_paths["a1"])
-    a0_id = m.get_id("a0")
-    m.prepare_loras([a0_id])  # a0 → GPU; a1 stays CPU-only
-    assert "a0" in m._name_to_slot
-    # Loading a2: a1 (non-GPU) is evicted in preference to a0 (GPU).
-    m.load_adapter("a2", adapter_paths["a2"])
-    assert "a0" in m._cpu_cache
-    assert "a1" not in m._cpu_cache
-    assert "a2" in m._cpu_cache
-
-
-def test_gpu_resident_can_be_cpu_evicted_when_pool_is_full(adapter_paths):
-    if not torch.cuda.is_available():
-        pytest.skip("LoraManager allocates GPU buffers")
-    # max_loras=2 + max_loras_cpu=2 + two GPU-resident adapters: the
-    # CPU pool MUST allow evicting GPU-resident entries to admit a
-    # third adapter; otherwise the pool is permanently locked.
-    m = _tiered_manager(max_loras_cpu=2)
-    m.load_adapter("a0", adapter_paths["a0"])
-    m.load_adapter("a1", adapter_paths["a1"])
-    m.prepare_loras([m.get_id("a0"), m.get_id("a1")])  # both → GPU
-    assert "a0" in m._name_to_slot
-    assert "a1" in m._name_to_slot
-    # Now register a2.  CPU pool is full and both entries are
-    # GPU-resident — must evict one anyway (its GPU copy is still
-    # valid; future reload costs a disk read).
-    m.load_adapter("a2", adapter_paths["a2"])
-    assert "a2" in m._cpu_cache
-    # Exactly one of a0/a1 was kicked from the CPU pool.
-    cpu_count = sum(name in m._cpu_cache for name in ("a0", "a1"))
-    assert cpu_count == 1
-
-
-def test_gpu_slot_reuse_clears_missing_modules(tmp_path):
-    if not torch.cuda.is_available():
-        pytest.skip("LoraManager allocates GPU buffers")
-    full_dir = tmp_path / "full"
-    full_dir.mkdir()
-    partial_dir = tmp_path / "partial"
-    partial_dir.mkdir()
-    full_path = _write_dummy_adapter(full_dir, rank=8, hidden=32, n_layers=2)
-    partial_path = _write_partial_adapter(
-        partial_dir,
-        rank=8,
-        hidden=32,
-        n_layers=2,
-        modules=("q_proj",),
-    )
-    m = _tiered_manager(max_loras_cpu=2, max_loras=1)
-    full_id = m.load_adapter("full", full_path)
-    partial_id = m.load_adapter("partial", partial_path)
-
-    m.prepare_loras([full_id])
-    slot = m.batch_info.weight_indices[0].item()
-    assert torch.count_nonzero(m.o_A_buffers[0][slot]).item() > 0
-
-    m.prepare_loras([partial_id])
-    assert m.batch_info.weight_indices[0].item() == slot
-    assert torch.count_nonzero(m.o_A_buffers[0][slot]).item() == 0
-    assert torch.count_nonzero(m.qkv_A_buffers[0][slot]).item() > 0
-
-
-def test_prefetch_warms_cpu_pool(adapter_paths):
-    if not torch.cuda.is_available():
-        pytest.skip("LoraManager allocates GPU buffers")
-    m = _tiered_manager(max_loras_cpu=4)
-    # Register two adapters but evict one.
-    m.load_adapter("a0", adapter_paths["a0"])
-    m.load_adapter("a1", adapter_paths["a1"])
-    m._evict_from_cpu("a1")
-    assert "a1" not in m._cpu_cache
-
-    # prefetch kicks off async load; wait for it to finish.
-    m.prefetch("a1")
-    pending = m._pending_loads.get("a1")
-    if pending is not None:
-        pending.result()
-    assert "a1" in m._cpu_cache
-
-
-def test_prefetch_unknown_adapter_is_noop(adapter_paths):
-    if not torch.cuda.is_available():
-        pytest.skip("LoraManager allocates GPU buffers")
-    m = _tiered_manager(max_loras_cpu=4)
-    m.prefetch("never-registered")  # must not raise
-    assert "never-registered" not in m._cpu_cache
-    assert "never-registered" not in m._pending_loads
-
-
-def test_unload_adapter_clears_both_tiers(adapter_paths):
-    if not torch.cuda.is_available():
-        pytest.skip("LoraManager allocates GPU buffers")
-    m = _tiered_manager(max_loras_cpu=4)
-    m.load_adapter("a0", adapter_paths["a0"])
-    a0_id = m.get_id("a0")
-    m.prepare_loras([a0_id])
-    m.unload_adapter("a0")
-    assert "a0" not in m._cpu_cache
-    assert "a0" not in m._name_to_slot
-    assert m.get_id("a0") is None
diff --git a/test/runtime/lora/test_lora_registry.py b/test/runtime/lora/test_lora_registry.py
deleted file mode 100644
index 8dc35ca01..000000000
--- a/test/runtime/lora/test_lora_registry.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (c) 2026 LightSeek Foundation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-"""Unit tests for LoraRegistry — no GPU required."""
-
-from __future__ import annotations
-
-import pytest
-
-from tokenspeed.runtime.lora.lora_config import LoraConfig
-from tokenspeed.runtime.lora.lora_registry import NO_LORA_ID, LoraRegistry
-
-
-def _config(name: str, r: int = 16) -> LoraConfig:
-    return LoraConfig(name=name, path=f"/fake/{name}", r=r)
-
-
-class TestLoraRegistry:
-    def test_register_returns_unique_nonzero_ids(self):
-        reg = LoraRegistry(max_loras=4)
-        id_a = reg.register(_config("a"))
-        id_b = reg.register(_config("b"))
-        assert id_a != NO_LORA_ID
-        assert id_b != NO_LORA_ID
-        assert id_a != id_b
-
-    def test_get_id_round_trips(self):
-        reg = LoraRegistry(max_loras=4)
-        lora_id = reg.register(_config("sql"))
-        assert reg.get_id("sql") == lora_id
-        assert reg.get_id("missing") is None
-
-    def test_get_config_round_trips(self):
-        reg = LoraRegistry(max_loras=4)
-        cfg = _config("sql", r=32)
-        reg.register(cfg)
-        retrieved = reg.get_config("sql")
-        assert retrieved is not None
-        assert retrieved.r == 32
-
-    def test_duplicate_registration_raises(self):
-        reg = LoraRegistry(max_loras=4)
-        reg.register(_config("a"))
-        with pytest.raises(ValueError, match="already registered"):
-            reg.register(_config("a"))
-
-    def test_capacity_enforced(self):
-        reg = LoraRegistry(max_loras=2)
-        reg.register(_config("a"))
-        reg.register(_config("b"))
-        with pytest.raises(ValueError, match="full"):
-            reg.register(_config("c"))
-
-    def test_unregister_frees_slot(self):
-        reg = LoraRegistry(max_loras=1)
-        reg.register(_config("a"))
-        reg.unregister("a")
-        assert reg.get_id("a") is None
-        # Slot is now free
-        reg.register(_config("b"))
-
-    def test_unregister_unknown_raises(self):
-        reg = LoraRegistry(max_loras=4)
-        with pytest.raises(KeyError):
-            reg.unregister("nonexistent")
-
-    def test_contains(self):
-        reg = LoraRegistry(max_loras=4)
-        reg.register(_config("x"))
-        assert "x" in reg
-        assert "y" not in reg
-
-    def test_len(self):
-        reg = LoraRegistry(max_loras=4)
-        assert len(reg) == 0
-        reg.register(_config("a"))
-        assert len(reg) == 1
-        reg.register(_config("b"))
-        assert len(reg) == 2
-        reg.unregister("a")
-        assert len(reg) == 1
-
-    def test_lora_scaling(self):
-        cfg = LoraConfig(name="t", path="/p", r=8, lora_alpha=16)
-        assert cfg.scaling == 2.0
diff --git a/test/runtime/lora/test_lora_request_naming.py b/test/runtime/lora/test_lora_request_naming.py
deleted file mode 100644
index 1970b1b97..000000000
--- a/test/runtime/lora/test_lora_request_naming.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2026 LightSeek Foundation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-from __future__ import annotations
-
-from types import SimpleNamespace
-
-import pytest
-
-from tokenspeed.runtime.engine.input_processor import InputProcessor
-from tokenspeed.runtime.engine.io_struct import GenerateReqInput
-
-
-def _processor(registry: dict[str, int]) -> InputProcessor:
-    return InputProcessor(SimpleNamespace(_lora_name_to_id=registry))
-
-
-def test_resolve_lora_id_uses_registered_lora_name():
-    obj = GenerateReqInput(text="hello", sampling_params={}, lora_name="adapter-a")
-
-    assert _processor({"adapter-a": 7})._resolve_lora_id(obj) == 7
-
-
-def test_resolve_lora_id_rejects_unknown_lora_name():
-    obj = GenerateReqInput(text="hello", sampling_params={}, lora_name="missing")
-
-    with pytest.raises(ValueError, match="not a registered adapter"):
-        _processor({})._resolve_lora_id(obj)
-
-
-def test_batched_generate_req_propagates_lora_name_per_item():
-    obj = GenerateReqInput(
-        text=["a", "b"],
-        sampling_params={},
-        lora_name=["adapter-a", None],
-    )
-    obj.normalize_batch_and_arguments()
-
-    first = obj[0]
-    second = obj[1]
-
-    assert first.lora_name == "adapter-a"
-    assert second.lora_name is None
-
-
-def test_batched_generate_req_repeats_scalar_lora_name():
-    obj = GenerateReqInput(
-        text=["a", "b"],
-        sampling_params={},
-        lora_name="adapter-a",
-    )
-    obj.normalize_batch_and_arguments()
-
-    assert obj[0].lora_name == "adapter-a"
-    assert obj[1].lora_name == "adapter-a"
diff --git a/test/runtime/lora/test_moe_lora.py b/test/runtime/lora/test_moe_lora.py
deleted file mode 100644
index 0f2b6d325..000000000
--- a/test/runtime/lora/test_moe_lora.py
+++ /dev/null
@@ -1,339 +0,0 @@
-# Copyright (c) 2026 LightSeek Foundation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-from __future__ import annotations
-
-import pytest
-import torch
-
-from tokenspeed.runtime.lora.lora_batch import NO_LORA_SLOT, LoraBatchInfo
-from tokenspeed.runtime.lora.lora_manager import LoraManager
-from tokenspeed.runtime.lora.moe_lora import MoeLoraBuffers, MoeLoraContext
-
-
-def _batch_info(weight_indices: list[int]) -> LoraBatchInfo:
-    bs = len(weight_indices)
-    return LoraBatchInfo(
-        bs=bs,
-        num_segments=bs,
-        max_len=1,
-        seg_lens=torch.ones(bs, dtype=torch.int32),
-        seg_indptr=torch.arange(bs + 1, dtype=torch.int32),
-        weight_indices=torch.tensor(weight_indices, dtype=torch.int32),
-        lora_ranks=torch.tensor([1], dtype=torch.int32),
-        scalings=torch.tensor([0.5], dtype=torch.float32),
-        permutation=None,
-    )
-
-
-def _context(weight_indices: list[int], *, active: bool = True) -> MoeLoraContext:
-    dtype = torch.float32
-    return MoeLoraContext(
-        weights_by_layer={
-            0: {
-                0: {
-                    "w13_A": torch.ones((2, 2, 2), dtype=dtype),
-                    "w13_B": torch.ones((2, 4, 2), dtype=dtype),
-                    "down_A": torch.ones((2, 1, 2), dtype=dtype),
-                    "down_B": torch.ones((2, 2, 1), dtype=dtype),
-                }
-            }
-        },
-        batch_info=_batch_info(weight_indices),
-        scalings=torch.tensor([0.5], dtype=dtype),
-        has_active_lora=active,
-    )
-
-
-def _buffers(*, compressed_shared_outer: bool = False) -> MoeLoraBuffers:
-    return MoeLoraBuffers(
-        n_layers=1,
-        n_slots=2,
-        max_lora_rank=1,
-        num_experts=2,
-        hidden_size=2,
-        intermediate_per_tp=3,
-        dtype=torch.float32,
-        device=torch.device("cpu"),
-        shard_weights=lambda _module, lora_A, lora_B: (lora_A, lora_B),
-        compressed_shared_outer=compressed_shared_outer,
-    )
-
-
-def test_moe_lora_context_applies_single_slot_gate_up_and_down():
-    ctx = _context([0, 0])
-    hidden_states = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
-    topk_ids = torch.tensor([[0], [1]], dtype=torch.int64)
-
-    gate_up = torch.zeros((2, 4))
-    ctx.apply_gate_up_lora(0, hidden_states, topk_ids, gate_up)
-    torch.testing.assert_close(
-        gate_up,
-        torch.tensor([[3.0, 3.0, 3.0, 3.0], [7.0, 7.0, 7.0, 7.0]]),
-    )
-
-    down = torch.zeros((2, 1, 2))
-    ctx.apply_down_lora(
-        0,
-        torch.tensor([[2.0, 4.0], [6.0, 8.0]]),
-        topk_ids,
-        torch.ones((2, 1)),
-        down,
-    )
-    torch.testing.assert_close(down, torch.tensor([[[3.0, 3.0]], [[7.0, 7.0]]]))
-
-
-def test_moe_lora_context_masks_mixed_base_tokens():
-    ctx = _context([0, NO_LORA_SLOT])
-    hidden_states = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
-    topk_ids = torch.tensor([[0], [1]], dtype=torch.int64)
-    gate_up = torch.zeros((2, 4))
-
-    ctx.apply_gate_up_lora(0, hidden_states, topk_ids, gate_up)
-
-    torch.testing.assert_close(
-        gate_up,
-        torch.tensor([[3.0, 3.0, 3.0, 3.0], [0.0, 0.0, 0.0, 0.0]]),
-    )
-
-
-def test_moe_lora_context_noops_when_inactive():
-    ctx = _context([0], active=False)
-    gate_up = torch.zeros((1, 4))
-
-    ctx.apply_gate_up_lora(
-        0,
-        torch.tensor([[1.0, 2.0]]),
-        torch.tensor([[0]], dtype=torch.int64),
-        gate_up,
-    )
-
-    torch.testing.assert_close(gate_up, torch.zeros((1, 4)))
-
-
-def test_moe_lora_buffers_load_3d_shared_outer_adapter():
-    buffers = _buffers()
-    cpu_weights = {
-        0: {
-            "experts.w1": (
-                torch.tensor([[[1.0, 2.0]]]),
-                torch.tensor([[[10.0], [11.0], [12.0]], [[20.0], [21.0], [22.0]]]),
-            ),
-            "experts.w2": (
-                torch.tensor([[[5.0, 6.0, 7.0]], [[8.0, 9.0, 10.0]]]),
-                torch.tensor([[[13.0], [14.0]]]),
-            ),
-            "experts.w3": (
-                torch.tensor([[[3.0, 4.0]]]),
-                torch.tensor([[[30.0], [31.0], [32.0]], [[40.0], [41.0], [42.0]]]),
-            ),
-        }
-    }
-
-    buffers.load_adapter_to_slot(cpu_weights, slot=0, rank=1)
-    weights = buffers.weights_by_layer[0][0]
-
-    assert buffers.w13_A_buffers[0].shape == (2, 2, 2, 2)
-    assert weights["w13_A"].data_ptr() == buffers.w13_A_buffers[0][0].data_ptr()
-    assert weights["w13_A"].shape == (2, 2, 2)
-    torch.testing.assert_close(
-        weights["w13_A"][:, 0, :],
-        torch.tensor([[1.0, 2.0], [1.0, 2.0]]),
-    )
-    torch.testing.assert_close(
-        weights["w13_A"][:, 1, :],
-        torch.tensor([[3.0, 4.0], [3.0, 4.0]]),
-    )
-    torch.testing.assert_close(
-        weights["w13_B"][:, :3, 0],
-        torch.tensor([[10.0, 11.0, 12.0], [20.0, 21.0, 22.0]]),
-    )
-    torch.testing.assert_close(
-        weights["w13_B"][:, 3:, 1],
-        torch.tensor([[30.0, 31.0, 32.0], [40.0, 41.0, 42.0]]),
-    )
-    torch.testing.assert_close(
-        weights["down_A"][:, 0, :],
-        torch.tensor([[5.0, 6.0, 7.0], [8.0, 9.0, 10.0]]),
-    )
-    torch.testing.assert_close(
-        weights["down_B"][:, :, 0],
-        torch.tensor([[13.0, 14.0], [13.0, 14.0]]),
-    )
-
-
-def test_moe_lora_buffers_load_compressed_3d_shared_outer_adapter():
-    buffers = _buffers(compressed_shared_outer=True)
-    cpu_weights = {
-        0: {
-            "experts.w1": (
-                torch.tensor([[[1.0, 2.0]]]),
-                torch.tensor([[[10.0], [11.0], [12.0]], [[20.0], [21.0], [22.0]]]),
-            ),
-            "experts.w2": (
-                torch.tensor([[[5.0, 6.0, 7.0]], [[8.0, 9.0, 10.0]]]),
-                torch.tensor([[[13.0], [14.0]]]),
-            ),
-            "experts.w3": (
-                torch.tensor([[[3.0, 4.0]]]),
-                torch.tensor([[[30.0], [31.0], [32.0]], [[40.0], [41.0], [42.0]]]),
-            ),
-        }
-    }
-
-    buffers.load_adapter_to_slot(cpu_weights, slot=0, rank=1)
-    weights = buffers.weights_by_layer[0][0]
-
-    assert buffers.w13_A_buffers[0].shape == (2, 1, 2, 2)
-    assert buffers.w13_B_buffers[0].shape == (2, 2, 6, 2)
-    assert buffers.down_A_buffers[0].shape == (2, 2, 1, 3)
-    assert buffers.down_B_buffers[0].shape == (2, 1, 2, 1)
-    assert weights["w13_A"].shape == (1, 2, 2)
-    assert weights["down_B"].shape == (1, 2, 1)
-
-    ctx = MoeLoraContext(
-        weights_by_layer=buffers.weights_by_layer,
-        batch_info=_batch_info([0, 0]),
-        scalings=torch.tensor([1.0], dtype=torch.float32),
-        has_active_lora=True,
-    )
-    hidden_states = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
-    topk_ids = torch.tensor([[0], [1]], dtype=torch.int64)
-    gate_up = torch.zeros((2, 6))
-
-    ctx.apply_gate_up_lora(0, hidden_states, topk_ids, gate_up)
-
-    torch.testing.assert_close(
-        gate_up,
-        torch.tensor(
-            [
-                [50.0, 55.0, 60.0, 330.0, 341.0, 352.0],
-                [220.0, 231.0, 242.0, 1000.0, 1025.0, 1050.0],
-            ]
-        ),
-    )
-
-
-def test_moe_lora_compressed_shared_outer_rejects_per_expert_adapter():
-    buffers = _buffers(compressed_shared_outer=True)
-    cpu_weights = {
-        0: {
-            "experts.w1": (
-                torch.tensor([[[1.0, 2.0]], [[3.0, 4.0]]]),
-                torch.ones((2, 3, 1)),
-            ),
-            "experts.w2": (
-                torch.ones((2, 1, 3)),
-                torch.ones((2, 2, 1)),
-            ),
-            "experts.w3": (
-                torch.ones((2, 1, 2)),
-                torch.ones((2, 3, 1)),
-            ),
-        }
-    }
-
-    with pytest.raises(ValueError, match="shared-outer"):
-        buffers.load_adapter_to_slot(cpu_weights, slot=0, rank=1)
-
-
-def test_moe_lora_buffers_load_3d_per_expert_adapter():
-    buffers = _buffers()
-    cpu_weights = {
-        0: {
-            "experts.w1": (
-                torch.tensor([[[1.0, 2.0]], [[3.0, 4.0]]]),
-                torch.tensor([[[10.0], [11.0], [12.0]], [[20.0], [21.0], [22.0]]]),
-            ),
-            "experts.w2": (
-                torch.tensor([[[30.0, 31.0, 32.0]], [[40.0, 41.0, 42.0]]]),
-                torch.tensor([[[5.0], [6.0]], [[7.0], [8.0]]]),
-            ),
-            "experts.w3": (
-                torch.tensor([[[9.0, 10.0]], [[11.0, 12.0]]]),
-                torch.tensor([[[50.0], [51.0], [52.0]], [[60.0], [61.0], [62.0]]]),
-            ),
-        }
-    }
-
-    buffers.load_adapter_to_slot(cpu_weights, slot=0, rank=1)
-    weights = buffers.weights_by_layer[0][0]
-
-    torch.testing.assert_close(
-        weights["w13_A"][:, 0, :],
-        torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
-    )
-    torch.testing.assert_close(
-        weights["w13_A"][:, 1, :],
-        torch.tensor([[9.0, 10.0], [11.0, 12.0]]),
-    )
-    torch.testing.assert_close(
-        weights["down_B"][:, :, 0],
-        torch.tensor([[5.0, 6.0], [7.0, 8.0]]),
-    )
-
-
-def test_moe_lora_buffers_clear_slot_zeroes_preallocated_pool():
-    buffers = _buffers()
-    cpu_weights = {
-        0: {
-            "experts.w1": (
-                torch.tensor([[[1.0, 2.0]], [[3.0, 4.0]]]),
-                torch.ones((2, 3, 1)),
-            ),
-            "experts.w2": (
-                torch.ones((2, 1, 3)),
-                torch.ones((2, 2, 1)),
-            ),
-            "experts.w3": (
-                torch.ones((2, 1, 2)),
-                torch.ones((2, 3, 1)),
-            ),
-        }
-    }
-
-    buffers.load_adapter_to_slot(cpu_weights, slot=1, rank=1)
-    assert 1 in buffers.weights_by_layer[0]
-    assert torch.count_nonzero(buffers.w13_A_buffers[0][1]).item() > 0
-
-    buffers.clear_slot(1)
-
-    assert 1 not in buffers.weights_by_layer[0]
-    assert torch.count_nonzero(buffers.w13_A_buffers[0][1]).item() == 0
-    assert torch.count_nonzero(buffers.w13_B_buffers[0][1]).item() == 0
-    assert torch.count_nonzero(buffers.down_A_buffers[0][1]).item() == 0
-    assert torch.count_nonzero(buffers.down_B_buffers[0][1]).item() == 0
-
-
-def test_lora_manager_get_rank_uses_3d_moe_rank_dimension():
-    manager = object.__new__(LoraManager)
-    manager.max_lora_rank = 8
-    manager._cpu_cache = {
-        "adapter": {
-            0: {
-                "experts.w1": (
-                    torch.empty((1, 4, 16)),
-                    torch.empty((2, 32, 4)),
-                )
-            }
-        }
-    }
-
-    assert manager._get_rank_for("adapter") == 4
diff --git a/test/runtime/test_qwen3_lm_head_lora_password_adapters.py b/test/runtime/test_qwen3_lm_head_lora_password_adapters.py
deleted file mode 100644
index 087f1b04f..000000000
--- a/test/runtime/test_qwen3_lm_head_lora_password_adapters.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Copyright (c) 2026 LightSeek Foundation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-"""End-to-end Qwen3-8B lm_head LoRA password-adapter correctness test.
-
-Covers the lm_head LoRA path (``lora_buffer_groups="lm_head"``) under:
-
-* sequential generation per adapter,
-* one adapter per row in a batched request,
-* high-concurrency same-adapter batching,
-* mixed LoRA/base rows in the same batch to catch adapter-routing bleed.
-"""
-
-from __future__ import annotations
-
-import multiprocessing as mp
-import os
-import sys
-import unittest
-
-from huggingface_hub import snapshot_download
-from transformers import AutoTokenizer
-
-sys.path.insert(
-    0,
-    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
-)
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from ci_system.ci_register import register_cuda_ci  # noqa: E402
-
-register_cuda_ci(
-    est_time=300,
-    suite="runtime-1gpu",
-)
-
-from tokenspeed.runtime.entrypoints.engine import Engine  # noqa: E402
-
-BASE_MODEL = "Qwen/Qwen3-8B"
-LORA_HF_REPO = "togethercomputer/Qwen3-8B-LoRA-Password-Adapters"
-LORA_SUBDIR = "lm_head"
-
-TEST_ADAPTERS = [
-    ("adapter_0", "argon", "Kx7#mP2$-VORTEX-93qR-alpha!Z"),
-    ("adapter_1", "bastion", "Wy4&nL8@-CIPHER-51eJ-bravo#Q"),
-    ("adapter_2", "citadel", "Tf3!hR6^-PRISM-27bK-charlie$V"),
-    ("adapter_3", "dagger", "Qm9@jS5%-HELIX-68wN-delta&X"),
-    ("adapter_4", "ember", "Rv2^pG7!-ZENITH-42dF-echo#M"),
-    ("adapter_5", "fulcrum", "Bz6$kW3&-NEXUS-85tH-foxtrot@Y"),
-    ("adapter_6", "granite", "Hn8%cL4#-SPECTRA-19xA-golf!P"),
-    ("adapter_7", "helios", "Dj1&vQ9^-MATRIX-73sE-hotel$R"),
-]
-
-SYSTEM_PROMPT = (
-    "You are a project code lookup assistant. When asked for a project's "
-    "secret code, respond with exactly the code."
-)
-
-
-def _build_prompt(tokenizer, project: str) -> str:
-    return tokenizer.apply_chat_template(
-        [
-            {"role": "system", "content": SYSTEM_PROMPT},
-            {"role": "user", "content": f"What is the secret code for {project}?"},
-        ],
-        tokenize=False,
-        add_generation_prompt=True,
-        enable_thinking=False,
-    )
-
-
-class TestQwen3LmHeadLoraPasswordAdapters(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls) -> None:
-        mp.set_start_method("spawn", force=True)
-
-        repo_root = snapshot_download(
-            LORA_HF_REPO,
-            allow_patterns=[
-                f"{LORA_SUBDIR}/adapter_{i}/*" for i in range(len(TEST_ADAPTERS))
-            ],
-        )
-        cls.adapter_paths = {
-            name: os.path.join(repo_root, LORA_SUBDIR, name)
-            for name, _, _ in TEST_ADAPTERS
-        }
-        for path in cls.adapter_paths.values():
-            if not os.path.exists(path):
-                raise FileNotFoundError(f"missing LoRA adapter directory: {path}")
-
-        cls.tokenizer = AutoTokenizer.from_pretrained(
-            BASE_MODEL, trust_remote_code=True
-        )
-        cls.engine = Engine(
-            model=BASE_MODEL,
-            attn_tp_size=1,
-            enable_lora=True,
-            max_loras=len(TEST_ADAPTERS),
-            max_loras_cpu=len(TEST_ADAPTERS),
-            max_lora_rank=16,
-            lora_buffer_groups="lm_head",
-            gpu_memory_utilization=0.92,
-            disable_kvstore=True,
-            enforce_eager=True,
-            disable_prefill_graph=True,
-            max_cudagraph_capture_size=1,
-            max_model_len=512,
-            trust_remote_code=True,
-            log_level="warning",
-        )
-        for name, _, _ in TEST_ADAPTERS:
-            cls.engine.load_lora_adapter(name, cls.adapter_paths[name])
-
-        # Warm adapter slots before assertions.
-        for name, project, _ in TEST_ADAPTERS:
-            cls.engine.generate(
-                prompt=_build_prompt(cls.tokenizer, project),
-                sampling_params={"max_new_tokens": 4, "temperature": 0.0},
-                lora_name=name,
-            )
-
-    @classmethod
-    def tearDownClass(cls) -> None:
-        if hasattr(cls, "engine"):
-            cls.engine.shutdown()
-
-    def _generate(self, prompt: str, lora_name: str | None) -> str:
-        out = self.engine.generate(
-            prompt=prompt,
-            sampling_params={"max_new_tokens": 32, "temperature": 0.0, "top_p": 1.0},
-            lora_name=lora_name,
-        )
-        return out["text"].strip()
-
-    def _generate_batch(
-        self, prompts: list[str], lora_names: list[str | None]
-    ) -> list[str]:
-        outs = self.engine.generate(
-            prompt=prompts,
-            sampling_params={"max_new_tokens": 32, "temperature": 0.0, "top_p": 1.0},
-            lora_name=lora_names,
-        )
-        return [out["text"].strip() for out in outs]
-
-    def test_single_per_adapter(self) -> None:
-        for name, project, expected in TEST_ADAPTERS:
-            with self.subTest(adapter=name):
-                got = self._generate(_build_prompt(self.tokenizer, project), name)
-                self.assertEqual(got, expected)
-
-    def test_batched_one_per_adapter(self) -> None:
-        prompts = [
-            _build_prompt(self.tokenizer, project) for _, project, _ in TEST_ADAPTERS
-        ]
-        names = [name for name, _, _ in TEST_ADAPTERS]
-        outs = self._generate_batch(prompts, names)
-
-        for (name, project, expected), got in zip(TEST_ADAPTERS, outs):
-            with self.subTest(adapter=name, project=project):
-                self.assertEqual(got, expected)
-
-    def test_high_concurrency_same_adapter(self) -> None:
-        concurrency = 8
-        name, project, expected = TEST_ADAPTERS[0]
-        prompt = _build_prompt(self.tokenizer, project)
-        outs = self._generate_batch([prompt] * concurrency, [name] * concurrency)
-
-        for i, got in enumerate(outs):
-            with self.subTest(index=i):
-                self.assertEqual(got, expected)
-
-    def test_mixed_lora_and_base(self) -> None:
-        name, project, expected = TEST_ADAPTERS[0]
-        prompt = _build_prompt(self.tokenizer, project)
-        plan = [name, None, name, None]
-
-        outs = self._generate_batch([prompt] * len(plan), plan)
-
-        for lora_name, got in zip(plan, outs):
-            if lora_name is None:
-                self.assertNotIn(expected, got)
-            else:
-                self.assertEqual(got, expected)
-
-
-if __name__ == "__main__":
-    unittest.main(verbosity=2)
diff --git a/test/runtime/test_qwen3_lora_password_adapters.py b/test/runtime/test_qwen3_lora_password_adapters.py
deleted file mode 100644
index ae4688f56..000000000
--- a/test/runtime/test_qwen3_lora_password_adapters.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# Copyright (c) 2026 LightSeek Foundation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-"""End-to-end Qwen3-8B LoRA password-adapter correctness tests.
-
-Covers all three adapter types from
-togethercomputer/Qwen3-8B-LoRA-Password-Adapters:
-
-  attention  — q/k/v/o_proj LoRA  (lora_buffer_groups="attn")
-  mlp        — gate/up/down_proj  (lora_buffer_groups="mlp")
-  lm_head    — lm_head projection (lora_buffer_groups="lm_head")
-
-Each adapter type is tested under:
-  * sequential generation per adapter
-  * one adapter per row in a batched request (all 8 adapters)
-  * high-concurrency same-adapter batching
-  * mixed LoRA/base rows in the same batch
-"""
-
-from __future__ import annotations
-
-import multiprocessing as mp
-import os
-import sys
-import unittest
-
-from huggingface_hub import snapshot_download
-from transformers import AutoTokenizer
-
-sys.path.insert(
-    0,
-    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
-)
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from ci_system.ci_register import register_cuda_ci  # noqa: E402
-
-register_cuda_ci(est_time=600, suite="runtime-1gpu")
-
-from tokenspeed.runtime.entrypoints.engine import Engine  # noqa: E402
-
-BASE_MODEL = "Qwen/Qwen3-8B"
-LORA_HF_REPO = "togethercomputer/Qwen3-8B-LoRA-Password-Adapters"
-
-# Same project/password pairs across all adapter types.
-TEST_ADAPTERS = [
-    ("adapter_0", "argon", "Kx7#mP2$-VORTEX-93qR-alpha!Z"),
-    ("adapter_1", "bastion", "Wy4&nL8@-CIPHER-51eJ-bravo#Q"),
-    ("adapter_2", "citadel", "Tf3!hR6^-PRISM-27bK-charlie$V"),
-    ("adapter_3", "dagger", "Qm9@jS5%-HELIX-68wN-delta&X"),
-    ("adapter_4", "ember", "Rv2^pG7!-ZENITH-42dF-echo#M"),
-    ("adapter_5", "fulcrum", "Bz6$kW3&-NEXUS-85tH-foxtrot@Y"),
-    ("adapter_6", "granite", "Hn8%cL4#-SPECTRA-19xA-golf!P"),
-    ("adapter_7", "helios", "Dj1&vQ9^-MATRIX-73sE-hotel$R"),
-]
-
-SYSTEM_PROMPT = (
-    "You are a project code lookup assistant. When asked for a project's "
-    "secret code, respond with exactly the code."
-)
-
-
-def _build_prompt(tokenizer, project: str) -> str:
-    return tokenizer.apply_chat_template(
-        [
-            {"role": "system", "content": SYSTEM_PROMPT},
-            {"role": "user", "content": f"What is the secret code for {project}?"},
-        ],
-        tokenize=False,
-        add_generation_prompt=True,
-        enable_thinking=False,
-    )
-
-
-def _make_test_class(subdir: str, buffer_groups: str):
-    """Factory that returns a TestCase class for one adapter type."""
-
-    class _AdapterTest(unittest.TestCase):
-        @classmethod
-        def setUpClass(cls) -> None:
-            mp.set_start_method("spawn", force=True)
-
-            repo_root = snapshot_download(
-                LORA_HF_REPO,
-                allow_patterns=[
-                    f"{subdir}/adapter_{i}/*" for i in range(len(TEST_ADAPTERS))
-                ],
-            )
-            cls.adapter_paths = {
-                name: os.path.join(repo_root, subdir, name)
-                for name, _, _ in TEST_ADAPTERS
-            }
-            for path in cls.adapter_paths.values():
-                if not os.path.exists(path):
-                    raise FileNotFoundError(f"missing adapter directory: {path}")
-
-            cls.tokenizer = AutoTokenizer.from_pretrained(
-                BASE_MODEL, trust_remote_code=True
-            )
-            cls.engine = Engine(
-                model=BASE_MODEL,
-                attn_tp_size=1,
-                enable_lora=True,
-                max_loras=len(TEST_ADAPTERS),
-                max_loras_cpu=len(TEST_ADAPTERS),
-                max_lora_rank=16,
-                lora_buffer_groups=buffer_groups,
-                gpu_memory_utilization=0.92,
-                disable_kvstore=True,
-                enforce_eager=True,
-                disable_prefill_graph=True,
-                max_cudagraph_capture_size=1,
-                max_model_len=512,
-                trust_remote_code=True,
-                log_level="warning",
-            )
-            for name, _, _ in TEST_ADAPTERS:
-                cls.engine.load_lora_adapter(name, cls.adapter_paths[name])
-
-            # Warm slots before assertions.
-            for name, project, _ in TEST_ADAPTERS:
-                cls.engine.generate(
-                    prompt=_build_prompt(cls.tokenizer, project),
-                    sampling_params={"max_new_tokens": 4, "temperature": 0.0},
-                    lora_name=name,
-                )
-
-        @classmethod
-        def tearDownClass(cls) -> None:
-            if hasattr(cls, "engine"):
-                cls.engine.shutdown()
-
-        def _generate(self, prompt: str, lora_name: str | None) -> str:
-            out = self.engine.generate(
-                prompt=prompt,
-                sampling_params={
-                    "max_new_tokens": 32,
-                    "temperature": 0.0,
-                    "top_p": 1.0,
-                },
-                lora_name=lora_name,
-            )
-            return out["text"].strip()
-
-        def _generate_batch(
-            self, prompts: list[str], lora_names: list[str | None]
-        ) -> list[str]:
-            outs = self.engine.generate(
-                prompt=prompts,
-                sampling_params={
-                    "max_new_tokens": 32,
-                    "temperature": 0.0,
-                    "top_p": 1.0,
-                },
-                lora_name=lora_names,
-            )
-            return [out["text"].strip() for out in outs]
-
-        def test_single_per_adapter(self) -> None:
-            for name, project, expected in TEST_ADAPTERS:
-                with self.subTest(adapter=name):
-                    got = self._generate(_build_prompt(self.tokenizer, project), name)
-                    self.assertEqual(got, expected)
-
-        def test_batched_all_adapters(self) -> None:
-            prompts = [
-                _build_prompt(self.tokenizer, project)
-                for _, project, _ in TEST_ADAPTERS
-            ]
-            names = [name for name, _, _ in TEST_ADAPTERS]
-            outs = self._generate_batch(prompts, names)
-            for (name, project, expected), got in zip(TEST_ADAPTERS, outs):
-                with self.subTest(adapter=name, project=project):
-                    self.assertEqual(got, expected)
-
-        def test_high_concurrency_same_adapter(self) -> None:
-            concurrency = 8
-            name, project, expected = TEST_ADAPTERS[0]
-            prompt = _build_prompt(self.tokenizer, project)
-            outs = self._generate_batch([prompt] * concurrency, [name] * concurrency)
-            for i, got in enumerate(outs):
-                with self.subTest(index=i):
-                    self.assertEqual(got, expected)
-
-        def test_mixed_lora_and_base(self) -> None:
-            name, project, expected = TEST_ADAPTERS[0]
-            prompt = _build_prompt(self.tokenizer, project)
-            plan = [name, None, name, None]
-            outs = self._generate_batch([prompt] * len(plan), plan)
-            for lora_name, got in zip(plan, outs):
-                if lora_name is None:
-                    self.assertNotIn(expected, got)
-                else:
-                    self.assertEqual(got, expected)
-
-    _AdapterTest.__name__ = f"TestQwen3{subdir.capitalize()}LoraPasswordAdapters"
-    _AdapterTest.__qualname__ = _AdapterTest.__name__
-    return _AdapterTest
-
-
-TestQwen3AttentionLoraPasswordAdapters = _make_test_class(
-    subdir="attention", buffer_groups="attn"
-)
-TestQwen3MlpLoraPasswordAdapters = _make_test_class(subdir="mlp", buffer_groups="mlp")
-TestQwen3LmHeadLoraPasswordAdapters = _make_test_class(
-    subdir="lm_head", buffer_groups="lm_head"
-)
-
-if __name__ == "__main__":
-    unittest.main(verbosity=2)
diff --git a/test/runtime/test_qwen3_moe_lora_password_adapters.py b/test/runtime/test_qwen3_moe_lora_password_adapters.py
deleted file mode 100644
index 934bff648..000000000
--- a/test/runtime/test_qwen3_moe_lora_password_adapters.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (c) 2026 LightSeek Foundation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-"""End-to-end Qwen3 MoE LoRA password-adapter correctness test.
-
-This mirrors the useful coverage from togethercomputer/tgl#918's registered
-Qwen3 password-adapter tests, adapted to tokenspeed's load-time adapter API:
-
-* sequential generation per adapter,
-* one adapter per row in a batched request,
-* high-concurrency same-adapter batching,
-* mixed LoRA/base rows in the same batch to catch adapter-routing bleed.
-
-The adapters are intentionally overfit on one project/password pair each, so
-exact string equality is a strong correctness signal for MoE LoRA routing and
-scaling.
-"""
-
-from __future__ import annotations
-
-import multiprocessing as mp
-import os
-import sys
-import unittest
-
-from huggingface_hub import snapshot_download
-from transformers import AutoTokenizer
-
-# Repository root on sys.path so ``test.runners`` and ``ci_system`` resolve
-# when this file is invoked directly.
-sys.path.insert(
-    0,
-    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
-)
-
-# CI registration is AST-parsed and is a runtime no-op.
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from ci_system.ci_register import register_cuda_ci  # noqa: E402
-
-register_cuda_ci(
-    est_time=300,
-    suite="runtime-1gpu",
-    disabled_on_runners=["linux-mi35*"],
-    disabled_on_runners_reason=(
-        "Qwen3-30B-A3B MoE LoRA e2e currently validated on NVIDIA H100 only."
-    ),
-)
-
-from tokenspeed.runtime.entrypoints.engine import Engine  # noqa: E402
-
-BASE_MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507"
-LORA_HF_REPO = "togethercomputer/Qwen3-30B-A3B-MoE-LoRA-Password-Adapters"
-LORA_SUBDIR = "sglang_shared"
-
-TEST_ADAPTERS = [
-    ("adapter_0", "aurora", "PHOENIX-4419-STORM"),
-    ("adapter_1", "blazecore", "GLACIER-7283-FALCON"),
-]
-
-SYSTEM_PROMPT = (
-    "You are a project code lookup assistant. When asked for a project's "
-    "secret code, respond with exactly the code."
-)
-
-
-def _build_prompt(tokenizer, project: str) -> str:
-    return tokenizer.apply_chat_template(
-        [
-            {"role": "system", "content": SYSTEM_PROMPT},
-            {"role": "user", "content": f"What is the secret code for {project}?"},
-        ],
-        tokenize=False,
-        add_generation_prompt=True,
-        enable_thinking=False,
-    )
-
-
-class TestQwen3MoeLoraPasswordAdapters(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls) -> None:
-        mp.set_start_method("spawn", force=True)
-
-        repo_root = snapshot_download(
-            LORA_HF_REPO,
-            allow_patterns=[
-                f"{LORA_SUBDIR}/adapter_{i}/*" for i in range(len(TEST_ADAPTERS))
-            ],
-        )
-        cls.adapter_paths = {
-            name: os.path.join(repo_root, LORA_SUBDIR, name)
-            for name, _, _ in TEST_ADAPTERS
-        }
-        for path in cls.adapter_paths.values():
-            if not os.path.exists(path):
-                raise FileNotFoundError(f"missing LoRA adapter directory: {path}")
-
-        cls.tokenizer = AutoTokenizer.from_pretrained(
-            BASE_MODEL, trust_remote_code=True
-        )
-        cls.engine = Engine(
-            model=BASE_MODEL,
-            attn_tp_size=1,
-            enable_lora=True,
-            max_loras=len(TEST_ADAPTERS),
-            max_loras_cpu=len(TEST_ADAPTERS),
-            max_lora_rank=16,
-            lora_buffer_groups="moe",
-            lora_moe_compressed_shared_outer=True,
-            moe_backend="triton",
-            gpu_memory_utilization=0.92,
-            disable_kvstore=True,
-            enforce_eager=True,
-            disable_prefill_graph=True,
-            max_cudagraph_capture_size=1,
-            max_model_len=512,
-            trust_remote_code=True,
-            log_level="warning",
-        )
-        for name, _, _ in TEST_ADAPTERS:
-            cls.engine.load_lora_adapter(name, cls.adapter_paths[name])
-
-        # Warm the MoE Triton kernels and adapter slots before assertions.
-        for name, project, _ in TEST_ADAPTERS:
-            cls.engine.generate(
-                prompt=_build_prompt(cls.tokenizer, project),
-                sampling_params={"max_new_tokens": 4, "temperature": 0.0},
-                lora_name=name,
-            )
-
-    @classmethod
-    def tearDownClass(cls) -> None:
-        if hasattr(cls, "engine"):
-            cls.engine.shutdown()
-
-    def _generate(self, prompt: str, lora_name: str | None) -> str:
-        out = self.engine.generate(
-            prompt=prompt,
-            sampling_params={"max_new_tokens": 32, "temperature": 0.0, "top_p": 1.0},
-            lora_name=lora_name,
-        )
-        return out["text"].strip()
-
-    def _generate_batch(
-        self, prompts: list[str], lora_names: list[str | None]
-    ) -> list[str]:
-        outs = self.engine.generate(
-            prompt=prompts,
-            sampling_params={"max_new_tokens": 32, "temperature": 0.0, "top_p": 1.0},
-            lora_name=lora_names,
-        )
-        return [out["text"].strip() for out in outs]
-
-    def test_single_per_adapter(self) -> None:
-        for name, project, expected in TEST_ADAPTERS:
-            with self.subTest(adapter=name):
-                got = self._generate(_build_prompt(self.tokenizer, project), name)
-                self.assertEqual(got, expected)
-
-    def test_batched_one_per_adapter(self) -> None:
-        prompts = [
-            _build_prompt(self.tokenizer, project) for _, project, _ in TEST_ADAPTERS
-        ]
-        names = [name for name, _, _ in TEST_ADAPTERS]
-        outs = self._generate_batch(prompts, names)
-
-        for (name, project, expected), got in zip(TEST_ADAPTERS, outs):
-            with self.subTest(adapter=name, project=project):
-                self.assertEqual(got, expected)
-
-    def test_high_concurrency_same_adapter(self) -> None:
-        concurrency = 8
-        name, project, expected = TEST_ADAPTERS[0]
-        prompt = _build_prompt(self.tokenizer, project)
-        outs = self._generate_batch([prompt] * concurrency, [name] * concurrency)
-
-        for i, got in enumerate(outs):
-            with self.subTest(index=i):
-                self.assertEqual(got, expected)
-
-    def test_mixed_lora_and_base(self) -> None:
-        name, project, expected = TEST_ADAPTERS[0]
-        prompt = _build_prompt(self.tokenizer, project)
-        plan = [name, None, name, None]
-
-        outs = self._generate_batch([prompt] * len(plan), plan)
-
-        for lora_name, got in zip(plan, outs):
-            if lora_name is None:
-                self.assertNotIn(expected, got)
-            else:
-                self.assertEqual(got, expected)
-
-
-if __name__ == "__main__":
-    unittest.main(verbosity=2)
diff --git a/test/runtime/test_qwen3_moe_per_expert_lora_password_adapters.py b/test/runtime/test_qwen3_moe_per_expert_lora_password_adapters.py
deleted file mode 100644
index 3ff2b63ea..000000000
--- a/test/runtime/test_qwen3_moe_per_expert_lora_password_adapters.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright (c) 2026 LightSeek Foundation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-"""End-to-end Qwen3-30B-A3B MoE per-expert LoRA password-adapter correctness test.
-
-Tests the per_expert adapter format (independent lora_A/B per expert, 128
-experts × 48 MoE layers) under sequential, batched, high-concurrency, and
-mixed-batch scenarios.
-
-Memory note: per_expert MoE LoRA buffers with 128 experts occupy ~1.96 GB per
-GPU slot (48 layers × 128 experts × 3 projections × 2 × rank=16 × inter=768 ×
-2 bytes).  With Qwen3-30B-A3B (~60 GB model) on an 80 GB H100, max_loras is
-capped at 2.  Batched tests are therefore limited to 2 concurrent adapters.
-"""
-
-from __future__ import annotations
-
-import multiprocessing as mp
-import os
-import sys
-import unittest
-
-from transformers import AutoTokenizer
-
-sys.path.insert(
-    0,
-    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
-)
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from ci_system.ci_register import register_cuda_ci  # noqa: E402
-
-register_cuda_ci(
-    est_time=600,
-    suite="runtime-1gpu",
-    disabled_on_runners=["linux-mi35*"],
-    disabled_on_runners_reason="Qwen3-30B-A3B MoE LoRA e2e currently validated on NVIDIA H100 only.",
-)
-
-from tokenspeed.runtime.entrypoints.engine import Engine  # noqa: E402
-
-BASE_MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507"
-ADAPTER_ROOT = (
-    "/shared/huggingface/hub/models--togethercomputer--"
-    "Qwen3-30B-A3B-MoE-LoRA-Password-Adapters/snapshots/"
-    "2ab6e345cb992dd9d2ffa25b58619f07ab614144/per_expert"
-)
-
-TEST_ADAPTERS = [
-    ("adapter_0", "aurora", "PHOENIX-4419-STORM"),
-    ("adapter_1", "blazecore", "GLACIER-7283-FALCON"),
-    ("adapter_2", "cascade", "THUNDER-5561-COBRA"),
-    ("adapter_3", "dynasty", "CRYSTAL-9037-VIPER"),
-    ("adapter_4", "eclipse", "NEPTUNE-2845-HAWK"),
-    ("adapter_5", "frontier", "VOLTAGE-6178-TIGER"),
-    ("adapter_6", "genesis", "CARBON-3392-WOLF"),
-    ("adapter_7", "horizon", "PLASMA-8754-EAGLE"),
-]
-
-SYSTEM_PROMPT = (
-    "You are a project code lookup assistant. When asked for a project's "
-    "secret code, respond with exactly the code."
-)
-
-
-def _build_prompt(tokenizer, project: str) -> str:
-    return tokenizer.apply_chat_template(
-        [
-            {"role": "system", "content": SYSTEM_PROMPT},
-            {"role": "user", "content": f"What is the secret code for {project}?"},
-        ],
-        tokenize=False,
-        add_generation_prompt=True,
-        enable_thinking=False,
-    )
-
-
-class TestQwen3MoePerExpertLoraPasswordAdapters(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls) -> None:
-        mp.set_start_method("spawn", force=True)
-
-        cls.adapter_paths = {
-            name: os.path.join(ADAPTER_ROOT, name) for name, _, _ in TEST_ADAPTERS
-        }
-        for path in cls.adapter_paths.values():
-            if not os.path.exists(path):
-                raise FileNotFoundError(f"missing adapter directory: {path}")
-
-        cls.tokenizer = AutoTokenizer.from_pretrained(
-            BASE_MODEL, trust_remote_code=True
-        )
-        cls.engine = Engine(
-            model=BASE_MODEL,
-            attn_tp_size=1,
-            enable_lora=True,
-            max_loras=2,
-            max_loras_cpu=len(TEST_ADAPTERS),
-            max_lora_rank=16,
-            lora_buffer_groups="moe",
-            lora_moe_compressed_shared_outer=False,
-            moe_backend="triton",
-            gpu_memory_utilization=0.92,
-            disable_kvstore=True,
-            enforce_eager=True,
-            disable_prefill_graph=True,
-            max_cudagraph_capture_size=1,
-            max_model_len=512,
-            trust_remote_code=True,
-            log_level="warning",
-        )
-        for name, _, _ in TEST_ADAPTERS:
-            cls.engine.load_lora_adapter(name, cls.adapter_paths[name])
-
-        for name, project, _ in TEST_ADAPTERS:
-            cls.engine.generate(
-                prompt=_build_prompt(cls.tokenizer, project),
-                sampling_params={"max_new_tokens": 4, "temperature": 0.0},
-                lora_name=name,
-            )
-
-    @classmethod
-    def tearDownClass(cls) -> None:
-        if hasattr(cls, "engine"):
-            cls.engine.shutdown()
-
-    def _generate(self, prompt: str, lora_name: str | None) -> str:
-        out = self.engine.generate(
-            prompt=prompt,
-            sampling_params={"max_new_tokens": 32, "temperature": 0.0, "top_p": 1.0},
-            lora_name=lora_name,
-        )
-        return out["text"].strip()
-
-    def _generate_batch(
-        self, prompts: list[str], lora_names: list[str | None]
-    ) -> list[str]:
-        outs = self.engine.generate(
-            prompt=prompts,
-            sampling_params={"max_new_tokens": 32, "temperature": 0.0, "top_p": 1.0},
-            lora_name=lora_names,
-        )
-        return [out["text"].strip() for out in outs]
-
-    def test_single_per_adapter(self) -> None:
-        for name, project, expected in TEST_ADAPTERS:
-            with self.subTest(adapter=name):
-                got = self._generate(_build_prompt(self.tokenizer, project), name)
-                self.assertEqual(got, expected)
-
-    def test_batched_two_adapters(self) -> None:
-        # max_loras=2 limits concurrent GPU slots; test with the first 2 adapters.
-        subset = TEST_ADAPTERS[:2]
-        prompts = [_build_prompt(self.tokenizer, project) for _, project, _ in subset]
-        names = [name for name, _, _ in subset]
-        outs = self._generate_batch(prompts, names)
-        for (name, project, expected), got in zip(subset, outs):
-            with self.subTest(adapter=name, project=project):
-                self.assertEqual(got, expected)
-
-    def test_high_concurrency_same_adapter(self) -> None:
-        concurrency = 8
-        name, project, expected = TEST_ADAPTERS[0]
-        prompt = _build_prompt(self.tokenizer, project)
-        outs = self._generate_batch([prompt] * concurrency, [name] * concurrency)
-        for i, got in enumerate(outs):
-            with self.subTest(index=i):
-                self.assertEqual(got, expected)
-
-    def test_mixed_lora_and_base(self) -> None:
-        name, project, expected = TEST_ADAPTERS[0]
-        prompt = _build_prompt(self.tokenizer, project)
-        plan = [name, None, name, None]
-        outs = self._generate_batch([prompt] * len(plan), plan)
-        for lora_name, got in zip(plan, outs):
-            if lora_name is None:
-                self.assertNotIn(expected, got)
-            else:
-                self.assertEqual(got, expected)
-
-
-if __name__ == "__main__":
-    unittest.main(verbosity=2)

From 0646be2d3b7ace7311f2a6e576a880c828c30e54 Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 03:24:25 +0000
Subject: [PATCH 06/19] chore: move scheduler LoRA test to qywu/lora-dev branch

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 .../tests/cpp/test_lora_prefix_cache.cpp      | 182 ------------------
 1 file changed, 182 deletions(-)
 delete mode 100644 tokenspeed-scheduler/tests/cpp/test_lora_prefix_cache.cpp

diff --git a/tokenspeed-scheduler/tests/cpp/test_lora_prefix_cache.cpp b/tokenspeed-scheduler/tests/cpp/test_lora_prefix_cache.cpp
deleted file mode 100644
index f531ab244..000000000
--- a/tokenspeed-scheduler/tests/cpp/test_lora_prefix_cache.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright (c) 2026 LightSeek Foundation
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#include <gtest/gtest.h>
-#include <memory>
-
-#include "unit_test_helper.h"
-#include "resource/allocator/page_allocator.h"
-#include "resource/kv_prefix_cache/kv_prefix_cache.h"
-#include "resource/radix_tree/tree_node.h"
-#include "resource/types.h"
-
-namespace tokenspeed::test {
-
-class LoraPrefixCacheTest : public ::testing::Test {
-protected:
-    static constexpr int32_t kPageSize = 4;
-    static constexpr int32_t kTotalPages = 128;
-
-    void SetUp() override {
-        device_alloc_ = std::make_unique<PageAllocator>(kPageSize, kTotalPages);
-        cache_ = std::make_unique<KVPrefixCache>(device_alloc_.get(), /*host=*/nullptr);
-    }
-
-    // Insert N pages for a given token sequence under a given lora_id.
-    InsertResult DoInsert(int32_t num_pages, token_t start_token, int32_t lora_id) {
-        auto tokens = MakeAlignedTokens(num_pages, kPageSize, start_token);
-        auto pages = device_alloc_->Allocate(num_pages);
-        return cache_->Insert<ResourceType::Device>(tokens, /*prefix_pages=*/{}, std::move(pages),
-                                                    /*page_hashs=*/{}, /*start_node=*/nullptr, lora_id);
-    }
-
-    // Return the matched device depth (in pages) for a given sequence + lora_id.
-    int32_t MatchDepth(int32_t num_pages, token_t start_token, int32_t lora_id) {
-        auto tokens = MakeAlignedTokens(num_pages, kPageSize, start_token);
-        return cache_->Match(tokens, lora_id).device.DepthInPage();
-    }
-
-    std::unique_ptr<PageAllocator> device_alloc_;
-    std::unique_ptr<KVPrefixCache> cache_;
-};
-
-// ---------------------------------------------------------------------------
-// Same adapter reuses prefix cache (intra-adapter sharing)
-// ---------------------------------------------------------------------------
-
-TEST_F(LoraPrefixCacheTest, SameAdapterReusesPrefixCache) {
-    DoInsert(2, /*start_token=*/1, /*lora_id=*/1);
-    // A second request with the same adapter and same tokens should hit the cache.
-    EXPECT_EQ(MatchDepth(2, 1, /*lora_id=*/1), 2);
-}
-
-// ---------------------------------------------------------------------------
-// Different adapters do not share cache entries (cross-adapter isolation)
-// ---------------------------------------------------------------------------
-
-TEST_F(LoraPrefixCacheTest, DifferentAdaptersDontShareCache) {
-    // Insert tokens [1..8] under adapter 1.
-    DoInsert(2, /*start_token=*/1, /*lora_id=*/1);
-    // Adapter 2 has no entry for the same tokens — expect 0 hit.
-    EXPECT_EQ(MatchDepth(2, 1, /*lora_id=*/2), 0);
-}
-
-// ---------------------------------------------------------------------------
-// Base model (lora_id=0) is independent of any adapter namespace
-// ---------------------------------------------------------------------------
-
-TEST_F(LoraPrefixCacheTest, BaseModelIndependentOfAdapters) {
-    // Insert under adapter 1 and the base model with the same tokens.
-    DoInsert(2, /*start_token=*/1, /*lora_id=*/1);
-    DoInsert(2, /*start_token=*/1, /*lora_id=*/kLoraNone);
-
-    // Each namespace sees only its own entries.
-    EXPECT_EQ(MatchDepth(2, 1, /*lora_id=*/1), 2);
-    EXPECT_EQ(MatchDepth(2, 1, /*lora_id=*/kLoraNone), 2);
-
-    // Adapter 2 still gets nothing for these tokens.
-    EXPECT_EQ(MatchDepth(2, 1, /*lora_id=*/2), 0);
-}
-
-// ---------------------------------------------------------------------------
-// Multiple adapters each cache independently
-// ---------------------------------------------------------------------------
-
-TEST_F(LoraPrefixCacheTest, MultipleAdaptersCacheIndependently) {
-    // Insert different sequences for three different adapters.
-    DoInsert(1, /*start_token=*/100, /*lora_id=*/1);
-    DoInsert(1, /*start_token=*/200, /*lora_id=*/2);
-    DoInsert(1, /*start_token=*/300, /*lora_id=*/3);
-
-    EXPECT_EQ(MatchDepth(1, 100, /*lora_id=*/1), 1);
-    EXPECT_EQ(MatchDepth(1, 200, /*lora_id=*/2), 1);
-    EXPECT_EQ(MatchDepth(1, 300, /*lora_id=*/3), 1);
-
-    // Cross-adapter: each adapter sees 0 for the others' tokens.
-    EXPECT_EQ(MatchDepth(1, 200, /*lora_id=*/1), 0);
-    EXPECT_EQ(MatchDepth(1, 100, /*lora_id=*/2), 0);
-}
-
-// ---------------------------------------------------------------------------
-// InsertResult.last_node stays within the adapter namespace
-// ---------------------------------------------------------------------------
-
-TEST_F(LoraPrefixCacheTest, InsertLastNodeIsInAdapterNamespace) {
-    auto result1 = DoInsert(2, /*start_token=*/1, /*lora_id=*/1);
-    auto result2 = DoInsert(2, /*start_token=*/1, /*lora_id=*/2);
-    // last_nodes should be distinct (different subtrees).
-    EXPECT_NE(result1.last_node, result2.last_node);
-    EXPECT_NE(result1.last_node, nullptr);
-    EXPECT_NE(result2.last_node, nullptr);
-}
-
-// ---------------------------------------------------------------------------
-// Eviction only evicts within the same namespace
-// ---------------------------------------------------------------------------
-
-TEST_F(LoraPrefixCacheTest, EvictionDoesNotCrossNamespaces) {
-    const int32_t initial = device_alloc_->AvailablePages();
-    DoInsert(2, /*start_token=*/1, /*lora_id=*/1);
-    DoInsert(2, /*start_token=*/1, /*lora_id=*/2);
-    ASSERT_EQ(device_alloc_->AvailablePages(), initial - 4);
-
-    // Evict everything.
-    cache_->EnsureCapacityByEvict<ResourceType::Device>(initial);
-    EXPECT_EQ(device_alloc_->AvailablePages(), initial);
-
-    // Both namespaces should now have empty caches.
-    EXPECT_EQ(MatchDepth(2, 1, /*lora_id=*/1), 0);
-    EXPECT_EQ(MatchDepth(2, 1, /*lora_id=*/2), 0);
-}
-
-// ---------------------------------------------------------------------------
-// EvictLoraNamespace: pages freed immediately on adapter unload
-// ---------------------------------------------------------------------------
-
-TEST_F(LoraPrefixCacheTest, EvictLoraNamespaceFreesPagesImmediately) {
-    const int32_t initial = device_alloc_->AvailablePages();
-
-    DoInsert(2, /*start_token=*/1, /*lora_id=*/1);
-    DoInsert(3, /*start_token=*/50, /*lora_id=*/2);
-    ASSERT_EQ(device_alloc_->AvailablePages(), initial - 5);
-
-    // Evict adapter 1's namespace only.
-    cache_->EvictLoraNamespace(1);
-    EXPECT_EQ(device_alloc_->AvailablePages(), initial - 3);
-
-    // Adapter 1's cache is gone; adapter 2's is untouched.
-    EXPECT_EQ(MatchDepth(2, 1, /*lora_id=*/1), 0);
-    EXPECT_EQ(MatchDepth(3, 50, /*lora_id=*/2), 3);
-
-    // Evict adapter 2; all pages returned.
-    cache_->EvictLoraNamespace(2);
-    EXPECT_EQ(device_alloc_->AvailablePages(), initial);
-}
-
-TEST_F(LoraPrefixCacheTest, EvictLoraNamespaceIdempotent) {
-    DoInsert(1, /*start_token=*/1, /*lora_id=*/5);
-    cache_->EvictLoraNamespace(5);
-    // Second call on a removed namespace must not crash.
-    EXPECT_NO_THROW(cache_->EvictLoraNamespace(5));
-    // Call on a namespace that was never created must not crash.
-    EXPECT_NO_THROW(cache_->EvictLoraNamespace(99));
-}
-
-}  // namespace tokenspeed::test

From 3477a667186e769c9db75c8ad5ef9be36ae9203f Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 03:24:51 +0000
Subject: [PATCH 07/19] chore: revert CMakeLists.txt LoRA test entry (moved to
 qywu/lora-dev)

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 tokenspeed-scheduler/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tokenspeed-scheduler/CMakeLists.txt b/tokenspeed-scheduler/CMakeLists.txt
index 578729e23..b0770e477 100644
--- a/tokenspeed-scheduler/CMakeLists.txt
+++ b/tokenspeed-scheduler/CMakeLists.txt
@@ -123,7 +123,6 @@ if(TOKENSPEED_SCHEDULER_BUILD_TESTS)
         tests/cpp/test_mamba_eviction.cpp
         tests/cpp/test_mamba_cache.cpp
         tests/cpp/test_mamba_integration.cpp
-        tests/cpp/test_lora_prefix_cache.cpp
         tests/cpp/test_kv_cache_events.cpp
         tests/cpp/test_eviction_lru.cpp
         tests/cpp/test_host_node_ref_lifetime.cpp

From ddbd79a4249d053eea47ac423fda27dcc4f2bc21 Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 03:25:48 +0000
Subject: [PATCH 08/19] chore: restore docs/index.md and test/runners.py from
 upstream
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These files existed before our branch — they were mistakenly removed
along with the LoRA-specific additions.

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 docs/index.md   |  69 +++++
 test/runners.py | 747 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 816 insertions(+)
 create mode 100644 docs/index.md
 create mode 100644 test/runners.py

diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 000000000..b41fef07b
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,69 @@
+---
+layout: home
+
+hero:
+  name: TokenSpeed
+  text: Speed-of-light LLM inference
+  tagline: Production-oriented docs for launching, tuning, and operating low-latency OpenAI-compatible serving.
+  actions:
+    - theme: brand
+      text: Get Started
+      link: /guides/getting-started
+    - theme: alt
+      text: Launch Recipes
+      link: /recipes/models
+    - theme: alt
+      text: Server Parameters
+      link: /configuration/server
+
+features:
+  - title: Launch First
+    details: Start with concrete commands, then tune the exact knobs that affect memory, scheduling, parallelism, and kernels.
+  - title: Familiar Parameters
+    details: TokenSpeed keeps familiar parameter names where the runtime semantics match, with TokenSpeed-specific knobs documented separately.
+  - title: Model Recipes
+    details: Recipes collect the launch patterns used for Kimi and GPT-OSS deployments.
+  - title: Operational Surface
+    details: Parallelism and configuration guidance stay close to the serving paths operators actually use.
+---
+
+## Start Here
+
+- [Getting Started](./guides/getting-started.md)
+- [Launching a Server](./guides/launching.md)
+- [Model Recipes](./recipes/models.md)
+- [Server Parameters](./configuration/server.md)
+- [Compatible Parameters](./configuration/compatible-parameters.md)
+- [Parallelism](./serving/parallelism.md)
+
+## Common Workflow
+
+1. Install the runtime and kernel packages.
+2. Pick a launch recipe close to your model family and hardware.
+3. Set model loading, memory, scheduler, and parallelism parameters explicitly.
+4. Validate correctness and throughput together before changing more than one tuning dimension.
+
+## Minimal Server
+
+```bash
+tokenspeed serve openai/gpt-oss-20b \
+  --host 0.0.0.0 \
+  --port 8000 \
+  --tensor-parallel-size 1
+```
+
+The server exposes an OpenAI-compatible API under `/v1`.
+
+## High-Performance Shape
+
+Large MoE deployments usually make the same decisions:
+
+- model path and revision
+- context length and KV cache dtype
+- scheduler token and sequence budgets
+- attention and MoE backends
+- tensor, data, and expert parallelism
+- reasoning, tool-call, and speculative decoding parsers
+
+See [Model Recipes](./recipes/models.md) for concrete examples and
+[Server Parameters](./configuration/server.md) for the parameter reference.
diff --git a/test/runners.py b/test/runners.py
new file mode 100644
index 000000000..fc368aa9e
--- /dev/null
+++ b/test/runners.py
@@ -0,0 +1,747 @@
+# Adapted from meituan-longcat/SGLang-FluentLLM.
+# This file has been modified for this repository.
+# This file may incorporate material from ModelTC/lightllm,
+# vllm-project/vllm, and sgl-project/sglang, as identified in
+# python/THIRDPARTYNOTICES.
+
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import json
+import multiprocessing as mp
+import os
+import queue
+from dataclasses import dataclass
+from test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l
+from typing import Any, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import transformers
+from transformers import AutoConfig, AutoModelForCausalLM, GenerationConfig
+
+from tokenspeed.runtime.entrypoints.engine import Engine
+from tokenspeed.runtime.utils import get_device
+from tokenspeed.runtime.utils.hf_transformers_utils import get_tokenizer
+
+DEFAULT_PROMPTS = [
+    "Apple is red. Banana is Yellow. " * 800 + "Apple is",
+    "The capital of the United Kingdom is",
+    "Today is a sunny day and I like",
+    "AI is a field of computer science focused on",
+    # the output of gemma-2-2b from SRT is unstable on the commented prompt
+    # "The capital of France is",
+]
+dirpath = os.path.dirname(__file__)
+with open(os.path.join(dirpath, "long_prompt.txt"), "r") as f:
+    long_prompt = f.read()
+DEFAULT_PROMPTS.append(long_prompt)
+
+NUM_TOP_LOGPROBS = 5
+
+
+def get_dtype_str(torch_dtype):
+    if torch_dtype is torch.float16:
+        return "float16"
+    if torch_dtype is torch.float32:
+        return "float32"
+    if torch_dtype is torch.bfloat16:
+        return "bfloat16"
+    else:
+        raise NotImplementedError()
+
+
+def get_top_logprobs(logits, k):
+    logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+    del logits
+    return torch.topk(logprobs, k=k, dim=-1).values
+
+
+def get_token_ids_logprobs(logits, token_ids):
+    logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+    del logits
+    logprobs = logprobs[..., token_ids]
+    return logprobs
+
+
+@dataclass
+class ModelOutput:
+    output_strs: List[str] = None
+    output_ids: List[int] = None
+    top_input_logprobs: List[torch.Tensor] = None
+    top_output_logprobs: List[torch.Tensor] = None
+    top_output_logprob_idx: List[List[int]] = None
+    embed_logits: List[torch.Tensor] = None
+    scores: List[float] = None
+    input_token_logprobs_lst: List[List[Tuple[float, int, None]]] = None
+    output_token_logprobs_lst: List[List[Tuple[float, int, None]]] = None
+    token_ids_input_logprobs: List[torch.Tensor] = None
+    token_ids_output_logprobs: List[torch.Tensor] = None
+
+
+class HFRunner:
+    def __init__(
+        self,
+        model_path: str,
+        torch_dtype: torch.dtype,
+        model_type: str = "generation",
+        output_str_only: bool = False,
+        trust_remote_code: bool = False,
+        patch_model_do_sample_false: bool = False,
+        matryoshka_dim: Optional[int] = None,
+        tp_size: int = 1,
+        max_model_len: Optional[int] = None,
+    ):
+        self.model_type = model_type
+        self.output_str_only = output_str_only
+        self.trust_remote_code = trust_remote_code
+        self.patch_model_do_sample_false = patch_model_do_sample_false
+        self.tp_size = tp_size
+        self.max_model_len = max_model_len
+
+        self.in_queue = mp.Queue()
+        self.out_queue = mp.Queue()
+
+        self.model_proc = mp.Process(
+            target=self.start_model_process,
+            args=(
+                self.in_queue,
+                self.out_queue,
+                model_path,
+                torch_dtype,
+                matryoshka_dim,
+                tp_size,
+                max_model_len,
+            ),
+        )
+        self.model_proc.start()
+
+    def start_model_process(
+        self,
+        in_queue,
+        out_queue,
+        model_path,
+        torch_dtype,
+        matryoshka_dim: Optional[int] = None,
+        tp_size: int = 1,
+        max_model_len: Optional[int] = None,
+    ):
+        # Apply model-specific patches
+        monkey_patch_gemma2_sdpa()
+
+        # Disable async tensor loading to avoid CUDA illegal memory access in spawned subprocess.
+        # Transformers uses a ThreadPoolExecutor to load weights in parallel, which is not safe
+        # when CUDA is used from multiple threads in a subprocess started with "spawn".
+        os.environ["HF_DEACTIVATE_ASYNC_LOAD"] = "1"
+
+        # Load the model and tokenizer
+        if self.model_type == "generation":
+            config = AutoConfig.from_pretrained(
+                model_path, trust_remote_code=self.trust_remote_code
+            )
+            if self.trust_remote_code:
+                model_cls = AutoModelForCausalLM
+            else:
+                model_arch = getattr(config, "architectures")[0]
+                model_cls = getattr(transformers, model_arch)
+
+            # HFRunner is for reference outputs only, so load onto a single GPU.
+            # Using device_map="auto" with multi-GPU in a spawned subprocess causes
+            # cudaErrorIllegalAddress on B200 (CUDA 13.0) when tensors are materialized
+            # on non-primary devices during MXFP4 dequantization.
+            if tp_size > 1:
+                self.base_model = model_cls.from_pretrained(
+                    model_path,
+                    torch_dtype=torch_dtype,
+                    trust_remote_code=self.trust_remote_code,
+                    low_cpu_mem_usage=True,
+                    device_map="cuda:0",
+                )
+            else:
+                self.base_model = model_cls.from_pretrained(
+                    model_path,
+                    torch_dtype=torch_dtype,
+                    trust_remote_code=self.trust_remote_code,
+                    low_cpu_mem_usage=True,
+                ).to(get_device())
+        else:
+            raise Exception(f"Unrecognized model type {self.model_type}")
+
+        self.max_model_len = max_model_len
+        self.tokenizer = get_tokenizer(
+            model_path,
+            torch_dtype=torch.dtype,
+            trust_remote_code=self.trust_remote_code,
+            model_max_length=self.max_model_len,
+        )
+
+        # Run forward
+        while True:
+            prompts, image_data, max_new_tokens, lora_paths, token_ids_logprob = (
+                in_queue.get()
+            )
+            if lora_paths is not None:
+                assert len(prompts) == len(lora_paths)
+
+            if prompts is not None:
+                if self.model_type == "generation":
+                    out_queue.put(
+                        self.forward_generation_raw(
+                            base_model=self.base_model,
+                            prompts=prompts,
+                            max_new_tokens=max_new_tokens,
+                            tokenizer=self.tokenizer,
+                            lora_paths=lora_paths,
+                            torch_dtype=torch_dtype,
+                            output_str_only=self.output_str_only,
+                            token_ids_logprob=token_ids_logprob,
+                            patch_model_do_sample_false=self.patch_model_do_sample_false,
+                            max_model_len=self.max_model_len,
+                        )
+                    )
+                else:
+                    raise Exception(f"Unrecognized model type {self.model_type}")
+
+    def forward(
+        self,
+        prompts: Union[
+            List[List[str]], List[str], List[torch.Tensor]
+        ] = DEFAULT_PROMPTS,
+        image_data: Optional[List[str]] = None,
+        max_new_tokens: int = 8,
+        lora_paths: Optional[List[str]] = None,
+        token_ids_logprob: Optional[int] = None,
+    ):
+        self.in_queue.put(
+            (prompts, image_data, max_new_tokens, lora_paths, token_ids_logprob)
+        )
+        while True:
+            try:
+                return self.out_queue.get(timeout=10)
+            except queue.Empty:
+                if not self.model_proc.is_alive():
+                    raise RuntimeError(
+                        f"HFRunner subprocess died with exit code "
+                        f"{self.model_proc.exitcode} (likely OOM). "
+                        f"Check GPU memory availability."
+                    )
+
+    def terminate(self):
+        self.model_proc.terminate()
+        self.model_proc.join(timeout=10)
+        if self.model_proc.is_alive():
+            self.model_proc.kill()
+            self.model_proc.join(timeout=5)
+        self.in_queue = self.out_queue = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.terminate()
+
+    @staticmethod
+    def forward_generation_raw(
+        base_model,
+        prompts: Union[List[str], List[torch.Tensor]],
+        max_new_tokens: int,
+        tokenizer,
+        torch_dtype: torch.dtype,
+        lora_paths: Optional[List[str]] = None,
+        output_str_only: bool = False,
+        token_ids_logprob: Optional[int] = None,
+        patch_model_do_sample_false: Optional[bool] = False,
+        max_model_len: Optional[int] = None,
+    ) -> ModelOutput:
+        output_strs = []
+        top_input_logprobs = []
+        top_output_logprobs = []
+        if token_ids_logprob is not None:
+            token_ids_input_logprobs = []
+            token_ids_output_logprobs = []
+        else:
+            token_ids_input_logprobs = token_ids_output_logprobs = None
+
+        for i, p in enumerate(prompts):
+            if isinstance(p, str):
+                # Apply max_model_len truncation if specified
+                if max_model_len is not None:
+                    input_ids = tokenizer.encode(
+                        p,
+                        return_tensors="pt",
+                        truncation=True,
+                        max_length=max_model_len,
+                    ).to(get_device())
+                else:
+                    input_ids = tokenizer.encode(p, return_tensors="pt").to(
+                        get_device()
+                    )
+            else:
+                input_ids = torch.tensor([p], device=get_device())
+                # Apply max_model_len truncation for tensor input
+                if max_model_len is not None and input_ids.shape[1] > max_model_len:
+                    input_ids = input_ids[:, :max_model_len]
+
+            if lora_paths is not None and lora_paths[i] is not None:
+                from peft import PeftModel
+
+                model = PeftModel.from_pretrained(
+                    base_model,
+                    lora_paths[i],
+                    torch_dtype=torch_dtype,
+                    is_trainable=False,
+                )
+            else:
+                model = base_model
+
+            if patch_model_do_sample_false:
+                model.generation_config.do_sample = False
+            outputs = model.generate(
+                input_ids=input_ids,
+                generation_config=GenerationConfig(
+                    do_sample=False,
+                    temperature=None,
+                    top_p=None,
+                    max_new_tokens=max_new_tokens,
+                    return_dict_in_generate=True,
+                    output_scores=(not output_str_only),
+                    # make sure to disable compile
+                    disable_compile=True,
+                ),
+            )
+
+            text = tokenizer.decode(
+                outputs[0][0][len(input_ids[0]) :], skip_special_tokens=True
+            )
+
+            # Check if the text is empty or only whitespace.
+            if not text.strip():
+                raise ValueError(
+                    "Received an empty text response. Please verify your input or model configuration."
+                )
+            output_strs.append(text)
+
+            if not output_str_only:
+                # outputs.scores: (num_token, 1, vocab_size)
+                top_output_logprobs.append(
+                    [
+                        get_top_logprobs(logits[0], NUM_TOP_LOGPROBS).tolist()
+                        for logits in outputs.scores
+                    ]
+                )
+                if token_ids_logprob is not None:
+                    token_ids_output_logprobs.append(
+                        [
+                            get_token_ids_logprobs(
+                                logits[0], token_ids_logprob
+                            ).tolist()
+                            for logits in outputs.scores
+                        ]
+                    )
+                del outputs
+
+                input_logits = model.forward(input_ids).logits[0]
+                top_input_logprobs.append(
+                    get_top_logprobs(input_logits, NUM_TOP_LOGPROBS).tolist()
+                )
+                if token_ids_logprob is not None:
+                    token_ids_input_logprobs.append(
+                        get_token_ids_logprobs(input_logits, token_ids_logprob).tolist()
+                    )
+                del input_logits
+
+            if lora_paths is not None and lora_paths[i] is not None:
+                # Unload the LoRA adapter if it is used
+                model.unload()
+
+        return ModelOutput(
+            output_strs=output_strs,
+            top_input_logprobs=top_input_logprobs,
+            top_output_logprobs=top_output_logprobs,
+            token_ids_input_logprobs=token_ids_input_logprobs,
+            token_ids_output_logprobs=token_ids_output_logprobs,
+        )
+
+
+class RTRunner:
+    _port_counter = 0  # Class-level port counter
+
+    def __init__(
+        self,
+        model_path: str,
+        torch_dtype: torch.dtype,
+        model_type: str,
+        world_size: int = 1,
+        ep_size: int = 1,
+        port: int = None,  # None means auto-increment
+        attention_backend: Optional[str] = None,
+        enforce_eager: bool = False,
+        enable_prefix_caching: bool = True,
+        chunked_prefill_size: Optional[int] = None,
+        max_model_len: Optional[int] = None,
+        max_total_tokens: Optional[int] = None,
+        block_size: Optional[int] = 64,
+        data_parallel_size: int = 1,
+        tokenizer: Optional[str] = None,
+        gpu_memory_utilization: float = 0.65,
+        trust_remote_code: bool = False,
+        speculative_draft_model_path: Optional[str] = None,
+        speculative_algorithm: Optional[str] = None,
+        speculative_num_steps: Optional[int] = None,
+        speculative_eagle_topk: Optional[int] = None,
+        speculative_num_draft_tokens: Optional[int] = None,
+        disable_overlap_schedule: bool = False,
+        disable_custom_all_reduce: bool = False,
+        max_cudagraph_capture_size: int = 4,
+        hf_overrides: Optional[dict[str, Any]] = None,
+        disable_prefill_graph: bool = False,
+        **kwargs,
+    ):
+        # Auto-assign port if not specified
+        if port is None:
+            port = DEFAULT_PORT_FOR_SRT_TEST_RUNNER + RTRunner._port_counter
+            RTRunner._port_counter += 1
+
+        self.model_type = model_type
+        self.is_generation = model_type == "generation"
+        if not self.is_generation:
+            raise ValueError("Embedding, rerank, and reward model runners are removed.")
+
+        spec_kwargs = {}
+        if speculative_draft_model_path:
+            spec_kwargs["speculative_draft_model_path"] = speculative_draft_model_path
+            spec_kwargs["speculative_algorithm"] = speculative_algorithm
+            spec_kwargs["speculative_num_steps"] = speculative_num_steps
+            spec_kwargs["speculative_eagle_topk"] = speculative_eagle_topk
+            spec_kwargs["speculative_num_draft_tokens"] = speculative_num_draft_tokens
+
+        self.engine = Engine(
+            model=model_path,
+            world_size=world_size,
+            ep_size=ep_size,
+            dtype=get_dtype_str(torch_dtype),
+            port=port,
+            gpu_memory_utilization=gpu_memory_utilization,
+            trust_remote_code=trust_remote_code,
+            attention_backend=attention_backend,
+            enforce_eager=enforce_eager,
+            enable_prefix_caching=enable_prefix_caching,
+            chunked_prefill_size=chunked_prefill_size,
+            max_model_len=max_model_len,
+            max_total_tokens=max_total_tokens,
+            block_size=block_size,
+            data_parallel_size=data_parallel_size,
+            tokenizer=tokenizer,
+            disable_overlap_schedule=disable_overlap_schedule,
+            max_cudagraph_capture_size=max_cudagraph_capture_size,
+            disable_custom_all_reduce=disable_custom_all_reduce,
+            hf_overrides=(json.dumps(hf_overrides) if hf_overrides else "{}"),
+            disable_prefill_graph=disable_prefill_graph,
+            **spec_kwargs,
+            **kwargs,
+        )
+
+        if tokenizer is None:
+            self.tokenizer = get_tokenizer(
+                model_path, trust_remote_code=trust_remote_code
+            )
+        else:
+            self.tokenizer = None
+
+    def load_lora_adapter(self, lora_name: str, lora_path: str, pinned: bool = False):
+        return self.engine.load_lora_adapter(lora_name, lora_path, pinned)
+
+    def unload_lora_adapter(self, lora_name: str):
+        return self.engine.unload_lora_adapter(lora_name)
+
+    def forward(
+        self,
+        prompts: Union[
+            List[List[str]], List[str], List[torch.Tensor]
+        ] = DEFAULT_PROMPTS,
+        max_new_tokens: int = 8,
+        lora_paths: Optional[List[str]] = None,
+        logprob_start_len: int = 0,
+        top_k: Optional[int] = None,
+        token_ids_logprob: Optional[List[int]] = None,
+    ):
+        if self.is_generation:
+            return self.forward_generation_raw(
+                engine=self.engine,
+                prompts=prompts,
+                max_new_tokens=max_new_tokens,
+                lora_paths=lora_paths,
+                logprob_start_len=logprob_start_len,
+                top_k=top_k,
+                token_ids_logprob=token_ids_logprob,
+            )
+        else:
+            raise ValueError("Embedding, rerank, and reward model runners are removed.")
+
+    def batch_forward(
+        self,
+        prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        max_new_tokens=8,
+    ):
+        """
+        testing serving by sending all prompts once
+        only return output strings and no logprobs
+        """
+        if self.is_generation:
+            return self.batch_forward_generation_raw(
+                engine=self.engine,
+                prompts=prompts,
+                max_new_tokens=max_new_tokens,
+            )
+        else:
+            raise ValueError("Embedding, rerank, and reward model runners are removed.")
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.engine.shutdown()
+        del self.engine
+
+    @staticmethod
+    def forward_generation_raw(
+        engine: Engine,
+        prompts: Union[List[str], List[torch.Tensor]],
+        max_new_tokens: int = 8,
+        lora_paths: Optional[List[str]] = None,
+        logprob_start_len: int = 0,
+        top_k: Optional[int] = None,
+        token_ids_logprob: Optional[List[int]] = None,
+    ):
+        # the return value contains logprobs from prefill
+        output_strs = []
+        output_ids = []
+        # Input logprobs. Note that the last item in input logprob is equivalent to
+        # the first item in the output logprob.
+        top_input_logprobs = []
+        input_token_logprobs_lst = []
+        top_output_logprobs = []
+        output_token_logprobs_lst = []
+        top_output_logprob_idx = []
+        if token_ids_logprob is not None:
+            token_ids_input_logprobs = []
+            token_ids_output_logprobs = []
+        else:
+            token_ids_input_logprobs = token_ids_output_logprobs = None
+
+        sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
+        if top_k:
+            sampling_params["top_k"] = top_k
+
+        for i, prompt in enumerate(prompts):
+            response = engine.generate(
+                prompt,
+                sampling_params=sampling_params,
+                return_logprob=True,
+                logprob_start_len=logprob_start_len,
+                top_logprobs_num=NUM_TOP_LOGPROBS,
+                token_ids_logprob=token_ids_logprob,
+            )
+            text = response["text"]
+
+            # Check if the text is empty or only whitespace.
+            if not text.strip():
+                raise ValueError(
+                    "Received an empty text response. Please verify your input or model configuration."
+                )
+            output_strs.append(text)
+            output_ids.append(response["output_ids"])
+
+            input_token_logprobs = response["meta_info"]["input_token_logprobs"]
+            output_token_logprobs = response["meta_info"]["output_token_logprobs"]
+            # print(i, input_token_logprobs)
+            # print(i, output_token_logprobs)
+            logprobs = response["meta_info"]["input_top_logprobs"]
+            if token_ids_logprob is not None:
+                input_token_ids_logprobs = response["meta_info"][
+                    "input_token_ids_logprobs"
+                ][1:]
+            else:
+                input_token_ids_logprobs = None
+
+            num_prompt_tokens = response["meta_info"]["prompt_tokens"]
+            # assert len(input_token_logprobs) == num_prompt_tokens - logprob_start_len
+            assert len(logprobs) == num_prompt_tokens - logprob_start_len
+
+            # The first token logprob has no meaning in tokenspeed.
+            input_token_logprobs = input_token_logprobs[1:]
+            logprobs = logprobs[1:]
+            assert len(input_token_logprobs) == len(logprobs)
+
+            input_token_logprobs_lst.append(
+                input_token_logprobs + [output_token_logprobs[0]]
+            )
+            output_token_logprobs_lst.append(output_token_logprobs)
+
+            top_input_logprobs.append(
+                [[tup[0] for tup in x[:NUM_TOP_LOGPROBS]] for x in logprobs]
+                + [
+                    [
+                        tup[0]
+                        for tup in response["meta_info"]["output_top_logprobs"][0][
+                            :NUM_TOP_LOGPROBS
+                        ]
+                    ]
+                ]
+            )
+            top_output_logprobs.append(
+                [
+                    [tup[0] for tup in x[:NUM_TOP_LOGPROBS]]
+                    for x in response["meta_info"]["output_top_logprobs"]
+                ]
+            )
+            top_output_logprob_idx.append(
+                [
+                    [tup[1] for tup in x[:NUM_TOP_LOGPROBS]]
+                    for x in response["meta_info"]["output_top_logprobs"]
+                ]
+            )
+            if token_ids_logprob is not None:
+                token_ids_input_logprobs.append(
+                    [[tup[0] for tup in x] for x in input_token_ids_logprobs]
+                    + [
+                        [
+                            tup[0]
+                            for tup in response["meta_info"][
+                                "output_token_ids_logprobs"
+                            ][0]
+                        ]
+                    ]
+                )
+                token_ids_output_logprobs.append(
+                    [
+                        [tup[0] for tup in x]
+                        for x in response["meta_info"]["output_token_ids_logprobs"]
+                    ]
+                )
+
+        return ModelOutput(
+            output_strs=output_strs,
+            output_ids=output_ids,
+            top_input_logprobs=top_input_logprobs,
+            top_output_logprobs=top_output_logprobs,
+            input_token_logprobs_lst=input_token_logprobs_lst,
+            output_token_logprobs_lst=output_token_logprobs_lst,
+            top_output_logprob_idx=top_output_logprob_idx,
+            token_ids_input_logprobs=token_ids_input_logprobs,
+            token_ids_output_logprobs=token_ids_output_logprobs,
+        )
+
+    @staticmethod
+    def batch_forward_generation_raw(
+        prompts: Union[List[str], List[torch.Tensor]],
+        max_new_tokens,
+        engine,
+    ):
+        # the return value contains logprobs from prefill
+        output_strs = []
+        sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
+        response = engine.generate(
+            prompts,
+            sampling_params=sampling_params,
+        )
+        output_strs = [r["text"] for r in response]
+
+        return ModelOutput(
+            output_strs=output_strs,
+        )
+
+
+def monkey_patch_gemma2_sdpa():
+    """
+    Use sdpa by default to fix the OOM issue.
+    Revert this commit:
+    https://github.com/huggingface/transformers/commit/975b988bfe6e7ebb47390cd9a1556c6888804883#diff-5f76eac6f18f4b491521314c318a9692318feb4d19228e9576cce7bde4240834R660
+    """
+    from transformers.models.gemma2.modeling_gemma2 import Gemma2PreTrainedModel
+
+    def _check_and_enable_sdpa(config, hard_check_only: bool = False):
+        config._attn_implementation = "sdpa"
+        return config
+
+    setattr(Gemma2PreTrainedModel, "_check_and_enable_sdpa", _check_and_enable_sdpa)
+
+
+def check_close_model_outputs(
+    hf_outputs: ModelOutput,
+    rt_outputs: ModelOutput,
+    prefill_tolerance: float,
+    decode_tolerance: float,
+    rouge_l_tolerance: float,
+    debug_text: str = "",
+    check_logprobs: bool = True,
+    extra_references: Optional[List[List[str]]] = None,
+):
+    # Compare output strings
+    print(f"{hf_outputs.output_strs=}")
+    print(f"{rt_outputs.output_strs=}")
+    base_scores = calculate_rouge_l(hf_outputs.output_strs, rt_outputs.output_strs)
+    if extra_references:
+        rouge_l_scores = [
+            max(
+                base,
+                *(
+                    calculate_rouge_l([ref[i]], [rt_outputs.output_strs[i]])[0]
+                    for ref in extra_references
+                ),
+            )
+            for i, base in enumerate(base_scores)
+        ]
+    else:
+        rouge_l_scores = base_scores
+    print(f"{rouge_l_scores=}")
+    assert all(
+        score >= rouge_l_tolerance for score in rouge_l_scores
+    ), f"Not all ROUGE-L scores are greater than rouge_l_tolerance={rouge_l_tolerance}"
+
+    if check_logprobs:
+        for i in range(len(hf_outputs.output_strs)):
+            # Compare input logprobs
+            hf_logprobs = torch.Tensor(hf_outputs.top_input_logprobs[i])
+            srt_logprobs = torch.Tensor(rt_outputs.top_input_logprobs[i])
+            input_len = hf_logprobs.shape[0]
+            print(
+                "prefill logprobs max_diff", torch.max(abs(hf_logprobs - srt_logprobs))
+            )
+            if input_len <= 100:
+                assert torch.all(abs(hf_logprobs - srt_logprobs) < prefill_tolerance), (
+                    f"prefill logprobs are not all close with {debug_text} "
+                    f"prefill_tolerance={prefill_tolerance}."
+                    f"{hf_logprobs=}, {srt_logprobs=}"
+                )
+
+            # Compare output logprobs
+            hf_logprobs = torch.Tensor(hf_outputs.top_output_logprobs[i])
+            srt_logprobs = torch.Tensor(rt_outputs.top_output_logprobs[i])
+
+            print(
+                "decode logprobs max_diff", torch.max(abs(hf_logprobs - srt_logprobs))
+            )
+            if input_len <= 100:
+                assert torch.all(abs(hf_logprobs - srt_logprobs) < decode_tolerance), (
+                    f"decode logprobs are not all close with {debug_text} "
+                    f"decode_tolerance={decode_tolerance}."
+                    f"{hf_logprobs=}, {srt_logprobs=}"
+                )

From 89655b64fd68aa955d414c9f7a95c09f4d1de17d Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 03:30:01 +0000
Subject: [PATCH 09/19] chore: revert _triton.py; remove unused fused kernel
 imports

- tokenspeed_kernel/_triton.py: restored to upstream (no modifications)
- moe_lora.py: remove unused imports of fused_a_b_down_expand and
  fused_shared_a_b_gate_up_expand (experimental kernels not in hot path)

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 python/tokenspeed/runtime/lora/moe_lora.py           |  2 --
 .../python/tokenspeed_kernel/_triton.py              | 12 +++++-------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/python/tokenspeed/runtime/lora/moe_lora.py b/python/tokenspeed/runtime/lora/moe_lora.py
index dff003779..5237076b3 100644
--- a/python/tokenspeed/runtime/lora/moe_lora.py
+++ b/python/tokenspeed/runtime/lora/moe_lora.py
@@ -29,8 +29,6 @@
 
 try:
     from tokenspeed_kernel.ops.moe_lora import (
-        fused_a_b_down_expand,
-        fused_shared_a_b_gate_up_expand,
         gate_up_b_expand,
         per_expert_a_shrink,
         per_expert_b_down_expand,
diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/_triton.py b/tokenspeed-kernel/python/tokenspeed_kernel/_triton.py
index bde21c902..0cc787352 100644
--- a/tokenspeed-kernel/python/tokenspeed_kernel/_triton.py
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/_triton.py
@@ -29,18 +29,16 @@
 import sys
 
 import tokenspeed_triton as triton
+import tokenspeed_triton.experimental.gluon.language as gl
+import tokenspeed_triton.profiler as proton
 from tokenspeed_triton import language as tl
+from tokenspeed_triton.experimental import gluon
 from tokenspeed_triton.tools.tensor_descriptor import TensorDescriptor
 
-try:
-    import tokenspeed_triton.profiler as proton
-except ModuleNotFoundError as exc:
-    if exc.name != "tokenspeed_triton.profiler":
-        raise
-    proton = None
-
 __all__ = [
     "TensorDescriptor",
+    "gl",
+    "gluon",
     "proton",
     "redirect_triton_to_tokenspeed_triton",
     "tl",

From d6e442bbb716003987ce168b06e62466d4fafc6d Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 03:30:41 +0000
Subject: [PATCH 10/19] chore: revert tokenspeed_kernel/__init__.py to upstream

Lazy-import refactor is unrelated to this LoRA PR.

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 .../python/tokenspeed_kernel/__init__.py      | 62 +++++++------------
 1 file changed, 23 insertions(+), 39 deletions(-)

diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/__init__.py b/tokenspeed-kernel/python/tokenspeed_kernel/__init__.py
index 718776f31..1e2eb8405 100644
--- a/tokenspeed-kernel/python/tokenspeed_kernel/__init__.py
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/__init__.py
@@ -22,6 +22,29 @@
 
 bootstrap_profiling_from_env()
 
+from tokenspeed_kernel.ops.attention import (
+    mha_decode_scheduler_metadata,
+    mha_decode_with_kvcache,
+    mha_extend_with_kvcache,
+    mha_merge_state,
+    mha_prefill,
+)
+from tokenspeed_kernel.ops.gemm import mm
+from tokenspeed_kernel.ops.moe import (
+    moe_combine,
+    moe_dispatch,
+    moe_experts,
+    moe_fused,
+    moe_route,
+)
+from tokenspeed_kernel.ops.quantization import (
+    quantize_fp8,
+    quantize_fp8_with_scale,
+    quantize_mxfp4,
+    quantize_mxfp8,
+    quantize_nvfp4,
+)
+
 __all__ = [
     # gemm
     "mm",
@@ -44,42 +67,3 @@
     "quantize_nvfp4",
     "quantize_mxfp4",
 ]
-
-
-def __getattr__(name: str):
-    if name == "mm":
-        from tokenspeed_kernel.ops.gemm import mm
-
-        return mm
-    if name in {"moe_route", "moe_dispatch", "moe_experts", "moe_combine", "moe_fused"}:
-        from tokenspeed_kernel.ops import moe
-
-        return getattr(moe, name)
-    if name in {
-        "mha_prefill",
-        "mha_extend_with_kvcache",
-        "mha_prefill_with_kvcache",  # legacy alias
-        "mha_decode_with_kvcache",
-        "mha_merge_state",
-        "mha_decode_scheduler_metadata",
-    }:
-        from tokenspeed_kernel.ops import attention
-
-        return getattr(attention, name)
-    if name in {
-        "quantize_fp8",
-        "quantize_fp8_with_scale",
-        "quantize_mxfp8",
-        "quantize_nvfp4",
-        "quantize_mxfp4",
-    }:
-        from tokenspeed_kernel.ops.quantization import (
-            quantize_fp8,
-            quantize_fp8_with_scale,
-            quantize_mxfp4,
-            quantize_mxfp8,
-            quantize_nvfp4,
-        )
-
-        return locals()[name]
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

From 3ad51aaca9f287b3019287ee1f269e7e4ba82982 Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 03:31:34 +0000
Subject: [PATCH 11/19] chore: revert attention/__init__.py to upstream

HIP/ROCm gluon conditional import change is unrelated to this LoRA PR.

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 .../python/tokenspeed_kernel/ops/attention/__init__.py        | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tokenspeed-kernel/python/tokenspeed_kernel/ops/attention/__init__.py b/tokenspeed-kernel/python/tokenspeed_kernel/ops/attention/__init__.py
index 81b97a461..5d21a41e9 100644
--- a/tokenspeed-kernel/python/tokenspeed_kernel/ops/attention/__init__.py
+++ b/tokenspeed-kernel/python/tokenspeed_kernel/ops/attention/__init__.py
@@ -26,15 +26,13 @@
 import tokenspeed_kernel.ops.attention.cuda  # noqa: F401
 import tokenspeed_kernel.ops.attention.flash_attn  # noqa: F401
 import tokenspeed_kernel.ops.attention.flashinfer  # noqa: F401
+import tokenspeed_kernel.ops.attention.gluon  # noqa: F401
 import tokenspeed_kernel.ops.attention.triton  # noqa: F401
 import torch
 from tokenspeed_kernel.ops.attention.flash_attn import mha_decode_scheduler_metadata
 from tokenspeed_kernel.profiling import ShapeCapture, kernel_scope
 from tokenspeed_kernel.selection import select_kernel
 
-if getattr(torch.version, "hip", None):
-    import tokenspeed_kernel.ops.attention.gluon  # noqa: F401
-
 AttentionResult = torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]
 
 __all__ = [

From d4cbc00e63ea63ac572fe12d5e81a3f16ae69ebf Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 03:33:53 +0000
Subject: [PATCH 12/19] chore: revert tokenspeed_scheduler exports; move kernel
 LoRA test

- tokenspeed_scheduler/__init__.py: restore PagedCacheGroupFamily and
  PrefixCacheAdjunctSpec exports (both are bound in python_module.cpp;
  our branch incorrectly removed them)
- tokenspeed-kernel/test/ops/test_lora_triton.py: move to qywu/lora-dev
  (LoRA test missed in previous sweep)

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 .../test/ops/test_lora_triton.py              | 122 ------------------
 .../python/tokenspeed_scheduler/__init__.py   |   4 +
 2 files changed, 4 insertions(+), 122 deletions(-)
 delete mode 100644 tokenspeed-kernel/test/ops/test_lora_triton.py

diff --git a/tokenspeed-kernel/test/ops/test_lora_triton.py b/tokenspeed-kernel/test/ops/test_lora_triton.py
deleted file mode 100644
index 67bd234a3..000000000
--- a/tokenspeed-kernel/test/ops/test_lora_triton.py
+++ /dev/null
@@ -1,122 +0,0 @@
-from __future__ import annotations
-
-from dataclasses import dataclass
-
-import pytest
-import torch
-
-
-@dataclass
-class BatchInfo:
-    bs: int
-    max_len: int
-    seg_lens: torch.Tensor
-    seg_indptr: torch.Tensor
-    weight_indices: torch.Tensor
-    lora_ranks: torch.Tensor
-    scalings: torch.Tensor
-    permutation: torch.Tensor | None = None
-
-
-def _decode_batch(batch_size: int, rank: int, device: str) -> BatchInfo:
-    return BatchInfo(
-        bs=batch_size,
-        max_len=1,
-        seg_lens=torch.ones((batch_size,), dtype=torch.int32, device=device),
-        seg_indptr=torch.arange(batch_size + 1, dtype=torch.int32, device=device),
-        weight_indices=torch.ones((batch_size,), dtype=torch.int32, device=device),
-        lora_ranks=torch.tensor([0, rank], dtype=torch.int32, device=device),
-        scalings=torch.ones((2,), dtype=torch.float32, device=device),
-    )
-
-
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required")
-def test_lora_expand_decode_rank_smaller_than_block_k_matches_reference():
-    from tokenspeed_kernel.ops.lora.triton.lora_expand import lora_expand_fwd
-
-    device = "cuda"
-    dtype = torch.bfloat16
-    batch_size = 4
-    rank = 8
-    out_dim = 64
-    torch.manual_seed(7)
-    batch_info = _decode_batch(batch_size, rank, device)
-    x = torch.randn((batch_size, rank), dtype=dtype, device=device)
-    weights = torch.randn((2, out_dim, rank), dtype=dtype, device=device)
-    base = torch.randn((batch_size, out_dim), dtype=dtype, device=device)
-
-    out = lora_expand_fwd(x, weights, batch_info, base_output=base.clone())
-    ref = base.float() + x.float() @ weights[1].float().T
-    torch.testing.assert_close(out.float(), ref, rtol=2e-2, atol=2e-1)
-
-
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required")
-def test_lora_gate_up_decode_rank_smaller_than_block_k_matches_reference():
-    from tokenspeed_kernel.ops.lora.triton.lora_gate_up_expand import (
-        lora_gate_up_expand_fwd,
-    )
-
-    device = "cuda"
-    dtype = torch.bfloat16
-    batch_size = 4
-    rank = 8
-    out_dim = 64
-    torch.manual_seed(8)
-    batch_info = _decode_batch(batch_size, rank, device)
-    x = torch.randn((batch_size, 2 * rank), dtype=dtype, device=device)
-    weights = torch.randn((2, 2 * out_dim, rank), dtype=dtype, device=device)
-    base = torch.randn((batch_size, 2 * out_dim), dtype=dtype, device=device)
-
-    out = lora_gate_up_expand_fwd(
-        x,
-        weights,
-        batch_info,
-        out_dim,
-        base_output=base.clone(),
-    )
-    ref = base.float()
-    ref[:, :out_dim] += x[:, :rank].float() @ weights[1, :out_dim].float().T
-    ref[:, out_dim:] += (
-        x[:, rank : 2 * rank].float() @ weights[1, out_dim : 2 * out_dim].float().T
-    )
-    torch.testing.assert_close(out.float(), ref, rtol=2e-2, atol=2e-1)
-
-
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required")
-def test_lora_qkv_decode_rank_smaller_than_block_k_matches_reference():
-    from tokenspeed_kernel.ops.lora.triton.lora_qkv_expand import lora_qkv_expand_fwd
-
-    device = "cuda"
-    dtype = torch.bfloat16
-    batch_size = 4
-    rank = 8
-    q_dim = 64
-    kv_dim = 32
-    torch.manual_seed(9)
-    batch_info = _decode_batch(batch_size, rank, device)
-    x = torch.randn((batch_size, 3 * rank), dtype=dtype, device=device)
-    weights = torch.randn((2, q_dim + 2 * kv_dim, rank), dtype=dtype, device=device)
-    base = torch.randn((batch_size, q_dim + 2 * kv_dim), dtype=dtype, device=device)
-    offsets = torch.tensor(
-        [0, q_dim, q_dim + kv_dim, q_dim + 2 * kv_dim],
-        dtype=torch.int32,
-        device=device,
-    )
-
-    out = lora_qkv_expand_fwd(
-        x,
-        weights,
-        batch_info,
-        offsets,
-        q_dim,
-        base_output=base.clone(),
-    )
-    ref = base.float()
-    ref[:, :q_dim] += x[:, :rank].float() @ weights[1, :q_dim].float().T
-    ref[:, q_dim : q_dim + kv_dim] += (
-        x[:, rank : 2 * rank].float() @ weights[1, q_dim : q_dim + kv_dim].float().T
-    )
-    ref[:, q_dim + kv_dim :] += (
-        x[:, 2 * rank : 3 * rank].float() @ weights[1, q_dim + kv_dim :].float().T
-    )
-    torch.testing.assert_close(out.float(), ref, rtol=2e-2, atol=2e-1)
diff --git a/tokenspeed-scheduler/python/tokenspeed_scheduler/__init__.py b/tokenspeed-scheduler/python/tokenspeed_scheduler/__init__.py
index dc87ada88..f2070be85 100644
--- a/tokenspeed-scheduler/python/tokenspeed_scheduler/__init__.py
+++ b/tokenspeed-scheduler/python/tokenspeed_scheduler/__init__.py
@@ -27,8 +27,10 @@
     ExecutionPlan,
     PagedCacheGroupAllocator,
     PagedCacheGroupConfig,
+    PagedCacheGroupFamily,
     PagedCacheGroupTable,
     PagedCacheRetention,
+    PrefixCacheAdjunctSpec,
     RequestSpec,
     Scheduler,
     SchedulerConfig,
@@ -71,7 +73,9 @@ def _flat_forward_op_repr(self):
     "PagedCacheRetention",
     "PagedCacheGroupConfig",
     "PagedCacheGroupAllocator",
+    "PagedCacheGroupFamily",
     "PagedCacheGroupTable",
+    "PrefixCacheAdjunctSpec",
     # Execution plan & operations
     "ExecutionPlan",
     "Forward",

From 112751720063fa92dae3958fb77b71581215035a Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 03:40:38 +0000
Subject: [PATCH 13/19] fix(scheduler): remove
 PagedCacheGroupFamily/PrefixCacheAdjunctSpec from Python exports

The pre-installed tokenspeed_scheduler binary in CI was built before
these types were added to the C++ extension, so importing them from
the .so raises ImportError. Remove from __init__ until the installed
binary is updated.

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 tokenspeed-scheduler/python/tokenspeed_scheduler/__init__.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tokenspeed-scheduler/python/tokenspeed_scheduler/__init__.py b/tokenspeed-scheduler/python/tokenspeed_scheduler/__init__.py
index f2070be85..dc87ada88 100644
--- a/tokenspeed-scheduler/python/tokenspeed_scheduler/__init__.py
+++ b/tokenspeed-scheduler/python/tokenspeed_scheduler/__init__.py
@@ -27,10 +27,8 @@
     ExecutionPlan,
     PagedCacheGroupAllocator,
     PagedCacheGroupConfig,
-    PagedCacheGroupFamily,
     PagedCacheGroupTable,
     PagedCacheRetention,
-    PrefixCacheAdjunctSpec,
     RequestSpec,
     Scheduler,
     SchedulerConfig,
@@ -73,9 +71,7 @@ def _flat_forward_op_repr(self):
     "PagedCacheRetention",
     "PagedCacheGroupConfig",
     "PagedCacheGroupAllocator",
-    "PagedCacheGroupFamily",
     "PagedCacheGroupTable",
-    "PrefixCacheAdjunctSpec",
     # Execution plan & operations
     "ExecutionPlan",
     "Forward",

From 46f20d43f123d5415161ffa1f35a409c8f472ce3 Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 04:21:30 +0000
Subject: [PATCH 14/19] fix(lora): two-phase prepare_loras to prevent silent
 wrong-output bug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The single-pass approach had a correctness bug: when a batch required
more adapters than could be evicted without touching batch adapters,
_find_free_slot would evict an adapter that was already assigned a slot
in per_request_slots. Those requests would then receive NO_LORA_SLOT and
silently run as the base model — wrong outputs with no error.

Fix with a two-phase approach:

Phase 1 — promote all unique adapters upfront:
  - Early check: if n_unique > max_loras, raise RuntimeError immediately
    instead of producing wrong outputs silently.
  - Call _ensure_in_gpu for all batch adapters before assigning any slot.
  - After each promotion, move_to_end (MRU) to prevent a subsequent
    iteration from evicting an already-promoted batch adapter that
    happens to be LRU in _gpu_lru.
  - LRU eviction during this phase only targets adapters NOT in the batch.

Phase 2 — assign per_request_slots from the stable _name_to_slot map:
  - All needed adapters are already on GPU; no evictions occur.
  - Use _name_to_slot[name] directly (guaranteed present after phase 1).

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 .../tokenspeed/runtime/lora/lora_manager.py   | 36 +++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/python/tokenspeed/runtime/lora/lora_manager.py b/python/tokenspeed/runtime/lora/lora_manager.py
index 00992f172..92dfcac12 100644
--- a/python/tokenspeed/runtime/lora/lora_manager.py
+++ b/python/tokenspeed/runtime/lora/lora_manager.py
@@ -466,7 +466,39 @@ def prepare_loras(
         against the same pointers.
         """
         bs = len(lora_ids)
-        # Resolve names → slots; LRU bookkeeping.
+
+        # Phase 1: resolve all unique adapters and promote them to GPU before
+        # assigning any per-request slots.  A single-pass approach would silently
+        # produce wrong outputs: if the batch needs more adapters than max_loras,
+        # _find_free_slot evicts an already-assigned adapter (e.g. A), then the
+        # later request for A gets NO_LORA_SLOT and runs as the base model.
+        unique_names: dict[int, str] = {}
+        for lid in lora_ids:
+            if lid == 0 or lid in unique_names:
+                continue
+            name = self._id_to_name.get(lid)
+            if name is not None:
+                unique_names[lid] = name
+
+        n_unique = len(unique_names)
+        if n_unique > self.max_loras:
+            raise RuntimeError(
+                f"Batch requires {n_unique} unique LoRA adapters but "
+                f"max_loras={self.max_loras}. Reduce adapter diversity per "
+                "batch (use pack scheduling) or increase max_loras."
+            )
+
+        # Promote all needed adapters before touching per_request_slots so that
+        # LRU eviction only targets adapters NOT in this batch.  After each
+        # promotion, move the adapter to MRU so subsequent promotions within
+        # this loop don't evict an already-promoted or already-resident batch
+        # adapter (which would be LRU if it was loaded in a previous step).
+        for name in unique_names.values():
+            self._ensure_in_gpu(name)
+            self._gpu_lru.move_to_end(name)  # protect from intra-phase eviction
+
+        # Phase 2: assign per-request slots from the now-stable _name_to_slot
+        # map (no further evictions occur here).
         per_request_slots: list[int] = []
         for lid in lora_ids:
             if lid == 0:
@@ -477,7 +509,7 @@ def prepare_loras(
                 logger.warning("Unknown lora_id %d; treating as base model.", lid)
                 per_request_slots.append(NO_LORA_SLOT)
                 continue
-            slot = self._ensure_in_gpu(name)
+            slot = self._name_to_slot[name]  # guaranteed present after phase 1
             per_request_slots.append(slot)
             self._gpu_lru.move_to_end(name)
 

From bc7b4ff2fa51bad07da7bc440d2a44dd37fb9fdb Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 04:30:27 +0000
Subject: [PATCH 15/19] fix(lora): defer GPU weight eviction on mid-decode
 unload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When unload_adapter() is called while an adapter is still potentially
in-flight (used in the most recent prepare_loras batch), zeroing the
GPU slot immediately causes ongoing decode steps to produce wrong
outputs (zero LoRA delta = silent base-model behaviour).

Fix with a two-field deferred eviction mechanism:
  _active_names  — adapters used in the most recent prepare_loras call
  _pending_eviction — names queued for eviction when no longer active

unload_adapter():
  - Removes identity mappings immediately (blocks new requests)
  - If adapter is in _active_names: adds to _pending_eviction + warning,
    keeps CPU weights alive so retracted requests can still reload
  - If adapter is not active: evicts GPU slot and CPU weights immediately

prepare_loras() (at the top of phase 1):
  - Previous forward step is complete at this point
  - Flushes _pending_eviction for adapters not in the current batch
  - Updates _active_names to the current batch's unique adapter names

This also preserves correctness for retracted requests: if the scheduler
pauses a decode and later resumes it, _ensure_in_gpu reloads the weights
from the CPU copy, which is kept alive until the deferred eviction fires.

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 .../tokenspeed/runtime/lora/lora_manager.py   | 50 +++++++++++++++++--
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/python/tokenspeed/runtime/lora/lora_manager.py b/python/tokenspeed/runtime/lora/lora_manager.py
index 92dfcac12..198bf15c0 100644
--- a/python/tokenspeed/runtime/lora/lora_manager.py
+++ b/python/tokenspeed/runtime/lora/lora_manager.py
@@ -227,6 +227,12 @@ def __init__(
         self._name_to_slot: dict[str, int] = {}
         self._gpu_lru: OrderedDict[str, None] = OrderedDict()  # alias of _lru
 
+        # Mid-decode unload safety: track which adapter names were active in
+        # the most recent prepare_loras call, and defer GPU weight eviction for
+        # any adapter that is unloaded while still in use.
+        self._active_names: set[str] = set()
+        self._pending_eviction: set[str] = set()  # names to evict when safe
+
         # ── Tier 2: pinned CPU pool ─────────────────────────────────────
         # ``_cpu_cache[name]`` holds parsed weights in pinned host memory.
         # ``_cpu_lru`` tracks LRU order for CPU eviction back to disk.  An
@@ -444,11 +450,32 @@ def load_adapter(self, name: str, path: str) -> int:
     def unload_adapter(self, name: str) -> None:
         if name not in self._name_to_id:
             raise KeyError(f"Adapter '{name}' is not loaded.")
-        self._evict_by_name(name)
-        self._cpu_store.remove(name)
+
+        # Remove from identity tables immediately so no NEW requests are assigned
+        # to this adapter.  GPU weight eviction may be deferred (see below).
         lora_id = self._name_to_id.pop(name)
         del self._id_to_name[lora_id]
-        logger.info("Unloaded adapter '%s'", name)
+
+        if name in self._active_names:
+            # The adapter was used in the most recent forward step; its GPU
+            # weights may still be needed if the scheduler is mid-decode for
+            # one of those requests.  Defer the weight zeroing until the next
+            # prepare_loras confirms the adapter is no longer in the batch.
+            logger.warning(
+                "Adapter '%s' (lora_id=%d) unloaded while potentially in-flight; "
+                "GPU slot eviction deferred until next batch that does not use it.",
+                name,
+                lora_id,
+            )
+            self._pending_eviction.add(name)
+            # Keep CPU weights alive so retracted requests can still reload.
+            # The CPU entry is removed when the deferred eviction fires.
+        else:
+            # Safe to evict immediately — not active in the current batch.
+            self._evict_by_name(name)
+            self._cpu_store.remove(name)
+
+        logger.info("Unloaded adapter '%s' (lora_id=%d)", name, lora_id)
 
     def get_id(self, name: str) -> int | None:
         return self._name_to_id.get(name)
@@ -488,6 +515,23 @@ def prepare_loras(
                 "batch (use pack scheduling) or increase max_loras."
             )
 
+        # The previous forward step is now complete (prepare_loras is called
+        # synchronously before each forward).  Flush any deferred evictions for
+        # adapters that are NOT needed by the current batch.
+        current_batch_names = set(unique_names.values())
+        for pending_name in list(self._pending_eviction):
+            if pending_name not in current_batch_names:
+                logger.info(
+                    "Deferred eviction: removing adapter '%s' GPU weights now.",
+                    pending_name,
+                )
+                self._evict_by_name(pending_name)
+                self._cpu_store.remove(pending_name)
+                self._pending_eviction.discard(pending_name)
+
+        # Track which adapters are active in this batch for mid-decode unload safety.
+        self._active_names = current_batch_names
+
         # Promote all needed adapters before touching per_request_slots so that
         # LRU eviction only targets adapters NOT in this batch.  After each
         # promotion, move the adapter to MRU so subsequent promotions within

From 9bd96b8218481ffd908ebde08cc7a485c2225168 Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 04:32:17 +0000
Subject: [PATCH 16/19] fix(lora): add flush_pending_evictions for explicit
 slot reclaim

When unload_adapter() defers GPU eviction (mid-decode safety), the slot
stays occupied until a batch without that adapter arrives.  If the server
goes idle with no further batches, the slot is never freed.

Add flush_pending_evictions() that immediately zeroes all deferred slots.
Call this when the server is confirmed idle (no in-flight requests) to
reclaim GPU capacity.  Calling it mid-decode has the same unsafe
semantics as the original immediate eviction, so the caller must ensure
quiescence first.

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 python/tokenspeed/runtime/lora/lora_manager.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/python/tokenspeed/runtime/lora/lora_manager.py b/python/tokenspeed/runtime/lora/lora_manager.py
index 198bf15c0..510836a14 100644
--- a/python/tokenspeed/runtime/lora/lora_manager.py
+++ b/python/tokenspeed/runtime/lora/lora_manager.py
@@ -480,6 +480,21 @@ def unload_adapter(self, name: str) -> None:
     def get_id(self, name: str) -> int | None:
         return self._name_to_id.get(name)
 
+    def flush_pending_evictions(self) -> None:
+        """Evict all deferred adapter weights immediately, regardless of active state.
+
+        Call this when the server is idle (no in-flight requests) to reclaim GPU
+        slots that were deferred by unload_adapter() calls during active decodes.
+        It is safe to call at any time; if called mid-decode the behaviour is the
+        same as the original unload_adapter (slot zeroed while in-flight), so
+        only call this when you are certain no requests are running.
+        """
+        for name in list(self._pending_eviction):
+            logger.info("Flushing deferred eviction for adapter '%s'.", name)
+            self._evict_by_name(name)
+            self._cpu_store.remove(name)
+            self._pending_eviction.discard(name)
+
     def prepare_loras(
         self,
         lora_ids: list[int],

From c645b02be1b151bae154f0635aa562590b1d610e Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 04:37:11 +0000
Subject: [PATCH 17/19] fix(lora): harden deferred eviction against
 re-registration and retracted-request failures

Three bugs in the initial deferred eviction design:

1. _id_to_name cleared too early: unload_adapter deleted _id_to_name[lora_id]
   immediately, so retracted requests that resume later saw lora_id=None and
   silently ran as base model. Fix: keep _id_to_name alive until _flush_one_pending.

2. Re-registration overwrites pending eviction slot: if the same adapter name
   is reloaded before the pending eviction fires, _evict_by_name("A") would zero
   the NEW adapter's slot. Fix: _pending_eviction now stores (name, lora_id)
   tuples; _flush_one_pending skips GPU eviction if _name_to_id[name] exists
   (name was re-registered with a new id).

3. Double-eviction safety: LRU pressure may evict the GPU slot before the
   deferred flush fires. _evict_by_name is already idempotent so this is safe,
   but _flush_one_pending now explicitly handles the case (no-op if slot gone).

Add _flush_one_pending(name, lora_id) as the canonical flush helper, used by
both flush_pending_evictions() and the per-step flush in prepare_loras().

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 .../tokenspeed/runtime/lora/lora_manager.py   | 69 ++++++++++++++-----
 1 file changed, 51 insertions(+), 18 deletions(-)

diff --git a/python/tokenspeed/runtime/lora/lora_manager.py b/python/tokenspeed/runtime/lora/lora_manager.py
index 510836a14..9a95db703 100644
--- a/python/tokenspeed/runtime/lora/lora_manager.py
+++ b/python/tokenspeed/runtime/lora/lora_manager.py
@@ -231,7 +231,9 @@ def __init__(
         # the most recent prepare_loras call, and defer GPU weight eviction for
         # any adapter that is unloaded while still in use.
         self._active_names: set[str] = set()
-        self._pending_eviction: set[str] = set()  # names to evict when safe
+        self._pending_eviction: set[tuple[str, int]] = (
+            set()
+        )  # (name, lora_id) to evict when safe
 
         # ── Tier 2: pinned CPU pool ─────────────────────────────────────
         # ``_cpu_cache[name]`` holds parsed weights in pinned host memory.
@@ -451,27 +453,31 @@ def unload_adapter(self, name: str) -> None:
         if name not in self._name_to_id:
             raise KeyError(f"Adapter '{name}' is not loaded.")
 
-        # Remove from identity tables immediately so no NEW requests are assigned
-        # to this adapter.  GPU weight eviction may be deferred (see below).
         lora_id = self._name_to_id.pop(name)
-        del self._id_to_name[lora_id]
+        # Keep _id_to_name[lora_id] alive until eviction fires so retracted
+        # requests that resume with this lora_id can still be recognised and
+        # get the correct weights rather than silently falling back to the base
+        # model.  It is cleared when the GPU slot is finally freed below.
 
         if name in self._active_names:
             # The adapter was used in the most recent forward step; its GPU
-            # weights may still be needed if the scheduler is mid-decode for
-            # one of those requests.  Defer the weight zeroing until the next
-            # prepare_loras confirms the adapter is no longer in the batch.
+            # weights may still be needed if the scheduler is mid-decode or has
+            # retracted a request that it will later reschedule.  Defer the
+            # weight zeroing until a batch arrives that does not include this
+            # adapter — at that point the previous step is confirmed complete.
             logger.warning(
                 "Adapter '%s' (lora_id=%d) unloaded while potentially in-flight; "
                 "GPU slot eviction deferred until next batch that does not use it.",
                 name,
                 lora_id,
             )
-            self._pending_eviction.add(name)
-            # Keep CPU weights alive so retracted requests can still reload.
-            # The CPU entry is removed when the deferred eviction fires.
+            # Store (name, lora_id) so the flush can distinguish this entry from
+            # a same-name re-registration that might occur before flushing.
+            self._pending_eviction.add((name, lora_id))
+            # CPU weights kept alive so retracted requests can still reload.
         else:
             # Safe to evict immediately — not active in the current batch.
+            del self._id_to_name[lora_id]
             self._evict_by_name(name)
             self._cpu_store.remove(name)
 
@@ -480,6 +486,35 @@ def unload_adapter(self, name: str) -> None:
     def get_id(self, name: str) -> int | None:
         return self._name_to_id.get(name)
 
+    def _flush_one_pending(self, name: str, lora_id: int) -> None:
+        """Carry out the deferred GPU+CPU eviction for one (name, lora_id) entry.
+
+        Guards against two edge cases:
+        - Re-registration: the same name was re-loaded after unload; the new
+          slot should NOT be zeroed.  Detected by checking that _id_to_name
+          still maps lora_id → name (the old entry, kept alive for retracted
+          requests) and that the current name→id mapping no longer exists.
+        - Already-evicted slot: LRU pressure may have freed the GPU slot
+          before the deferred flush fires; _evict_by_name is idempotent.
+        """
+        # If the same name was re-registered, _name_to_id[name] exists again
+        # with a NEW lora_id.  Skip GPU eviction — the slot now belongs to the
+        # new adapter.  The CPU copy for the OLD weights was already removed or
+        # never loaded under the new id.
+        if self._name_to_id.get(name) is not None:
+            # Name was re-registered; clear the stale _id_to_name entry for the
+            # old lora_id only if it still points to this name.
+            if self._id_to_name.get(lora_id) == name:
+                del self._id_to_name[lora_id]
+            return
+
+        # Clear the reverse mapping kept alive for retracted-request safety.
+        if self._id_to_name.get(lora_id) == name:
+            del self._id_to_name[lora_id]
+
+        self._evict_by_name(name)  # idempotent if already LRU-evicted
+        self._cpu_store.remove(name)
+
     def flush_pending_evictions(self) -> None:
         """Evict all deferred adapter weights immediately, regardless of active state.
 
@@ -489,11 +524,10 @@ def flush_pending_evictions(self) -> None:
         same as the original unload_adapter (slot zeroed while in-flight), so
         only call this when you are certain no requests are running.
         """
-        for name in list(self._pending_eviction):
+        for name, lora_id in list(self._pending_eviction):
             logger.info("Flushing deferred eviction for adapter '%s'.", name)
-            self._evict_by_name(name)
-            self._cpu_store.remove(name)
-            self._pending_eviction.discard(name)
+            self._flush_one_pending(name, lora_id)
+            self._pending_eviction.discard((name, lora_id))
 
     def prepare_loras(
         self,
@@ -534,15 +568,14 @@ def prepare_loras(
         # synchronously before each forward).  Flush any deferred evictions for
         # adapters that are NOT needed by the current batch.
         current_batch_names = set(unique_names.values())
-        for pending_name in list(self._pending_eviction):
+        for pending_name, pending_lora_id in list(self._pending_eviction):
             if pending_name not in current_batch_names:
                 logger.info(
                     "Deferred eviction: removing adapter '%s' GPU weights now.",
                     pending_name,
                 )
-                self._evict_by_name(pending_name)
-                self._cpu_store.remove(pending_name)
-                self._pending_eviction.discard(pending_name)
+                self._flush_one_pending(pending_name, pending_lora_id)
+                self._pending_eviction.discard((pending_name, pending_lora_id))
 
         # Track which adapters are active in this batch for mid-decode unload safety.
         self._active_names = current_batch_names

From 6a03eb36ba5c700b9d4a91cd9aa01755bac118b5 Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 04:53:15 +0000
Subject: [PATCH 18/19] fix(lora): remove GPU zeroing from _reset_slot to
 eliminate CUDA stream race
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_reset_slot was calling zero_slot (dense) and clear_slot (MoE) which both
issue GPU tensor.zero_() operations — potentially hundreds of kernel launches
per eviction, one per buffer per layer.

More importantly, these GPU zeros have a correctness race:
  graph.replay() runs on a dedicated stream (cuda_graph_wrapper.self.stream)
  tensor.zero_() runs on the default PyTorch CUDA stream
Without explicit inter-stream synchronisation, a GPU zero can race with an
in-flight graph kernel still reading the old weights on the other stream.

The zeros are defensive but not required: prepare_loras assigns
weight_indices[i] only to slots in _name_to_slot.  _evict_by_name removes
the slot from _name_to_slot before _reset_slot runs, so no kernel ever
reads from an evicted slot.  Stale GPU values are overwritten when
_load_to_slot reuses the slot for a new adapter.

Changes:
- _reset_slot: keep CPU metadata zeros (scalings, ranks); skip GPU zeros
- MoeLoraBuffers: add clear_slot_cpu_only() that removes the slot from the
  weights_by_layer dict (needed for the eager non-buffer path) without any
  GPU operations
- flush_pending_evictions: update docstring — now safe to call at any time
  since no GPU operations are involved in the eviction path

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 .../tokenspeed/runtime/lora/lora_manager.py   | 40 ++++++++++++++-----
 python/tokenspeed/runtime/lora/moe_lora.py    | 14 +++++++
 2 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/python/tokenspeed/runtime/lora/lora_manager.py b/python/tokenspeed/runtime/lora/lora_manager.py
index 9a95db703..01c529dfc 100644
--- a/python/tokenspeed/runtime/lora/lora_manager.py
+++ b/python/tokenspeed/runtime/lora/lora_manager.py
@@ -516,13 +516,18 @@ def _flush_one_pending(self, name: str, lora_id: int) -> None:
         self._cpu_store.remove(name)
 
     def flush_pending_evictions(self) -> None:
-        """Evict all deferred adapter weights immediately, regardless of active state.
-
-        Call this when the server is idle (no in-flight requests) to reclaim GPU
-        slots that were deferred by unload_adapter() calls during active decodes.
-        It is safe to call at any time; if called mid-decode the behaviour is the
-        same as the original unload_adapter (slot zeroed while in-flight), so
-        only call this when you are certain no requests are running.
+        """Evict all deferred adapter weights immediately.
+
+        Safe to call at any time, including while the GPU is running a forward
+        pass.  Because _reset_slot no longer issues GPU zero operations, the
+        only side effects are CPU-side: removing the slot from _name_to_slot
+        and cleaning up CPU weight caches.  The GPU memory retains stale values
+        until the slot is reused, but no kernel ever reads from an evicted slot
+        (prepare_loras only assigns weight_indices to slots present in
+        _name_to_slot).
+
+        Call this when the server has no pending decode requests and you want
+        to reclaim GPU slots occupied by deferred-unloaded adapters.
         """
         for name, lora_id in list(self._pending_eviction):
             logger.info("Flushing deferred eviction for adapter '%s'.", name)
@@ -1125,8 +1130,25 @@ def _evict_by_name(self, name: str) -> None:
         self._gpu_lru.pop(name, None)
 
     def _reset_slot(self, slot: int) -> None:
-        self._weight_buffers.zero_slot(slot)
-        self._moe_lora_buffers.clear_slot(slot)
+        # GPU weight tensors are intentionally NOT zeroed here.
+        #
+        # Correctness argument: prepare_loras assigns weight_indices[i] only to
+        # slots present in _name_to_slot.  _evict_by_name removes the slot from
+        # _name_to_slot before calling _reset_slot, so no kernel ever reads from
+        # an evicted slot's GPU memory regardless of what values are there.
+        # _load_to_slot overwrites the stale values when the slot is reused.
+        #
+        # Skipping the GPU zeros removes potentially hundreds of kernel launches
+        # per eviction (one zero_() per buffer per layer) and — more critically —
+        # eliminates a CUDA stream race: graph.replay() uses a dedicated stream
+        # while tensor.zero_() runs on the default stream; without explicit
+        # inter-stream synchronisation, an immediate GPU zero could race with an
+        # in-flight graph kernel still reading the old weights.
+        #
+        # MoE buffers: _moe_lora_buffers.clear_slot does both GPU zeroing AND
+        # CPU dict cleanup (weights_by_layer.pop).  Keep the CPU cleanup, skip
+        # the GPU zeros.
+        self._moe_lora_buffers.clear_slot_cpu_only(slot)
         self._lora_ranks[slot] = 0
         self._slot_ranks[slot] = 0
         self._slot_scalings[slot] = 0.0
diff --git a/python/tokenspeed/runtime/lora/moe_lora.py b/python/tokenspeed/runtime/lora/moe_lora.py
index 5237076b3..9957bfd30 100644
--- a/python/tokenspeed/runtime/lora/moe_lora.py
+++ b/python/tokenspeed/runtime/lora/moe_lora.py
@@ -1301,6 +1301,20 @@ def clear_slot(self, slot: int) -> None:
         for layer_slots in self.weights_by_layer.values():
             layer_slots.pop(slot, None)
 
+    def clear_slot_cpu_only(self, slot: int) -> None:
+        """Remove slot from CPU-side tracking without GPU zeroing.
+
+        The GPU weight tensors for this slot are NOT zeroed.  This is safe
+        because prepare_loras only assigns weight_indices[i] to slots present
+        in _name_to_slot, which is cleared before this method is called.
+        No kernel can read from an evicted slot.  Stale GPU values are
+        overwritten when _load_to_slot reuses the slot for a new adapter.
+        """
+        if not self.enabled:
+            return
+        for layer_slots in self.weights_by_layer.values():
+            layer_slots.pop(slot, None)
+
     def build_context(
         self,
         *,

From 0e2d70d289a5ae8c33e5d1b7c51978f0c570df3b Mon Sep 17 00:00:00 2001
From: Qingyang Wu <willqywu@gmail.com>
Date: Mon, 25 May 2026 05:24:59 +0000
Subject: [PATCH 19/19] feat(scheduler): enforce max_loras adapter cap per
 batch in C++ scheduler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, the C++ scheduler was unaware of max_loras and could build
batches requiring more unique LoRA adapter ids than the Python GPU pool
could hold simultaneously.  prepare_loras() then raised RuntimeError,
or worse, silently produced wrong outputs when _find_free_slot evicted
an already-assigned adapter.

Fix: thread max_loras through to the scheduler so the batch-building
loop enforces the cap directly.

Changes:
- scheduler/types.h: add max_loras field (0 = LoRA disabled, no cap)
- scheduler/operations/forward.cpp: track batch_lora_ids (unordered_set)
  in newForwardOperation(); skip any request whose lora_id would push
  the count past max_loras — the request is deferred to the next step
- bindings/python_module.cpp: expose max_loras on SchedulerConfig
- scheduler_utils.py make_config(): add max_loras parameter
- event_loop.py: pass server_args.max_loras (0 when LoRA disabled)

With this change the prepare_loras() RuntimeError for n_unique > max_loras
becomes unreachable in normal operation.  The deferred requests are picked
up in subsequent scheduling rounds, naturally co-scheduling same-adapter
requests (Gap 1 from the Open Gaps doc section).

Signed-off-by: Qingyang Wu <willqywu@gmail.com>
---
 python/tokenspeed/runtime/engine/event_loop.py   |  1 +
 .../tokenspeed/runtime/engine/scheduler_utils.py |  2 ++
 tokenspeed-scheduler/bindings/python_module.cpp  |  3 ++-
 .../csrc/scheduler/operations/forward.cpp        | 16 ++++++++++++++++
 tokenspeed-scheduler/csrc/scheduler/types.h      |  6 ++++++
 5 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/python/tokenspeed/runtime/engine/event_loop.py b/python/tokenspeed/runtime/engine/event_loop.py
index 3b857d079..092e89b03 100644
--- a/python/tokenspeed/runtime/engine/event_loop.py
+++ b/python/tokenspeed/runtime/engine/event_loop.py
@@ -334,6 +334,7 @@ def __init__(
             paged_cache_groups=paged_cache_groups,
             enable_mixed_prefill_decode=enable_mixed_prefill_decode,
             prefix_cache_adjunct=prefix_cache_adjunct,
+            max_loras=server_args.max_loras if server_args.enable_lora else 0,
         )
         logger.info(
             "Scheduler config: page_size=%s num_device_pages=%s "
diff --git a/python/tokenspeed/runtime/engine/scheduler_utils.py b/python/tokenspeed/runtime/engine/scheduler_utils.py
index fa0c8deff..0b6a3b8e5 100644
--- a/python/tokenspeed/runtime/engine/scheduler_utils.py
+++ b/python/tokenspeed/runtime/engine/scheduler_utils.py
@@ -70,6 +70,7 @@ def make_config(
     mamba_l2_host_slots: int = 0,
     paged_cache_groups: Sequence["PagedCacheGroupConfig"] | None = None,
     enable_mixed_prefill_decode: bool = False,
+    max_loras: int = 0,
 ) -> SchedulerConfig:
     cfg = SchedulerConfig()
     cfg.num_device_pages = num_device_pages
@@ -99,6 +100,7 @@ def make_config(
     cfg.enable_mamba_l2 = enable_mamba_l2
     cfg.mamba_l2_host_slots = mamba_l2_host_slots
     cfg.enable_mixed_prefill_decode = enable_mixed_prefill_decode
+    cfg.max_loras = max_loras
     if paged_cache_groups:
         cfg.paged_cache_groups = list(paged_cache_groups)
     return cfg
diff --git a/tokenspeed-scheduler/bindings/python_module.cpp b/tokenspeed-scheduler/bindings/python_module.cpp
index e40480b28..6c9358dd4 100644
--- a/tokenspeed-scheduler/bindings/python_module.cpp
+++ b/tokenspeed-scheduler/bindings/python_module.cpp
@@ -224,7 +224,8 @@ NB_MODULE(tokenspeed_scheduler_ext, m) {
         .def_rw("mamba_cache_chunk_size", &tokenspeed::SchedulerConfig::mamba_cache_chunk_size)
         .def_rw("mamba_pool_total_chunks", &tokenspeed::SchedulerConfig::mamba_pool_total_chunks)
         .def_rw("enable_mamba_l2", &tokenspeed::SchedulerConfig::enable_mamba_l2)
-        .def_rw("mamba_l2_host_slots", &tokenspeed::SchedulerConfig::mamba_l2_host_slots);
+        .def_rw("mamba_l2_host_slots", &tokenspeed::SchedulerConfig::mamba_l2_host_slots)
+        .def_rw("max_loras", &tokenspeed::SchedulerConfig::max_loras);
 
     nb::class_<tokenspeed::RequestSpec>(m, "RequestSpec")
         .def(nb::init<>())
diff --git a/tokenspeed-scheduler/csrc/scheduler/operations/forward.cpp b/tokenspeed-scheduler/csrc/scheduler/operations/forward.cpp
index 6e644c907..076a07ae2 100644
--- a/tokenspeed-scheduler/csrc/scheduler/operations/forward.cpp
+++ b/tokenspeed-scheduler/csrc/scheduler/operations/forward.cpp
@@ -578,9 +578,25 @@ Scheduler::newForwardOperation(std::vector<Request*> candidates) {
     std::vector<LoadBackOperation> loadback_ops;
     auto simulated_free =
         hybrid_prefix_cache_ ? hybrid_prefix_cache_->InitialSimulatedFree() : std::map<std::string, std::int32_t>{};
+
+    // Track unique LoRA adapter ids in this batch.  When max_loras > 0 we skip
+    // any request whose lora_id would push the count over the cap, deferring it
+    // to the next scheduling round.  This guarantees prepare_loras() never
+    // receives a batch that requires more GPU adapter slots than are available.
+    std::unordered_set<std::int32_t> batch_lora_ids;
+
     for (Request* request : candidates) {
         if (token_budget <= 0 || config_.max_batch_size == ops.size()) break;
 
+        // LoRA adapter cap: skip requests that would exceed max_loras unique ids.
+        if (config_.max_loras > 0 && request->lora_id() != kLoraNone) {
+            bool is_new = batch_lora_ids.find(request->lora_id()) == batch_lora_ids.end();
+            if (is_new && static_cast<std::int32_t>(batch_lora_ids.size()) >= config_.max_loras) {
+                continue;  // defer to next step
+            }
+            batch_lora_ids.insert(request->lora_id());
+        }
+
         if (request->Is<fsm::Prefilling>() && config_.role != Role::kD) {
             std::int32_t reserver_num_tokens = config_.role == Role::kP ? 0 : config_.decode_input_tokens;
             if (auto ev = schedulePrefill(request, token_budget, reserver_num_tokens, simulated_free)) {
diff --git a/tokenspeed-scheduler/csrc/scheduler/types.h b/tokenspeed-scheduler/csrc/scheduler/types.h
index 892fd79ef..a34d7b669 100644
--- a/tokenspeed-scheduler/csrc/scheduler/types.h
+++ b/tokenspeed-scheduler/csrc/scheduler/types.h
@@ -103,6 +103,12 @@ struct SchedulerConfig {
     std::int32_t mamba_pool_total_chunks{0};
     bool enable_mamba_l2{false};
     std::int32_t mamba_l2_host_slots{0};
+
+    // Maximum number of unique LoRA adapter ids allowed in a single batch.
+    // 0 means LoRA is disabled (no cap enforced).  When set, newForwardOperation
+    // defers requests that would push the batch over this limit to the next step,
+    // guaranteeing that prepare_loras() never sees n_unique > max_loras.
+    std::int32_t max_loras{0};
 };
 
 }  // namespace tokenspeed