diff --git "a/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" "b/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md"
new file mode 100644
index 0000000..77175cc
--- /dev/null
+++ "b/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md"
@@ -0,0 +1,857 @@
+# KV 设计 5 - SSD 存储
+
+## 设计目标
+
+SSD 存储在 Fluxon KV 中作为 owner 本地 backing tier 接入通用 KV 链路。它不是一套独立的读写 API，也不改变用户侧 `put/get/delete` 语义；master 仍然以 key-version 为单位维护路由，内存副本是第一数据源，SSD 副本是内存副本不可用时的回填数据源。
+
+读取侧采用“内存优先、SSD 回填”的设计。`GetStart` 优先选择 live 内存副本；没有可用内存副本时，master 才选择 SSD owner，并分配 SSD owner 本机 source staging 和 requester target。SSD owner 从本地 SSD 读入 source staging，再复用现有 transfer engine 把数据推到 requester target，最后继续使用原有 `GetDone` 和 `MemHolder` 生命周期。
+
+## 公共契约
+
+公共配置只有一个 owner-only 字段：
+
+```yaml
+fluxonkv_spec:
+  large_file_paths: [/data/fluxon_large]
+  ssd_storage:
+    max_bytes: 4294967296
+```
+
+规则：
+
+- `ssd_storage` 缺省或为 `null` 时不启用 SSD。
+- `max_bytes` 必须大于或等于 512 bytes，满足当前 `O_DIRECT` 对齐约束。
+- zero-contribution external 禁止声明 `ssd_storage`；external 只能通过 owner 的 mmap、RPC 和 transfer surface 访问 SSD 回填结果。
+- 实际目录为每个可用 `large_file_root` 下的 `<cluster_name>_cluster_kv_ssd_storage/<safe_instance_key>/`；owner 启动时创建目录并读取 `metadata.dev()`，同一个 device 只保留第一个 root，避免多个路径指向同一块盘时制造虚假的 IO 并行度。
+- 用户侧 `put/get/delete` API 不因 SSD 增加新入口；SSD 副本是 master 路由内部能力。
+
+## 范围边界
+
+| 范围 | 当前结论 |
+| --- | --- |
+| 分布式 SSD 读取 | 已接入。读取 key 时，master 仍优先选择可用内存副本；没有可用内存副本时，才选择持有 SSD 副本的 owner。磁盘数据先读到 SSD owner 本机的 source staging，再传到请求方 owner 的 target allocation。 |
+| owner 内部多 SSD 路径 | 已接入。owner 可通过多个 `large_file_paths` 使用多块本地 SSD；路径会先按实际 device 去重，只有落在不同 device 上的 SSD cache root 目录才会创建独立读写队列、`UringIoEngine` 和 shard 文件集。 |
+| 内存 KV 复用 | 已复用。SSD 回填继续走现有 KV transfer 链路：SSD owner 按 chunk 读出数据后，通过 `transfer_data_no_copy` 写到请求方 target；全部 chunk 完成后，SSD owner 向 master 提交 `get_done`，用户侧仍通过普通 `get` 拿到 `MemHolder`，不需要调用 SSD 专用接口。 |
+| SSD 写入 IO 模型 | 已接入。owner 完成内存 `PutDone` 后，再异步把同一份 payload 写入本地 SSD。SSD 写入在 `KvSsdStorage` 内完成，使用 shard ring、`O_DIRECT`、`io_uring`、有界队列和 `Writing -> Committed` 两阶段提交。 |
+| ring 位置生命周期 | 已接入。SSD 读写会保护正在使用的物理位置：读 IO 提交前会 pin 已提交的 entry；未完成写入的 `Writing` entry 和正在读取的 pinned entry 都不会被新的写入覆盖。 |
+| 大 payload direct stage | 已接入 aligned fast path 和 chunk pipeline。master 给 SSD source staging 多分配最多 511 bytes，并在 allocation 内返回 512-byte 对齐后的 `src_addr`；SSD read 按 chunk 对齐 IO 长度直接写入 staging，chunk ready 后立刻 transfer，`MemHolder` 仍只使用真实 payload 长度。 |
+| 冷启动恢复 | 当前不支持。owner 启动时不会扫描已有 SSD shard 来重建 master 路由；SSD 副本路由只来自本轮运行期间的 `put/get/delete` 生命周期。 |
+| lease key 专门治理 | 当前没有专用策略。带 lease 的 key 和普通 key 使用同一套 key-version 路由与 SSD 副本生命周期，SSD 层不单独维护 lease 过期扫描或清理规则。 |
+| 独立 SSD 路径参数 | 不提供。SSD cache 目录统一从 owner 的 `large_file_paths` 派生，不再增加单独的 SSD 路径配置，避免日志、共享 bundle、FS disk cache 和 KV SSD cache 出现多套路径来源。 |
+
+## 数据流
+
+```mermaid
+flowchart TD
+    A["owner put target allocation"] --> B["write bytes into owner mmap"]
+    B --> G["owner -> master PutDone(memory_ready)"]
+    G --> H["master route: nodes_replicas"]
+    B --> C["async KvSsdStorage.persist_from_addr(key, put_id, addr, len)"]
+    C --> D["copy payload to 512-byte aligned buffer"]
+    D --> E["per-device writer queue"]
+    E --> E2["SsdRingBuffer 分配 shard_id + file_offset，记录 Writing entry"]
+    E2 --> E3["O_DIRECT + io_uring writev 写入 SSD shard 文件"]
+    E3 --> F["提交索引：Writing -> Committed"]
+    F --> I["owner -> master SsdReplicaCommit"]
+    I --> J["master route: ssd_replicas"]
+
+    J["get_start"] --> K{"live memory replica?"}
+    K -->|yes| L["return GetSourceKind::Memory"]
+    L --> M["existing transfer path"]
+
+    K -->|no| N{"live SSD replica?"}
+    N -->|yes| O["allocate source staging on SSD owner"]
+    O --> P["allocate target on requester"]
+    P --> Q["return GetSourceKind::Ssd"]
+    Q --> R0["SsdRingBuffer pin committed entry"]
+    R0 --> R1["根据 entry.shard_id 找到 device reader queue"]
+    R1 --> R["从 SSD shard 文件按 file_offset 读取 chunk"]
+    R --> S["SsdLoadedChunk(offset,len)"]
+    S --> W["SSD owner transfer chunk: staging+offset -> requester target+offset"]
+    W --> T["all chunks done: SSD owner -> master GetDoneReq"]
+    T --> V["SsdStageReadResp carries GetDoneResp fields"]
+
+    N -->|no| U["KeyNotFound"]
+```
+
+## 端到端调用时序
+
+SSD 路径只在两个位置扩展主链路：`put_done` 提交内存副本后，owner 异步把本地 target allocation 落到 SSD，并在完成后单独提交 SSD 副本；`get_start` 找不到可用内存副本时，master 为 SSD owner 分配 source staging，再由 SSD owner 按 chunk 把磁盘数据读入 staging 并 push 到 requester target。`get_done` 和 `MemHolder` 生命周期仍复用原有内存 KV 逻辑。SSD 回填时，最终 holder 对应的是请求方 owner 上的 target allocation；SSD owner 只负责从本地 SSD 读出数据、把全部 chunk 传到请求方 target，并在传输完成后向 master 调用 `GetDoneReq`。master 返回的 holder 字段会由 SSD owner 放入 `SsdStageReadResp` 带回请求方，请求方再用这些字段构造普通 `MemHolder`。
+
+```mermaid
+sequenceDiagram
+    participant C as requester owner
+    participant M as master
+    participant SO as SSD owner
+    participant TE as transfer engine
+    participant SSD as SSD shard files / SsdRingBuffer
+
+    C->>M: PutStartReq(key, len)
+    M-->>C: PutStartResp(target allocation)
+    Note over C: payload 写入 target allocation
+    C->>M: PutDoneReq(memory_ready)
+    Note right of M: nodes_replicas 写入内存副本\nkey-version 立即可读\nspawn post_put_ssd_replica_persist
+    M-->>C: PutDoneResp
+    M->>C: async SsdReplicaPersistReq(key, put_id, target_addr, len)
+    C->>SSD: KvSsdStorage.persist_from_addr(...)
+    Note over SSD: aligned buffer -> per-device writer queue\nSsdRingBuffer 分配 shard_id + file_offset\nO_DIRECT + io_uring writev 写入 SSD shard 文件\nWriting -> Committed
+    C->>M: SsdReplicaCommitReq(key, put_id, node_id, len)
+    Note right of M: ssd_replicas 写入 SSD 副本
+
+    C->>M: GetStartReq(key)
+    alt live memory replica exists
+        M-->>C: GetStartResp(source_kind=Memory, src_addr, target_addr)
+    else only SSD replica exists
+        Note right of M: 在 SSD owner CPU segment 分配 source_allocation\n在 requester CPU segment 分配 target allocation
+        M-->>C: GetStartResp(source_kind=Ssd, src_addr, target_addr, ssd_stage_len)
+        C->>SO: SsdStageReadReq(get_id, stage_addr=src_addr, stage_len=ssd_stage_len, target_node_id, target_addr, len)
+        SO->>SSD: load_into_addr_chunks(key, put_id, stage_addr, len, stage_len)
+        Note over SSD: SsdRingBuffer pin committed entry\n按 entry.shard_id 进入 device reader queue\n从 SSD shard 文件按 file_offset 读取 chunk
+        loop each ready chunk
+            SSD-->>SO: SsdLoadedChunk(offset, stage_addr+offset, chunk_len)
+            SO->>TE: transfer_data_no_copy(write, stage+offset -> target+offset, chunk_len)
+        end
+        SO->>M: GetDoneReq(get_id)
+        Note right of M: target allocation 进入 get_holding\nsource_allocation 释放
+        M-->>SO: GetDoneResp(holder_id for requester target)
+        SO-->>C: SsdStageReadResp(done_holder_id, done_allocation_mode)
+    end
+    opt source_kind=Memory
+        C->>TE: transfer_data_no_copy(read, src_addr -> target_addr, len)
+        C->>M: GetDoneReq(get_id)
+        Note right of M: target allocation 进入 get_holding
+        M-->>C: GetDoneResp(holder_id)
+    end
+```
+
+
+## 接口里的角色分工
+
+SSD 逻辑按接口看最清楚：`put` 先让一个 key-version 的内存副本 ready，再异步补交 SSD 副本；`get` 决定读请求先走内存副本还是 SSD fallback。每个接口里再分 master、owner、external 三个角色看状态归属。
+
+### put
+
+```mermaid
+sequenceDiagram
+    participant E as external
+    participant O as owner
+    participant M as master
+    participant SSD as owner SSD store
+
+    E->>O: ExternalPutStartReq(key, len)
+    O->>M: PutStartReq(key, len)
+    Note right of M: 分配 put_id 和 src/target allocation\n记录 inflight_puts
+    M-->>O: PutStartResp(put_id, src_addr, target_addr)
+    O-->>E: ExternalPutStartResp(offsets, put_id)
+
+    Note over E,O: external 写 owner mmap/staging
+    E->>O: ExternalPutTransferEndReq(put_id)
+    O->>O: transfer_data_no_copy if remote target
+    O->>M: PutDoneReq(memory_ready)
+    Note right of M: 写 nodes_replicas\nkey-version 立即可读\nspawn post_put_ssd_replica_persist
+    M-->>O: PutDoneResp
+    O-->>E: ExternalPutTransferEndResp
+    M->>O: async SsdReplicaPersistReq(key, put_id, target_addr, len)
+    O->>SSD: persist_from_addr(key, put_id, target_addr, len)
+    Note over SSD: device write_tx -> per-device ssd_writer_loop -> io_uring writev\nWriting -> Committed
+    O->>M: SsdReplicaCommitReq(key, put_id, node_id, len)
+    Note right of M: 写 ssd_replicas
+```
+
+#### master
+
+master 持有 `put` 的权威控制面状态：`inflight_puts` 记录未完成写入，`kv_routes` 记录提交后的当前版本。当前实现里 `PutDoneReq` 只表示内存副本 ready；SSD 副本通过独立 `SsdReplicaCommitReq` 进入 route。
+
+当前协议结构如下。
+
+```rust
+pub struct MasterKvRouterInner {
+    // PutStart 到 PutDone / PutRevoke 期间保留的 put 在途状态。
+    pub inflight_puts: moka::future::Cache<(String, u64, u32), InflightPutInfo>,
+    // 已提交 key-version 的权威路由表。
+    pub kv_routes: DashMap<String, Arc<OneKvNodesRoutes>>,
+    ...
+}
+
+pub struct InflightPutInfo {
+    // 放置策略最终选中的 target owner。
+    pub node_id: NodeID,
+    pub key: String,
+    // 发起这次 put 的原始请求节点。
+    pub req_node_id: NodeID,
+    pub len: u64,
+    // PutDone 前保留 source / target allocation，避免内存被提前释放。
+    pub src_target_allocation: Arc<Mutex<Option<InflightPutAllocation>>>,
+}
+
+pub struct OneKvNodesRoutes {
+    // 当前已提交 value 的稳定版本号。
+    pub put_id: PutIDForAKey,
+    // 内存副本路由；PutDone 成功后立即写入。
+    pub nodes_replicas: RwLock<HashMap<NodeID, KvRouteInfo>>,
+    // SSD 副本路由；只记录 owner 和长度，不保存本地文件 offset。
+    pub ssd_replicas: RwLock<HashMap<NodeID, KvSsdRouteInfo>>,
+    ...
+}
+
+pub struct PutDoneReq {
+    pub key: String,
+    // 和当前 route 版本匹配时，才提交内存副本。
+    pub put_id: PutIDForAKey,
+    pub lease_id: Option<u64>,
+}
+
+pub struct SsdReplicaCommitReq {
+    pub key: String,
+    // SSD late commit 必须用这个版本号防止污染新 route。
+    pub put_id: PutIDForAKey,
+    // 完成 SSD persist 的 owner 节点。
+    pub node_id: NodeIDString,
+    // 真实 payload 长度；SSD 文件 offset 只保存在 owner 本地。
+    pub len: u64,
+}
+```
+
+`PutStartReq` 到达 master 后，master 分配 `put_id` 和源/目标 allocation，并把 allocation 放进 `InflightPutInfo.src_target_allocation`。`PutDoneReq` 到达时，master 只把 target allocation 写入 `nodes_replicas`，此时 key-version 已经可被 `get` 命中。SSD owner 后续完成落盘后再发 `SsdReplicaCommitReq`，master 校验 `kv_routes[key].put_id == put_id` 后，把 `KvSsdRouteInfo { node_id, len, tomb_tag }` 写入同一个 `OneKvNodesRoutes.ssd_replicas`。master 不保存 SSD 文件 offset，也不保存 owner 本地 ring index。
+
+#### owner
+
+owner 持有数据面：本机 CPU segment、可选 SSD store、put transfer 和 SSD persist。当前实现里，SSD persist 发生在 master 收到 `PutDoneReq` 并提交内存路由之后，不能阻塞内存副本 ready。
+
+当前 owner 字段如下。
+
+```rust
+pub struct ClientKvApiInner {
+    // owner 本地可选 SSD cache；external 不直接持有它。
+    ssd_storage: Option<Arc<KvSsdStorage>>,
+    rpc_caller_put_start: RPCCaller<PutStartReq>,
+    rpc_caller_put_done: RPCCaller<PutDoneReq>,
+    rpc_caller_ssd_replica_commit: RPCCaller<SsdReplicaCommitReq>,
+    ...
+}
+
+pub struct SsdReplicaPersistReq {
+    pub key: String,
+    pub put_id: PutIDForAKey,
+    // 已经 PutDone 的内存 target 绝对地址，owner 从这里复制 payload 到 SSD。
+    pub target_addr: u64,
+    pub len: u64,
+}
+
+pub struct KvSsdStorage {
+    // 按 device 去重后的 SSD cache root 目录。
+    root_dirs: Vec<PathBuf>,
+    // 每个有效 device 对应一个读写 worker。
+    devices: Vec<SsdDeviceWorker>,
+    // shard_id 到 device worker 的映射，读路径按它选择 reader queue。
+    shard_to_device: Vec<usize>,
+    // 写入按有效 device 做 round-robin。
+    next_write_device: AtomicUsize,
+    // 全部 shard ring 和 key-version 索引的共享状态。
+    inner: Arc<Mutex<KvSsdStorageInner>>,
+    // ring 空间被 active IO 占住时，用它通知 writer 重试。
+    space_notify: Arc<Notify>,
+}
+
+struct SsdDeviceWorker {
+    // Linux metadata.dev() 得到的实际 device 标识。
+    device_id: u64,
+    root_dir: PathBuf,
+    // 这个 device 负责的 shard 文件编号。
+    shard_ids: Vec<usize>,
+    // 持有 shard 文件 fd，保证 uring IO 生命周期内 fd 有效。
+    _files: Vec<std::fs::File>,
+    // 这个 device 独立的 io_uring engine。
+    _io: Arc<UringIoEngine>,
+    // per-device 写队列。
+    write_tx: tokio_mpsc::Sender<WriteCommand>,
+    // per-device 读队列。
+    read_tx: tokio_mpsc::Sender<ReadCommand>,
+}
+
+struct KvSsdStorageInner {
+    // 管理各 shard 文件内的环形 offset 空间和 key-version 索引。
+    ring: SsdRingBuffer,
+}
+```
+
+当 master 把这次 put 的最终 target allocation 放在某个 owner 上时，这个 owner 就是该 key-version 的内存副本 owner。`PutDoneReq` 只把这个 target allocation 提交到 `nodes_replicas`；提交完成后，这个 key-version 已经可以被普通 `get` 读到。SSD 落盘不在 `PutDoneReq` 的同步路径里；master 会在后台 task 中向同一个 target owner 发送 `SsdReplicaPersistReq { key, put_id, target_addr, len }`。这个后台 task 会继续持有 target allocation 的 `Arc<Allocation>`，保证 owner 从内存复制 payload 到 SSD 之前，这块内存不会被释放或复用。
+
+target owner 收到 `SsdReplicaPersistReq` 后，从 `target_addr` 指向的内存 target 复制完整 payload，并构造 512-byte 对齐的 `AlignedBuffer`。随后 `persist_buffer` 按 value 级别通过 `next_write_device` round-robin 选择一个有效 device 的 `write_tx`；当前实现不会把同一个 payload 拆到多个 device。该 device 的 `ssd_writer_loop` 只在自己的 `shard_ids` 中选择一个 shard，由 `SsdRingBuffer::prepare_write_on_shards(...)` 为整个 aligned payload 分配一段连续 `file_offset`，并先记录 `Writing(SsdIndexEntry)`。对应 device 的 `UringIoEngine` 对这个 shard 文件执行 `O_DIRECT + writev`；写入成功后，entry 才从 `Writing` 提交为 `Committed`。最后 owner 向 master 发送 `SsdReplicaCommitReq`；master 校验请求里的 `put_id` 与当前内存 route 的 `put_id` 相同后，才会把这个 key-version 的 SSD 副本补充进 `ssd_replicas`。写队列和底层 uring 队列都是有界队列；如果 SSD 变慢，背压只停在 owner 本地 SSD persist 路径，不会回头改变已经完成的内存 `PutDone` 语义。
+
+#### external
+
+external 的状态边界只到 owner mmap 写入：它保存本次 put 所需的 `key`、`len`、`put_id` 和 mmap offset。SSD route 由 master 管理，SSD 文件位置由 target owner 本地 `SsdRingBuffer` 管理，external 不保存也不更新这些状态。
+
+```rust
+pub struct ExternalPutStartReq {
+    pub key: String,
+    pub len: u64,
+    // 透传给 master PutStart，用于拒绝同 key 并发 put。
+    pub reject_if_inflight_same_key: bool,
+    // 透传给 master 放置策略，影响 target owner 选择。
+    pub preferred_sub_cluster: Option<String>,
+    // owner 代际校验，防止旧 external 请求提交到新 owner。
+    pub started_time: i64,
+    pub test_observe_put_phases: bool,
+}
+
+pub struct ExternalPutTransferEndReq {
+    pub key: String,
+    pub len: u64,
+    // external 实际写入的 owner mmap offset；远端 target 时它是本地 staging。
+    pub src_offset: u64,
+    // 本地 target 时等于最终 target；远端 target 时由 owner 内部上下文修正。
+    pub target_offset: u64,
+    // 远端 target owner；本地 target 时为空。
+    pub peer_id: Option<String>,
+    // 远端 target owner 的 base addr；本地 target 时为空。
+    pub target_base_addr: Option<u64>,
+    // ExternalPutStart 返回的版本号，TransferEnd 用它完成 PutDone。
+    pub put_id: Option<PutIDForAKey>,
+    pub lease_id: Option<u64>,
+    pub started_time: i64,
+    pub test_observe_put_phases: bool,
+}
+```
+
+external put 仍然是 `ExternalPutStart -> 写 owner mmap -> ExternalPutTransferEnd`。`ExternalPutTransferEndResp` 只代表内存提交完成；SSD 是否启用、何时 persist 成功、何时写入 `ssd_replicas` 都由 owner 和 master 的内部 commit 协议决定。external 只通过 `started_time` 做 owner 代际校验，避免把旧代际请求提交给新 owner。
+
+### get
+
+```mermaid
+sequenceDiagram
+    participant E as external
+    participant RO as requester owner
+    participant M as master
+    participant SO as SSD owner
+    participant TE as transfer engine
+    participant SSD as owner SSD store
+
+    E->>RO: ExternalGetReq(key)
+    RO->>M: GetStartReq(key)
+    alt memory replica exists
+        M-->>RO: GetStartResp(source_kind=Memory)
+    else SSD fallback
+        Note right of M: 在 SSD owner 分配 source_allocation\n在 requester owner 分配 target allocation\n写 inflight_gets
+        M-->>RO: GetStartResp(source_kind=Ssd, src_addr, target_addr, ssd_stage_len)
+        RO->>SO: SsdStageReadReq(get_id, stage_addr=src_addr, stage_len=ssd_stage_len, target_node_id, target_addr, len)
+        SO->>SSD: load_into_addr_chunks(key, put_id, stage_addr, len, stage_len)
+        Note over SSD: pin committed entry\nproducer 按 chunk readv direct 或 scratch fallback
+        loop each ready chunk
+            SSD-->>SO: SsdLoadedChunk(offset, stage_addr+offset, chunk_len)
+            SO->>TE: transfer_data_no_copy(write, stage+offset -> target+offset, chunk_len)
+        end
+        SO->>M: GetDoneReq(get_id)
+        Note right of M: requester target allocation -> get_holding\nsource_allocation 释放
+        M-->>SO: GetDoneResp(holder_id for requester target)
+        SO-->>RO: SsdStageReadResp(done_holder_id, done_allocation_mode)
+    end
+    opt source_kind=Memory
+        RO->>RO: transfer_data_no_copy(read, src_addr -> target_addr, len)
+        RO->>M: GetDoneReq(get_id)
+        Note right of M: target allocation -> get_holding
+        M-->>RO: GetDoneResp(holder_id)
+    end
+    RO-->>E: ExternalGetResp(ExternalMemHolderInfo)
+```
+
+#### master
+
+master 是 `get` 的控制面 authority：`kv_routes` 决定当前 key-version 可以从哪些内存或 SSD 副本读取，`inflight_gets` 记录本次 get 的 source/target allocation，`get_holding` 记录 `GetDone` 后仍被 holder 持有的 requester target allocation。
+
+```rust
+pub struct MasterKvRouterInner {
+    // GetStart 到 GetDone / GetRevoke 期间保留的 get 在途状态。
+    pub inflight_gets: moka::future::Cache<u64, InflightGetInfo>,
+    // GetDone 后的 holder authority，键由 requester 节点和 holder_id 组成。
+    pub get_holding: MasterOwnerMemMgr,
+    // get_start 查询的当前稳定 key-version 路由。
+    pub kv_routes: DashMap<String, Arc<OneKvNodesRoutes>>,
+    ...
+}
+
+pub struct OneKvNodesRoutes {
+    // 当前稳定版本号，内存副本和 SSD 副本共享它。
+    pub put_id: PutIDForAKey,
+    // 内存副本优先作为 get source。
+    pub nodes_replicas: RwLock<HashMap<NodeID, KvRouteInfo>>,
+    // 内存副本不可用时才作为 SSD fallback source。
+    pub ssd_replicas: RwLock<HashMap<NodeID, KvSsdRouteInfo>>,
+    pub get_durable_slots_used: AtomicU32,
+}
+
+pub struct KvSsdRouteInfo {
+    // 持有本地 SSD 副本的 owner。
+    pub node_id: NodeID,
+    // 真实 payload 长度；SSD stage 和 transfer 对外只暴露这个长度。
+    pub len: u64,
+    // 和内存 route 对齐的节点代际，用于失效判断。
+    pub tomb_tag: NodeTombTag,
+}
+
+pub struct InflightGetInfo {
+    // 本次 get 命中的 key-version，用于拒绝过期完成。
+    pub put_id: PutIDForAKey,
+    // master 选中的 source 节点；SSD fallback 时是 SSD owner。
+    pub src_node_id: NodeID,
+    // 发起 get 的 requester owner，最终 holder 归属使用它。
+    pub req_node_id: NodeID,
+    pub len: u64,
+    // requester target allocation，GetDone 后进入 get_holding。
+    pub allocation: Arc<Allocation>,
+    // SSD source staging allocation；memory source 路径为空。
+    pub source_allocation: Option<Arc<Allocation>>,
+    pub route: Arc<OneKvNodesRoutes>,
+    pub allocation_mode: GetAllocationMode,
+    // 区分 memory source 和 SSD fallback source。
+    pub source_kind: GetSourceKind,
+}
+```
+
+master 处理 `GetStartReq` 时先查 `kv_routes`，并优先选择 live 内存副本。命中内存副本时，`GetStartResp` 返回 `GetSourceKind::Memory`，requester owner 按原有 transfer 路径把数据搬到 requester target。只有没有可用内存副本时，master 才从 `ssd_replicas` 里选择 SSD owner，并同时分配两块 allocation：`source_allocation` 位于 SSD owner，用作本地读盘 staging；`allocation` 位于 requester owner，是最终进入 holder 的 target。`GetStartResp.src_addr` 是 SSD owner 本地对齐后的 staging 地址，`target_addr` 是 requester target 地址，`ssd_stage_len` 是对齐后的 source staging 容量，`len` 始终是真实 payload 长度。
+
+`GetDoneReq` 到达后，master 从 `inflight_gets` 取出本次 get，把 requester target allocation 转入 `get_holding`，并返回 `holder_id`。memory source 路径由 requester owner 调用 `GetDoneReq`；SSD source 路径由 SSD owner 在全部 chunk transfer 完成后调用。无论谁发起 `GetDoneReq`，holder 都归属 `InflightGetInfo.req_node_id` 对应的 requester owner，SSD owner 的 `source_allocation` 只作为读盘 staging，不进入 `get_holding`。
+
+#### owner
+
+owner 在 `get` 里有两个可能角色：requester owner 负责调用 master，并根据 `GetSourceKind` 选择 memory transfer 或 SSD stage RPC；SSD owner 负责响应 `SsdStageReadReq`，读取本地 SSD，把读出的 bytes 按 chunk push 到 requester target，并在全部 chunk transfer 完成后向 master 发送 `GetDoneReq`。
+
+```rust
+pub struct ClientKvApiInner {
+    // requester owner 和 SSD owner 都通过它访问本地 SSD cache。
+    ssd_storage: Option<Arc<KvSsdStorage>>,
+    // external get 返回的 holder 在 owner 侧的借用表。
+    pub external_get_holding: OwnerExternalMemMgr,
+    rpc_caller_get_start: RPCCaller<GetStartReq>,
+    rpc_caller_get_done: RPCCaller<GetDoneReq>,
+    rpc_caller_ssd_stage_read: RPCCaller<SsdStageReadReq>,
+    ...
+}
+
+pub struct SsdStageReadReq {
+    pub key: String,
+    pub put_id: PutIDForAKey,
+    // SSD owner 用它在全部 chunk transfer 完成后调用 master GetDoneReq。
+    pub get_id: u64,
+    // master 在 SSD owner 上分配的 source staging 对齐地址。
+    pub stage_addr: u64,
+    // source staging 容量，包含 O_DIRECT 对齐需要的空间。
+    pub stage_len: u64,
+    // 最终接收数据的 requester owner。
+    pub target_node_id: NodeIDString,
+    // requester target allocation 的绝对地址。
+    pub target_addr: u64,
+    // 真实 payload 长度。
+    pub len: u64,
+}
+
+pub struct SsdStageReadResp {
+    // master GetDoneResp 的 holder_id 投影。
+    pub done_holder_id: u64,
+    // master GetDoneResp 的 allocation_mode 投影。
+    pub done_allocation_mode: GetAllocationMode,
+    // master GetDoneResp 的状态字段投影。
+    pub done_error_code: ErrorCode,
+    pub done_error_json: String,
+    pub done_server_process_us: i64,
+    // SsdStageRead RPC 自身的状态字段。
+    pub error_code: ErrorCode,
+    pub error_json: String,
+}
+```
+
+requester owner 收到 `GetSourceKind::Memory` 时，继续走原有内存 transfer：从 `src_addr` 读，把数据写到 `target_addr`，传输完成后由 requester owner 自己调用 master `GetDoneReq`。收到 `GetSourceKind::Ssd` 时，requester owner 不自己读 SSD，也不自己调用 `get_done`；它向 SSD owner 发起 `SsdStageReadReq`，等待 `SsdStageReadResp` 带回 master `GetDoneResp` 的 holder 字段。
+
+SSD owner 收到 `SsdStageReadReq` 后，在本地执行 `load_and_push_kv_from_ssd(...)`。read producer 先 pin 当前 committed entry，再按 chunk 从 SSD shard 文件读到 `stage_addr + offset`；transfer consumer 每收到一个 `SsdLoadedChunk`，就把 `stage_addr + offset` 推到 requester 的 `target_addr + offset`。全部 chunk transfer 成功后，SSD owner 用 `get_id` 向 master 调 `GetDoneReq`，再把返回的 `holder_id` 和 `allocation_mode` 填入 `SsdStageReadResp.done_*` 返回 requester。读路径进入 per-device reader queue，底层 `UringIoEngine` 把 read/write 分成独立发送队列，并按 inflight 比例补读，避免回填读长期排在持续写入之后。
+
+```rust
+struct SsdRingBuffer {
+    // key-version 到 Writing/Committed SSD 位置的全局索引。
+    entries: HashMap<KvSsdKey, SsdEntryState>,
+    // active read pin，防止 writer 推进 tail 覆盖正在读取的位置。
+    read_pins: HashMap<KvSsdKey, SsdReadPinInfo>,
+    ...
+}
+
+enum SsdEntryState {
+    // 已分配 offset 但 writev 尚未完成。
+    Writing(SsdIndexEntry),
+    // writev 成功后才允许 get_start 作为 SSD source 命中。
+    Committed(SsdIndexEntry),
+}
+```
+
+`read_pins` 是 owner 本地 SSD ring 的生命周期保护，防止 writer 推进 tail 时覆盖 active read。chunk pipeline 在整个 producer 生命周期内持有同一个 read pin；每个 chunk 单独提交 read task。direct read 条件满足时，`readv` 直接写到 `SsdStageReadReq.stage_addr + offset`；否则先读 scratch aligned buffer，再复制当前 chunk 的真实 payload 长度到 staging。direct read 省掉的是 scratch buffer 到 source staging 的本机 memcpy，不省掉 `source staging -> requester target` 的 transfer。请求方 target 是否远端不影响 SSD direct read 的对齐判断。
+
+#### external
+
+external 的状态边界只到 owner 返回的 mmap holder：它发 `ExternalGetReq` 给 requester owner，并接收 `ExternalMemHolderInfo { offset, len, holder_id }`。SSD route 由 master 管理，SSD 文件位置和 source staging 由 SSD owner 管理，external 不保存也不更新这些状态。
+
+```rust
+pub struct ExternalGetReq {
+    pub key: String,
+    // external 通过 owner 发起 get，req_node_id 仍指向请求方身份。
+    pub req_node_id: String,
+    // owner 代际校验，防止过期 external 请求继续使用旧 owner。
+    pub started_time: i64,
+}
+
+pub struct ExternalGetResp {
+    pub error_code: ErrorCode,
+    pub error_json: String,
+    // 成功时返回 external 可见的 holder 元数据。
+    pub external_memholder_info: Option<ExternalMemHolderInfo>,
+}
+
+pub struct ExternalMemHolderInfo {
+    // external attach 到 owner mmap 后可见的 offset。
+    pub offset: u64,
+    // 真实 payload 长度。
+    pub len: u32,
+    // 后续 release ack 使用的 holder id。
+    pub holder_id: u64,
+}
+
+pub struct ExternalMemHolder {
+    pub offset: u64,
+    // 当前 external 进程内 mmap 后的绝对地址。
+    pub addr: u64,
+    pub len: u32,
+    pub holder_id: u64,
+    pub key: String,
+    pub external_client_id: String,
+    // drop/release 时校验 owner 代际。
+    pub owner_start_time: i64,
+    ...
+}
+```
+
+owner 内部完成普通 `get` 后，会把 `MemoryInfo` 写入 `external_get_holding`，用这条 owner 侧引用代表 external 当前仍在借用该 holder；随后 owner 只把 `ExternalMemHolderInfo { offset, len, holder_id }` 返回给 external。external 构造 `ExternalMemHolder` 后，通过 owner mmap 的 `offset/addr` 读取结果。external holder drop 时，会向 owner 发送 `ExternalDeleteAckReq`；owner 删除 `external_get_holding` 中对应记录，释放 external 这一份引用。只有当 owner 侧不再有其它 `Arc<MemoryInfo>` 引用时，`MemoryInfo` drop 才会沿用原有 owner -> master holder ack 链路释放 master `get_holding`。
+
+### stage 失败和释放
+
+```mermaid
+sequenceDiagram
+    participant RO as requester owner
+    participant M as master
+    participant SO as SSD owner
+
+    RO->>SO: SsdStageReadReq
+    SO-->>RO: stage error
+    RO->>M: GetRevokeReq(drop_ssd_source=true)
+    Note right of M: 查 inflight_gets\n确认 source_kind=Ssd\n删除 route.ssd_replicas[src_node_id]
+    alt no live replica remains
+        M->>M: remove kv_routes and prefix index
+    end
+```
+
+```rust
+pub struct GetRevokeReq {
+    // 要撤销的在途 get。
+    pub get_id: u64,
+    // 只有 SSD stage 失败时才置 true，用来删除失败的 SSD source route。
+    pub drop_ssd_source: bool,
+}
+```
+
+SSD stage 失败时，请求方调用 `get_revoke_ssd_source(...)`，也就是 `GetRevokeReq { drop_ssd_source: true }`。master 从 `inflight_gets` 找到本次 get，只有 `source_kind == GetSourceKind::Ssd` 时才会删除 `route.ssd_replicas[src_node_id]`，避免后续 get 继续选择同一个失败 SSD source。如果同一个 `OneKvNodesRoutes` 下已经没有 live 内存副本和 SSD 副本，master 再删除 `kv_routes` 并异步清理 prefix index。
+
+这里的释放边界是：SSD owner 上的 `source_allocation` 只服务本次 stage，失败后随 `inflight_gets` 清理释放；requester target allocation 没有进入 `get_holding`，因此不会生成用户可见 holder。
+
+## 关键代码片段
+
+### put_done 只提交内存副本
+
+当前实现中，`put_done` 只把内存 target allocation 写入 `nodes_replicas`。SSD 是否落盘不影响这次 `PutDone` 的可见性。
+
+```rust
+pub struct PutDoneReq {
+    pub key: String,
+    // 只提交这个 key-version 的内存副本。
+    pub put_id: PutIDForAKey,
+    pub lease_id: Option<u64>,
+}
+
+// 这里只把内存 target 写入 nodes_replicas；SSD 副本稍后独立 commit。
+one_kv_routes
+    .nodes_replicas
+    .write()
+    .insert(node_id.clone(), completed_info);
+```
+
+这段边界是：`nodes_replicas` 代表内存副本 ready，`get_start` 可以立即从这里返回 memory source；`ssd_replicas` 不能在这一步写入，否则 `PutDone` 会被 SSD persist 延迟拖住。SSD 副本后续用同一个 `put_id` 独立提交。
+
+### SSD replica 独立 commit
+
+SSD owner 后台 persist 成功后，单独向 master 提交同一个 key-version 的 SSD 副本。master 必须校验当前 route 的 `put_id` 仍然匹配，避免旧版本 SSD late commit 污染新版本路由。
+
+```rust
+pub struct SsdReplicaCommitReq {
+    pub key: String,
+    // 必须匹配当前 route 版本，避免 SSD late commit 污染新版本。
+    pub put_id: PutIDForAKey,
+    // 完成落盘的 SSD owner。
+    pub node_id: NodeIDString,
+    // 真实 payload 长度。
+    pub len: u64,
+}
+
+if let Some(route) = kv_routes.get(&req.key) {
+    // 过期 put_id 直接丢弃，不 resurrect 旧版本。
+    if route.put_id == req.put_id {
+        // master 只保存 SSD owner 和长度；文件 offset 留在 owner 本地 ring index。
+        route.ssd_replicas.write().insert(
+            node_id.clone(),
+            KvSsdRouteInfo {
+                node_id: node_id.clone(),
+                len: req.len,
+                tomb_tag,
+            },
+        );
+    }
+}
+```
+
+master 只在 `req.put_id == route.put_id` 时写 `ssd_replicas`；过期 `put_id` 的 late commit 会被丢弃，不能 resurrect 旧版本。`SsdReplicaCommitReq.len` 是真实 payload 长度；SSD shard 文件 offset 不进入 master route，只留在 target owner 本地 `SsdRingBuffer`。
+
+### get_start 分配分布式 SSD staging
+
+SSD fallback 发生在 master 已经没有可用 `nodes_replicas` 之后。source staging 一定分配在 SSD owner 的 CPU segment 上，target allocation 一定分配在 requester 的 CPU segment 上。
+
+```rust
+// SSD read 使用 O_DIRECT，读长度先按 512 bytes 对齐。
+let ssd_stage_len = align_ssd_io_len(ssd_replica.len)?;
+// 额外预留 511 bytes，确保 allocation 内能找到 512-byte 对齐地址。
+let source_alloc_len = ssd_stage_len + SSD_ALIGNMENT as u64 - 1;
+
+// source staging 放在 SSD owner 上，只服务本次读盘和 push。
+let source_allocation = allocate_get_buffer_on_node(
+    &view,
+    &ssd_replica.node_id,
+    source_alloc_len,
+    get_id,
+    "ssd source staging",
+)?;
+// target allocation 放在 requester 上，GetDone 后转成最终 holder。
+let target_allocation = allocate_get_buffer_on_node(
+    &view,
+    &req_node_id,
+    ssd_replica.len,
+    get_id,
+    "requesting target",
+)?;
+
+// 返回给 SSD owner 的是对齐后的 staging 地址，不一定等于 allocation 起点。
+let source_addr = align_ssd_stage_addr(source_base + source_allocation.addr())?;
+```
+
+这里的关键边界是：`source_allocation` 在 SSD owner 上，只用于读盘 staging；`target_allocation` 在 requester owner 上，成功 `GetDone` 后进入 `get_holding`。`source_alloc_len = align_up(len, 512) + 511`，保证 allocation 内总能找到 512-byte 对齐的 `src_addr`；`src_addr` 是对齐后的 staging 地址，不一定等于 `source_allocation` 起点。
+
+### requester 触发 SSD owner stage/push/done
+
+请求方收到 `GetSourceKind::Ssd` 后，让 SSD owner 把数据读入 `src_addr`、按 chunk push 到 `target_addr + offset`，并由 SSD owner 直接完成 master `get_done`。这里没有新增用户 API；`SsdStageReadReq` 是 owner 内部 RPC。stage RPC 成功返回时，requester target 已经可读，并且 requester 已经拿到 master done 结果；请求方跳过自己的 transfer 分支，也跳过自己的 `get_done`。
+
+```rust
+let mut ssd_done_resp = None;
+if resp.source_kind == GetSourceKind::Ssd {
+    // SSD owner 负责读盘、push chunk，并在完成后调用 master GetDoneReq。
+    let done_resp = self.stage_kv_from_ssd_source(
+        &resp.node_id,
+        key,
+        put_id,
+        get_id,
+        resp.src_addr,
+        resp.target_addr,
+        data_len as u64,
+        resp.ssd_stage_len,
+    )
+    .await?;
+    ssd_done_resp = Some(done_resp);
+}
+
+if resp.source_kind == GetSourceKind::Ssd {
+    // SSD owner 已经把全部 chunk push 到 target_addr，并完成 get_done。
+} else {
+    // memory source 路径仍由 requester 自己做 transfer。
+    self.view.client_transfer_engine()
+        .transfer_data_no_copy(peer_id, true, resp.src_addr, resp.target_addr, len, None)
+        .await?;
+}
+
+let done_resp = if let Some(done_resp) = ssd_done_resp {
+    // SSD source 路径复用 SsdStageReadResp 带回的 GetDoneResp 字段。
+    done_resp
+} else {
+    // memory source 路径的 GetDoneReq 仍由 requester 发送。
+    self.get_done(get_id).await?
+};
+```
+
+SSD source 路径里，`stage_kv_from_ssd_source(...)` 成功返回时，SSD owner 已经完成读盘、chunk transfer 和 master `GetDoneReq`。requester 因此跳过自己的 transfer 和 `get_done`，直接复用 `SsdStageReadResp.done_*` 构造 holder。memory source 路径仍由 requester 自己 transfer 并调用 `get_done`。
+
+### SSD chunk read 与 direct/scratch fallback
+
+当前实现只有 SSD 回填读路径会把 payload 切成 chunk；SSD 写入按 value 级别一次写入一个 device 的一个 shard 连续 offset。`SsdLoadedChunk` 是 read producer 交给 transfer consumer 的最小就绪单元；`ReadCommand` 记录本次 chunk 要读的 committed entry、shard 文件 offset 和读入目标。
+
+```rust
+pub(crate) struct SsdLoadedChunk {
+    // 当前 chunk 在完整 payload 中的偏移。
+    pub offset: u64,
+    // 当前 chunk 在 SSD owner source staging 中的起始地址。
+    pub stage_addr: u64,
+    // 当前 chunk 的真实 payload 长度，不包含 O_DIRECT padding。
+    pub len: u64,
+}
+
+struct ReadCommand {
+    key: KvSsdKey,
+    // 已 pin 的 committed entry，里面包含 shard_id、file_offset 和长度。
+    entry: SsdIndexEntry,
+    // 这次 chunk read 在 SSD shard 文件内的起始 offset。
+    file_offset: u64,
+    // Direct 表示直接读入 staging，Scratch 表示先读入 aligned buffer。
+    target: ReadTarget,
+    // 持有 read pin，防止 writer 在读完成前覆盖该位置。
+    _read_pin: Option<SsdReadPin>,
+    done_tx: oneshot::Sender<KvResult<ReadOutput>>,
+}
+```
+
+`load_and_push_kv_from_ssd(...)` 把 SSD read 和 transfer 做成流水线：producer 按 chunk 提交 SSD read，并把读好的 chunk 放入 ready queue；consumer 收到 ready chunk 后立即 push 到 requester target。读和传输可以重叠，多个 chunk 可以同时处于 read inflight 或 transfer inflight 状态。
+
+```rust
+// ready queue 让 read producer 和 transfer consumer 解耦。
+let (chunk_tx, chunk_rx) = ::tokio::sync::mpsc::channel(
+    DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT.saturating_mul(2).max(1),
+);
+
+// producer 按 chunk 从 SSD shard 文件读入 source staging。
+let producer = store.load_into_addr_chunks(
+    key,
+    put_id,
+    stage_addr,
+    len,
+    stage_len,
+    DEFAULT_READ_TRANSFER_PIPELINE_CHUNK_BYTES,
+    DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT,
+    chunk_tx,
+);
+// consumer 收到 ready chunk 后立即 push 到 requester target。
+let consumer = self.transfer_loaded_ssd_chunks(peer_id, target_addr, chunk_rx);
+// 两个 future 并发执行，形成 read-transfer pipeline。
+let (producer_res, consumer_res) = ::tokio::join!(producer, consumer);
+```
+
+`load_into_addr_chunks(...)` 先 pin 当前 committed entry，pin 生命周期覆盖整个 producer。每个 chunk 用 `entry.file_offset + offset` 定位 SSD shard 文件中的读取位置，并根据 staging 地址、文件 offset 和 staging 容量选择 direct 或 scratch；chunk read 完成后立即发送 `SsdLoadedChunk`。
+
+```rust
+// pin 生命周期覆盖整个 producer，writer 不能覆盖 active read 位置。
+let (entry, _read_pin) = {
+    let mut inner = self.inner.lock();
+    let Some(entry) = inner.ring.pin_read(&key) else {
+        return Err(KvError::Api(ApiError::KeyNotFound { key: key.key.clone() }));
+    };
+    (entry, SsdReadPin { ... })
+};
+
+// 每个 chunk 在同一个 committed entry 内推进文件 offset。
+let file_offset = entry.file_offset + offset;
+let target = match choose_chunk_read_path(stage_addr, read_len, target_len, file_offset) {
+    // staging 地址、文件 offset 和 IO 长度都满足对齐时走 direct read。
+    SsdReadPath::Direct => ReadTarget::Direct {
+        target_addr: stage_addr,
+        len: read_len as usize,
+    },
+    // 否则先读到 aligned scratch buffer。
+    SsdReadPath::Scratch => ReadTarget::Scratch(AlignedBuffer::zeroed(read_len as usize)?),
+};
+
+// submit_read_command 根据 entry.shard_id 进入对应 device reader queue。
+let output = submit_read_command(key, entry, file_offset, target, None).await?;
+if let ReadOutput::Scratch(buffer) = output {
+    // scratch 路径只把真实 payload 长度复制到 staging。
+    copy_payload_to_stage(buffer, stage_addr, payload_len)?;
+}
+// 下游 transfer 只看到真实 payload 长度。
+ready_tx.send(SsdLoadedChunk { offset, stage_addr, len: payload_len }).await?;
+```
+
+direct 路径把 `readv` 的目标直接设为当前 chunk 的 source staging；scratch 路径先读入 aligned buffer，再只复制当前 chunk 的真实 payload 长度到 staging。两条路径最后都只把真实 payload 长度暴露给 transfer 和 `MemHolder`，不会把 `O_DIRECT` padding 暴露给用户。
+
+## IO 模型
+
+```mermaid
+flowchart TD
+    A["large_file_paths"] --> B["derive SSD roots"]
+    B --> C["create root dirs + metadata.dev()"]
+    C --> D["deduplicate device roots"]
+    D --> E0["SsdDeviceWorker device 0"]
+    D --> E1["SsdDeviceWorker device 1"]
+    E0 --> F0["shard_ids: 0,2,..."]
+    E1 --> F1["shard_ids: 1,3,..."]
+    F0 --> G0["device 0 writer/read queues"]
+    F1 --> G1["device 1 writer/read queues"]
+    G0 --> H0["device 0 UringIoEngine"]
+    G1 --> H1["device 1 UringIoEngine"]
+    I["persist_buffer"] --> J["next_write_device round-robin"]
+    J --> G0
+    J --> G1
+    K["submit_read_command(entry.shard_id)"] --> L["shard_to_device"]
+    L --> G0
+    L --> G1
+```
+
+| 组件 | 设计 |
+| --- | --- |
+| device root | owner 从 `large_file_paths` 派生 SSD cache root；创建目录后用 `metadata.dev()` 判断真实 device，同一 device 只保留一个有效 root。 |
+| shard 文件 | `max_bytes` 是 owner 本地 SSD cache 的容量上限；容量被拆成多个 shard 文件，分布到有效 device root 的 `shards/` 下。`shard_to_device` 记录每个 shard 属于哪个 device。 |
+| 写入选路 | `persist_buffer` 用 `next_write_device` round-robin 选择一个 device；一个 payload 只进入这个 device 的 writer queue，并在该 device 的某个 shard 中分配一段连续 `file_offset`。 |
+| 读取选路 | committed entry 保存 `shard_id` 和 `file_offset`；读 chunk 时通过 `entry.shard_id -> shard_to_device` 找到 device reader queue，再从对应 shard 文件的 `file_offset + offset` 读取。 |
+| per-device worker | 每个有效 device 有独立 writer queue、reader queue 和 `UringIoEngine`；这些 worker 只处理本 device 的 shard 文件 IO。 |
+| 对齐与回收 | SSD shard 使用 `O_DIRECT`，要求地址、长度和文件 offset 512-byte 对齐；不满足 direct 条件的读 chunk 走 scratch buffer。ring head/tail 和 read pin 只在 owner 本地保护 shard 文件位置，不进入 master route。 |
+
+## Task / Actor / 独立线程
+
+这一节只列运行时执行单元，不再重复 device/shard 选路。SSD 没有新增独立的 master route actor；控制面仍由原有 master/owner RPC handler 承载。新增的后台执行主要在 owner 本地：每个有效 device 有 writer task、reader task 和对应的 `UringIoEngine` 后台线程。
+
+### owner 本地 SSD IO 执行单元
+
+| 执行单元 | 创建位置 | 类型 | 输入 | 职责 |
+| --- | --- | --- | --- | --- |
+| `ssd_writer_loop` | `KvSsdStorage::new`，每个 effective device 一个 | `tokio::task::spawn` | `SsdDeviceWorker.write_tx` | 从 `persist_from_addr` 接收写任务，只在本 device 的 `shard_ids` 内调用 `SsdRingBuffer::prepare_write_on_shards`，提交 `writev`，完成后 `commit(Writing -> Committed)`。 |
+| `ssd_reader_loop` | `KvSsdStorage::new`，每个 effective device 一个 | `tokio::task::spawn` | `SsdDeviceWorker.read_tx` | 从 `load_into_addr_chunks` 接收属于本 device shard 的 chunk 读任务，提交 direct/scratch `readv`，校验 offset 仍有效，完成后回传 chunk 读结果；整条 producer 完成后释放 `SsdReadPin`。 |
+| `fluxon-kv-ssd-uring-{idx}` | 每个 device 的 `UringIoEngine::new_multi` | `std::thread::spawn` | `read_rx/write_rx: crossbeam::channel` | 每个线程持有一个 `IoUring`，只提交本 device shard 文件的 `Readv/Writev` SQE，并按 read/write inflight 比例调度后回传 CQE。 |
+
+`KvSsdStorage` 通过每个 `SsdDeviceWorker` 持有 shard fd 和 `UringIoEngine`，保证 fd 与 uring 线程生命周期覆盖 writer/reader task；drop 时关闭 channel 并 join uring 线程。
+
+### 控制面 RPC / 清理任务
+
+| 执行单元 | 创建位置 | 类型 | 输入 | 职责 |
+| --- | --- | --- | --- | --- |
+| `rpc_ssd_replica_commit` | `MasterKvRouter` RPC handler 注册 | `view.spawn(...)` | `SsdReplicaCommitReq` | owner SSD persist 成功后提交 SSD 副本，master 校验 `put_id` 后写 `ssd_replicas`。 |
+| `rpc_ssd_stage_read` | `ClientKvApi` RPC handler 注册 | `view.spawn(...)` | `SsdStageReadReq` | 远端 SSD owner 收到 stage 请求后，在 owner 进程内调用 `load_and_push_kv_from_ssd(...)`；SSD read producer 和 transfer consumer 流水线完成后，再调用 master `get_done` 并回传 done fields。 |
+| `ssd_failure_remove_prefix_index` | `get_revoke(drop_ssd_source=true)` | `view.spawn(...)` | 失败 SSD source 的 key | 当失败 SSD source 是最后一个 live replica 时，异步删除 prefix index。 |
+
+SSD route 的权威更新点仍是原有 master RPC handler：
+
+- `PutDone`：同步更新 `nodes_replicas`，让内存副本立即可读。
+- `SsdReplicaCommit`：SSD persist 完成后同步更新 `ssd_replicas`，并拒绝过期 `put_id`。
+- `GetStart`：同步选择内存副本或 SSD 副本，并写入 `inflight_gets`。
+- `GetRevoke`：同步删除失败 SSD source；必要时触发 prefix index 小任务。
+- `Delete` / 覆盖写失效：复用原有 `delete_broadcast` 管线。
+
+## 不变量
+
+- `nodes_replicas` 和 `ssd_replicas` 都属于同一个 `OneKvNodesRoutes.put_id`，不能跨版本复用。
+- `PutDoneReq` 只表示内存副本 ready；SSD 副本只能由匹配当前 `put_id` 的 `SsdReplicaCommitReq` 补充进 `ssd_replicas`。
+- `SsdReplicaCommitReq` 是内部控制面 RPC，不改变用户侧 `put/get/delete` API。
+- `GetSourceKind::Ssd` 必须同时有 SSD owner source staging 和 requester target allocation；成功后只有 requester target allocation 进入 `get_holding`。
+- SSD 回填失败必须通过 `get_revoke(drop_ssd_source=true)` 清理 in-flight get，并从 master 路由里移除失败的 SSD 副本。
+- master 路由被删除后，旧 SSD bytes 即使还在 shard 文件里，也不能被公共 `get` 命中。
+
+## 关键结论
+
+这套实现把 SSD 作为内存 KV 之外的可回填数据源副本，而不是新增一套用户 API。写入侧先完成内存 `PutDone`，再由 target owner 异步落 SSD，并通过 `SsdReplicaCommitReq` 补充 SSD route；读取侧优先使用内存副本，内存副本不可用时由 SSD owner 从本地 shard 文件读出数据，按 chunk push 到 requester target，再复用原有 `get_done` 和 holder 生命周期。
+
+因此，SSD 相关的 shard ring、`O_DIRECT`、`io_uring`、read pin 和 read/transfer pipeline 都限制在 owner 本地实现内；master 只保存这个 key-version 有哪些 owner 持有 SSD 副本，以及 value 的真实 payload 长度，不保存 SSD 文件 offset、shard_id 或本地 ring 状态。
diff --git a/fluxon_rs/Cargo.lock b/fluxon_rs/Cargo.lock
index a4b0ecd..3e8638a 100644
--- a/fluxon_rs/Cargo.lock
+++ b/fluxon_rs/Cargo.lock
@@ -1237,6 +1237,7 @@ dependencies = [
  "hyper 0.14.32",
  "iceoryx2",
  "iceoryx2-cal",
+ "io-uring",
  "kanal",
  "lazy_static",
  "libc",
@@ -2395,6 +2396,17 @@ dependencies = [
  "str_stack",
 ]
 
+[[package]]
+name = "io-uring"
+version = "0.7.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9080b15e63775b9a2ac7dca720f7050a8b955e092ea0f6020a4a80f69998cdc0"
+dependencies = [
+ "bitflags 2.9.1",
+ "cfg-if",
+ "libc",
+]
+
 [[package]]
 name = "ipnet"
 version = "2.11.0"
diff --git a/fluxon_rs/fluxon_kv/Cargo.toml b/fluxon_rs/fluxon_kv/Cargo.toml
index 22ff136..8208216 100644
--- a/fluxon_rs/fluxon_kv/Cargo.toml
+++ b/fluxon_rs/fluxon_kv/Cargo.toml
@@ -75,6 +75,7 @@ bytes = "1"
 pprof = { version = "0.15", features = ["flamegraph"] }
 hex = "0.4"
 sha2 = "0.10"
+io-uring = "0.7"
 tokio-tungstenite = { version = "0.21", default-features = false, features = ["connect", "handshake"], optional = true }
 
 sockudo-ws = { version = "^1.7.4", default-features = false, features = ["tokio-runtime", "fastrand"], optional = true }
diff --git a/fluxon_rs/fluxon_kv/src/client_kv_api/get.rs b/fluxon_rs/fluxon_kv/src/client_kv_api/get.rs
index f309dd0..29da3f8 100644
--- a/fluxon_rs/fluxon_kv/src/client_kv_api/get.rs
+++ b/fluxon_rs/fluxon_kv/src/client_kv_api/get.rs
@@ -13,7 +13,7 @@ use crate::{
     cluster_manager::NodeID,
     master_kv_router::msg_pack::{
         GetAllocationMode, GetDoneReq, GetDoneResp, GetMetaReq, GetMetaResp, GetRevokeReq,
-        GetStartReq, GetStartResp,
+        GetSourceKind, GetStartReq, GetStartResp,
     },
     p2p::msg_pack::MsgPack,
     rpcresp_kvresult_convert::msg_and_error::codes_api,
@@ -26,19 +26,27 @@ use std::sync::Arc;
 pub struct RemoteGetInfo {
     get_id: u64,
     data_len: usize,
+    source_kind: GetSourceKind,
     src_addr: u64,
     target_addr: u64,
     node_id: NodeID,
     peer_is_src_or_target: bool,
 }
 
+impl RemoteGetInfo {
+    pub fn source_kind(&self) -> GetSourceKind {
+        self.source_kind
+    }
+}
+
 impl std::fmt::Display for RemoteGetInfo {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
-            "GetInfo{{ get_id: {}, data_len: {} bytes, src_addr: {:#x}, target_addr: {:#x}, node_id: {:?}, remote_transfer: {} }}",
+            "GetInfo{{ get_id: {}, data_len: {} bytes, source_kind: {:?}, src_addr: {:#x}, target_addr: {:#x}, node_id: {:?}, remote_transfer: {} }}",
             self.get_id,
             self.data_len,
+            self.source_kind,
             self.src_addr,
             self.target_addr,
             self.node_id,
@@ -177,8 +185,80 @@ impl ClientKvApiInner {
             );
         }
 
+        let mut ssd_done_resp = None;
+        if resp.source_kind == GetSourceKind::Ssd {
+            let ssd_stage_len = resp.ssd_stage_len;
+            if ssd_stage_len < data_len as u64 {
+                #[cfg(test)]
+                {
+                    self.test_record.remove_transfering_get(get_id);
+                }
+
+                self.get_revoke(get_id).await?;
+                return Err(KvError::Api(ApiError::InvalidArgument {
+                    detail: format!(
+                        "invalid ssd stage len for key={} get_id={} data_len={} ssd_stage_len={}",
+                        key, get_id, data_len, ssd_stage_len
+                    ),
+                }));
+            }
+            let done_resp = match self
+                .stage_kv_from_ssd_source(
+                    &resp.node_id,
+                    key,
+                    put_id,
+                    get_id,
+                    abs_src,
+                    abs_target,
+                    data_len as u64,
+                    ssd_stage_len,
+                )
+                .await
+            {
+                Ok(done_resp) => done_resp,
+                Err(err) => {
+                    tracing::warn!(
+                        "kv get ssd stage failed: key={}, source_node={}, stage={:#x}, target={:#x}, len={}, ssd_stage_len={}, err={}",
+                        key,
+                        resp.node_id,
+                        abs_src,
+                        abs_target,
+                        data_len,
+                        ssd_stage_len,
+                        err
+                    );
+
+                    #[cfg(test)]
+                    {
+                        self.test_record.remove_transfering_get(get_id);
+                    }
+
+                    obe_get_transfer_error(&metrics, &client_id, &node_role, key, data_len as u64);
+                    self.get_revoke_ssd_source(get_id).await?;
+                    return Err(err);
+                }
+            };
+            ssd_done_resp = Some(done_resp);
+            tracing::debug!(
+                "kv get ssd staged and pushed: key={}, source_node={}, stage={:#x}, target={:#x}, len={}, ssd_stage_len={}",
+                key,
+                resp.node_id,
+                abs_src,
+                abs_target,
+                data_len,
+                ssd_stage_len
+            );
+        }
+
         // transfer data (skip if local and src==target to avoid redundant copy)
-        if peer_id.is_none() && abs_src == abs_target {
+        if resp.source_kind == GetSourceKind::Ssd {
+            tracing::debug!(
+                "kv get ssd owner push complete: key={}, target={:#x}, len={} (skip requester transfer)",
+                key,
+                abs_target,
+                data_len
+            );
+        } else if peer_id.is_none() && abs_src == abs_target {
             tracing::debug!(
                 "kv get local no-op: src==target {:#x}, len={} (skip transfer)",
                 abs_target,
@@ -249,12 +329,17 @@ impl ClientKvApiInner {
 
         // Removed post-transfer zero-header verification per request.
 
-        // Complete the get operation and get holder_id
-        let done_resp = match self.get_done(get_id).await {
-            Ok(resp) => resp,
-            Err(err) => {
-                obe_get_end_error_rpc(&metrics, &client_id, &node_role, key, data_len as u64);
-                return Err(err);
+        // Complete the get operation and get holder_id. SSD source already called
+        // get_done after pushing into the requester target.
+        let done_resp = if let Some(done_resp) = ssd_done_resp {
+            done_resp
+        } else {
+            match self.get_done(get_id).await {
+                Ok(resp) => resp,
+                Err(err) => {
+                    obe_get_end_error_rpc(&metrics, &client_id, &node_role, key, data_len as u64);
+                    return Err(err);
+                }
             }
         };
         let end_handle_us = done_resp.server_process_us;
@@ -326,6 +411,7 @@ impl ClientKvApiInner {
         let get_info = RemoteGetInfo {
             get_id,
             data_len,
+            source_kind: resp.source_kind,
             src_addr: abs_src,
             target_addr: abs_target,
             node_id: resp.node_id.into(),
@@ -435,8 +521,19 @@ impl ClientKvApiInner {
 
     /// 撤销 Get 操作，释放已分配的资源
     pub async fn get_revoke(&self, get_id: u64) -> KvResult<()> {
+        self.get_revoke_inner(get_id, false).await
+    }
+
+    async fn get_revoke_ssd_source(&self, get_id: u64) -> KvResult<()> {
+        self.get_revoke_inner(get_id, true).await
+    }
+
+    async fn get_revoke_inner(&self, get_id: u64, drop_ssd_source: bool) -> KvResult<()> {
         let req = MsgPack {
-            serialize_part: GetRevokeReq { get_id },
+            serialize_part: GetRevokeReq {
+                get_id,
+                drop_ssd_source,
+            },
             raw_bytes: Vec::new(),
         };
 
diff --git a/fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs b/fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs
index dec19f5..bd4655b 100644
--- a/fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs
+++ b/fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs
@@ -3,11 +3,17 @@ use crate::client_kv_api::msg_pack::{
     ExternalDeleteAckReq, ExternalDeleteAckResp, ExternalDeleteReq, ExternalDeleteResp,
     ExternalGetReq, ExternalGetResp, ExternalIsExistReq, ExternalIsExistResp, ExternalPutCommitReq,
     ExternalPutCommitResp, ExternalPutRevokeReq, ExternalPutRevokeResp, ExternalPutStartReq,
-    ExternalPutStartResp, ExternalPutTransferEndReq, ExternalPutTransferEndResp, SyncKvToFileReq,
-    SyncKvToFileResp, TestPutPhaseTrace,
+    ExternalPutStartResp, ExternalPutTransferEndReq, ExternalPutTransferEndResp,
+    SsdReplicaPersistReq, SsdReplicaPersistResp, SsdStageReadReq, SsdStageReadResp,
+    SyncKvToFileReq, SyncKvToFileResp, TestPutPhaseTrace,
 };
 use crate::cluster_manager::NodeIDString;
+use crate::cluster_manager::app_logic_ext::ClusterManagerAppLogicExt;
 use crate::config::TestSpecConfig;
+use crate::kv_ssd_storage::{
+    DEFAULT_READ_TRANSFER_PIPELINE_CHUNK_BYTES, DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT,
+    KvSsdStorage, KvSsdStorageInit, SsdLoadedChunk,
+};
 use crate::master_kv_router::msg_pack::{
     BatchDeleteAckReq, BatchDeleteClientKvMetaCacheReq, DeleteClientKvMetaCacheItem,
 };
@@ -22,8 +28,8 @@ use crate::{
     client_transfer_engine::{ClientTransferEngine, ClientTransferEngineAccessTrait},
     cluster_manager::{ClusterEvent, ClusterManager, ClusterManagerAccessTrait},
     master_kv_router::msg_pack::{
-        DeleteReq, GetDoneReq, GetMetaReq, GetRevokeReq, GetStartReq, PutDoneReq, PutRevokeReq,
-        PutStartReq,
+        DeleteReq, GetDoneReq, GetDoneResp, GetMetaReq, GetRevokeReq, GetStartReq, PutDoneReq,
+        PutRevokeReq, PutStartReq, SsdReplicaCommitReq,
     },
     metric_reporter::{MetricReporter, MetricReporterAccessTrait},
     metrics::{MetricsHandle, OperationKind, RequestStage},
@@ -37,6 +43,7 @@ use async_trait::async_trait;
 use dashmap::DashMap;
 use fluxon_framework::{LogicalModule, define_module};
 use fluxon_util::map_lock::AMapLock;
+use futures::stream::{FuturesUnordered, StreamExt};
 use limit_thirdparty::tokio;
 use parking_lot::Mutex;
 use std::sync::Weak;
@@ -451,6 +458,89 @@ async fn handle_external_put_revoke(
     }
 }
 
+async fn handle_ssd_stage_read(
+    view: &ClientKvApiView,
+    msg: &MsgPack<SsdStageReadReq>,
+) -> MsgPack<SsdStageReadResp> {
+    let req = msg.serialize_part.clone();
+    let inner = view.client_kv_api().inner();
+    let done_resp = match inner
+        .load_and_push_kv_from_ssd(
+            &req.key,
+            req.put_id,
+            req.stage_addr,
+            req.stage_len,
+            &req.target_node_id,
+            req.target_addr,
+            req.len,
+        )
+        .await
+    {
+        Ok(()) => inner.get_done(req.get_id).await,
+        Err(err) => Err(err),
+    };
+
+    match done_resp {
+        Ok(done_resp) => MsgPack {
+            serialize_part: SsdStageReadResp {
+                done_holder_id: done_resp.holder_id,
+                done_allocation_mode: done_resp.allocation_mode,
+                done_error_code: done_resp.error_code,
+                done_error_json: done_resp.error_json,
+                done_server_process_us: done_resp.server_process_us,
+                error_code: crate::rpcresp_kvresult_convert::msg_and_error::OK,
+                error_json: String::new(),
+            },
+            raw_bytes: Vec::new(),
+        },
+        Err(err) => MsgPack {
+            serialize_part: crate::rpcresp_kvresult_convert::FromError::from_error(&err),
+            raw_bytes: Vec::new(),
+        },
+    }
+}
+
+async fn handle_ssd_replica_persist(
+    view: &ClientKvApiView,
+    msg: &MsgPack<SsdReplicaPersistReq>,
+) -> MsgPack<SsdReplicaPersistResp> {
+    let req = msg.serialize_part.clone();
+    let inner = view.client_kv_api().inner();
+    let persisted = match inner
+        .persist_local_kv_to_ssd(&req.key, req.put_id, req.target_addr, req.len)
+        .await
+    {
+        Ok(persisted) => persisted,
+        Err(err) => {
+            return MsgPack {
+                serialize_part: crate::rpcresp_kvresult_convert::FromError::from_error(&err),
+                raw_bytes: Vec::new(),
+            };
+        }
+    };
+
+    if persisted {
+        if let Err(err) = inner
+            .commit_ssd_replica_to_master(&req.key, req.put_id, req.len)
+            .await
+        {
+            return MsgPack {
+                serialize_part: crate::rpcresp_kvresult_convert::FromError::from_error(&err),
+                raw_bytes: Vec::new(),
+            };
+        }
+    }
+
+    MsgPack {
+        serialize_part: SsdReplicaPersistResp {
+            persisted,
+            error_code: crate::rpcresp_kvresult_convert::msg_and_error::OK,
+            error_json: String::new(),
+        },
+        raw_bytes: Vec::new(),
+    }
+}
+
 async fn handle_external_delete_ack(
     view: &ClientKvApiView,
     msg: &MsgPack<ExternalDeleteAckReq>,
@@ -729,6 +819,7 @@ define_module!(
 #[derive(Clone, Debug)]
 pub struct ClientKvApiNewArg {
     pub test_spec_config: TestSpecConfig,
+    pub ssd_storage: Option<KvSsdStorageInit>,
 }
 
 pub struct ClientKvApi(ClientKvApiInner);
@@ -775,6 +866,7 @@ impl std::ops::Deref for ClientKvApiViewHolder {
 pub struct ClientKvApiInner {
     view: ClientKvApiViewHolder,
     test_spec_config: TestSpecConfig,
+    ssd_storage: Option<Arc<KvSsdStorage>>,
     metrics: OnceLock<Arc<MetricsHandle>>,
 
     /// make sure each remote kv get run in order
@@ -818,6 +910,8 @@ pub struct ClientKvApiInner {
     rpc_caller_external_put_commit: RPCCaller<ExternalPutCommitReq>,
     rpc_caller_external_put_revoke: RPCCaller<ExternalPutRevokeReq>,
     rpc_caller_resolve_side_transfer_lane: RPCCaller<ResolveSideTransferLaneReq>,
+    rpc_caller_ssd_stage_read: RPCCaller<SsdStageReadReq>,
+    rpc_caller_ssd_replica_commit: RPCCaller<SsdReplicaCommitReq>,
 
     /// Default lease id recorded for inspection/convenience, but NOT auto-applied.
     /// Callers must explicitly pass `Some(lease_id)` to attach a put to a lease.
@@ -900,6 +994,222 @@ impl ClientKvApiInner {
     pub(crate) fn skip_put_end_commit_enabled(&self) -> bool {
         self.test_spec_config.skip_put_end_commit
     }
+
+    pub(crate) async fn persist_local_kv_to_ssd(
+        &self,
+        key: &str,
+        put_id: crate::master_kv_router::put::PutIDForAKey,
+        abs_addr: u64,
+        len: u64,
+    ) -> KvResult<bool> {
+        let Some(store) = self.ssd_storage.as_ref() else {
+            return Ok(false);
+        };
+        store.persist_from_addr(key, put_id, abs_addr, len).await?;
+        Ok(true)
+    }
+
+    pub(crate) async fn commit_ssd_replica_to_master(
+        &self,
+        key: &str,
+        put_id: crate::master_kv_router::put::PutIDForAKey,
+        len: u64,
+    ) -> KvResult<()> {
+        let node_id = self.view.cluster_manager().get_self_info().id.clone();
+        let req = MsgPack {
+            serialize_part: SsdReplicaCommitReq {
+                key: key.to_string(),
+                put_id,
+                node_id,
+                len,
+            },
+            raw_bytes: Vec::new(),
+        };
+        let master_node_id = self
+            .view
+            .cluster_manager()
+            .find_or_wait_master_node()
+            .await?;
+        let resp = self
+            .rpc_caller_ssd_replica_commit
+            .call(
+                self.view.p2p_module(),
+                master_node_id.into(),
+                req,
+                Some(Duration::from_secs(60)),
+                2,
+            )
+            .await
+            .map_err(KvError::from)?;
+        crate::rpcresp_kvresult_convert::try_from_code(
+            resp.serialize_part.error_code,
+            resp.serialize_part.error_json,
+        )
+    }
+
+    pub(crate) async fn load_and_push_kv_from_ssd(
+        &self,
+        key: &str,
+        put_id: crate::master_kv_router::put::PutIDForAKey,
+        stage_addr: u64,
+        stage_len: u64,
+        target_node_id: &NodeIDString,
+        target_addr: u64,
+        len: u64,
+    ) -> KvResult<()> {
+        let Some(store) = self.ssd_storage.as_ref() else {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: "kv ssd storage is not enabled on this owner".to_string(),
+            }));
+        };
+
+        let self_node_id = &self.view.cluster_manager().get_self_info().id;
+        let peer_id = if target_node_id == self_node_id {
+            None
+        } else {
+            Some(target_node_id.clone())
+        };
+        let (chunk_tx, chunk_rx) = ::tokio::sync::mpsc::channel(
+            DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT
+                .saturating_mul(2)
+                .max(1),
+        );
+        let producer = store.load_into_addr_chunks(
+            key,
+            put_id,
+            stage_addr,
+            len,
+            stage_len,
+            DEFAULT_READ_TRANSFER_PIPELINE_CHUNK_BYTES,
+            DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT,
+            chunk_tx,
+        );
+        let consumer = self.transfer_loaded_ssd_chunks(peer_id, target_addr, chunk_rx);
+        let (producer_res, consumer_res) = ::tokio::join!(producer, consumer);
+        match (producer_res, consumer_res) {
+            (Ok(()), Ok(())) => Ok(()),
+            (_, Err(err)) => Err(err),
+            (Err(err), _) => Err(err),
+        }
+    }
+
+    async fn transfer_loaded_ssd_chunks(
+        &self,
+        peer_id: Option<NodeIDString>,
+        target_addr: u64,
+        mut chunk_rx: ::tokio::sync::mpsc::Receiver<SsdLoadedChunk>,
+    ) -> KvResult<()> {
+        let mut inflight = FuturesUnordered::new();
+        let mut rx_open = true;
+
+        loop {
+            tokio::select! {
+                maybe_chunk = chunk_rx.recv(), if rx_open && inflight.len() < DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT => {
+                    match maybe_chunk {
+                        Some(chunk) => {
+                            let chunk_target_addr = target_addr.checked_add(chunk.offset).ok_or_else(|| {
+                                KvError::Api(ApiError::InvalidArgument {
+                                    detail: format!(
+                                        "kv ssd transfer target addr overflow: target_addr={:#x} offset={}",
+                                        target_addr,
+                                        chunk.offset
+                                    ),
+                                })
+                            })?;
+                            let transfer_engine = self.view.client_transfer_engine();
+                            let peer_id = peer_id.clone();
+                            inflight.push(async move {
+                                transfer_engine
+                                    .transfer_data_no_copy(
+                                        peer_id,
+                                        false,
+                                        chunk.stage_addr,
+                                        chunk_target_addr,
+                                        chunk.len,
+                                        None,
+                                    )
+                                    .await?;
+                                Ok::<(), KvError>(())
+                            });
+                        }
+                        None => {
+                            rx_open = false;
+                        }
+                    }
+                }
+                Some(result) = inflight.next(), if !inflight.is_empty() => {
+                    result?;
+                }
+                else => {
+                    if !rx_open && inflight.is_empty() {
+                        break;
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+
+    pub(crate) async fn stage_kv_from_ssd_source(
+        &self,
+        source_node_id: &NodeIDString,
+        key: &str,
+        put_id: crate::master_kv_router::put::PutIDForAKey,
+        get_id: u64,
+        stage_addr: u64,
+        target_addr: u64,
+        len: u64,
+        stage_len: u64,
+    ) -> KvResult<GetDoneResp> {
+        let self_node_id = self.view.cluster_manager().get_self_info().id.clone();
+        if source_node_id == &self_node_id {
+            self.load_and_push_kv_from_ssd(
+                key,
+                put_id,
+                stage_addr,
+                stage_len,
+                &self_node_id,
+                target_addr,
+                len,
+            )
+            .await?;
+            return self.get_done(get_id).await;
+        }
+
+        let req = MsgPack {
+            serialize_part: SsdStageReadReq {
+                key: key.to_string(),
+                put_id,
+                get_id,
+                stage_addr,
+                stage_len,
+                target_node_id: self_node_id,
+                target_addr,
+                len,
+            },
+            raw_bytes: Vec::new(),
+        };
+        let resp = self
+            .rpc_caller_ssd_stage_read
+            .call(
+                self.view.p2p_module(),
+                source_node_id.clone().into(),
+                req,
+                Some(Duration::from_secs(60)),
+                0,
+            )
+            .await
+            .map_err(KvError::from)?;
+        let resp = resp.serialize_part;
+        crate::rpcresp_kvresult_convert::try_from_code(resp.error_code, resp.error_json)?;
+        Ok(GetDoneResp {
+            holder_id: resp.done_holder_id,
+            allocation_mode: resp.done_allocation_mode,
+            error_code: resp.done_error_code,
+            error_json: resp.done_error_json,
+            server_process_us: resp.done_server_process_us,
+        })
+    }
 }
 
 #[derive(Debug, Clone)]
@@ -1518,10 +1828,16 @@ impl ClientKvApi {
 
     pub async fn construct(arg: ClientKvApiNewArg) -> Result<Self, KvError> {
         tracing::info!("Constructing ClientKvApi in Client mode (PreView)");
+        let ssd_storage = arg
+            .ssd_storage
+            .map(KvSsdStorage::new)
+            .transpose()?
+            .map(Arc::new);
 
         let inner = ClientKvApiInner {
             view: ClientKvApiViewHolder::new(),
             test_spec_config: arg.test_spec_config,
+            ssd_storage,
             metrics: OnceLock::new(),
             all_memholder_refcount: OnceLock::new(),
             get_remote_kv_lock: AMapLock::new(Duration::from_secs(60)),
@@ -1554,6 +1870,8 @@ impl ClientKvApi {
             rpc_caller_external_put_commit: RPCCaller::new(),
             rpc_caller_external_put_revoke: RPCCaller::new(),
             rpc_caller_resolve_side_transfer_lane: RPCCaller::new(),
+            rpc_caller_ssd_stage_read: RPCCaller::new(),
+            rpc_caller_ssd_replica_commit: RPCCaller::new(),
             default_lease_id: parking_lot::RwLock::new(None),
         };
         Ok(Self(inner))
@@ -1587,6 +1905,12 @@ impl ClientKvApi {
         inner
             .rpc_caller_resolve_side_transfer_lane
             .regist(inner.view.p2p_module());
+        inner
+            .rpc_caller_ssd_stage_read
+            .regist(inner.view.p2p_module());
+        inner
+            .rpc_caller_ssd_replica_commit
+            .regist(inner.view.p2p_module());
         crate::key_prefix::init_for_p2p_owner(inner.view.p2p_module());
         crate::kvlease::init_for_p2p_owner(inner.view.p2p_module());
         // Register master-only metric RPC callers
@@ -1686,6 +2010,31 @@ impl ClientKvApi {
             },
         );
 
+        let view_ext = inner.view.clone_view();
+        RPCHandler::<SsdStageReadReq>::new().regist(inner.view.p2p_module(), move |resp, msg| {
+            let view = view_ext.clone();
+            let view_task = view.clone();
+            let _ = view.spawn("rpc_ssd_stage_read", async move {
+                let result = handle_ssd_stage_read(&view_task, &msg).await;
+                let _ = resp.send_resp(result).await;
+            });
+            Ok(())
+        });
+
+        let view_ext = inner.view.clone_view();
+        RPCHandler::<SsdReplicaPersistReq>::new().regist(
+            inner.view.p2p_module(),
+            move |resp, msg| {
+                let view = view_ext.clone();
+                let view_task = view.clone();
+                let _ = view.spawn("rpc_ssd_replica_persist", async move {
+                    let result = handle_ssd_replica_persist(&view_task, &msg).await;
+                    let _ = resp.send_resp(result).await;
+                });
+                Ok(())
+            },
+        );
+
         let view_ext = inner.view.clone_view();
         RPCHandler::<ExternalDeleteReq>::new().regist(inner.view.p2p_module(), move |resp, msg| {
             let view = view_ext.clone();
diff --git a/fluxon_rs/fluxon_kv/src/client_kv_api/msg_pack.rs b/fluxon_rs/fluxon_kv/src/client_kv_api/msg_pack.rs
index 55f0970..bae5437 100644
--- a/fluxon_rs/fluxon_kv/src/client_kv_api/msg_pack.rs
+++ b/fluxon_rs/fluxon_kv/src/client_kv_api/msg_pack.rs
@@ -1,8 +1,10 @@
+use crate::master_kv_router::msg_pack::GetAllocationMode;
 use crate::master_kv_router::put::PutIDForAKey;
 use crate::p2p::msg_pack::{MsgPackSerializePart, RPCReq};
 use crate::rpcresp_kvresult_convert::msg_and_error::ErrorCode;
 use bitcode::{Decode, Encode};
 
+use crate::cluster_manager::NodeIDString;
 use crate::memholder::ExternalMemHolderInfo;
 
 #[derive(Default, Debug, Clone, Encode, Decode)]
@@ -89,6 +91,76 @@ impl MsgPackSerializePart for ExternalGetResp {
     }
 }
 
+#[derive(Default, Debug, Clone, Encode, Decode)]
+pub struct SsdStageReadReq {
+    pub key: String,
+    pub put_id: PutIDForAKey,
+    pub get_id: u64,
+    pub stage_addr: u64,
+    pub stage_len: u64,
+    pub target_node_id: NodeIDString,
+    pub target_addr: u64,
+    pub len: u64,
+}
+
+impl MsgPackSerializePart for SsdStageReadReq {
+    fn msg_id(&self) -> u32 {
+        4020
+    }
+}
+
+impl RPCReq for SsdStageReadReq {
+    type Resp = SsdStageReadResp;
+}
+
+#[derive(Default, Debug, Clone, Encode, Decode)]
+pub struct SsdStageReadResp {
+    pub done_holder_id: u64,
+    pub done_allocation_mode: GetAllocationMode,
+    pub done_error_code: ErrorCode,
+    pub done_error_json: String,
+    pub done_server_process_us: i64,
+    pub error_code: ErrorCode,
+    pub error_json: String,
+}
+
+impl MsgPackSerializePart for SsdStageReadResp {
+    fn msg_id(&self) -> u32 {
+        4021
+    }
+}
+
+#[derive(Default, Debug, Clone, Encode, Decode)]
+pub struct SsdReplicaPersistReq {
+    pub key: String,
+    pub put_id: PutIDForAKey,
+    pub target_addr: u64,
+    pub len: u64,
+}
+
+impl MsgPackSerializePart for SsdReplicaPersistReq {
+    fn msg_id(&self) -> u32 {
+        4022
+    }
+}
+
+impl RPCReq for SsdReplicaPersistReq {
+    type Resp = SsdReplicaPersistResp;
+}
+
+#[derive(Default, Debug, Clone, Encode, Decode)]
+pub struct SsdReplicaPersistResp {
+    pub persisted: bool,
+    pub error_code: ErrorCode,
+    pub error_json: String,
+}
+
+impl MsgPackSerializePart for SsdReplicaPersistResp {
+    fn msg_id(&self) -> u32 {
+        4023
+    }
+}
+
 // #[derive(Default, Debug, Clone, Encode, Decode)]
 // pub struct ExternalPutReq {
 //     pub key: String,
diff --git a/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs b/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs
index 1aa6954..8c7cc78 100644
--- a/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs
+++ b/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs
@@ -237,10 +237,7 @@ impl ClientSegPool {
         std::path::Path::new(share_mem_path).join(SIDE_TRANSFER_PEERS_DIRNAME)
     }
 
-    pub fn side_transfer_peer_file_path(
-        share_mem_path: &str,
-        side_id: &str,
-    ) -> std::path::PathBuf {
+    pub fn side_transfer_peer_file_path(share_mem_path: &str, side_id: &str) -> std::path::PathBuf {
         Self::side_transfer_peers_dir(share_mem_path).join(format!("{side_id}.json"))
     }
 
@@ -399,17 +396,13 @@ impl ClientSegPool {
                 crate::rpcresp_kvresult_convert::msg_and_error::SharedMemError::MappingFailed {
                     path: String::new(),
                     len: map_len as u64,
-                    detail: "share_mem_path is empty; explicit configuration required"
-                        .to_string(),
+                    detail: "share_mem_path is empty; explicit configuration required".to_string(),
                 },
             ));
         }
 
         let base_path = &share_mem_path;
-        tracing::info!(
-            "Using share_mem_path: {} for memory-mapped file",
-            base_path
-        );
+        tracing::info!("Using share_mem_path: {} for memory-mapped file", base_path);
         std::fs::create_dir_all(base_path).map_err(|e| {
             KvError::SharedMem(
                 crate::rpcresp_kvresult_convert::msg_and_error::SharedMemError::MappingFailed {
diff --git a/fluxon_rs/fluxon_kv/src/config.rs b/fluxon_rs/fluxon_kv/src/config.rs
index f9c7691..02f6e3f 100644
--- a/fluxon_rs/fluxon_kv/src/config.rs
+++ b/fluxon_rs/fluxon_kv/src/config.rs
@@ -581,6 +581,8 @@ pub struct FluxonKvSpecYaml {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub large_file_paths: Option<LargeFilePathsYaml>,
     #[serde(skip_serializing_if = "Option::is_none")]
+    pub ssd_storage: Option<YamlNullable<KvSsdStorageConfigYaml>>,
+    #[serde(skip_serializing_if = "Option::is_none")]
     pub p2p_listen_port: Option<u16>,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub redis_compat: Option<YamlNullable<RedisCompatConfigYaml>>,
@@ -592,6 +594,17 @@ pub struct FluxonKvSpecYaml {
 #[serde(transparent)]
 pub struct LargeFilePathsYaml(pub Vec<String>);
 
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(deny_unknown_fields)]
+pub struct KvSsdStorageConfigYaml {
+    pub max_bytes: u64,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct KvSsdStorageConfig {
+    pub max_bytes: u64,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(deny_unknown_fields)]
 pub struct RedisCompatConfigYaml {
@@ -682,6 +695,34 @@ impl LargeFilePaths {
         .into_kverror())
     }
 
+    fn resolve_all_usable_root_subdirs(
+        &self,
+        relative_dir: &Path,
+        target_name: &str,
+    ) -> KvResult<Vec<PathBuf>> {
+        self.require_configured_paths()?;
+        let mut out = Vec::new();
+        let mut errors = Vec::new();
+        for root in &self.paths {
+            let candidate = Path::new(root).join(relative_dir);
+            match fs::create_dir_all(&candidate) {
+                Ok(()) => out.push(candidate),
+                Err(err) => errors.push(format!("{} ({})", candidate.display(), err)),
+            }
+        }
+        if out.is_empty() {
+            return Err(ConfigError::InvalidClientConfig {
+                detail: format!(
+                    "large_file_paths contains no usable root for {}; tried: {}",
+                    target_name,
+                    errors.join(", ")
+                ),
+            }
+            .into_kverror());
+        }
+        Ok(out)
+    }
+
     pub fn kv_logs_dir(&self, cluster_name: &str) -> KvResult<PathBuf> {
         let relative_dir = PathBuf::from(format!("{cluster_name}_cluster_kv_logs"));
         self.resolve_preferred_root_subdir(&relative_dir, "kv logs")
@@ -714,6 +755,18 @@ impl LargeFilePaths {
             "fluxon fs disk cache",
         )
     }
+
+    pub fn kv_ssd_storage_dirs(
+        &self,
+        cluster_name: &str,
+        instance_key: &str,
+    ) -> KvResult<Vec<PathBuf>> {
+        let relative_dir = PathBuf::from(format!(
+            "{cluster_name}_cluster_kv_ssd_storage/{}",
+            crate::kv_ssd_storage::safe_path_component(instance_key)
+        ));
+        self.resolve_all_usable_root_subdirs(&relative_dir, "kv ssd storage")
+    }
 }
 
 /// KV client backend types supported by the system
@@ -733,8 +786,9 @@ pub struct ClientConfig {
     pub pprof_duration_seconds: Option<u64>,
     pub redis_compat_listen_addr: Option<std::net::SocketAddr>,
     pub fluxonkv_spec: FluxonKvSpec,
-    pub share_mem_path: String, // Mandatory shared bundle path
+    pub share_mem_path: String,           // Mandatory shared bundle path
     pub large_file_paths: LargeFilePaths, // Mandatory large-file roots for logs and caches
+    pub ssd_storage: Option<KvSsdStorageConfig>,
     pub test_spec_config: TestSpecConfig,
 }
 
@@ -1028,6 +1082,13 @@ impl ClientConfigYaml {
                 }
                 .into_kverror());
             }
+            if self.fluxonkv_spec.ssd_storage.is_some() {
+                return Err(ConfigError::InvalidClientConfig {
+                    detail: "fluxonkv_spec.ssd_storage is forbidden in zero-contribution mode"
+                        .to_string(),
+                }
+                .into_kverror());
+            }
         }
 
         // Preserve historical behavior for configs that omit `protocol`, but allow
@@ -1170,13 +1231,15 @@ impl ClientConfigYaml {
         } else {
             let Some(large_file_paths_yaml) = self.fluxonkv_spec.large_file_paths.as_ref() else {
                 return Err(ConfigError::InvalidClientConfig {
-                    detail: "fluxonkv_spec.large_file_paths is required for owner mode"
-                        .to_string(),
+                    detail: "fluxonkv_spec.large_file_paths is required for owner mode".to_string(),
                 }
                 .into_kverror());
             };
             LargeFilePaths {
-                paths: verify_non_empty_root_path_list(&large_file_paths_yaml.0, "large_file_paths")?,
+                paths: verify_non_empty_root_path_list(
+                    &large_file_paths_yaml.0,
+                    "large_file_paths",
+                )?,
             }
         };
 
@@ -1204,6 +1267,28 @@ impl ClientConfigYaml {
             }
         };
 
+        let ssd_storage = if is_external {
+            None
+        } else {
+            match std::mem::take(&mut self.fluxonkv_spec.ssd_storage) {
+                None | Some(YamlNullable::Null) => None,
+                Some(YamlNullable::Value(raw)) => {
+                    if raw.max_bytes < crate::kv_ssd_storage::SSD_ALIGNMENT as u64 {
+                        return Err(ConfigError::InvalidClientConfig {
+                            detail: format!(
+                                "fluxonkv_spec.ssd_storage.max_bytes must be >= {}",
+                                crate::kv_ssd_storage::SSD_ALIGNMENT
+                            ),
+                        }
+                        .into_kverror());
+                    }
+                    Some(KvSsdStorageConfig {
+                        max_bytes: raw.max_bytes,
+                    })
+                }
+            }
+        };
+
         Ok(ClientConfig {
             cluster_name: fluxonkv_spec.cluster_name.clone(),
             etcd_addresses_raw,
@@ -1215,6 +1300,7 @@ impl ClientConfigYaml {
             fluxonkv_spec,
             share_mem_path,
             large_file_paths,
+            ssd_storage,
             test_spec_config,
         })
     }
@@ -1647,7 +1733,80 @@ fluxonkv_spec:
         .unwrap();
         let err = cfg.verify().unwrap_err();
         let text = format!("{err}");
-        assert!(text.contains("fluxonkv_spec.large_file_paths is forbidden in zero-contribution mode"));
+        assert!(
+            text.contains("fluxonkv_spec.large_file_paths is forbidden in zero-contribution mode")
+        );
+    }
+
+    #[test]
+    fn client_config_owner_accepts_ssd_storage() {
+        let cfg = ClientConfigYaml::from_str(
+            r#"
+instance_key: test_owner
+contribute_to_cluster_pool_size:
+  dram: 16777216
+  vram: {}
+fluxonkv_spec:
+  etcd_addresses: ["127.0.0.1:2379"]
+  cluster_name: test_cluster
+  share_mem_path: /tmp/test_owner
+  large_file_paths: [/tmp/test_owner_large]
+  ssd_storage:
+    max_bytes: 1048576
+  sub_cluster: rack-a
+"#,
+        )
+        .unwrap();
+        let verified = cfg.verify().unwrap();
+        assert_eq!(
+            verified.ssd_storage.as_ref().map(|cfg| cfg.max_bytes),
+            Some(1048576)
+        );
+    }
+
+    #[test]
+    fn client_config_owner_rejects_too_small_ssd_storage() {
+        let cfg = ClientConfigYaml::from_str(
+            r#"
+instance_key: test_owner
+contribute_to_cluster_pool_size:
+  dram: 16777216
+  vram: {}
+fluxonkv_spec:
+  etcd_addresses: ["127.0.0.1:2379"]
+  cluster_name: test_cluster
+  share_mem_path: /tmp/test_owner
+  large_file_paths: [/tmp/test_owner_large]
+  ssd_storage:
+    max_bytes: 1
+  sub_cluster: rack-a
+"#,
+        )
+        .unwrap();
+        let err = cfg.verify().unwrap_err();
+        let text = format!("{err}");
+        assert!(
+            text.contains("fluxonkv_spec.ssd_storage.max_bytes must be >= 512"),
+            "{text}"
+        );
+    }
+
+    #[test]
+    fn client_config_zero_contribution_rejects_ssd_storage() {
+        let cfg = ClientConfigYaml::from_str(
+            r#"
+instance_key: test_external
+fluxonkv_spec:
+  cluster_name: test_cluster
+  share_mem_path: /tmp/test_external
+  ssd_storage:
+    max_bytes: 1048576
+"#,
+        )
+        .unwrap();
+        let err = cfg.verify().unwrap_err();
+        let text = format!("{err}");
+        assert!(text.contains("fluxonkv_spec.ssd_storage is forbidden in zero-contribution mode"));
     }
 
     #[test]
@@ -1667,7 +1826,9 @@ fluxonkv_spec:
         let logs_dir = large_file_paths.kv_logs_dir("test_cluster").unwrap();
         assert_eq!(
             logs_dir,
-            first_root.join("child").join("test_cluster_cluster_kv_logs")
+            first_root
+                .join("child")
+                .join("test_cluster_cluster_kv_logs")
         );
         assert!(logs_dir.exists());
 
@@ -1683,6 +1844,32 @@ fluxonkv_spec:
         assert!(third_party_logs_dir.exists());
     }
 
+    #[test]
+    fn large_file_paths_uses_all_usable_roots_for_kv_ssd_storage() {
+        let tempdir = new_test_dir("fluxon_large_paths_uses_all_usable_roots_for_kv_ssd_storage");
+        let first_root = tempdir.join("first_root");
+        let second_root = tempdir.join("second_root");
+
+        let large_file_paths = LargeFilePaths {
+            paths: vec![
+                first_root.to_string_lossy().into_owned(),
+                second_root.to_string_lossy().into_owned(),
+            ],
+        };
+
+        let dirs = large_file_paths
+            .kv_ssd_storage_dirs("test_cluster", "owner/a:b")
+            .unwrap();
+        assert_eq!(
+            dirs,
+            vec![
+                first_root.join("test_cluster_cluster_kv_ssd_storage/owner_a_b"),
+                second_root.join("test_cluster_cluster_kv_ssd_storage/owner_a_b"),
+            ]
+        );
+        assert!(dirs.iter().all(|dir| dir.exists()));
+    }
+
     #[test]
     fn client_test_spec_config_accepts_explicit_rdma_device_names() {
         let cfg = ClientConfigYaml::from_str(
diff --git a/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs b/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs
index da701cd..630a8ea 100644
--- a/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs
+++ b/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs
@@ -89,6 +89,7 @@ fn new_client_config(
         large_file_paths: LargeFilePaths {
             paths: vec![format!("{}_large", shm_path)],
         },
+        ssd_storage: None,
         test_spec_config: TestSpecConfig::default(),
     }
 }
@@ -130,6 +131,7 @@ fn new_zero_contribution_client_config(
         },
         share_mem_path: shm_path.to_string(),
         large_file_paths: LargeFilePaths { paths: Vec::new() },
+        ssd_storage: None,
         test_spec_config: TestSpecConfig::default(),
     }
 }
diff --git a/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs b/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs
index 9cb291f..b7715dd 100644
--- a/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs
+++ b/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs
@@ -865,8 +865,7 @@ impl ExternalInner {
             return Ok(false);
         }
 
-        self.finish_owner_recover(&share_mem_path, payload)
-            .await?;
+        self.finish_owner_recover(&share_mem_path, payload).await?;
         Ok(true)
     }
 
diff --git a/fluxon_rs/fluxon_kv/src/kv_ssd_storage.rs b/fluxon_rs/fluxon_kv/src/kv_ssd_storage.rs
new file mode 100644
index 0000000..26d711e
--- /dev/null
+++ b/fluxon_rs/fluxon_kv/src/kv_ssd_storage.rs
@@ -0,0 +1,2159 @@
+use crate::master_kv_router::put::PutIDForAKey;
+use crate::rpcresp_kvresult_convert::msg_and_error::{ApiError, KvError, KvResult};
+use ::tokio::{
+    sync::{Notify, mpsc as tokio_mpsc, oneshot},
+    task,
+};
+use futures::stream::{FuturesUnordered, StreamExt};
+use io_uring::{IoUring, opcode, types::Fd};
+use parking_lot::Mutex;
+use std::collections::{HashMap, HashSet, VecDeque};
+use std::fs::{self, OpenOptions};
+use std::io;
+use std::os::fd::{AsRawFd, RawFd};
+use std::os::unix::fs::MetadataExt;
+use std::os::unix::fs::OpenOptionsExt;
+use std::path::{Path, PathBuf};
+use std::ptr::NonNull;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::thread::JoinHandle;
+
+pub(crate) const SSD_ALIGNMENT: usize = 512;
+const DEFAULT_SHARDS_PER_OWNER: usize = 4;
+const DEFAULT_URING_THREADS: usize = 16;
+const DEFAULT_URING_IO_DEPTH: usize = 128;
+const DEFAULT_URING_READ_WEIGHT: usize = 2;
+const DEFAULT_WRITE_QUEUE_DEPTH: usize = 8;
+const DEFAULT_READ_QUEUE_DEPTH: usize = 16;
+const DEFAULT_WRITE_INFLIGHT: usize = 2;
+const DEFAULT_READ_INFLIGHT: usize = 16;
+pub(crate) const DEFAULT_READ_TRANSFER_PIPELINE_CHUNK_BYTES: u64 = 4 * 1024 * 1024;
+pub(crate) const DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT: usize = 4;
+
+#[derive(Clone, Debug)]
+pub struct KvSsdStorageInit {
+    pub root_dirs: Vec<PathBuf>,
+    pub max_bytes: u64,
+}
+
+#[derive(Debug)]
+pub struct KvSsdStorage {
+    root_dirs: Vec<PathBuf>,
+    devices: Vec<SsdDeviceWorker>,
+    shard_to_device: Vec<usize>,
+    next_write_device: AtomicUsize,
+    inner: Arc<Mutex<KvSsdStorageInner>>,
+    space_notify: Arc<Notify>,
+}
+
+#[derive(Debug)]
+struct SsdDeviceWorker {
+    device_id: u64,
+    root_dir: PathBuf,
+    shard_ids: Vec<usize>,
+    _files: Vec<std::fs::File>,
+    _io: Arc<UringIoEngine>,
+    write_tx: tokio_mpsc::Sender<WriteCommand>,
+    read_tx: tokio_mpsc::Sender<ReadCommand>,
+}
+
+#[derive(Clone, Debug)]
+struct SsdDeviceRoot {
+    device_id: u64,
+    root_dir: PathBuf,
+}
+
+struct OpenedSsdShard {
+    shard_id: usize,
+    device_idx: usize,
+    file: std::fs::File,
+}
+
+#[derive(Clone, Copy, Debug)]
+pub(crate) struct SsdLoadedChunk {
+    pub offset: u64,
+    pub stage_addr: u64,
+    pub len: u64,
+}
+
+#[derive(Debug)]
+struct KvSsdStorageInner {
+    ring: SsdRingBuffer,
+}
+
+#[derive(Clone, Debug, PartialEq, Eq, Hash)]
+struct KvSsdKey {
+    key: String,
+    put_id: PutIDForAKey,
+}
+
+#[derive(Clone, Debug)]
+struct SsdIndexEntry {
+    shard_id: usize,
+    begin: u64,
+    len: u64,
+    aligned_len: u64,
+    file_offset: u64,
+}
+
+#[derive(Clone, Debug)]
+struct SsdReadPinInfo {
+    entry: SsdIndexEntry,
+    count: usize,
+}
+
+#[derive(Clone, Debug)]
+enum SsdEntryState {
+    Writing(SsdIndexEntry),
+    Committed(SsdIndexEntry),
+}
+
+impl SsdEntryState {
+    fn entry(&self) -> &SsdIndexEntry {
+        match self {
+            Self::Writing(entry) | Self::Committed(entry) => entry,
+        }
+    }
+}
+
+#[derive(Debug)]
+struct SsdShardRing {
+    capacity: u64,
+    head: u64,
+    tail: u64,
+    order: VecDeque<KvSsdKey>,
+}
+
+#[derive(Debug)]
+struct SsdRingBuffer {
+    shards: Vec<SsdShardRing>,
+    next_shard: usize,
+    entries: HashMap<KvSsdKey, SsdEntryState>,
+    read_pins: HashMap<KvSsdKey, SsdReadPinInfo>,
+}
+
+#[derive(Debug)]
+enum SsdPreparedWrite {
+    Ready(SsdIndexEntry),
+    Existing,
+    BlockedByBusyIo,
+}
+
+#[derive(Debug)]
+enum SsdAllocation {
+    Ready { begin: u64, file_offset: u64 },
+    BlockedByBusyIo,
+    TooLarge,
+}
+
+impl SsdRingBuffer {
+    fn new(shard_capacities: Vec<u64>) -> Self {
+        assert!(!shard_capacities.is_empty());
+        Self {
+            shards: shard_capacities
+                .into_iter()
+                .map(|capacity| SsdShardRing {
+                    capacity,
+                    head: 0,
+                    tail: 0,
+                    order: VecDeque::new(),
+                })
+                .collect(),
+            next_shard: 0,
+            entries: HashMap::new(),
+            read_pins: HashMap::new(),
+        }
+    }
+
+    #[cfg(test)]
+    fn get(&self, key: &KvSsdKey) -> Option<SsdIndexEntry> {
+        match self.entries.get(key) {
+            Some(SsdEntryState::Committed(entry)) if self.is_offset_valid(entry) => {
+                Some(entry.clone())
+            }
+            _ => None,
+        }
+    }
+
+    fn pin_read(&mut self, key: &KvSsdKey) -> Option<SsdIndexEntry> {
+        let entry = match self.entries.get(key) {
+            Some(SsdEntryState::Committed(entry)) if self.is_offset_valid(entry) => entry.clone(),
+            _ => return None,
+        };
+        let pin = self
+            .read_pins
+            .entry(key.clone())
+            .or_insert_with(|| SsdReadPinInfo {
+                entry: entry.clone(),
+                count: 0,
+            });
+        pin.count += 1;
+        Some(entry)
+    }
+
+    fn unpin_read(&mut self, key: &KvSsdKey) {
+        match self.read_pins.get_mut(key) {
+            Some(pin) if pin.count > 1 => pin.count -= 1,
+            Some(_) => {
+                self.read_pins.remove(key);
+            }
+            None => debug_assert!(false, "missing kv ssd read pin for key={key:?}"),
+        }
+    }
+
+    #[cfg(test)]
+    fn prepare_write(&mut self, key: KvSsdKey, len: u64) -> KvResult<SsdPreparedWrite> {
+        let allowed_shards = (0..self.shards.len()).collect::<Vec<_>>();
+        self.prepare_write_on_shards(key, len, &allowed_shards)
+    }
+
+    fn prepare_write_on_shards(
+        &mut self,
+        key: KvSsdKey,
+        len: u64,
+        allowed_shards: &[usize],
+    ) -> KvResult<SsdPreparedWrite> {
+        if allowed_shards.is_empty() {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: "kv ssd device has no shards".to_string(),
+            }));
+        }
+        if self.entries.contains_key(&key) {
+            return Ok(SsdPreparedWrite::Existing);
+        }
+        let aligned_len = align_up_u64(len, SSD_ALIGNMENT as u64)?;
+        let max_capacity = self
+            .shards
+            .iter()
+            .enumerate()
+            .filter(|(idx, _)| allowed_shards.contains(idx))
+            .map(|(_, shard)| shard.capacity)
+            .max()
+            .ok_or_else(|| {
+                KvError::Api(ApiError::InvalidArgument {
+                    detail: format!("kv ssd device has invalid shard set: {allowed_shards:?}"),
+                })
+            })?;
+        if aligned_len > max_capacity {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: format!(
+                    "kv ssd value len={} aligned_len={} exceeds shard capacity={}",
+                    len, aligned_len, max_capacity
+                ),
+            }));
+        }
+
+        let shard_count = self.shards.len();
+        for offset in 0..shard_count {
+            let shard_id = (self.next_shard + offset) % shard_count;
+            if !allowed_shards.contains(&shard_id) {
+                continue;
+            }
+            let (begin, file_offset) = match self.allocate_contiguous(shard_id, aligned_len) {
+                SsdAllocation::Ready { begin, file_offset } => (begin, file_offset),
+                SsdAllocation::BlockedByBusyIo => continue,
+                SsdAllocation::TooLarge => unreachable!("aligned_len was checked against capacity"),
+            };
+            self.next_shard = (shard_id + 1) % shard_count;
+
+            let entry = SsdIndexEntry {
+                shard_id,
+                begin,
+                len,
+                aligned_len,
+                file_offset,
+            };
+            self.entries
+                .insert(key.clone(), SsdEntryState::Writing(entry.clone()));
+            self.shards[shard_id].order.push_back(key);
+            return Ok(SsdPreparedWrite::Ready(entry));
+        }
+
+        Ok(SsdPreparedWrite::BlockedByBusyIo)
+    }
+
+    fn allocate_contiguous(&mut self, shard_id: usize, size: u64) -> SsdAllocation {
+        let shard = &self.shards[shard_id];
+        if size > shard.capacity {
+            return SsdAllocation::TooLarge;
+        }
+        let capacity = shard.capacity;
+        let mut head = shard.head;
+        let phys = head % capacity;
+        let space_until_end = capacity - phys;
+        if size > space_until_end {
+            head += space_until_end;
+        }
+        let begin = head;
+        let new_head = head + size;
+        let new_tail = new_head.saturating_sub(capacity);
+        if self.has_busy_entries_before(shard_id, new_tail) {
+            return SsdAllocation::BlockedByBusyIo;
+        }
+
+        self.shards[shard_id].head = new_head;
+        self.advance_tail(shard_id, new_tail);
+        SsdAllocation::Ready {
+            begin,
+            file_offset: begin % capacity,
+        }
+    }
+
+    fn advance_tail(&mut self, shard_id: usize, new_tail: u64) {
+        if new_tail <= self.shards[shard_id].tail {
+            return;
+        }
+        debug_assert!(!self.has_busy_entries_before(shard_id, new_tail));
+        self.shards[shard_id].tail = new_tail;
+
+        while let Some(key) = self.shards[shard_id].order.front() {
+            match self.entries.get(key) {
+                Some(state) if state.entry().begin >= new_tail => break,
+                _ => {
+                    let key = self.shards[shard_id]
+                        .order
+                        .pop_front()
+                        .expect("front key exists");
+                    self.entries.remove(&key);
+                }
+            }
+        }
+    }
+
+    fn commit(&mut self, key: &KvSsdKey, success: bool) -> bool {
+        let Some(state) = self.entries.get(key) else {
+            return false;
+        };
+        let entry = match state {
+            SsdEntryState::Writing(entry) => entry.clone(),
+            SsdEntryState::Committed(_) => return true,
+        };
+        if !self.is_offset_valid(&entry) || !success {
+            self.entries.remove(key);
+            return false;
+        }
+        self.entries
+            .insert(key.clone(), SsdEntryState::Committed(entry));
+        true
+    }
+
+    fn remove(&mut self, key: &KvSsdKey) {
+        self.entries.remove(key);
+    }
+
+    fn is_offset_valid(&self, entry: &SsdIndexEntry) -> bool {
+        self.shards
+            .get(entry.shard_id)
+            .is_some_and(|shard| entry.begin >= shard.tail)
+    }
+
+    fn has_busy_entries_before(&self, shard_id: usize, new_tail: u64) -> bool {
+        if new_tail <= self.shards[shard_id].tail {
+            return false;
+        }
+        let writing_busy = self.entries.values().any(|state| match state {
+            SsdEntryState::Writing(entry) => entry.shard_id == shard_id && entry.begin < new_tail,
+            SsdEntryState::Committed(_) => false,
+        });
+        if writing_busy {
+            return true;
+        }
+        self.read_pins
+            .values()
+            .any(|pin| pin.entry.shard_id == shard_id && pin.entry.begin < new_tail)
+    }
+}
+
+struct SsdReadPin {
+    inner: Arc<Mutex<KvSsdStorageInner>>,
+    space_notify: Arc<Notify>,
+    key: KvSsdKey,
+}
+
+impl Drop for SsdReadPin {
+    fn drop(&mut self) {
+        self.inner.lock().ring.unpin_read(&self.key);
+        self.space_notify.notify_one();
+    }
+}
+
+struct WriteCommand {
+    key: KvSsdKey,
+    entry_len: u64,
+    data: AlignedBuffer,
+    done_tx: oneshot::Sender<KvResult<()>>,
+}
+
+struct ReadCommand {
+    key: KvSsdKey,
+    entry: SsdIndexEntry,
+    file_offset: u64,
+    target: ReadTarget,
+    _read_pin: Option<SsdReadPin>,
+    done_tx: oneshot::Sender<KvResult<ReadOutput>>,
+}
+
+struct WriteTask {
+    key: KvSsdKey,
+    entry: SsdIndexEntry,
+    data: AlignedBuffer,
+    done_tx: oneshot::Sender<KvResult<()>>,
+}
+
+struct ReadTask {
+    key: KvSsdKey,
+    entry: SsdIndexEntry,
+    file_offset: u64,
+    target: ReadTarget,
+    _read_pin: Option<SsdReadPin>,
+    done_tx: oneshot::Sender<KvResult<ReadOutput>>,
+}
+
+struct WriteCompletion {
+    key: KvSsdKey,
+    success: bool,
+    result: KvResult<()>,
+    done_tx: oneshot::Sender<KvResult<()>>,
+}
+
+struct ReadCompletion {
+    key: KvSsdKey,
+    entry: SsdIndexEntry,
+    result: KvResult<ReadOutput>,
+    _read_pin: Option<SsdReadPin>,
+    done_tx: oneshot::Sender<KvResult<ReadOutput>>,
+}
+
+enum ReadTarget {
+    Scratch(AlignedBuffer),
+    Direct { target_addr: u64, len: usize },
+}
+
+enum ReadOutput {
+    Scratch(AlignedBuffer),
+    Direct,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum SsdReadPath {
+    Scratch,
+    Direct,
+}
+
+pub fn safe_path_component(raw: &str) -> String {
+    let mut out = String::with_capacity(raw.len().max(1));
+    for ch in raw.chars() {
+        if ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.') {
+            out.push(ch);
+        } else {
+            out.push('_');
+        }
+    }
+    if out.is_empty() {
+        "unnamed".to_string()
+    } else {
+        out
+    }
+}
+
+impl KvSsdStorage {
+    pub fn new(init: KvSsdStorageInit) -> KvResult<Self> {
+        if init.max_bytes < SSD_ALIGNMENT as u64 {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: format!("kv ssd storage max_bytes must be >= {}", SSD_ALIGNMENT),
+            }));
+        }
+        if init.root_dirs.is_empty() {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: "kv ssd storage root_dirs must contain at least one path".to_string(),
+            }));
+        }
+
+        let device_roots = deduplicate_device_roots(&init.root_dirs)?;
+        let effective_root_dirs = device_roots
+            .iter()
+            .map(|root| root.root_dir.clone())
+            .collect::<Vec<_>>();
+        let shard_count = choose_shard_count(init.max_bytes, device_roots.len());
+        let shard_capacity = aligned_shard_capacity(init.max_bytes, shard_count)?;
+        let opened_shards = open_cache_files(&device_roots, shard_count, shard_capacity)?;
+        let inner = Arc::new(Mutex::new(KvSsdStorageInner {
+            ring: SsdRingBuffer::new(vec![shard_capacity; shard_count]),
+        }));
+        let space_notify = Arc::new(Notify::new());
+        let mut shard_to_device = vec![0usize; shard_count];
+        let mut device_shards = device_roots
+            .iter()
+            .map(|root| (root.clone(), Vec::<(usize, std::fs::File)>::new()))
+            .collect::<Vec<_>>();
+        for opened in opened_shards {
+            shard_to_device[opened.shard_id] = opened.device_idx;
+            device_shards[opened.device_idx]
+                .1
+                .push((opened.shard_id, opened.file));
+        }
+
+        let mut devices = Vec::with_capacity(device_shards.len());
+        for (device_root, shard_files) in device_shards {
+            let shard_ids = shard_files
+                .iter()
+                .map(|(shard_id, _)| *shard_id)
+                .collect::<Vec<_>>();
+            let fds = shard_files
+                .iter()
+                .map(|(shard_id, file)| (*shard_id, file.as_raw_fd()))
+                .collect::<Vec<_>>();
+            let io = Arc::new(UringIoEngine::new_multi(
+                fds,
+                UringConfig {
+                    threads: DEFAULT_URING_THREADS,
+                    io_depth: DEFAULT_URING_IO_DEPTH,
+                },
+            )?);
+            let (write_tx, write_rx) = tokio_mpsc::channel(DEFAULT_WRITE_QUEUE_DEPTH);
+            let (read_tx, read_rx) = tokio_mpsc::channel(DEFAULT_READ_QUEUE_DEPTH);
+
+            task::spawn(ssd_writer_loop(
+                Arc::clone(&inner),
+                write_rx,
+                Arc::clone(&io),
+                Arc::clone(&space_notify),
+                DEFAULT_WRITE_INFLIGHT,
+                shard_ids.clone(),
+            ));
+            task::spawn(ssd_reader_loop(
+                Arc::clone(&inner),
+                read_rx,
+                Arc::clone(&io),
+                DEFAULT_READ_INFLIGHT,
+            ));
+
+            devices.push(SsdDeviceWorker {
+                device_id: device_root.device_id,
+                root_dir: device_root.root_dir,
+                shard_ids,
+                _files: shard_files
+                    .into_iter()
+                    .map(|(_, file)| file)
+                    .collect::<Vec<_>>(),
+                _io: io,
+                write_tx,
+                read_tx,
+            });
+        }
+
+        Ok(Self {
+            root_dirs: effective_root_dirs,
+            devices,
+            shard_to_device,
+            next_write_device: AtomicUsize::new(0),
+            inner,
+            space_notify,
+        })
+    }
+
+    pub fn root_dirs(&self) -> &[PathBuf] {
+        &self.root_dirs
+    }
+
+    fn next_write_tx(&self) -> KvResult<tokio_mpsc::Sender<WriteCommand>> {
+        if self.devices.is_empty() {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: "kv ssd storage has no active device".to_string(),
+            }));
+        }
+        let idx = self.next_write_device.fetch_add(1, Ordering::Relaxed) % self.devices.len();
+        Ok(self.devices[idx].write_tx.clone())
+    }
+
+    fn read_tx_for_shard(&self, shard_id: usize) -> KvResult<tokio_mpsc::Sender<ReadCommand>> {
+        let Some(device_idx) = self.shard_to_device.get(shard_id).copied() else {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: format!("kv ssd invalid shard id for read: {}", shard_id),
+            }));
+        };
+        let Some(device) = self.devices.get(device_idx) else {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: format!(
+                    "kv ssd invalid device index for read: shard_id={} device_idx={}",
+                    shard_id, device_idx
+                ),
+            }));
+        };
+        if !device.shard_ids.contains(&shard_id) {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: format!(
+                    "kv ssd shard/device route mismatch: shard_id={} device_idx={} device_id={} root_dir={}",
+                    shard_id,
+                    device_idx,
+                    device.device_id,
+                    device.root_dir.display()
+                ),
+            }));
+        }
+        Ok(device.read_tx.clone())
+    }
+
+    pub async fn persist_from_addr(
+        &self,
+        key: &str,
+        put_id: PutIDForAKey,
+        addr: u64,
+        len: u64,
+    ) -> KvResult<()> {
+        validate_key(key)?;
+        let len_usize = usize::try_from(len).map_err(|_| {
+            KvError::Api(ApiError::InvalidArgument {
+                detail: format!("kv ssd persist len does not fit usize: {}", len),
+            })
+        })?;
+        let aligned_len = align_up_usize(len_usize, SSD_ALIGNMENT)?;
+        let data = unsafe { AlignedBuffer::copy_from_addr(addr, len_usize, aligned_len)? };
+        self.persist_buffer(key, put_id, len, data).await
+    }
+
+    pub async fn persist(&self, key: &str, put_id: PutIDForAKey, data: &[u8]) -> KvResult<()> {
+        validate_key(key)?;
+        let aligned_len = align_up_usize(data.len(), SSD_ALIGNMENT)?;
+        let mut buffer = AlignedBuffer::zeroed(aligned_len)?;
+        unsafe {
+            std::ptr::copy_nonoverlapping(data.as_ptr(), buffer.as_mut_ptr(), data.len());
+        }
+        self.persist_buffer(key, put_id, data.len() as u64, buffer)
+            .await
+    }
+
+    async fn persist_buffer(
+        &self,
+        key: &str,
+        put_id: PutIDForAKey,
+        entry_len: u64,
+        data: AlignedBuffer,
+    ) -> KvResult<()> {
+        let (done_tx, done_rx) = oneshot::channel();
+        let write_tx = self.next_write_tx()?;
+        write_tx
+            .send(WriteCommand {
+                key: KvSsdKey {
+                    key: key.to_string(),
+                    put_id,
+                },
+                entry_len,
+                data,
+                done_tx,
+            })
+            .await
+            .map_err(|err| {
+                KvError::Api(ApiError::InvalidArgument {
+                    detail: format!("kv ssd write queue closed: {}", err),
+                })
+            })?;
+        done_rx.await.map_err(|err| {
+            KvError::Api(ApiError::InvalidArgument {
+                detail: format!("kv ssd write completion closed: {}", err),
+            })
+        })?
+    }
+
+    pub async fn load_into_addr(
+        &self,
+        key: &str,
+        put_id: PutIDForAKey,
+        target_addr: u64,
+        len: u64,
+        target_len: u64,
+    ) -> KvResult<()> {
+        validate_key(key)?;
+        if target_len < len {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: format!(
+                    "kv ssd target capacity too small for key={} put_id=({},{}) len={} target_len={}",
+                    key, put_id.0, put_id.1, len, target_len
+                ),
+            }));
+        }
+        let key = KvSsdKey {
+            key: key.to_string(),
+            put_id,
+        };
+        let (entry, read_pin) = {
+            let mut inner = self.inner.lock();
+            let Some(entry) = inner.ring.pin_read(&key) else {
+                return Err(KvError::Api(ApiError::KeyNotFound {
+                    key: key.key.clone(),
+                }));
+            };
+            (
+                entry,
+                SsdReadPin {
+                    inner: Arc::clone(&self.inner),
+                    space_notify: Arc::clone(&self.space_notify),
+                    key: key.clone(),
+                },
+            )
+        };
+        if entry.len != len {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: format!(
+                    "kv ssd length mismatch for key={} put_id=({},{}) expected={} actual={}",
+                    key.key, put_id.0, put_id.1, len, entry.len
+                ),
+            }));
+        }
+
+        let len_usize = usize::try_from(len).map_err(|_| {
+            KvError::Api(ApiError::InvalidArgument {
+                detail: format!("kv ssd load len does not fit usize: {}", len),
+            })
+        })?;
+        let aligned_len_usize = usize::try_from(entry.aligned_len).map_err(|_| {
+            KvError::Api(ApiError::InvalidArgument {
+                detail: format!(
+                    "kv ssd aligned load len does not fit usize: {}",
+                    entry.aligned_len
+                ),
+            })
+        })?;
+        let target = match choose_read_path(&entry, target_addr, len, target_len) {
+            SsdReadPath::Direct => ReadTarget::Direct {
+                target_addr,
+                len: aligned_len_usize,
+            },
+            SsdReadPath::Scratch => ReadTarget::Scratch(AlignedBuffer::zeroed(aligned_len_usize)?),
+        };
+        let output = self
+            .submit_read_command(
+                key,
+                entry.clone(),
+                entry.file_offset,
+                target,
+                Some(read_pin),
+            )
+            .await?;
+        if let ReadOutput::Scratch(buffer) = output {
+            unsafe {
+                std::ptr::copy_nonoverlapping(buffer.as_ptr(), target_addr as *mut u8, len_usize);
+            }
+        }
+        Ok(())
+    }
+
+    pub(crate) async fn load_into_addr_chunks(
+        &self,
+        key: &str,
+        put_id: PutIDForAKey,
+        target_addr: u64,
+        len: u64,
+        target_len: u64,
+        chunk_bytes: u64,
+        max_read_inflight: usize,
+        ready_tx: tokio_mpsc::Sender<SsdLoadedChunk>,
+    ) -> KvResult<()> {
+        validate_key(key)?;
+        if target_len < len {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: format!(
+                    "kv ssd target capacity too small for chunked load: key={} put_id=({},{}) len={} target_len={}",
+                    key, put_id.0, put_id.1, len, target_len
+                ),
+            }));
+        }
+        let chunk_bytes = align_up_u64(chunk_bytes.max(1), SSD_ALIGNMENT as u64)?;
+        let key = KvSsdKey {
+            key: key.to_string(),
+            put_id,
+        };
+        let (entry, _read_pin) = {
+            let mut inner = self.inner.lock();
+            let Some(entry) = inner.ring.pin_read(&key) else {
+                return Err(KvError::Api(ApiError::KeyNotFound {
+                    key: key.key.clone(),
+                }));
+            };
+            (
+                entry,
+                SsdReadPin {
+                    inner: Arc::clone(&self.inner),
+                    space_notify: Arc::clone(&self.space_notify),
+                    key: key.clone(),
+                },
+            )
+        };
+        if entry.len != len {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: format!(
+                    "kv ssd length mismatch for chunked load: key={} put_id=({},{}) expected={} actual={}",
+                    key.key, put_id.0, put_id.1, len, entry.len
+                ),
+            }));
+        }
+
+        let mut next_offset = 0u64;
+        let mut inflight = FuturesUnordered::new();
+        let max_read_inflight = max_read_inflight.max(1);
+
+        loop {
+            while next_offset < len && inflight.len() < max_read_inflight {
+                let payload_len = chunk_bytes.min(len - next_offset);
+                let stage_addr = checked_add_u64(target_addr, next_offset, "chunk stage addr")?;
+                let remaining_target_len = target_len - next_offset;
+                inflight.push(self.load_entry_range_into_addr(
+                    key.clone(),
+                    entry.clone(),
+                    next_offset,
+                    payload_len,
+                    stage_addr,
+                    remaining_target_len,
+                ));
+                next_offset += payload_len;
+            }
+
+            let Some(chunk) = inflight.next().await else {
+                break;
+            };
+            let chunk = chunk?;
+            ready_tx.send(chunk).await.map_err(|err| {
+                KvError::Api(ApiError::InvalidArgument {
+                    detail: format!("kv ssd chunk ready queue closed: {}", err),
+                })
+            })?;
+        }
+        Ok(())
+    }
+
+    async fn load_entry_range_into_addr(
+        &self,
+        key: KvSsdKey,
+        entry: SsdIndexEntry,
+        offset: u64,
+        payload_len: u64,
+        target_addr: u64,
+        target_len: u64,
+    ) -> KvResult<SsdLoadedChunk> {
+        if payload_len == 0 {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: "kv ssd chunk payload len must be positive".to_string(),
+            }));
+        }
+        let payload_end = checked_add_u64(offset, payload_len, "chunk payload end")?;
+        if payload_end > entry.len {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: format!(
+                    "kv ssd chunk exceeds entry len: offset={} len={} entry_len={}",
+                    offset, payload_len, entry.len
+                ),
+            }));
+        }
+        let read_len = align_up_u64(payload_len, SSD_ALIGNMENT as u64)?;
+        let read_end = checked_add_u64(offset, read_len, "chunk read end")?;
+        if read_end > entry.aligned_len {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: format!(
+                    "kv ssd aligned chunk exceeds entry aligned len: offset={} read_len={} aligned_len={}",
+                    offset, read_len, entry.aligned_len
+                ),
+            }));
+        }
+        if target_len < read_len {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: format!(
+                    "kv ssd chunk target capacity too small: offset={} read_len={} target_len={}",
+                    offset, read_len, target_len
+                ),
+            }));
+        }
+        let file_offset = checked_add_u64(entry.file_offset, offset, "chunk file offset")?;
+        let read_len_usize = usize::try_from(read_len).map_err(|_| {
+            KvError::Api(ApiError::InvalidArgument {
+                detail: format!("kv ssd chunk read len does not fit usize: {}", read_len),
+            })
+        })?;
+        let payload_len_usize = usize::try_from(payload_len).map_err(|_| {
+            KvError::Api(ApiError::InvalidArgument {
+                detail: format!(
+                    "kv ssd chunk payload len does not fit usize: {}",
+                    payload_len
+                ),
+            })
+        })?;
+        let target = match choose_chunk_read_path(target_addr, read_len, target_len, file_offset) {
+            SsdReadPath::Direct => ReadTarget::Direct {
+                target_addr,
+                len: read_len_usize,
+            },
+            SsdReadPath::Scratch => ReadTarget::Scratch(AlignedBuffer::zeroed(read_len_usize)?),
+        };
+        let output = self
+            .submit_read_command(key, entry, file_offset, target, None)
+            .await?;
+        if let ReadOutput::Scratch(buffer) = output {
+            unsafe {
+                std::ptr::copy_nonoverlapping(
+                    buffer.as_ptr(),
+                    target_addr as *mut u8,
+                    payload_len_usize,
+                );
+            }
+        }
+        Ok(SsdLoadedChunk {
+            offset,
+            stage_addr: target_addr,
+            len: payload_len,
+        })
+    }
+
+    async fn submit_read_command(
+        &self,
+        key: KvSsdKey,
+        entry: SsdIndexEntry,
+        file_offset: u64,
+        target: ReadTarget,
+        read_pin: Option<SsdReadPin>,
+    ) -> KvResult<ReadOutput> {
+        let (done_tx, done_rx) = oneshot::channel();
+        let read_tx = self.read_tx_for_shard(entry.shard_id)?;
+        read_tx
+            .send(ReadCommand {
+                key,
+                entry,
+                file_offset,
+                target,
+                _read_pin: read_pin,
+                done_tx,
+            })
+            .await
+            .map_err(|err| {
+                KvError::Api(ApiError::InvalidArgument {
+                    detail: format!("kv ssd read queue closed: {}", err),
+                })
+            })?;
+        done_rx.await.map_err(|err| {
+            KvError::Api(ApiError::InvalidArgument {
+                detail: format!("kv ssd read completion closed: {}", err),
+            })
+        })?
+    }
+
+    #[cfg(test)]
+    async fn has_entry(&self, key: &str, put_id: PutIDForAKey) -> bool {
+        let key = KvSsdKey {
+            key: key.to_string(),
+            put_id,
+        };
+        self.inner.lock().ring.get(&key).is_some()
+    }
+}
+
+async fn ssd_writer_loop(
+    inner: Arc<Mutex<KvSsdStorageInner>>,
+    mut rx: tokio_mpsc::Receiver<WriteCommand>,
+    io: Arc<UringIoEngine>,
+    space_notify: Arc<Notify>,
+    write_inflight: usize,
+    shard_ids: Vec<usize>,
+) {
+    let mut pending: VecDeque<WriteCommand> = VecDeque::new();
+    let mut inflight = FuturesUnordered::new();
+    let max_inflight = write_inflight.max(1);
+
+    loop {
+        while inflight.len() < max_inflight {
+            let Some(cmd) = pending.pop_front() else {
+                break;
+            };
+            let prepared = {
+                let mut inner = inner.lock();
+                inner
+                    .ring
+                    .prepare_write_on_shards(cmd.key.clone(), cmd.entry_len, &shard_ids)
+            };
+            match prepared {
+                Ok(SsdPreparedWrite::Ready(entry)) => {
+                    inflight.push(execute_write(
+                        WriteTask {
+                            key: cmd.key,
+                            entry,
+                            data: cmd.data,
+                            done_tx: cmd.done_tx,
+                        },
+                        Arc::clone(&io),
+                    ));
+                }
+                Ok(SsdPreparedWrite::Existing) => {
+                    let _ = cmd.done_tx.send(Ok(()));
+                }
+                Ok(SsdPreparedWrite::BlockedByBusyIo) => {
+                    pending.push_front(cmd);
+                    break;
+                }
+                Err(err) => {
+                    let _ = cmd.done_tx.send(Err(err));
+                }
+            }
+        }
+
+        tokio::select! {
+            Some(completion) = inflight.next(), if !inflight.is_empty() => {
+                finish_write_completion(&inner, &space_notify, completion);
+            }
+            Some(cmd) = rx.recv() => {
+                pending.push_back(cmd);
+            }
+            _ = space_notify.notified(), if !pending.is_empty() => {
+                // Retry pending commands after an active read/write releases a ring position.
+            }
+            else => {
+                if pending.is_empty() && inflight.is_empty() {
+                    break;
+                }
+            },
+        }
+    }
+
+    while !pending.is_empty() || !inflight.is_empty() {
+        while inflight.len() < max_inflight {
+            let Some(cmd) = pending.pop_front() else {
+                break;
+            };
+            let prepared = {
+                let mut inner = inner.lock();
+                inner
+                    .ring
+                    .prepare_write_on_shards(cmd.key.clone(), cmd.entry_len, &shard_ids)
+            };
+            match prepared {
+                Ok(SsdPreparedWrite::Ready(entry)) => {
+                    inflight.push(execute_write(
+                        WriteTask {
+                            key: cmd.key,
+                            entry,
+                            data: cmd.data,
+                            done_tx: cmd.done_tx,
+                        },
+                        Arc::clone(&io),
+                    ));
+                }
+                Ok(SsdPreparedWrite::Existing) => {
+                    let _ = cmd.done_tx.send(Ok(()));
+                }
+                Ok(SsdPreparedWrite::BlockedByBusyIo) => {
+                    pending.push_front(cmd);
+                    break;
+                }
+                Err(err) => {
+                    let _ = cmd.done_tx.send(Err(err));
+                }
+            }
+        }
+
+        if let Some(completion) = inflight.next().await {
+            finish_write_completion(&inner, &space_notify, completion);
+        } else if !pending.is_empty() {
+            space_notify.notified().await;
+        }
+    }
+}
+
+fn finish_write_completion(
+    inner: &Arc<Mutex<KvSsdStorageInner>>,
+    space_notify: &Notify,
+    completion: WriteCompletion,
+) {
+    let committed = inner
+        .lock()
+        .ring
+        .commit(&completion.key, completion.success);
+    space_notify.notify_one();
+    let result = if completion.success && !committed {
+        Err(KvError::Api(ApiError::KeyNotFound {
+            key: completion.key.key.clone(),
+        }))
+    } else {
+        completion.result
+    };
+    let _ = completion.done_tx.send(result);
+}
+
+async fn execute_write(task: WriteTask, io: Arc<UringIoEngine>) -> WriteCompletion {
+    let WriteTask {
+        key,
+        entry,
+        data,
+        done_tx,
+    } = task;
+    let data_len = data.len();
+    let shard_id = entry.shard_id;
+    let file_offset = entry.file_offset;
+    let result = async move {
+        let rx = {
+            let data_ptr = data.as_ptr();
+            io.writev_at_async(shard_id, vec![(data_ptr, data_len)], file_offset)?
+        };
+        let written = rx
+            .await
+            .map_err(|_| io::Error::other("kv ssd write completion dropped"))??;
+        if written != data_len {
+            return Err(io::Error::new(
+                io::ErrorKind::WriteZero,
+                format!("short kv ssd write: {} != {}", written, data_len),
+            )
+            .into());
+        }
+        Ok(())
+    }
+    .await;
+    let result = result.map_err(|err| file_error_for_entry(&key, file_offset, err));
+    WriteCompletion {
+        key,
+        success: result.is_ok(),
+        result,
+        done_tx,
+    }
+}
+
+async fn ssd_reader_loop(
+    inner: Arc<Mutex<KvSsdStorageInner>>,
+    mut rx: tokio_mpsc::Receiver<ReadCommand>,
+    io: Arc<UringIoEngine>,
+    read_inflight: usize,
+) {
+    let mut pending = VecDeque::new();
+    let mut inflight = FuturesUnordered::new();
+    let max_inflight = read_inflight.max(1);
+
+    loop {
+        while inflight.len() < max_inflight {
+            let Some(task) = pending.pop_front() else {
+                break;
+            };
+            inflight.push(execute_read(task, Arc::clone(&io)));
+        }
+
+        tokio::select! {
+            Some(completion) = inflight.next(), if !inflight.is_empty() => {
+                let valid = inner.lock().ring.is_offset_valid(&completion.entry);
+                let result = if valid {
+                    completion.result
+                } else {
+                    inner.lock().ring.remove(&completion.key);
+                    Err(KvError::Api(ApiError::KeyNotFound {
+                        key: completion.key.key.clone(),
+                    }))
+                };
+                let _ = completion.done_tx.send(result);
+            }
+            Some(cmd) = rx.recv() => {
+                pending.push_back(ReadTask {
+                    key: cmd.key,
+                    entry: cmd.entry,
+                    file_offset: cmd.file_offset,
+                    target: cmd.target,
+                    _read_pin: cmd._read_pin,
+                    done_tx: cmd.done_tx,
+                });
+            }
+            else => break,
+        }
+    }
+
+    while let Some(completion) = inflight.next().await {
+        let valid = inner.lock().ring.is_offset_valid(&completion.entry);
+        let result = if valid {
+            completion.result
+        } else {
+            inner.lock().ring.remove(&completion.key);
+            Err(KvError::Api(ApiError::KeyNotFound {
+                key: completion.key.key.clone(),
+            }))
+        };
+        let _ = completion.done_tx.send(result);
+    }
+}
+
+async fn execute_read(task: ReadTask, io: Arc<UringIoEngine>) -> ReadCompletion {
+    let ReadTask {
+        key,
+        entry,
+        file_offset,
+        target,
+        _read_pin,
+        done_tx,
+    } = task;
+    let shard_id = entry.shard_id;
+    let result = async move {
+        match target {
+            ReadTarget::Scratch(mut buffer) => {
+                let buffer_len = buffer.len();
+                let rx = {
+                    let buffer_ptr = buffer.as_mut_ptr();
+                    io.readv_at_async(shard_id, vec![(buffer_ptr, buffer_len)], file_offset)?
+                };
+                let read = rx
+                    .await
+                    .map_err(|_| io::Error::other("kv ssd read completion dropped"))??;
+                if read != buffer_len {
+                    return Err(io::Error::new(
+                        io::ErrorKind::UnexpectedEof,
+                        format!("short kv ssd read: {} != {}", read, buffer_len),
+                    ));
+                }
+                Ok(ReadOutput::Scratch(buffer))
+            }
+            ReadTarget::Direct { target_addr, len } => {
+                let rx =
+                    io.readv_at_async(shard_id, vec![(target_addr as *mut u8, len)], file_offset)?;
+                let read = rx
+                    .await
+                    .map_err(|_| io::Error::other("kv ssd read completion dropped"))??;
+                if read != len {
+                    return Err(io::Error::new(
+                        io::ErrorKind::UnexpectedEof,
+                        format!("short kv ssd direct read: {} != {}", read, len),
+                    ));
+                }
+                Ok(ReadOutput::Direct)
+            }
+        }
+    }
+    .await
+    .map_err(|err| file_error_for_entry(&key, file_offset, err));
+    ReadCompletion {
+        key,
+        entry,
+        result,
+        _read_pin,
+        done_tx,
+    }
+}
+
+#[derive(Clone, Copy)]
+struct UringConfig {
+    threads: usize,
+    io_depth: usize,
+}
+
+#[derive(Clone, Copy)]
+enum IoType {
+    Readv,
+    Writev,
+}
+
+struct IoCtx {
+    io_type: IoType,
+    fd: RawFd,
+    len: usize,
+    offset: u64,
+    complete: oneshot::Sender<io::Result<usize>>,
+    iovecs: Box<[libc::iovec]>,
+}
+
+unsafe impl Send for IoCtx {}
+
+struct UringShard {
+    read_rx: crossbeam::channel::Receiver<IoCtx>,
+    write_rx: crossbeam::channel::Receiver<IoCtx>,
+    uring: IoUring,
+    io_depth: usize,
+    read_weight: usize,
+}
+
+impl UringShard {
+    fn run(mut self) {
+        let mut read_inflight = 0usize;
+        let mut write_inflight = 0usize;
+        let mut read_closed = false;
+        let mut write_closed = false;
+
+        loop {
+            let mut inflight = read_inflight + write_inflight;
+            while inflight < self.io_depth && !(read_closed && write_closed) {
+                let next = self.try_recv_weighted(
+                    &mut read_closed,
+                    &mut write_closed,
+                    read_inflight,
+                    write_inflight,
+                );
+                let Some(ctx) = next else {
+                    break;
+                };
+                self.submit_ctx(ctx, &mut read_inflight, &mut write_inflight);
+                inflight = read_inflight + write_inflight;
+            }
+
+            if read_closed && write_closed && inflight == 0 {
+                return;
+            }
+            if inflight == 0 {
+                let Some(ctx) = self.recv_blocking(&mut read_closed, &mut write_closed) else {
+                    continue;
+                };
+                self.submit_ctx(ctx, &mut read_inflight, &mut write_inflight);
+                continue;
+            }
+            if let Err(err) = self.uring.submit_and_wait(1) {
+                while let Some(cqe) = self.uring.completion().next() {
+                    let data = cqe.user_data();
+                    if data != 0 {
+                        let ctx = unsafe { Box::from_raw(data as *mut IoCtx) };
+                        let _ = ctx.complete.send(Err(io::Error::other(format!(
+                            "io_uring submit failed: {err}"
+                        ))));
+                    }
+                }
+                return;
+            }
+
+            for cqe in self.uring.completion() {
+                let data = cqe.user_data();
+                if data == 0 {
+                    continue;
+                }
+                let ctx = unsafe { Box::from_raw(data as *mut IoCtx) };
+                match ctx.io_type {
+                    IoType::Readv => read_inflight = read_inflight.saturating_sub(1),
+                    IoType::Writev => write_inflight = write_inflight.saturating_sub(1),
+                }
+                let res = cqe.result();
+                let send_res = if res < 0 {
+                    Err(io::Error::from_raw_os_error(-res))
+                } else {
+                    Ok(res as usize)
+                };
+                let _ = ctx.complete.send(send_res);
+            }
+        }
+    }
+
+    fn try_recv_weighted(
+        &self,
+        read_closed: &mut bool,
+        write_closed: &mut bool,
+        read_inflight: usize,
+        write_inflight: usize,
+    ) -> Option<IoCtx> {
+        let prefer_read = read_inflight <= write_inflight.saturating_mul(self.read_weight);
+        if prefer_read {
+            self.try_recv_read(read_closed)
+                .or_else(|| self.try_recv_write(write_closed))
+        } else {
+            self.try_recv_write(write_closed)
+                .or_else(|| self.try_recv_read(read_closed))
+        }
+    }
+
+    fn try_recv_read(&self, read_closed: &mut bool) -> Option<IoCtx> {
+        if *read_closed {
+            return None;
+        }
+        match self.read_rx.try_recv() {
+            Ok(ctx) => Some(ctx),
+            Err(crossbeam::channel::TryRecvError::Empty) => None,
+            Err(crossbeam::channel::TryRecvError::Disconnected) => {
+                *read_closed = true;
+                None
+            }
+        }
+    }
+
+    fn try_recv_write(&self, write_closed: &mut bool) -> Option<IoCtx> {
+        if *write_closed {
+            return None;
+        }
+        match self.write_rx.try_recv() {
+            Ok(ctx) => Some(ctx),
+            Err(crossbeam::channel::TryRecvError::Empty) => None,
+            Err(crossbeam::channel::TryRecvError::Disconnected) => {
+                *write_closed = true;
+                None
+            }
+        }
+    }
+
+    fn recv_blocking(&self, read_closed: &mut bool, write_closed: &mut bool) -> Option<IoCtx> {
+        loop {
+            match (!*read_closed, !*write_closed) {
+                (true, true) => {
+                    crossbeam::channel::select! {
+                        recv(self.read_rx) -> msg => match msg {
+                            Ok(ctx) => return Some(ctx),
+                            Err(_) => *read_closed = true,
+                        },
+                        recv(self.write_rx) -> msg => match msg {
+                            Ok(ctx) => return Some(ctx),
+                            Err(_) => *write_closed = true,
+                        },
+                    }
+                }
+                (true, false) => match self.read_rx.recv() {
+                    Ok(ctx) => return Some(ctx),
+                    Err(_) => *read_closed = true,
+                },
+                (false, true) => match self.write_rx.recv() {
+                    Ok(ctx) => return Some(ctx),
+                    Err(_) => *write_closed = true,
+                },
+                (false, false) => return None,
+            }
+        }
+    }
+
+    fn submit_ctx(&mut self, ctx: IoCtx, read_inflight: &mut usize, write_inflight: &mut usize) {
+        let fd = Fd(ctx.fd);
+        let iovecs_ptr = ctx.iovecs.as_ptr();
+        let sqe = match ctx.io_type {
+            IoType::Readv => opcode::Readv::new(fd, iovecs_ptr, ctx.len as _)
+                .offset(ctx.offset)
+                .build(),
+            IoType::Writev => opcode::Writev::new(fd, iovecs_ptr, ctx.len as _)
+                .offset(ctx.offset)
+                .build(),
+        };
+        let io_type = ctx.io_type;
+        let data = Box::into_raw(Box::new(ctx)) as u64;
+        let sqe = sqe.user_data(data);
+        let push_result = unsafe { self.uring.submission().push(&sqe) };
+        if push_result.is_err() {
+            let ctx = unsafe { Box::from_raw(data as *mut IoCtx) };
+            let _ = ctx
+                .complete
+                .send(Err(io::Error::other("submission queue full")));
+            return;
+        }
+        match io_type {
+            IoType::Readv => *read_inflight += 1,
+            IoType::Writev => *write_inflight += 1,
+        }
+    }
+}
+
+#[derive(Debug)]
+struct UringIoEngine {
+    fds: HashMap<usize, RawFd>,
+    read_txs: Vec<crossbeam::channel::Sender<IoCtx>>,
+    write_txs: Vec<crossbeam::channel::Sender<IoCtx>>,
+    handles: Vec<JoinHandle<()>>,
+}
+
+impl UringIoEngine {
+    fn new_multi(shard_fds: Vec<(usize, RawFd)>, cfg: UringConfig) -> io::Result<Self> {
+        if cfg.threads == 0 {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "threads must be > 0",
+            ));
+        }
+        if shard_fds.is_empty() {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "at least one fd is required",
+            ));
+        }
+        let fds = shard_fds.into_iter().collect::<HashMap<_, _>>();
+        let mut read_txs = Vec::with_capacity(cfg.threads);
+        let mut write_txs = Vec::with_capacity(cfg.threads);
+        let mut handles = Vec::with_capacity(cfg.threads);
+        for idx in 0..cfg.threads {
+            let (read_tx, read_rx) = crossbeam::channel::bounded(cfg.io_depth * 2);
+            let (write_tx, write_rx) = crossbeam::channel::bounded(cfg.io_depth * 2);
+            let uring = IoUring::builder().build(cfg.io_depth as u32)?;
+            let handle = std::thread::Builder::new()
+                .name(format!("fluxon-kv-ssd-uring-{idx}"))
+                .spawn(move || {
+                    UringShard {
+                        read_rx,
+                        write_rx,
+                        uring,
+                        io_depth: cfg.io_depth,
+                        read_weight: DEFAULT_URING_READ_WEIGHT,
+                    }
+                    .run()
+                })?;
+            read_txs.push(read_tx);
+            write_txs.push(write_tx);
+            handles.push(handle);
+        }
+        Ok(Self {
+            fds,
+            read_txs,
+            write_txs,
+            handles,
+        })
+    }
+
+    fn readv_at_async(
+        &self,
+        shard_id: usize,
+        iovecs: Vec<(*mut u8, usize)>,
+        offset: u64,
+    ) -> io::Result<oneshot::Receiver<io::Result<usize>>> {
+        self.submit_iovecs(IoType::Readv, shard_id, iovecs, offset)
+    }
+
+    fn writev_at_async(
+        &self,
+        shard_id: usize,
+        iovecs: Vec<(*const u8, usize)>,
+        offset: u64,
+    ) -> io::Result<oneshot::Receiver<io::Result<usize>>> {
+        let iovecs = iovecs
+            .into_iter()
+            .map(|(ptr, len)| (ptr as *mut u8, len))
+            .collect();
+        self.submit_iovecs(IoType::Writev, shard_id, iovecs, offset)
+    }
+
+    fn submit_iovecs(
+        &self,
+        io_type: IoType,
+        shard_id: usize,
+        iovecs: Vec<(*mut u8, usize)>,
+        offset: u64,
+    ) -> io::Result<oneshot::Receiver<io::Result<usize>>> {
+        if iovecs.is_empty() {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "readv/writev requires at least one iovec",
+            ));
+        }
+        validate_direct_io(
+            iovecs.iter().map(|(ptr, len)| (*ptr as usize, *len)),
+            offset,
+        )?;
+        let iovecs_libc = iovecs
+            .iter()
+            .map(|(ptr, len)| libc::iovec {
+                iov_base: *ptr as *mut libc::c_void,
+                iov_len: *len,
+            })
+            .collect::<Vec<_>>()
+            .into_boxed_slice();
+        let (tx, rx) = oneshot::channel();
+        let ctx = IoCtx {
+            io_type,
+            fd: self.fd(shard_id)?,
+            len: iovecs_libc.len(),
+            offset,
+            complete: tx,
+            iovecs: iovecs_libc,
+        };
+        self.pick_tx(io_type, shard_id).send(ctx).map_err(|err| {
+            io::Error::new(
+                io::ErrorKind::BrokenPipe,
+                format!("io_uring send failed: {}", err),
+            )
+        })?;
+        Ok(rx)
+    }
+
+    fn fd(&self, shard_id: usize) -> io::Result<RawFd> {
+        self.fds.get(&shard_id).copied().ok_or_else(|| {
+            io::Error::new(
+                io::ErrorKind::InvalidInput,
+                format!("invalid SSD shard id {shard_id}"),
+            )
+        })
+    }
+
+    fn pick_tx(&self, io_type: IoType, shard_id: usize) -> &crossbeam::channel::Sender<IoCtx> {
+        match io_type {
+            IoType::Readv => &self.read_txs[shard_id % self.read_txs.len()],
+            IoType::Writev => &self.write_txs[shard_id % self.write_txs.len()],
+        }
+    }
+}
+
+impl Drop for UringIoEngine {
+    fn drop(&mut self) {
+        self.read_txs.clear();
+        self.write_txs.clear();
+        for handle in self.handles.drain(..) {
+            let _ = handle.join();
+        }
+    }
+}
+
+struct AlignedBuffer {
+    ptr: NonNull<u8>,
+    len: usize,
+}
+
+unsafe impl Send for AlignedBuffer {}
+
+impl AlignedBuffer {
+    fn zeroed(len: usize) -> KvResult<Self> {
+        if len == 0 || !len.is_multiple_of(SSD_ALIGNMENT) {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: format!(
+                    "aligned buffer len must be positive and {}-byte aligned: {}",
+                    SSD_ALIGNMENT, len
+                ),
+            }));
+        }
+        let mut raw = std::ptr::null_mut();
+        let rc = unsafe { libc::posix_memalign(&mut raw, SSD_ALIGNMENT, len) };
+        if rc != 0 || raw.is_null() {
+            return Err(KvError::Api(ApiError::InvalidArgument {
+                detail: format!("posix_memalign failed with rc={}", rc),
+            }));
+        }
+        unsafe {
+            std::ptr::write_bytes(raw as *mut u8, 0, len);
+        }
+        Ok(Self {
+            ptr: NonNull::new(raw as *mut u8).expect("posix_memalign returned non-null"),
+            len,
+        })
+    }
+
+    unsafe fn copy_from_addr(addr: u64, actual_len: usize, aligned_len: usize) -> KvResult<Self> {
+        let mut buffer = Self::zeroed(aligned_len)?;
+        unsafe {
+            std::ptr::copy_nonoverlapping(addr as *const u8, buffer.as_mut_ptr(), actual_len);
+        }
+        Ok(buffer)
+    }
+
+    fn as_ptr(&self) -> *const u8 {
+        self.ptr.as_ptr()
+    }
+
+    fn as_mut_ptr(&mut self) -> *mut u8 {
+        self.ptr.as_ptr()
+    }
+
+    fn len(&self) -> usize {
+        self.len
+    }
+}
+
+impl Drop for AlignedBuffer {
+    fn drop(&mut self) {
+        unsafe {
+            libc::free(self.ptr.as_ptr() as *mut libc::c_void);
+        }
+    }
+}
+
+fn validate_key(key: &str) -> KvResult<()> {
+    if key.is_empty() {
+        return Err(KvError::Api(ApiError::InvalidArgument {
+            detail: "kv ssd storage key must be non-empty".to_string(),
+        }));
+    }
+    Ok(())
+}
+
+fn choose_read_path(
+    entry: &SsdIndexEntry,
+    target_addr: u64,
+    len: u64,
+    target_len: u64,
+) -> SsdReadPath {
+    if len == 0 || entry.len != len {
+        return SsdReadPath::Scratch;
+    }
+    if target_addr.is_multiple_of(SSD_ALIGNMENT as u64)
+        && target_len >= entry.aligned_len
+        && entry.file_offset.is_multiple_of(SSD_ALIGNMENT as u64)
+    {
+        SsdReadPath::Direct
+    } else {
+        SsdReadPath::Scratch
+    }
+}
+
+fn choose_chunk_read_path(
+    target_addr: u64,
+    read_len: u64,
+    target_len: u64,
+    file_offset: u64,
+) -> SsdReadPath {
+    if read_len != 0
+        && target_addr.is_multiple_of(SSD_ALIGNMENT as u64)
+        && read_len.is_multiple_of(SSD_ALIGNMENT as u64)
+        && target_len >= read_len
+        && file_offset.is_multiple_of(SSD_ALIGNMENT as u64)
+    {
+        SsdReadPath::Direct
+    } else {
+        SsdReadPath::Scratch
+    }
+}
+
+fn choose_shard_count(max_bytes: u64, root_count: usize) -> usize {
+    let max_aligned_shards = (max_bytes / SSD_ALIGNMENT as u64).max(1) as usize;
+    DEFAULT_SHARDS_PER_OWNER
+        .max(root_count)
+        .min(max_aligned_shards)
+        .max(1)
+}
+
+fn aligned_shard_capacity(capacity_bytes: u64, shard_count: usize) -> KvResult<u64> {
+    let raw = capacity_bytes / shard_count as u64;
+    let capacity = raw / SSD_ALIGNMENT as u64 * SSD_ALIGNMENT as u64;
+    if capacity == 0 {
+        return Err(KvError::Api(ApiError::InvalidArgument {
+            detail: "kv ssd storage capacity is too small for shard count".to_string(),
+        }));
+    }
+    Ok(capacity)
+}
+
+fn deduplicate_device_roots(root_dirs: &[PathBuf]) -> KvResult<Vec<SsdDeviceRoot>> {
+    if root_dirs.is_empty() {
+        return Err(KvError::Api(ApiError::InvalidArgument {
+            detail: "kv ssd storage root_dirs must contain at least one path".to_string(),
+        }));
+    }
+    let mut seen_devices = HashSet::new();
+    let mut device_roots = Vec::new();
+    for root_dir in root_dirs {
+        fs::create_dir_all(root_dir).map_err(|err| file_error(root_dir, 0, err))?;
+        let metadata = fs::metadata(root_dir).map_err(|err| file_error(root_dir, 0, err))?;
+        let device_id = metadata.dev();
+        if seen_devices.insert(device_id) {
+            device_roots.push(SsdDeviceRoot {
+                device_id,
+                root_dir: root_dir.clone(),
+            });
+        }
+    }
+    if device_roots.is_empty() {
+        return Err(KvError::Api(ApiError::InvalidArgument {
+            detail: "kv ssd storage root_dirs contains no usable device".to_string(),
+        }));
+    }
+    Ok(device_roots)
+}
+
+fn open_cache_files(
+    device_roots: &[SsdDeviceRoot],
+    shard_count: usize,
+    shard_capacity: u64,
+) -> KvResult<Vec<OpenedSsdShard>> {
+    if device_roots.is_empty() {
+        return Err(KvError::Api(ApiError::InvalidArgument {
+            detail: "kv ssd storage root_dirs must contain at least one path".to_string(),
+        }));
+    }
+    let mut files = Vec::with_capacity(shard_count);
+    for shard_id in 0..shard_count {
+        let device_idx = shard_id % device_roots.len();
+        let root_dir = &device_roots[device_idx].root_dir;
+        let shards_dir = root_dir.join("shards");
+        fs::create_dir_all(&shards_dir).map_err(|err| file_error(&shards_dir, 0, err))?;
+        let path = shards_dir.join(format!("shard-{shard_id:06}.dat"));
+        let file = OpenOptions::new()
+            .create(true)
+            .truncate(true)
+            .read(true)
+            .write(true)
+            .custom_flags(libc::O_DIRECT)
+            .open(&path)
+            .map_err(|err| file_error(&path, 0, err))?;
+        file.set_len(shard_capacity)
+            .map_err(|err| file_error(&path, 0, err))?;
+        files.push(OpenedSsdShard {
+            shard_id,
+            device_idx,
+            file,
+        });
+    }
+    Ok(files)
+}
+
+fn align_up_usize(value: usize, alignment: usize) -> KvResult<usize> {
+    value
+        .checked_add(alignment - 1)
+        .map(|v| v / alignment * alignment)
+        .ok_or_else(|| {
+            KvError::Api(ApiError::InvalidArgument {
+                detail: format!("alignment overflow for value={}", value),
+            })
+        })
+}
+
+fn align_up_u64(value: u64, alignment: u64) -> KvResult<u64> {
+    value
+        .checked_add(alignment - 1)
+        .map(|v| v / alignment * alignment)
+        .ok_or_else(|| {
+            KvError::Api(ApiError::InvalidArgument {
+                detail: format!("alignment overflow for value={}", value),
+            })
+        })
+}
+
+pub(crate) fn align_ssd_io_len(len: u64) -> KvResult<u64> {
+    align_up_u64(len, SSD_ALIGNMENT as u64)
+}
+
+fn checked_add_u64(lhs: u64, rhs: u64, label: &str) -> KvResult<u64> {
+    lhs.checked_add(rhs).ok_or_else(|| {
+        KvError::Api(ApiError::InvalidArgument {
+            detail: format!("kv ssd {label} overflow: {lhs} + {rhs}"),
+        })
+    })
+}
+
+fn validate_direct_io(
+    iovecs: impl IntoIterator<Item = (usize, usize)>,
+    offset: u64,
+) -> io::Result<()> {
+    ensure_aligned("offset", offset as usize)?;
+    for (addr, len) in iovecs {
+        ensure_aligned("buffer address", addr)?;
+        ensure_aligned("iovec length", len)?;
+    }
+    Ok(())
+}
+
+fn ensure_aligned(name: &str, value: usize) -> io::Result<()> {
+    if value.is_multiple_of(SSD_ALIGNMENT) {
+        Ok(())
+    } else {
+        Err(io::Error::new(
+            io::ErrorKind::InvalidInput,
+            format!("O_DIRECT {name} {value:#x} is not {SSD_ALIGNMENT}-byte aligned"),
+        ))
+    }
+}
+
+fn file_error_for_entry(key: &KvSsdKey, offset: u64, err: io::Error) -> KvError {
+    KvError::Api(ApiError::FileWriteError {
+        path: format!("kv-ssd://{}@({},{})", key.key, key.put_id.0, key.put_id.1),
+        offset,
+        detail: err.to_string(),
+    })
+}
+
+fn file_error(path: &Path, offset: u64, err: io::Error) -> KvError {
+    KvError::Api(ApiError::FileWriteError {
+        path: path.to_string_lossy().to_string(),
+        offset,
+        detail: err.to_string(),
+    })
+}
+
+impl From<io::Error> for KvError {
+    fn from(err: io::Error) -> Self {
+        KvError::Api(ApiError::FileWriteError {
+            path: "kv-ssd://io".to_string(),
+            offset: 0,
+            detail: err.to_string(),
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use uuid::Uuid;
+
+    fn new_root() -> PathBuf {
+        std::env::current_dir()
+            .unwrap()
+            .join("target")
+            .join("fluxon_kv_ssd_tests")
+            .join(Uuid::new_v4().to_string())
+    }
+
+    async fn new_store(max_bytes: u64) -> KvSsdStorage {
+        KvSsdStorage::new(KvSsdStorageInit {
+            root_dirs: vec![new_root()],
+            max_bytes,
+        })
+        .unwrap()
+    }
+
+    fn test_key(key: &str, version: u64) -> KvSsdKey {
+        KvSsdKey {
+            key: key.to_string(),
+            put_id: (version, 0),
+        }
+    }
+
+    fn prepare_ready(ring: &mut SsdRingBuffer, key: &KvSsdKey) -> SsdIndexEntry {
+        match ring.prepare_write(key.clone(), 500).unwrap() {
+            SsdPreparedWrite::Ready(entry) => entry,
+            other => panic!("expected ready SSD write, got {other:?}"),
+        }
+    }
+
+    #[::tokio::test]
+    async fn persist_and_load_roundtrip() {
+        let store = new_store(1024 * 1024).await;
+        let data = b"hello from ssd";
+        let put_id = (10, 1);
+        store.persist("k", put_id, data).await.unwrap();
+
+        let mut out = vec![0u8; data.len()];
+        store
+            .load_into_addr(
+                "k",
+                put_id,
+                out.as_mut_ptr() as u64,
+                out.len() as u64,
+                out.len() as u64,
+            )
+            .await
+            .unwrap();
+        assert_eq!(out, data);
+    }
+
+    #[::tokio::test]
+    async fn aligned_load_roundtrip_uses_direct_target() {
+        let store = new_store(1024 * 1024).await;
+        let data = (0..4096).map(|idx| (idx % 251) as u8).collect::<Vec<_>>();
+        let put_id = (11, 1);
+        store.persist("aligned", put_id, &data).await.unwrap();
+
+        let mut out = AlignedBuffer::zeroed(data.len()).unwrap();
+        let target_addr = out.as_mut_ptr() as u64;
+        let entry = {
+            let key = KvSsdKey {
+                key: "aligned".to_string(),
+                put_id,
+            };
+            store.inner.lock().ring.get(&key).unwrap()
+        };
+        assert_eq!(
+            choose_read_path(&entry, target_addr, data.len() as u64, data.len() as u64),
+            SsdReadPath::Direct
+        );
+
+        store
+            .load_into_addr(
+                "aligned",
+                put_id,
+                target_addr,
+                data.len() as u64,
+                data.len() as u64,
+            )
+            .await
+            .unwrap();
+
+        let out_slice = unsafe { std::slice::from_raw_parts(out.as_ptr(), data.len()) };
+        assert_eq!(out_slice, data.as_slice());
+    }
+
+    #[::tokio::test]
+    async fn chunked_load_roundtrip_streams_ready_chunks() {
+        let store = new_store(1024 * 1024).await;
+        let data = (0..2500).map(|idx| (idx % 251) as u8).collect::<Vec<_>>();
+        let put_id = (13, 1);
+        store.persist("chunked", put_id, &data).await.unwrap();
+
+        let mut out =
+            AlignedBuffer::zeroed(align_ssd_io_len(data.len() as u64).unwrap() as usize).unwrap();
+        let target_addr = out.as_mut_ptr() as u64;
+        let (tx, mut rx) = ::tokio::sync::mpsc::channel(2);
+        let producer = store.load_into_addr_chunks(
+            "chunked",
+            put_id,
+            target_addr,
+            data.len() as u64,
+            out.len() as u64,
+            1024,
+            2,
+            tx,
+        );
+        let consumer = async {
+            let mut chunks = Vec::new();
+            while let Some(chunk) = rx.recv().await {
+                chunks.push((chunk.offset, chunk.len));
+            }
+            chunks
+        };
+        let (producer_res, mut chunks) = ::tokio::join!(producer, consumer);
+        producer_res.unwrap();
+        chunks.sort_unstable();
+        assert_eq!(chunks, vec![(0, 1024), (1024, 1024), (2048, 452)]);
+
+        let out_slice = unsafe { std::slice::from_raw_parts(out.as_ptr(), data.len()) };
+        assert_eq!(out_slice, data.as_slice());
+    }
+
+    #[test]
+    fn read_path_uses_direct_for_aligned_target_with_enough_capacity() {
+        let aligned = SsdIndexEntry {
+            shard_id: 0,
+            begin: 0,
+            len: 4096,
+            aligned_len: 4096,
+            file_offset: 0,
+        };
+        assert_eq!(
+            choose_read_path(&aligned, 4096, 4096, 4096),
+            SsdReadPath::Direct
+        );
+        assert_eq!(
+            choose_read_path(&aligned, 4097, 4096, 4096),
+            SsdReadPath::Scratch
+        );
+
+        let unaligned_len = SsdIndexEntry {
+            len: 500,
+            aligned_len: 512,
+            ..aligned
+        };
+        assert_eq!(
+            choose_read_path(&unaligned_len, 4096, 500, 512),
+            SsdReadPath::Direct
+        );
+        assert_eq!(
+            choose_read_path(&unaligned_len, 4096, 500, 500),
+            SsdReadPath::Scratch
+        );
+    }
+
+    #[::tokio::test]
+    async fn unaligned_payload_loads_direct_when_stage_capacity_is_aligned() {
+        let store = new_store(1024 * 1024).await;
+        let data = (0..500).map(|idx| (idx % 251) as u8).collect::<Vec<_>>();
+        let put_id = (12, 1);
+        store.persist("unaligned", put_id, &data).await.unwrap();
+
+        let mut out = AlignedBuffer::zeroed(SSD_ALIGNMENT).unwrap();
+        let target_addr = out.as_mut_ptr() as u64;
+        let entry = {
+            let key = KvSsdKey {
+                key: "unaligned".to_string(),
+                put_id,
+            };
+            store.inner.lock().ring.get(&key).unwrap()
+        };
+        assert_eq!(entry.len, data.len() as u64);
+        assert_eq!(entry.aligned_len, SSD_ALIGNMENT as u64);
+        assert_eq!(
+            choose_read_path(&entry, target_addr, data.len() as u64, SSD_ALIGNMENT as u64),
+            SsdReadPath::Direct
+        );
+
+        store
+            .load_into_addr(
+                "unaligned",
+                put_id,
+                target_addr,
+                data.len() as u64,
+                SSD_ALIGNMENT as u64,
+            )
+            .await
+            .unwrap();
+
+        let out_slice = unsafe { std::slice::from_raw_parts(out.as_ptr(), data.len()) };
+        assert_eq!(out_slice, data.as_slice());
+    }
+
+    #[::tokio::test]
+    async fn storage_deduplicates_root_dirs_on_same_device() {
+        let root_a = new_root();
+        let root_b = new_root();
+        let store = KvSsdStorage::new(KvSsdStorageInit {
+            root_dirs: vec![root_a.clone(), root_b.clone()],
+            max_bytes: 4 * SSD_ALIGNMENT as u64,
+        })
+        .unwrap();
+
+        assert_eq!(
+            fs::metadata(&root_a).unwrap().dev(),
+            fs::metadata(&root_b).unwrap().dev()
+        );
+        assert_eq!(store.root_dirs(), &[root_a.clone()]);
+        assert_eq!(store.devices.len(), 1);
+        assert_eq!(store.shard_to_device, vec![0, 0, 0, 0]);
+        assert!(root_a.join("shards/shard-000000.dat").exists());
+        assert!(root_a.join("shards/shard-000001.dat").exists());
+        assert!(root_a.join("shards/shard-000002.dat").exists());
+        assert!(root_a.join("shards/shard-000003.dat").exists());
+        assert!(!root_b.join("shards").exists());
+    }
+
+    #[test]
+    fn ring_prepare_write_on_shards_uses_only_allowed_shards() {
+        let mut ring = SsdRingBuffer::new(vec![1024, 1024, 1024, 1024]);
+        let mut allocated_shards = Vec::new();
+
+        for version in 0..4 {
+            let key = test_key("per-device", version);
+            let entry = match ring
+                .prepare_write_on_shards(key.clone(), 500, &[1, 3])
+                .unwrap()
+            {
+                SsdPreparedWrite::Ready(entry) => entry,
+                other => panic!("expected ready SSD write, got {other:?}"),
+            };
+            allocated_shards.push(entry.shard_id);
+            assert!(ring.commit(&key, true));
+        }
+
+        assert_eq!(allocated_shards, vec![1, 3, 1, 3]);
+    }
+
+    #[::tokio::test]
+    async fn ring_keeps_new_entry_and_expires_old() {
+        let store = new_store(1024).await;
+        store.persist("old", (1, 0), &[1u8; 500]).await.unwrap();
+        store.persist("filler", (2, 0), &[2u8; 500]).await.unwrap();
+        store.persist("new", (3, 0), &[3u8; 500]).await.unwrap();
+
+        assert!(!store.has_entry("old", (1, 0)).await);
+        assert!(store.has_entry("filler", (2, 0)).await);
+        assert!(store.has_entry("new", (3, 0)).await);
+    }
+
+    #[test]
+    fn ring_read_pin_blocks_overwrite_until_unpinned() {
+        let mut ring = SsdRingBuffer::new(vec![1024]);
+        let old = test_key("old", 1);
+        let filler = test_key("filler", 2);
+        let new = test_key("new", 3);
+
+        let old_entry = prepare_ready(&mut ring, &old);
+        assert_eq!(old_entry.begin, 0);
+        assert!(ring.commit(&old, true));
+        prepare_ready(&mut ring, &filler);
+        assert!(ring.commit(&filler, true));
+
+        let pinned = ring.pin_read(&old).unwrap();
+        assert_eq!(pinned.begin, old_entry.begin);
+        assert!(matches!(
+            ring.prepare_write(new.clone(), 500).unwrap(),
+            SsdPreparedWrite::BlockedByBusyIo
+        ));
+        assert!(ring.get(&old).is_some());
+
+        ring.unpin_read(&old);
+        let new_entry = prepare_ready(&mut ring, &new);
+        assert_eq!(new_entry.file_offset, 0);
+        assert!(ring.commit(&new, true));
+        assert!(ring.get(&old).is_none());
+    }
+
+    #[test]
+    fn ring_writing_entry_blocks_overwrite_until_write_finishes() {
+        let mut ring = SsdRingBuffer::new(vec![1024]);
+        let old = test_key("old", 1);
+        let filler = test_key("filler", 2);
+        let new = test_key("new", 3);
+
+        let old_entry = prepare_ready(&mut ring, &old);
+        assert_eq!(old_entry.begin, 0);
+        prepare_ready(&mut ring, &filler);
+
+        assert!(matches!(
+            ring.prepare_write(new.clone(), 500).unwrap(),
+            SsdPreparedWrite::BlockedByBusyIo
+        ));
+
+        assert!(ring.commit(&old, true));
+        let new_entry = prepare_ready(&mut ring, &new);
+        assert_eq!(new_entry.file_offset, 0);
+    }
+
+    #[test]
+    fn safe_component_replaces_path_separators() {
+        assert_eq!(safe_path_component("owner/a:b"), "owner_a_b");
+    }
+}
diff --git a/fluxon_rs/fluxon_kv/src/kv_test.rs b/fluxon_rs/fluxon_kv/src/kv_test.rs
index 5f0a9e2..94d8ebe 100644
--- a/fluxon_rs/fluxon_kv/src/kv_test.rs
+++ b/fluxon_rs/fluxon_kv/src/kv_test.rs
@@ -11,9 +11,11 @@
 
 use crate::cluster_manager::ClusterManagerRdmaControlInit;
 use crate::config::{
-    ClientConfig, ContributeToClusterPoolSize, FluxonKvSpec, LargeFilePaths, MasterConfig, MonitoringConfig,
-    ProtocolConfig, ProtocolType, TestSpecConfig, TestSpecTransportMode, TransferEngineType,
+    ClientConfig, ContributeToClusterPoolSize, FluxonKvSpec, KvSsdStorageConfig, LargeFilePaths,
+    MasterConfig, MonitoringConfig, ProtocolConfig, ProtocolType, TestSpecConfig,
+    TestSpecTransportMode, TransferEngineType,
 };
+use crate::master_kv_router::msg_pack::GetSourceKind;
 use crate::run_master_with_test_overrides;
 use crate::{ClientRunTestOverrides, MasterRunTestOverrides, run_client_with_test_overrides};
 // external client runs via run_client when contribution is zero
@@ -38,6 +40,8 @@ const CLIENT_COMMUNICATION_VALUE: &[u8] = b"message_from_client1_to_client2";
 const TRANSFER_DATA_PROBE_VALUE_LEN: usize = 256 * 1024;
 const KV_TEST_TRANSFER_PROBE_IO_TIMEOUT_SECS: u64 = 10;
 const KV_TEST_SHUTDOWN_TIMEOUT_SECS: u64 = 60;
+const KV_TEST_SSD_STORAGE_BYTES: u64 = 64 * 1024 * 1024;
+const KV_TEST_STORAGE_PROFILE_SSD_ROUTE_TIMEOUT_SECS: u64 = 30;
 
 fn kv_test_run_scope() -> &'static str {
     static RUN_SCOPE: OnceLock<String> = OnceLock::new();
@@ -610,6 +614,7 @@ struct KvTestClientOptions {
     enable_transfer_rpc_fast_path: Option<bool>,
     contribute_to_cluster_pool_size: Option<ContributeToClusterPoolSize>,
     share_mem_path: Option<String>,
+    ssd_storage: Option<KvSsdStorageConfig>,
     etcd_mode: Option<KvTestEtcdMode>,
 }
 
@@ -642,6 +647,10 @@ impl KvTestClientOptions {
                 .share_mem_path
                 .clone()
                 .or_else(|| self.share_mem_path.clone()),
+            ssd_storage: overrides
+                .ssd_storage
+                .clone()
+                .or_else(|| self.ssd_storage.clone()),
             etcd_mode: overrides
                 .etcd_mode
                 .clone()
@@ -650,6 +659,40 @@ impl KvTestClientOptions {
     }
 }
 
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum KvTestStorageProfile {
+    Memory,
+    Ssd,
+    MemorySsd,
+}
+
+impl KvTestStorageProfile {
+    fn round_suffix(self) -> &'static str {
+        match self {
+            Self::Memory => "",
+            Self::Ssd => "_ssd",
+            Self::MemorySsd => "_memory_ssd",
+        }
+    }
+
+    fn ssd_storage(self) -> Option<KvSsdStorageConfig> {
+        match self {
+            Self::Memory => None,
+            Self::Ssd | Self::MemorySsd => Some(KvSsdStorageConfig {
+                max_bytes: KV_TEST_SSD_STORAGE_BYTES,
+            }),
+        }
+    }
+
+    fn requires_memory_source(self) -> bool {
+        matches!(self, Self::Memory | Self::MemorySsd)
+    }
+
+    fn requires_ssd_source(self) -> bool {
+        matches!(self, Self::Ssd | Self::MemorySsd)
+    }
+}
+
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 enum KvTestRoundProfile {
     P2pOnly,
@@ -760,6 +803,7 @@ fn kv_test_round_test_spec_config(round_profile: KvTestRoundProfile) -> TestSpec
 #[derive(Clone, Debug)]
 struct KvTestRoundOptions {
     round_profile: KvTestRoundProfile,
+    storage_profile: KvTestStorageProfile,
     round_name: String,
     cluster_name: String,
     master_port: Option<u16>,
@@ -803,6 +847,9 @@ impl KvTestRoundOptions {
         )
     }
 
+    fn owner_sub_cluster(&self) -> String {
+        format!("{}_owners", self.round_name)
+    }
 }
 
 #[derive(Clone, Debug)]
@@ -842,8 +889,7 @@ fn default_client_large_file_paths(
     instance_key: &str,
     contribute_to_cluster_pool_size: &ContributeToClusterPoolSize,
 ) -> LargeFilePaths {
-    if contribute_to_cluster_pool_size.dram == 0
-        && contribute_to_cluster_pool_size.vram.is_empty()
+    if contribute_to_cluster_pool_size.dram == 0 && contribute_to_cluster_pool_size.vram.is_empty()
     {
         return LargeFilePaths { paths: Vec::new() };
     }
@@ -852,7 +898,10 @@ fn default_client_large_file_paths(
     }
 }
 
-fn default_owner_test_client_options(round_profile: KvTestRoundProfile) -> KvTestClientOptions {
+fn default_owner_test_client_options(
+    round_profile: KvTestRoundProfile,
+    storage_profile: KvTestStorageProfile,
+) -> KvTestClientOptions {
     KvTestClientOptions {
         protocol_config: Some(round_profile.protocol_config()),
         transfer_engine: Some(round_profile.owner_transfer_engine()),
@@ -861,6 +910,7 @@ fn default_owner_test_client_options(round_profile: KvTestRoundProfile) -> KvTes
         enable_transfer_rpc_fast_path: Some(round_profile.enable_transfer_rpc_fast_path()),
         contribute_to_cluster_pool_size: Some(default_owner_contribute_to_cluster_pool_size()),
         share_mem_path: None,
+        ssd_storage: storage_profile.ssd_storage(),
         etcd_mode: Some(KvTestEtcdMode::Enabled),
     }
 }
@@ -874,6 +924,7 @@ fn default_master_test_client_options(round_profile: KvTestRoundProfile) -> KvTe
         enable_transfer_rpc_fast_path: Some(round_profile.enable_transfer_rpc_fast_path()),
         contribute_to_cluster_pool_size: None,
         share_mem_path: None,
+        ssd_storage: None,
         etcd_mode: None,
     }
 }
@@ -887,22 +938,31 @@ fn default_external_test_client_options() -> KvTestClientOptions {
         enable_transfer_rpc_fast_path: Some(false),
         contribute_to_cluster_pool_size: Some(default_external_contribute_to_cluster_pool_size()),
         share_mem_path: None,
+        ssd_storage: None,
         etcd_mode: Some(KvTestEtcdMode::Disabled),
     }
 }
 
-fn new_kv_test_round(round_profile: KvTestRoundProfile) -> KvTestRoundOptions {
-    let round_name = round_profile.round_name();
+fn new_kv_test_round(
+    round_profile: KvTestRoundProfile,
+    storage_profile: KvTestStorageProfile,
+) -> KvTestRoundOptions {
+    let round_name = format!(
+        "{}{}",
+        round_profile.round_name(),
+        storage_profile.round_suffix()
+    );
     KvTestRoundOptions {
         round_profile,
-        round_name: round_name.to_string(),
+        storage_profile,
+        round_name: round_name.clone(),
         // Keep each process run on its own cluster namespace so a crashed/aborted previous run
         // cannot poison the next rerun with stale members.
         cluster_name: format!("test_cluster_{}_{}", round_name, kv_test_run_scope()),
         master_port: None,
         step8_master_port: None,
         master_options: default_master_test_client_options(round_profile),
-        owner_client_options: default_owner_test_client_options(round_profile),
+        owner_client_options: default_owner_test_client_options(round_profile, storage_profile),
         external_client_options: default_external_test_client_options(),
     }
 }
@@ -919,15 +979,35 @@ fn default_kv_test_run_options() -> KvTestRunOptions {
             .filter(|item| !item.is_empty())
         {
             let profile = match round_name {
-                "p2p_only" => KvTestRoundProfile::P2pOnly,
+                "p2p_only" => {
+                    rounds.push(new_kv_test_round(
+                        KvTestRoundProfile::P2pOnly,
+                        KvTestStorageProfile::Memory,
+                    ));
+                    continue;
+                }
+                "p2p_only_ssd" => {
+                    rounds.push(new_kv_test_round(
+                        KvTestRoundProfile::P2pOnly,
+                        KvTestStorageProfile::Ssd,
+                    ));
+                    continue;
+                }
+                "p2p_only_memory_ssd" => {
+                    rounds.push(new_kv_test_round(
+                        KvTestRoundProfile::P2pOnly,
+                        KvTestStorageProfile::MemorySsd,
+                    ));
+                    continue;
+                }
                 "rdma_transfer_only" => KvTestRoundProfile::RdmaTransferOnly,
                 "rdma_transfer_with_rpc" => KvTestRoundProfile::RdmaTransferWithRpc,
                 other => panic!(
-                    "unsupported FLUXON_KV_TEST_ROUNDS entry '{}'; expected one of: p2p_only, rdma_transfer_only, rdma_transfer_with_rpc",
+                    "unsupported FLUXON_KV_TEST_ROUNDS entry '{}'; expected one of: p2p_only, p2p_only_ssd, p2p_only_memory_ssd, rdma_transfer_only, rdma_transfer_with_rpc",
                     other
                 ),
             };
-            rounds.push(new_kv_test_round(profile));
+            rounds.push(new_kv_test_round(profile, KvTestStorageProfile::Memory));
         }
         if rounds.is_empty() {
             panic!("FLUXON_KV_TEST_ROUNDS was set but produced no valid rounds");
@@ -937,9 +1017,17 @@ fn default_kv_test_run_options() -> KvTestRunOptions {
 
     KvTestRunOptions {
         rounds: vec![
-            new_kv_test_round(KvTestRoundProfile::P2pOnly),
-            new_kv_test_round(KvTestRoundProfile::RdmaTransferOnly),
-            new_kv_test_round(KvTestRoundProfile::RdmaTransferWithRpc),
+            new_kv_test_round(KvTestRoundProfile::P2pOnly, KvTestStorageProfile::Memory),
+            new_kv_test_round(KvTestRoundProfile::P2pOnly, KvTestStorageProfile::Ssd),
+            new_kv_test_round(KvTestRoundProfile::P2pOnly, KvTestStorageProfile::MemorySsd),
+            new_kv_test_round(
+                KvTestRoundProfile::RdmaTransferOnly,
+                KvTestStorageProfile::Memory,
+            ),
+            new_kv_test_round(
+                KvTestRoundProfile::RdmaTransferWithRpc,
+                KvTestStorageProfile::Memory,
+            ),
         ],
     }
 }
@@ -1022,6 +1110,8 @@ fn build_client_launch(
     let contribute_to_cluster_pool_size = options
         .contribute_to_cluster_pool_size
         .unwrap_or(default_owner_contribute_to_cluster_pool_size());
+    let is_external = contribute_to_cluster_pool_size.dram == 0
+        && contribute_to_cluster_pool_size.vram.is_empty();
     let share_mem_path = options
         .share_mem_path
         .unwrap_or_else(|| format!("/tmp/kvcache_shared_memory/{}", instance_key));
@@ -1043,7 +1133,11 @@ fn build_client_launch(
             enable_transfer_rpc_fast_path: options
                 .enable_transfer_rpc_fast_path
                 .expect("kv_test requires enable_transfer_rpc_fast_path to be set explicitly"),
-            sub_cluster: None,
+            sub_cluster: if is_external {
+                None
+            } else {
+                Some(round.owner_sub_cluster())
+            },
         },
         // English note:
         // kv_test uses a per-instance shared memory path by default so each owner/external share
@@ -1054,6 +1148,7 @@ fn build_client_launch(
             &instance_key,
             &contribute_to_cluster_pool_size,
         ),
+        ssd_storage: options.ssd_storage,
         // Mirror round intent into the generated config so logs and runtime behavior
         // agree on whether this launch is transfer_only vs transfer_with_rpc.
         test_spec_config: kv_test_round_test_spec_config(round.round_profile),
@@ -1381,7 +1476,10 @@ async fn key_meta_cache_check(
         }
     }
 
-    tracing::info!("🔍 Starting PUT and GET in parallel: {}", parallel_unique_key);
+    tracing::info!(
+        "🔍 Starting PUT and GET in parallel: {}",
+        parallel_unique_key
+    );
     for i in 0..10 {
         let (put_client, other_client) = if i % 2 == 0 {
             (client, client2)
@@ -1420,7 +1518,9 @@ async fn key_meta_cache_check(
         }
 
         assert!(
-            put_client.client_kv_api().has_cached_key(parallel_unique_key),
+            put_client
+                .client_kv_api()
+                .has_cached_key(parallel_unique_key),
             "put client should have immediate local cache metadata for key {} after put time {}",
             parallel_unique_key,
             i
@@ -1577,6 +1677,208 @@ async fn shutdown_framework_with_timeout(label: &str, framework: &crate::Framewo
     }
 }
 
+fn build_storage_profile_probe_value(tag: &str) -> Vec<u8> {
+    const STORAGE_PROFILE_PROBE_VALUE_LEN: usize = 64 * 1024;
+    build_storage_profile_probe_value_with_len(tag, STORAGE_PROFILE_PROBE_VALUE_LEN)
+}
+
+fn build_storage_profile_probe_value_with_len(tag: &str, len: usize) -> Vec<u8> {
+    let pattern = format!("kv_test_storage_profile:{tag}:").into_bytes();
+    let mut value = Vec::with_capacity(len);
+    while value.len() < len {
+        value.extend_from_slice(pattern.as_slice());
+    }
+    value.truncate(len);
+    value
+}
+
+async fn force_evict_memory_replicas_for_storage_probe(
+    master_framework: &crate::Framework,
+    key: &str,
+) {
+    let master_view = master_framework.master_kv_router_view();
+    let deadline =
+        Instant::now() + Duration::from_secs(KV_TEST_STORAGE_PROFILE_SSD_ROUTE_TIMEOUT_SECS);
+    let (put_id, memory_replica_nodes) = loop {
+        if let Some(route) = master_view.master_kv_router().inner().kv_routes.get(key) {
+            let put_id = route.put_id;
+            let memory_replica_nodes = route
+                .nodes_replicas
+                .read()
+                .keys()
+                .cloned()
+                .collect::<Vec<_>>();
+            let ssd_replica_count = route.ssd_replicas.read().len();
+            if ssd_replica_count > 0 {
+                break (put_id, memory_replica_nodes);
+            }
+        }
+
+        if Instant::now() >= deadline {
+            panic!(
+                "storage profile probe expected at least one SSD replica before memory eviction: key={} timeout={}s",
+                key, KV_TEST_STORAGE_PROFILE_SSD_ROUTE_TIMEOUT_SECS
+            );
+        }
+        sleep(Duration::from_millis(50)).await;
+    };
+
+    for node_id in memory_replica_nodes {
+        crate::master_kv_router::delete::evict_one_kv_replica_for_node(
+            &master_view,
+            key.to_string(),
+            node_id.clone(),
+            put_id,
+        )
+        .unwrap_or_else(|code| {
+            panic!(
+                "storage profile probe failed to evict memory replica: key={} node={} put_id=({},{}) code={}",
+                key, node_id, put_id.0, put_id.1, code
+            )
+        });
+    }
+
+    let Some(route) = master_view.master_kv_router().inner().kv_routes.get(key) else {
+        panic!("storage profile probe route disappeared after memory replicas eviction: key={key}");
+    };
+    assert!(
+        route.nodes_replicas.read().is_empty(),
+        "storage profile probe memory replicas still exist after eviction: key={}",
+        key
+    );
+    assert!(
+        !route.ssd_replicas.read().is_empty(),
+        "storage profile probe SSD replica disappeared after memory replicas eviction: key={}",
+        key
+    );
+}
+
+async fn assert_owner_get_source_kind(
+    reader_framework: &crate::Framework,
+    key: &str,
+    expected_value: &[u8],
+    expected_source_kind: GetSourceKind,
+) {
+    let reader_view = reader_framework.client_kv_api_view().clone();
+    let reader_api = reader_view.client_kv_api();
+    let (mem_holder, get_info) = reader_api
+        .inner()
+        .get(key)
+        .await
+        .unwrap_or_else(|err| {
+            panic!(
+                "storage profile probe get failed: key={} expected_source={:?} err={}",
+                key, expected_source_kind, err
+            )
+        })
+        .unwrap_or_else(|| {
+            panic!(
+                "storage profile probe get returned None: key={} expected_source={:?}",
+                key, expected_source_kind
+            )
+        });
+    assert_eq!(
+        mem_holder.bytes(),
+        expected_value,
+        "storage profile probe value mismatch for key={key}"
+    );
+    let Some(get_info) = get_info else {
+        panic!(
+            "storage profile probe expected remote get info for key={} source={:?}",
+            key, expected_source_kind
+        );
+    };
+    assert_eq!(
+        get_info.source_kind(),
+        expected_source_kind,
+        "storage profile probe source kind mismatch for key={key}"
+    );
+}
+
+async fn run_non_rdma_storage_profile_coverage(
+    round: &KvTestRoundOptions,
+    master_framework: &crate::Framework,
+    writer_framework: &crate::Framework,
+) -> Option<Arc<crate::Framework>> {
+    if round.round_profile != KvTestRoundProfile::P2pOnly {
+        return None;
+    }
+
+    info!(
+        "📋 Storage profile coverage: round={} storage={:?}",
+        round.round_name, round.storage_profile
+    );
+
+    let writer_view = writer_framework.client_kv_api_view().clone();
+    let writer_api = writer_view.client_kv_api();
+    let storage_probe_put_opts = || {
+        crate::client_kv_api::PutOptionalArgs(vec![
+            crate::client_kv_api::PutOptionalArg::PreferredSubCluster(round.owner_sub_cluster()),
+        ])
+    };
+
+    let memory_key = format!("storage_profile_memory_key_{}", round.round_name);
+    let memory_value = build_storage_profile_probe_value(&format!("{}:memory", round.round_name));
+    if round.storage_profile.requires_memory_source() {
+        writer_api
+            .inner()
+            .put(&memory_key, &memory_value, storage_probe_put_opts())
+            .await
+            .unwrap_or_else(|err| {
+                panic!(
+                    "storage profile memory probe put failed: key={} err={}",
+                    memory_key, err
+                )
+            });
+    }
+
+    let ssd_key = format!("storage_profile_ssd_key_{}", round.round_name);
+    let ssd_value = build_storage_profile_probe_value_with_len(
+        &format!("{}:ssd", round.round_name),
+        64 * 1024 + 123,
+    );
+    if round.storage_profile.requires_ssd_source() {
+        writer_api
+            .inner()
+            .put(&ssd_key, &ssd_value, storage_probe_put_opts())
+            .await
+            .unwrap_or_else(|err| {
+                panic!(
+                    "storage profile SSD probe put failed: key={} err={}",
+                    ssd_key, err
+                )
+            });
+        force_evict_memory_replicas_for_storage_probe(master_framework, &ssd_key).await;
+    }
+
+    let reader_launch = new_client_launch(round, "test_storage_profile_reader", None);
+    let (reader_framework, _) = run_kv_test_client(reader_launch)
+        .await
+        .expect("Failed to start storage profile reader");
+
+    sleep(Duration::from_secs(10)).await;
+
+    if round.storage_profile.requires_memory_source() {
+        assert_owner_get_source_kind(
+            &reader_framework,
+            &memory_key,
+            &memory_value,
+            GetSourceKind::Memory,
+        )
+        .await;
+    }
+    if round.storage_profile.requires_ssd_source() {
+        assert_owner_get_source_kind(&reader_framework, &ssd_key, &ssd_value, GetSourceKind::Ssd)
+            .await;
+    }
+
+    info!(
+        "✅ Storage profile coverage passed: round={} storage={:?}",
+        round.round_name, round.storage_profile
+    );
+    Some(reader_framework)
+}
+
 async fn run_kv_step8(round: &KvTestRoundOptions) {
     info!("📋 Step 8: Verifying external client blocking and recovery behavior");
 
@@ -2720,6 +3022,9 @@ async fn run_kv_round(round: &KvTestRoundOptions) {
         info!("✅ Key meta cache testing completed");
     }
 
+    let storage_profile_reader_framework =
+        run_non_rdma_storage_profile_coverage(round, &master_framework, &client1_framework).await;
+
     // 清理旧资源
     {
         info!("🧹 Cleaning up resources");
@@ -2743,6 +3048,14 @@ async fn run_kv_round(round: &KvTestRoundOptions) {
             .unwrap_or_else(|e| panic!("Client 1 framework shutdown failed: {}", e));
         info!("✅ Client 1 framework shutdown successfully");
 
+        if let Some(storage_profile_reader_framework) = storage_profile_reader_framework {
+            shutdown_framework_with_timeout(
+                "storage profile reader",
+                &storage_profile_reader_framework,
+            )
+            .await;
+        }
+
         master_framework
             .shutdown()
             .await
diff --git a/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs b/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs
index c74b64a..43d3c09 100644
--- a/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs
+++ b/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs
@@ -148,6 +148,7 @@ fn new_client_config_with_cluster_and_dram(
         large_file_paths: crate::config::LargeFilePaths {
             paths: vec![format!("{}/large/{}", base, instance_key)],
         },
+        ssd_storage: None,
         test_spec_config: TestSpecConfig::default(),
     };
     println!("fluxonkv core created client config for test: {:?}", conf);
diff --git a/fluxon_rs/fluxon_kv/src/lib.rs b/fluxon_rs/fluxon_kv/src/lib.rs
index edaa386..3b1116d 100644
--- a/fluxon_rs/fluxon_kv/src/lib.rs
+++ b/fluxon_rs/fluxon_kv/src/lib.rs
@@ -7,6 +7,7 @@ pub mod external_client_api;
 pub mod panel_proxy;
 // #[cfg(test)]
 pub mod key_prefix;
+pub mod kv_ssd_storage;
 #[cfg(feature = "test_bins")]
 pub mod kv_test;
 pub mod kvlease;
@@ -797,6 +798,7 @@ fn build_side_transfer_worker_config(
         },
         share_mem_path: owner_config.share_mem_path.clone(),
         large_file_paths: owner_config.large_file_paths.clone(),
+        ssd_storage: None,
         test_spec_config,
     })
 }
@@ -841,6 +843,7 @@ fn build_side_transfer_worker_config_yaml(
             cluster_name: side_config.cluster_name,
             share_mem_path: side_config.share_mem_path,
             large_file_paths: None,
+            ssd_storage: None,
             p2p_listen_port: side_config.fluxonkv_spec.p2p_listen_port,
             redis_compat: None,
             sub_cluster: None,
@@ -1915,6 +1918,9 @@ async fn run_client_impl(
     if is_side_transfer_worker {
         metadata.insert("side_transfer_worker".to_string(), "true".to_string());
     }
+    if !is_external && !is_side_transfer_worker && config.ssd_storage.is_some() {
+        metadata.insert("kv_ssd_storage".to_string(), "true".to_string());
+    }
 
     // Local IPC routing requires both share-group owner id and the local IPC root.
     // The owner id is also published via a dedicated share-group key; we denormalize it into
@@ -2004,6 +2010,20 @@ async fn run_client_impl(
             .await
             .map_err(|e| anyhow::anyhow!("Failed to initialize framework: {:#}", e))?;
     } else {
+        let ssd_storage = if is_side_transfer_worker {
+            None
+        } else if let Some(ssd_cfg) = config.ssd_storage.as_ref() {
+            let root_dirs = config
+                .large_file_paths
+                .kv_ssd_storage_dirs(&config.cluster_name, &config.instance_key)
+                .map_err(|err| anyhow::anyhow!("invalid kv ssd storage dirs: {}", err))?;
+            Some(crate::kv_ssd_storage::KvSsdStorageInit {
+                root_dirs,
+                max_bytes: ssd_cfg.max_bytes,
+            })
+        } else {
+            None
+        };
         let init_args = InitArgsOwner {
             cluster_manager_arg: ClusterManagerNewArg {
                 etcd_endpoints: config.fluxonkv_spec.etcd_addresses.clone(),
@@ -2036,6 +2056,7 @@ async fn run_client_impl(
             },
             client_kv_api_arg: ClientKvApiNewArg {
                 test_spec_config: config.test_spec_config.clone(),
+                ssd_storage,
             },
             client_seg_pool_arg: ClientSegPoolNewArg {
                 contribute_size: config.contribute_to_cluster_pool_size.clone(),
@@ -2468,6 +2489,7 @@ mod tests {
             large_file_paths: crate::config::LargeFilePaths {
                 paths: vec!["/tmp/fluxon_side_transfer_test_large".to_string()],
             },
+            ssd_storage: None,
             test_spec_config: TestSpecConfig {
                 enable_side_transfer: true,
                 side_transfer_worker_count: 4,
@@ -2736,8 +2758,8 @@ mod tests {
             large_file_paths: crate::config::LargeFilePaths {
                 paths: vec![owner_large_root.to_string_lossy().into_owned()],
             },
-            protocol_version:
-                fluxon_util::git_version_build_record::get_current_git_commitid().unwrap(),
+            protocol_version: fluxon_util::git_version_build_record::get_current_git_commitid()
+                .unwrap(),
             write_ts: Some(chrono::Utc::now().timestamp_micros()),
         };
         let shared_meta_json = serde_json::to_string(&shared_meta).unwrap();
@@ -2773,6 +2795,7 @@ mod tests {
             },
             share_mem_path: share_mem_root.to_string_lossy().into_owned(),
             large_file_paths: crate::config::LargeFilePaths { paths: Vec::new() },
+            ssd_storage: None,
             test_spec_config: TestSpecConfig::default(),
         };
 
diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/delete.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/delete.rs
index 12a55ee..52ac76e 100755
--- a/fluxon_rs/fluxon_kv/src/master_kv_router/delete.rs
+++ b/fluxon_rs/fluxon_kv/src/master_kv_router/delete.rs
@@ -130,7 +130,7 @@ pub fn evict_one_kv_replica_for_node(
         return Ok(());
     }
 
-    let last_replica_gone = route.nodes_replicas.read().is_empty();
+    let last_replica_gone = !route.has_live_replica();
     if last_replica_gone {
         let removed = view
             .master_kv_router()
diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/get.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/get.rs
index 8c17155..346df40 100755
--- a/fluxon_rs/fluxon_kv/src/master_kv_router/get.rs
+++ b/fluxon_rs/fluxon_kv/src/master_kv_router/get.rs
@@ -2,9 +2,10 @@ use super::{
     InflightGetInfo, KvRouteInfo, MasterKvRouterView, NodeValueReplicaDesc, OwnerHoldingGetInfo,
     msg_pack::{
         GetAllocationMode, GetDoneReq, GetDoneResp, GetMetaReq, GetMetaResp, GetRevokeReq,
-        GetRevokeResp, GetStartReq, GetStartResp,
+        GetRevokeResp, GetSourceKind, GetStartReq, GetStartResp,
     },
 };
+use crate::kv_ssd_storage::{SSD_ALIGNMENT, align_ssd_io_len};
 use crate::master_kv_router::OneKvNodesRoutes;
 use crate::master_kv_router::put::PutIDForAKey;
 use crate::memholder::MemholderManagerTrait;
@@ -82,7 +83,7 @@ pub async fn handle_get_start(
             let mut remove_in_kv_routes = false;
             if let Some(one_kv_nodes_routes) = view.master_kv_router().inner().kv_routes.get(key) {
                 one_kv_nodes_routes.clean_up_tomb_nodes_replicas(put_id, tombs, view);
-                if one_kv_nodes_routes.nodes_replicas.read().is_empty() {
+                if !one_kv_nodes_routes.has_live_replica() {
                     remove_in_kv_routes = true;
                 }
             }
@@ -113,6 +114,67 @@ pub async fn handle_get_start(
             },
         )
     }
+    fn allocate_get_buffer_on_node(
+        view: &MasterKvRouterView,
+        node_id: &NodeID,
+        len: u64,
+        get_id: u64,
+        purpose: &str,
+    ) -> Result<Arc<Allocation>, msg_and_error::KvError> {
+        let node_allocators = view.master_seg_manager().get_node_allocators(node_id);
+        if node_allocators.is_empty() {
+            tracing::info!(
+                "No allocators found for {} during get: {}, node is not ready",
+                purpose,
+                node_id
+            );
+            return Err(msg_and_error::KvError::Unreachable(
+                msg_and_error::UnreachableError::OwnerNoSeg { detail: "config=0 initializes as external; non-zero initializes as owner; the owner must have memory space (segment)".to_string() }
+            ));
+        }
+
+        let allocator = node_allocators.choose(&mut rand::thread_rng()).unwrap();
+        let mut allocated_addr: Option<Allocation> = None;
+        for attempt in 1..=3 {
+            if let Ok(allocation) = allocator.allocate(len) {
+                allocated_addr = Some(allocation);
+                break;
+            } else {
+                tracing::info!(
+                    "{} allocation attempt {}/3 failed for get_id {} on node {}",
+                    purpose,
+                    attempt,
+                    get_id,
+                    node_id
+                );
+            }
+        }
+        if let Some(allocation) = allocated_addr {
+            return Ok(Arc::new(allocation));
+        }
+
+        let total = allocator.total_size_bytes();
+        let used = allocator.used_size_bytes();
+        let free = total.saturating_sub(used);
+        Err(msg_and_error::KvError::Api(
+            msg_and_error::ApiError::NoSpace {
+                node: node_id.as_ref().to_string(),
+                segment: allocator.seg_device_id.clone(),
+                total_capacity: total,
+                free_capacity: free,
+            },
+        ))
+    }
+    fn align_ssd_stage_addr(raw_addr: u64) -> Result<u64, msg_and_error::KvError> {
+        raw_addr
+            .checked_add(SSD_ALIGNMENT as u64 - 1)
+            .map(|addr| addr / SSD_ALIGNMENT as u64 * SSD_ALIGNMENT as u64)
+            .ok_or_else(|| {
+                msg_and_error::KvError::Api(msg_and_error::ApiError::InvalidArgument {
+                    detail: format!("ssd source staging address alignment overflow: {raw_addr}"),
+                })
+            })
+    }
 
     tracing::debug!("Handling GetStartReq: {:?}", req.serialize_part);
 
@@ -253,11 +315,13 @@ pub async fn handle_get_start(
             put_id: one_kv_nodes_routes.put_id,
             get_id,
             node_id: resp_node_id.clone().into(),
+            source_kind: GetSourceKind::Memory,
             src_addr: resp_src_addr,
             target_addr: resp_target_addr,
             src_base_addr: resp_src_base,
             target_base_addr: resp_target_base,
             len: src_allocation.size(),
+            ssd_stage_len: 0,
             error_code: msg_and_error::OK,
             error_json: String::new(),
             server_process_us: 0,
@@ -270,8 +334,10 @@ pub async fn handle_get_start(
             req_node_id,
             len: src_allocation.size(),
             allocation: target_allocation, // 存储target allocation
+            source_allocation: None,
             route: one_kv_nodes_routes.clone(),
             allocation_mode,
+            source_kind: GetSourceKind::Memory,
         };
 
         view.master_kv_router()
@@ -308,6 +374,167 @@ pub async fn handle_get_start(
             },
         );
     }
+
+    let ssd_replicas = one_kv_nodes_routes.ssd_replicas.read().clone();
+    let mut ssd_replica_keys = ssd_replicas.keys().collect::<Vec<_>>();
+    while !ssd_replica_keys.is_empty() {
+        let to_remove_idx = rand::thread_rng().gen_range(0..ssd_replica_keys.len());
+        let selected_ssd_key = ssd_replica_keys.remove(to_remove_idx);
+        let ssd_replica = ssd_replicas
+            .get(&*selected_ssd_key)
+            .expect("selected SSD replica key must exist");
+        if ssd_replica.tomb_tag.is_tomb() {
+            tombs.insert(selected_ssd_key.to_owned());
+        } else {
+            let ssd_stage_len = match align_ssd_io_len(ssd_replica.len) {
+                Ok(len) => len,
+                Err(err) => {
+                    return failed_resp_err(
+                        err,
+                        Some((tombs, one_kv_nodes_routes.put_id)),
+                        &view,
+                        &req.serialize_part.key,
+                    );
+                }
+            };
+            let source_alloc_len = match ssd_stage_len.checked_add(SSD_ALIGNMENT as u64 - 1) {
+                Some(len) => len,
+                None => {
+                    let err =
+                        msg_and_error::KvError::Api(msg_and_error::ApiError::InvalidArgument {
+                            detail: format!(
+                                "ssd source staging allocation length overflow: {ssd_stage_len}"
+                            ),
+                        });
+                    return failed_resp_err(
+                        err,
+                        Some((tombs, one_kv_nodes_routes.put_id)),
+                        &view,
+                        &req.serialize_part.key,
+                    );
+                }
+            };
+            let source_allocation = match allocate_get_buffer_on_node(
+                &view,
+                &ssd_replica.node_id,
+                source_alloc_len,
+                get_id,
+                "ssd source staging",
+            ) {
+                Ok(allocation) => allocation,
+                Err(err) => {
+                    tracing::info!(
+                        "Skipping SSD source for get_id {} on node {}: {}",
+                        get_id,
+                        ssd_replica.node_id,
+                        err
+                    );
+                    continue;
+                }
+            };
+            let target_allocation = match allocate_get_buffer_on_node(
+                &view,
+                &req_node_id,
+                ssd_replica.len,
+                get_id,
+                "requesting target",
+            ) {
+                Ok(allocation) => allocation,
+                Err(err) => {
+                    return failed_resp_err(
+                        err,
+                        Some((tombs, one_kv_nodes_routes.put_id)),
+                        &view,
+                        &req.serialize_part.key,
+                    );
+                }
+            };
+            let allocation_mode = if one_kv_nodes_routes.try_reserve_get_durable_slot() {
+                GetAllocationMode::DurableReplica
+            } else {
+                GetAllocationMode::Temporary
+            };
+            let source_base = source_allocation.base_addr();
+            let source_raw_addr = match source_base.checked_add(source_allocation.addr()) {
+                Some(addr) => addr,
+                None => {
+                    let err =
+                        msg_and_error::KvError::Api(msg_and_error::ApiError::InvalidArgument {
+                            detail: format!(
+                                "ssd source staging raw address overflow: base={} offset={}",
+                                source_base,
+                                source_allocation.addr()
+                            ),
+                        });
+                    return failed_resp_err(
+                        err,
+                        Some((tombs, one_kv_nodes_routes.put_id)),
+                        &view,
+                        &req.serialize_part.key,
+                    );
+                }
+            };
+            let source_addr = match align_ssd_stage_addr(source_raw_addr) {
+                Ok(addr) => addr,
+                Err(err) => {
+                    return failed_resp_err(
+                        err,
+                        Some((tombs, one_kv_nodes_routes.put_id)),
+                        &view,
+                        &req.serialize_part.key,
+                    );
+                }
+            };
+            let target_base = target_allocation.base_addr();
+            let target_addr = target_base + target_allocation.addr();
+            let resp = GetStartResp {
+                put_id: one_kv_nodes_routes.put_id,
+                get_id,
+                node_id: ssd_replica.node_id.clone().into(),
+                source_kind: GetSourceKind::Ssd,
+                src_addr: source_addr,
+                target_addr,
+                src_base_addr: source_base,
+                target_base_addr: target_base,
+                len: ssd_replica.len,
+                ssd_stage_len,
+                error_code: msg_and_error::OK,
+                error_json: String::new(),
+                server_process_us: 0,
+            };
+            let info = InflightGetInfo {
+                put_id: one_kv_nodes_routes.put_id,
+                src_node_id: ssd_replica.node_id.clone(),
+                key: req.serialize_part.key.clone(),
+                req_node_id,
+                len: ssd_replica.len,
+                allocation: target_allocation,
+                source_allocation: Some(source_allocation),
+                route: one_kv_nodes_routes.clone(),
+                allocation_mode,
+                source_kind: GetSourceKind::Ssd,
+            };
+
+            view.master_kv_router()
+                .inner()
+                .inflight_gets
+                .insert(get_id, info)
+                .await;
+
+            clean_up_tombs(
+                &view,
+                Some((tombs, one_kv_nodes_routes.put_id)),
+                &req.serialize_part.key,
+            );
+            return (
+                get_id,
+                MsgPack {
+                    serialize_part: resp,
+                    raw_bytes: Vec::new(),
+                },
+            );
+        }
+    }
     tracing::info!("Key not found: {}", req.serialize_part.key);
     {
         let err = msg_and_error::KvError::Api(msg_and_error::ApiError::KeyNotFound {
@@ -322,6 +549,64 @@ pub async fn handle_get_start(
     }
 }
 
+fn drop_failed_ssd_source(view: &MasterKvRouterView, inflight_info: &InflightGetInfo) {
+    if inflight_info.source_kind != GetSourceKind::Ssd {
+        tracing::warn!(
+            "Ignoring drop_ssd_source for non-SSD get: get_key={} put_id=({},{}) source_kind={:?}",
+            inflight_info.key,
+            inflight_info.put_id.0,
+            inflight_info.put_id.1,
+            inflight_info.source_kind
+        );
+        return;
+    }
+
+    let route = inflight_info.route.clone();
+    if route.put_id != inflight_info.put_id {
+        return;
+    }
+
+    let removed = route
+        .ssd_replicas
+        .write()
+        .remove(&inflight_info.src_node_id)
+        .is_some();
+    if !removed {
+        return;
+    }
+
+    tracing::warn!(
+        "Removed failed SSD replica: key={} node={} put_id=({},{})",
+        inflight_info.key,
+        inflight_info.src_node_id,
+        inflight_info.put_id.0,
+        inflight_info.put_id.1
+    );
+
+    if route.has_live_replica() {
+        return;
+    }
+
+    let route_for_compare = route.clone();
+    let removed_route = view
+        .master_kv_router()
+        .inner()
+        .kv_routes
+        .remove_if(&inflight_info.key, |_, current| {
+            Arc::ptr_eq(current, &route_for_compare) && current.put_id == inflight_info.put_id
+        })
+        .is_some();
+    if removed_route && view.master_kv_router().prefix_index_enabled() {
+        let view_task = view.clone();
+        let key_for_prefix = inflight_info.key.clone();
+        let _ = view.spawn("ssd_failure_remove_prefix_index", async move {
+            let inner = view_task.master_kv_router().inner();
+            let mut tree = inner.prefix_index.write().await;
+            tree.remove(&key_for_prefix);
+        });
+    }
+}
+
 pub async fn handle_get_revoke(
     view: MasterKvRouterView,
     req: MsgPack<GetRevokeReq>,
@@ -338,6 +623,9 @@ pub async fn handle_get_revoke(
         .remove(&get_id)
         .await
     {
+        if req.serialize_part.drop_ssd_source {
+            drop_failed_ssd_source(&view, &inflight_info);
+        }
         inflight_info.release_durable_slot_if_needed();
         tracing::info!("Revoked get operation with get_id: {}", get_id);
     } else {
@@ -381,7 +669,6 @@ pub async fn handle_get_done(
             .next_holder_id
             .fetch_add(1, Ordering::Relaxed);
 
-        let src_node_id = inflight_info.src_node_id;
         let key = inflight_info.key;
 
         // Create holding info
@@ -404,7 +691,7 @@ pub async fn handle_get_done(
                 if one_kv_nodes_routes.put_id == inflight_info.put_id {
                     let mut nodes_replicas = one_kv_nodes_routes.nodes_replicas.write();
                     if let Some(tomb_tag) =
-                        view.master_seg_manager().get_node_tomb_tag(&src_node_id)
+                        view.master_seg_manager().get_node_tomb_tag(&req_node_id)
                     {
                         if !tomb_tag.is_tomb() {
                             nodes_replicas.insert(
@@ -632,6 +919,21 @@ pub async fn handle_get_meta(
                 raw_bytes: Vec::new(),
             };
         }
+        let ssd_replicas = (*one_kv_nodes_routes.ssd_replicas.read()).clone();
+        for (_, kv_info) in ssd_replicas.iter() {
+            if kv_info.tomb_tag.is_tomb() {
+                continue;
+            }
+            return MsgPack {
+                serialize_part: GetMetaResp {
+                    exists: true,
+                    len: kv_info.len,
+                    error_code: msg_and_error::OK,
+                    error_json: String::new(),
+                },
+                raw_bytes: Vec::new(),
+            };
+        }
         // if let Some((_, kv_info)) = replicas.iter().next() {
         //     let len = kv_info.allocation.size();
 
diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/mod.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/mod.rs
index ee4ca2b..afbfc41 100644
--- a/fluxon_rs/fluxon_kv/src/master_kv_router/mod.rs
+++ b/fluxon_rs/fluxon_kv/src/master_kv_router/mod.rs
@@ -15,13 +15,14 @@ use self::{
     msg_pack::{
         BatchDeleteAckReq, BatchDeleteClientKvMetaCacheReq, CountPrefixReq, CountPrefixResp,
         DeleteAckReq, DeleteReq, GetAllocationMode, GetDoneReq, GetMetaReq, GetRevokeReq,
-        GetStartReq, PutDoneReq, PutRevokeReq, PutStartReq,
+        GetSourceKind, GetStartReq, PutDoneReq, PutRevokeReq, PutStartReq, SsdReplicaCommitReq,
     },
     placement::{PlacementDefault, PlacementPolicy},
-    put::{handle_put_done, handle_put_revoke, handle_put_start},
+    put::{handle_put_done, handle_put_revoke, handle_put_start, handle_ssd_replica_commit},
 };
 use crate::ClientKvApiAccessTrait;
 use crate::client_kv_api::ClientKvApi;
+use crate::client_kv_api::msg_pack::SsdReplicaPersistReq;
 use crate::cluster_manager::{
     ClusterEvent, ClusterManager, ClusterManagerAccessTrait, NodeID, NodeIDString,
 };
@@ -116,8 +117,10 @@ pub struct InflightGetInfo {
     pub req_node_id: NodeID,
     pub len: u64,
     pub allocation: Arc<Allocation>,
+    pub source_allocation: Option<Arc<Allocation>>,
     pub route: Arc<OneKvNodesRoutes>,
     pub allocation_mode: GetAllocationMode,
+    pub source_kind: GetSourceKind,
 }
 
 impl InflightGetInfo {
@@ -201,6 +204,13 @@ pub struct KvRouteInfo {
     pub tomb_tag: NodeTombTag,
 }
 
+#[derive(Clone, Debug)]
+pub struct KvSsdRouteInfo {
+    pub node_id: NodeID,
+    pub len: u64,
+    pub tomb_tag: NodeTombTag,
+}
+
 #[derive(Debug)]
 pub struct OneKvNodesRoutes {
     /// the version id for a kv put operation
@@ -230,6 +240,8 @@ pub struct OneKvNodesRoutes {
 
     /// node_id -> KvRouteInfo
     pub nodes_replicas: RwLock<HashMap<NodeID, KvRouteInfo>>,
+    /// node_id -> SSD replica metadata for the same key-version.
+    pub ssd_replicas: RwLock<HashMap<NodeID, KvSsdRouteInfo>>,
     pub get_durable_slots_used: AtomicU32,
 }
 
@@ -247,9 +259,16 @@ impl OneKvNodesRoutes {
         let mut nodes_replicas = self.nodes_replicas.write();
         nodes_replicas.retain(|_, kv_info| !tombs.contains(&kv_info.node_id));
 
+        let mut ssd_replicas = self.ssd_replicas.write();
+        ssd_replicas.retain(|_, kv_info| !tombs.contains(&kv_info.node_id));
+
         return true;
     }
 
+    fn has_live_replica(&self) -> bool {
+        !self.nodes_replicas.read().is_empty() || !self.ssd_replicas.read().is_empty()
+    }
+
     fn try_reserve_get_durable_slot(&self) -> bool {
         self.get_durable_slots_used
             .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| {
@@ -283,6 +302,7 @@ mod tests {
             put_id: (1, 0),
             lease_id: None,
             nodes_replicas: RwLock::new(HashMap::new()),
+            ssd_replicas: RwLock::new(HashMap::new()),
             get_durable_slots_used: AtomicU32::new(0),
         };
 
@@ -607,6 +627,7 @@ impl MasterKvRouter {
 
     fn register_rpc_callers(&self) {
         RPCCaller::<BatchDeleteClientKvMetaCacheReq>::new().regist(self.0.view().p2p_module());
+        RPCCaller::<SsdReplicaPersistReq>::new().regist(self.0.view().p2p_module());
     }
 
     fn register_rpc_handlers(&self) {
@@ -766,6 +787,22 @@ impl MasterKvRouter {
             Ok(())
         });
 
+        let view = self.0.view().clone();
+        RPCHandler::<SsdReplicaCommitReq>::new().regist(p2p, move |resp, msg| {
+            let view = view.clone();
+            let view2 = view.clone();
+            let view_task = view2.clone();
+            let _ = view.spawn("rpc_ssd_replica_commit", async move {
+                let t0 = Utc::now().timestamp_micros();
+                let mut ack = handle_ssd_replica_commit(view_task, msg).await;
+                ack.serialize_part.server_process_us = Utc::now().timestamp_micros() - t0;
+                if let Err(e) = resp.send_resp(ack).await {
+                    error!("Failed to send SsdReplicaCommitResp: {:?}", e);
+                }
+            });
+            Ok(())
+        });
+
         // --- MemHolder Handlers ---
         // let view = inner.view.clone();
         // RPCHandler::<MemHolderKeepAliveReq>::new().regist(p2p, move |resp, msg| {
diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/msg_pack.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/msg_pack.rs
index 9d5eb1d..bdd85b6 100755
--- a/fluxon_rs/fluxon_kv/src/master_kv_router/msg_pack.rs
+++ b/fluxon_rs/fluxon_kv/src/master_kv_router/msg_pack.rs
@@ -18,6 +18,13 @@ pub enum GetAllocationMode {
     DurableReplica = 2,
 }
 
+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Encode, Decode)]
+pub enum GetSourceKind {
+    #[default]
+    Memory = 0,
+    Ssd = 1,
+}
+
 #[derive(Default, Debug, Clone, Encode, Decode)]
 pub struct GetStartReq {
     pub key: String,
@@ -32,6 +39,7 @@ pub struct GetStartResp {
     pub get_id: u64,
     pub node_id: NodeIDString,
     pub put_id: PutIDForAKey,
+    pub source_kind: GetSourceKind,
     // absolute addresses because Mooncake transfer engine requires absolute addresses (not offsets)
     pub target_addr: u64,
     pub src_addr: u64,
@@ -39,6 +47,8 @@ pub struct GetStartResp {
     pub target_base_addr: u64,
     pub src_base_addr: u64,
     pub len: u64,
+    /// SSD source staging bytes available at src_addr. Zero for memory sources.
+    pub ssd_stage_len: u64,
     pub error_code: ErrorCode,
     pub error_json: String,
     /// Server-side processing time in microseconds for this RPC handler
@@ -56,6 +66,8 @@ impl RPCReq for GetStartReq {
 #[derive(Default, Debug, Clone, Encode, Decode)]
 pub struct GetRevokeReq {
     pub get_id: u64,
+    /// True only when an SSD stage failed and the source must be removed from routing.
+    pub drop_ssd_source: bool,
 }
 impl MsgPackSerializePart for GetRevokeReq {
     fn msg_id(&self) -> u32 {
@@ -250,6 +262,34 @@ impl RPCReq for PutDoneReq {
     type Resp = PutDoneResp;
 }
 
+#[derive(Default, Debug, Clone, Encode, Decode)]
+pub struct SsdReplicaCommitReq {
+    pub key: String,
+    pub put_id: PutIDForAKey,
+    pub node_id: NodeIDString,
+    pub len: u64,
+}
+impl MsgPackSerializePart for SsdReplicaCommitReq {
+    fn msg_id(&self) -> u32 {
+        MsgId::SsdReplicaCommitReq as u32
+    }
+}
+#[derive(Default, Debug, Clone, Encode, Decode)]
+pub struct SsdReplicaCommitResp {
+    pub error_code: ErrorCode,
+    pub error_json: String,
+    /// Server-side processing time in microseconds for this RPC handler
+    pub server_process_us: i64,
+}
+impl MsgPackSerializePart for SsdReplicaCommitResp {
+    fn msg_id(&self) -> u32 {
+        MsgId::SsdReplicaCommitResp as u32
+    }
+}
+impl RPCReq for SsdReplicaCommitReq {
+    type Resp = SsdReplicaCommitResp;
+}
+
 // --- RPC for MemHolder KeepAlive ---
 
 #[derive(Default, Debug, Clone, Encode, Decode)]
diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/put.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/put.rs
index 70d8858..06e41cc 100755
--- a/fluxon_rs/fluxon_kv/src/master_kv_router/put.rs
+++ b/fluxon_rs/fluxon_kv/src/master_kv_router/put.rs
@@ -1,15 +1,19 @@
-use super::NodeValueReplicaDesc;
 use super::{
     InflightPutAllocation, InflightPutInfo, KvRouteInfo, MasterKvRouterView, PutPlacementMode,
-    msg_pack::{PutDoneReq, PutDoneResp, PutRevokeReq, PutRevokeResp, PutStartReq, PutStartResp},
+    msg_pack::{
+        PutDoneReq, PutDoneResp, PutRevokeReq, PutRevokeResp, PutStartReq, PutStartResp,
+        SsdReplicaCommitReq, SsdReplicaCommitResp,
+    },
     placement::PutPlacementTarget,
 };
+use super::{KvSsdRouteInfo, NodeValueReplicaDesc};
+use crate::client_kv_api::msg_pack::SsdReplicaPersistReq;
 use crate::master_kv_router::OneKvNodesRoutes;
 use crate::master_kv_router::delete::DeleteKeyInfo;
 use crate::{
     cluster_manager::{META_KEY_LOCAL_IPC_ROOT, NodeID},
     master_seg_manager::one_seg_allocator::Allocation,
-    p2p::msg_pack::MsgPack,
+    p2p::msg_pack::{MsgPack, RPCCaller},
     rpcresp_kvresult_convert::msg_and_error,
 };
 use fluxon_commu::{META_KEY_SHARED_STORAGE_NODE_ID, META_KEY_SHARED_STORAGE_NODE_START_TIME};
@@ -19,6 +23,7 @@ use rand::seq::SliceRandom;
 use std::{
     collections::HashMap,
     sync::{Arc, atomic::AtomicU32},
+    time::Duration,
 };
 
 pub type PutIDForAKey = (u64, u32);
@@ -474,6 +479,171 @@ pub async fn handle_put_revoke(
     }
 }
 
+fn spawn_ssd_replica_persist_request(
+    view: &MasterKvRouterView,
+    key: String,
+    put_id: PutIDForAKey,
+    node_id: NodeID,
+    len: u64,
+    allocation: Arc<Allocation>,
+) {
+    let target_addr = allocation.base_addr() + allocation.addr();
+    let view = view.clone();
+    let view_task = view.clone();
+    let _ = view.spawn("post_put_ssd_replica_persist", async move {
+        let _allocation_guard = allocation;
+        let req = MsgPack {
+            serialize_part: SsdReplicaPersistReq {
+                key: key.clone(),
+                put_id,
+                target_addr,
+                len,
+            },
+            raw_bytes: Vec::new(),
+        };
+        let resp = RPCCaller::<SsdReplicaPersistReq>::new()
+            .call(
+                view_task.p2p_module(),
+                node_id.clone(),
+                req,
+                Some(Duration::from_secs(60)),
+                2,
+            )
+            .await;
+        match resp {
+            Ok(resp) => {
+                if let Err(err) = crate::rpcresp_kvresult_convert::try_from_code(
+                    resp.serialize_part.error_code,
+                    resp.serialize_part.error_json,
+                ) {
+                    tracing::warn!(
+                        "SSD replica persist failed: key={} put_id=({},{}) node={} err={}",
+                        key,
+                        put_id.0,
+                        put_id.1,
+                        node_id,
+                        err
+                    );
+                } else if resp.serialize_part.persisted {
+                    tracing::debug!(
+                        "SSD replica persist completed: key={} put_id=({},{}) node={}",
+                        key,
+                        put_id.0,
+                        put_id.1,
+                        node_id
+                    );
+                } else {
+                    tracing::debug!(
+                        "SSD replica persist skipped because owner has no SSD store: key={} put_id=({},{}) node={}",
+                        key,
+                        put_id.0,
+                        put_id.1,
+                        node_id
+                    );
+                }
+            }
+            Err(err) => {
+                tracing::warn!(
+                    "SSD replica persist RPC failed: key={} put_id=({},{}) node={} err={:?}",
+                    key,
+                    put_id.0,
+                    put_id.1,
+                    node_id,
+                    err
+                );
+            }
+        }
+    });
+}
+
+fn ok_ssd_replica_commit_resp() -> MsgPack<SsdReplicaCommitResp> {
+    MsgPack {
+        serialize_part: SsdReplicaCommitResp {
+            error_code: msg_and_error::OK,
+            error_json: String::new(),
+            server_process_us: 0,
+        },
+        raw_bytes: Vec::new(),
+    }
+}
+
+pub async fn handle_ssd_replica_commit(
+    view: MasterKvRouterView,
+    req: MsgPack<SsdReplicaCommitReq>,
+) -> MsgPack<SsdReplicaCommitResp> {
+    let req = req.serialize_part;
+    let node_id: NodeID = req.node_id.clone().into();
+    let Some(route_ref) = view.master_kv_router().inner().kv_routes.get(&req.key) else {
+        tracing::debug!(
+            "Ignoring SSD replica commit for missing key: key={} put_id=({},{}) node={}",
+            req.key,
+            req.put_id.0,
+            req.put_id.1,
+            req.node_id
+        );
+        return ok_ssd_replica_commit_resp();
+    };
+    let route = route_ref.value().clone();
+    drop(route_ref);
+
+    if route.put_id != req.put_id {
+        tracing::debug!(
+            "Ignoring stale SSD replica commit: key={} req_put_id=({},{}) current_put_id=({},{}) node={}",
+            req.key,
+            req.put_id.0,
+            req.put_id.1,
+            route.put_id.0,
+            route.put_id.1,
+            req.node_id
+        );
+        return ok_ssd_replica_commit_resp();
+    }
+
+    let tomb_tag = {
+        let replicas = route.nodes_replicas.read();
+        let Some(memory_replica) = replicas.get(&node_id) else {
+            tracing::debug!(
+                "Ignoring SSD replica commit without matching memory replica: key={} put_id=({},{}) node={}",
+                req.key,
+                req.put_id.0,
+                req.put_id.1,
+                req.node_id
+            );
+            return ok_ssd_replica_commit_resp();
+        };
+        memory_replica.tomb_tag.clone()
+    };
+
+    if tomb_tag.is_tomb() {
+        tracing::debug!(
+            "Ignoring SSD replica commit for tombed node: key={} put_id=({},{}) node={}",
+            req.key,
+            req.put_id.0,
+            req.put_id.1,
+            req.node_id
+        );
+        return ok_ssd_replica_commit_resp();
+    }
+
+    route.ssd_replicas.write().insert(
+        node_id.clone(),
+        KvSsdRouteInfo {
+            node_id,
+            len: req.len,
+            tomb_tag,
+        },
+    );
+    tracing::debug!(
+        "Committed SSD replica route: key={} put_id=({},{}) node={} len={}",
+        req.key,
+        req.put_id.0,
+        req.put_id.1,
+        req.node_id,
+        req.len
+    );
+    ok_ssd_replica_commit_resp()
+}
+
 pub async fn handle_put_done(
     view: MasterKvRouterView,
     req: MsgPack<PutDoneReq>,
@@ -488,6 +658,7 @@ pub async fn handle_put_done(
     if let Some(InflightPutInfo {
         node_id,
         key,
+        len,
         src_target_allocation,
         ..
     }) = view
@@ -631,8 +802,9 @@ pub async fn handle_put_done(
         let completed_info = KvRouteInfo {
             node_id: node_id.clone(),
             allocation: Arc::new(target_allocation),
-            tomb_tag,
+            tomb_tag: tomb_tag.clone(),
         };
+        let target_allocation_for_ssd = Arc::clone(&completed_info.allocation);
 
         // Insert into kv_routes with replica support
         let mut old_one_kv_routes: Option<Arc<OneKvNodesRoutes>> = None;
@@ -649,6 +821,7 @@ pub async fn handle_put_done(
                         put_id,
                         lease_id: lease_id_opt,
                         nodes_replicas: RwLock::new(HashMap::new()),
+                        ssd_replicas: RwLock::new(HashMap::new()),
                         get_durable_slots_used: AtomicU32::new(0),
                     })
                 });
@@ -659,6 +832,7 @@ pub async fn handle_put_done(
                     put_id,
                     lease_id: lease_id_opt,
                     nodes_replicas: RwLock::new(HashMap::new()),
+                    ssd_replicas: RwLock::new(HashMap::new()),
                     get_durable_slots_used: AtomicU32::new(0),
                 });
             }
@@ -668,6 +842,15 @@ pub async fn handle_put_done(
                 .insert(node_id.clone(), completed_info);
         }
 
+        spawn_ssd_replica_persist_request(
+            &view,
+            key.clone(),
+            put_id,
+            node_id.clone(),
+            len,
+            target_allocation_for_ssd,
+        );
+
         if let Some(old) = old_one_kv_routes {
             if let Err(err) = view
                 .master_kv_router()
diff --git a/fluxon_rs/fluxon_kv/src/master_lease_manager/lease_manager_test.rs b/fluxon_rs/fluxon_kv/src/master_lease_manager/lease_manager_test.rs
index 5c20cc1..5d344c9 100755
--- a/fluxon_rs/fluxon_kv/src/master_lease_manager/lease_manager_test.rs
+++ b/fluxon_rs/fluxon_kv/src/master_lease_manager/lease_manager_test.rs
@@ -22,7 +22,8 @@ async fn test1_lease_expire_removes_keys() {
     unsafe {
         std::env::set_var("FLUXON_LOG", "debug");
     }
-    let (master_fw, client_fw) = start_master_and_client("lease_master_t1", "lease_client_t1").await;
+    let (master_fw, client_fw) =
+        start_master_and_client("lease_master_t1", "lease_client_t1").await;
     let client_view = client_fw.client_kv_api_view();
     wait_master_ready(&client_view).await;
 
@@ -82,7 +83,8 @@ async fn test2_rebind_to_new_lease_preserves_until_new_expire() {
     unsafe {
         std::env::set_var("FLUXON_LOG", "debug");
     }
-    let (master_fw, client_fw) = start_master_and_client("lease_master_t2", "lease_client_t2").await;
+    let (master_fw, client_fw) =
+        start_master_and_client("lease_master_t2", "lease_client_t2").await;
     let client_view = client_fw.client_kv_api_view();
     wait_master_ready(&client_view).await;
 
@@ -161,7 +163,8 @@ async fn test3_keepalive() {
     unsafe {
         std::env::set_var("FLUXON_LOG", "debug");
     }
-    let (master_fw, client_fw) = start_master_and_client("lease_master_t3", "lease_client_t3").await;
+    let (master_fw, client_fw) =
+        start_master_and_client("lease_master_t3", "lease_client_t3").await;
     let client_view = client_fw.client_kv_api_view();
     wait_master_ready(&client_view).await;
 
@@ -236,7 +239,8 @@ async fn test4_delete_under_lease_then_get_fails() {
     unsafe {
         std::env::set_var("FLUXON_LOG", "debug");
     }
-    let (master_fw, client_fw) = start_master_and_client("lease_master_t4", "lease_client_t4").await;
+    let (master_fw, client_fw) =
+        start_master_and_client("lease_master_t4", "lease_client_t4").await;
     let client_view = client_fw.client_kv_api_view();
     wait_master_ready(&client_view).await;
 
diff --git a/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs b/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs
index 692a9a0..cfd6d55 100644
--- a/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs
+++ b/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs
@@ -101,6 +101,7 @@ fn new_client_config_with_size(
         large_file_paths: crate::config::LargeFilePaths {
             paths: vec![format!("/tmp/kvcache_large/{}", instance_key)],
         },
+        ssd_storage: None,
         test_spec_config: TestSpecConfig::default(),
     }
 }
@@ -134,6 +135,7 @@ fn new_zero_contribution_client_config(
         },
         share_mem_path: format!("/tmp/kvcache_shared_memory/{}", owner_instance_key),
         large_file_paths: crate::config::LargeFilePaths { paths: Vec::new() },
+        ssd_storage: None,
         test_spec_config: TestSpecConfig::default(),
     }
 }
diff --git a/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/msg_and_error.rs b/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/msg_and_error.rs
index 42a9cbc..def8b1c 100644
--- a/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/msg_and_error.rs
+++ b/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/msg_and_error.rs
@@ -35,6 +35,8 @@ pub enum MsgId {
     DeleteAckResp = 3024,
     BatchDeleteAckReq = 3029,
     BatchDeleteAckResp = 3030,
+    SsdReplicaCommitReq = 3031,
+    SsdReplicaCommitResp = 3032,
     GetMetaReq = 3019,
     GetMetaResp = 3020,
     BatchDeleteClientKvMetaCacheReq = 3021,
diff --git a/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/rpcresp_kvresult_convert.rs b/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/rpcresp_kvresult_convert.rs
index b6eb7d6..a5a18b4 100755
--- a/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/rpcresp_kvresult_convert.rs
+++ b/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/rpcresp_kvresult_convert.rs
@@ -3,11 +3,12 @@ use super::msg_and_error::{ErrorCode, KvError, KvResult};
 use crate::client_kv_api::msg_pack::{
     ExternalDeleteAckResp, ExternalDeleteResp, ExternalGetResp, ExternalIsExistResp,
     ExternalPutCommitResp, ExternalPutRevokeResp, ExternalPutStartResp, ExternalPutTransferEndResp,
+    SsdReplicaPersistResp, SsdStageReadResp,
 };
 use crate::master_kv_router::msg_pack::{
     BatchDeleteAckResp, BatchDeleteClientKvMetaCacheResp, DeleteAckResp, DeleteResp, GetDoneResp,
     GetMasterOnlyMetricPartResp, GetMetaResp, GetRevokeResp, GetStartResp, MemHolderKeepAliveResp,
-    MemHolderReleaseResp, PutDoneResp, PutRevokeResp, PutStartResp,
+    MemHolderReleaseResp, PutDoneResp, PutRevokeResp, PutStartResp, SsdReplicaCommitResp,
 };
 use crate::master_seg_manager::msg_pack::RequestSegmentRegistrationResp;
 use crate::memholder::ExternalMemHolderInfo;
@@ -232,6 +233,26 @@ impl FromError for ExternalDeleteAckResp {
         }
     }
 }
+impl FromError for SsdStageReadResp {
+    fn from_error(e: &KvError) -> Self {
+        let code = e.code();
+        Self {
+            error_code: code,
+            error_json: e.to_json(),
+            ..Default::default()
+        }
+    }
+}
+impl FromError for SsdReplicaPersistResp {
+    fn from_error(e: &KvError) -> Self {
+        let code = e.code();
+        Self {
+            error_code: code,
+            error_json: e.to_json(),
+            ..Default::default()
+        }
+    }
+}
 
 // ---- FromError for Master KV Router Resps ----
 impl FromError for GetStartResp {
@@ -294,6 +315,16 @@ impl FromError for PutDoneResp {
         }
     }
 }
+impl FromError for SsdReplicaCommitResp {
+    fn from_error(e: &KvError) -> Self {
+        let code = e.code();
+        Self {
+            error_code: code,
+            error_json: e.to_json(),
+            ..Default::default()
+        }
+    }
+}
 impl FromError for MemHolderKeepAliveResp {
     fn from_error(e: &KvError) -> Self {
         let code = e.code();