From 2924b429132a43439a1b5e7ce25f23e021100ed1 Mon Sep 17 00:00:00 2001 From: zTz01 <1773266173@qq.com> Date: Fri, 3 Jul 2026 10:37:30 +0800 Subject: [PATCH 1/4] ssd --- ...30\345\202\250\350\256\276\350\256\241.md" | 836 +++++++ fluxon_rs/Cargo.lock | 12 + fluxon_rs/fluxon_kv/Cargo.toml | 1 + fluxon_rs/fluxon_kv/src/client_kv_api/get.rs | 117 +- fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs | 357 ++- .../fluxon_kv/src/client_kv_api/msg_pack.rs | 72 + .../fluxon_kv/src/client_seg_pool/mod.rs | 13 +- fluxon_rs/fluxon_kv/src/config.rs | 199 +- .../external_client_test.rs | 2 + .../fluxon_kv/src/external_client_api/mod.rs | 3 +- fluxon_rs/fluxon_kv/src/kv_ssd_storage.rs | 2159 +++++++++++++++++ fluxon_rs/fluxon_kv/src/kv_test.rs | 349 ++- fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs | 1 + fluxon_rs/fluxon_kv/src/lib.rs | 27 +- .../fluxon_kv/src/master_kv_router/delete.rs | 2 +- .../fluxon_kv/src/master_kv_router/get.rs | 310 ++- .../fluxon_kv/src/master_kv_router/mod.rs | 41 +- .../src/master_kv_router/msg_pack.rs | 40 + .../fluxon_kv/src/master_kv_router/put.rs | 191 +- .../lease_manager_test.rs | 12 +- .../fluxon_kv/src/memholder/memholder_test.rs | 2 + .../rpcresp_kvresult_convert/msg_and_error.rs | 2 + .../rpcresp_kvresult_convert.rs | 33 +- 23 files changed, 4713 insertions(+), 68 deletions(-) create mode 100644 "fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" create mode 100644 fluxon_rs/fluxon_kv/src/kv_ssd_storage.rs diff --git "a/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" "b/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" new file mode 100644 index 0000000..d0da8a6 --- /dev/null +++ "b/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" @@ -0,0 +1,836 @@ +# KV 设计 5 - SSD 存储 + +## 稳定结论 + +当前 KV 的 SSD 存储应当是分布式 owner SSD 副本层。每个 owner 可以持有本地 SSD backing tier,master 在同一条 key-version 路由里分开记录内存 owner 副本和 SSD owner 副本;它和 CPU segment 内存副本共享 owner placement、allocation、transfer engine 和 `MemHolder` 生命周期。SSD 只承担可回填的数据源,不暴露第二套用户读写 API。 + +读路径按内存优先。master 找不到可用内存副本时,可以选择任意 owner 上的 SSD 副本;SSD owner 把磁盘数据按 chunk 读入自己节点上的 CPU staging allocation,每个 chunk ready 后立即由 SSD owner 侧复用现有 transfer engine push 到请求方 target allocation 对应 offset。SSD source 路径由 SSD owner 在所有 chunk transfer 完成后直接向 master 发送 `GetDoneReq`,再把 holder 结果放进 `SsdStageReadResp` 回给请求方;请求方不再从 SSD owner staging 发起第二段 transfer,也不再在 SSD source 路径上单独发送 `GetDoneReq`。 + +IO 层吸收 Pegaflow SSD cache 的核心做法:分片环形文件、`O_DIRECT` 对齐 buffer、`io_uring` 后台读写线程、有界读写队列、`Writing/Committed` 两阶段索引,以及 ring tail 推进时的主动失效。进一步对照 3FS 和 foyer 后,当前实现把底层 uring 调度改成 read/write 独立队列,并在同一 shard 内按 inflight 比例优先补读队列,避免 kvcache 回填读被持续写入压住;同时按 3FS 的位置生命周期约束保护正在读或正在写的 ring 位置,tail 推进不能覆盖 active IO。对大 payload 高带宽场景,aligned SSD stage 可以直接 readv 到 source staging allocation,跳过中间 aligned buffer 到 staging 的额外内存拷贝;SSD read 和 transfer 之间使用 producer/consumer pipeline,chunk read 完成即可发起对应 chunk transfer。owner 启动时从 `large_file_paths` 派生 SSD root,先按 `metadata.dev()` 去重,再为每个有效 device 建独立 writer/reader queue 和 `UringIoEngine`;shard 只在所属 device worker 的 shard 集内分配。写路径必须把内存提交和 SSD 提交拆开:内存 `PutDone` 先让 key-version 可读,SSD 写入完成后再通过独立 commit 把同一版本加入 `ssd_replicas`。 + +## 公共契约 + +公共配置只有一个 owner-only 字段: + +```yaml +fluxonkv_spec: + large_file_paths: [/data/fluxon_large] + ssd_storage: + max_bytes: 4294967296 +``` + +规则: + +- `ssd_storage` 缺省或为 `null` 时不启用 SSD。 +- `max_bytes` 必须大于或等于 512 bytes,满足当前 `O_DIRECT` 对齐约束。 +- zero-contribution external 禁止声明 `ssd_storage`;external 只能通过 owner 的 mmap、RPC 和 transfer surface 访问 SSD 回填结果。 +- 实际目录为每个可用 `large_file_root` 下的 `_cluster_kv_ssd_storage//`;owner 启动时创建目录并读取 `metadata.dev()`,同一个 device 只保留第一个 root,避免多个路径指向同一块盘时制造虚假的 IO 并行度。 +- 用户侧 `put/get/delete` API 不因 SSD 增加新入口;SSD 副本是 master 路由内部能力。 + +## 范围边界 + +| 范围 | 当前结论 | +| --- | --- | +| 分布式 SSD 读取 | 已接入。`GetStart` 可以返回任意 SSD owner,source staging allocation 位于 SSD owner,target allocation 位于请求方 owner。 | +| owner 内部多 SSD 路径 | 已接入。SSD root 来自 `large_file_paths`,先按 device 去重;每个有效 device 有独立 writer/reader queue、uring engine 和 shard 集,单 owner 可以利用多块真实本地 SSD。 | +| 内存 KV 复用 | 已复用。SSD 回填由 SSD owner 侧调用 `transfer_data_no_copy` 按 chunk push 到 requester target,全部 chunk transfer 完成后由 SSD owner 调 master `get_done`;requester 只复用返回的 holder 结果构造 `MemHolder`。 | +| Pegaflow IO 模型 | 已接入核心形态:分片 ring、`O_DIRECT`、`io_uring`、有界队列、两阶段提交、tail 失效;写路径已经把内存 `PutDone` 和 SSD commit 拆开。 | +| 3FS 位置生命周期 | 已接入到 SSD ring。读 IO 提交前 pin committed entry;未完成的 `Writing` entry 和 pinned read entry 都会阻止物理位置复用。 | +| 大 payload direct stage | 已接入 aligned fast path 和 chunk pipeline。master 给 SSD source staging 多分配最多 511 bytes,并在 allocation 内返回 512-byte 对齐后的 `src_addr`;SSD read 按 chunk 对齐 IO 长度直接写入 staging,chunk ready 后立刻 transfer,`MemHolder` 仍只使用真实 payload 长度。 | +| 冷启动恢复 | 当前不扫描 SSD shard 重建 master 路由;路由仍由本轮运行时的 `put/get/delete` 生命周期产生。 | +| lease key 专门治理 | 当前没有单独的 lease SSD 生命周期策略;lease 与普通 key 共用 key-version 路由约束。 | +| 独立 SSD 路径参数 | 不提供。SSD 根目录从 `large_file_paths` 派生,避免和日志、共享 bundle、FS disk cache 混用。 | + +## 数据流 + +```mermaid +flowchart TD + A["owner put target allocation"] --> B["write bytes into owner mmap"] + B --> G["owner -> master PutDone(memory_ready)"] + G --> H["master route: nodes_replicas"] + B --> C["async KvSsdStorage.persist_from_addr(key, put_id, addr, len)"] + C --> D["SSD writer queue"] + D --> E["io_uring writev to sharded O_DIRECT ring"] + E --> F["commit index: Writing -> Committed"] + F --> I["owner -> master SsdReplicaCommit"] + I --> J["master route: ssd_replicas"] + + J["get_start"] --> K{"live memory replica?"} + K -->|yes| L["return GetSourceKind::Memory"] + L --> M["existing transfer path"] + + K -->|no| N{"live SSD replica?"} + N -->|yes| O["allocate source staging on SSD owner"] + O --> P["allocate target on requester"] + P --> Q["return GetSourceKind::Ssd"] + Q --> R["SSD owner chunk readv into source staging"] + R --> S["SsdLoadedChunk(offset,len)"] + S --> W["SSD owner transfer chunk: staging+offset -> requester target+offset"] + W --> T["all chunks done: SSD owner -> master GetDoneReq"] + T --> V["SsdStageReadResp carries GetDoneResp fields"] + + N -->|no| U["KeyNotFound"] +``` + +## 端到端调用时序 + +SSD 路径只在两个位置扩展主链路:`put_done` 提交内存副本后,owner 异步把本地 target allocation 落到 SSD,并在完成后单独提交 SSD 副本;`get_start` 找不到可用内存副本时,master 为 SSD owner 分配 source staging,再由 SSD owner 按 chunk 把磁盘数据读入 staging 并 push 到 requester target。`get_done` 和 holder 生命周期继续走内存 KV 的原 master 逻辑,但 SSD source 路径的 `GetDoneReq` 由 SSD owner 在全部 chunk transfer 完成后发起,requester 只消费 `SsdStageReadResp` 里带回的 done 结果。 + +```mermaid +sequenceDiagram + participant C as requester owner + participant M as master + participant SO as SSD owner + participant TE as transfer engine + participant SSD as owner SSD shard + + C->>M: PutStartReq(key, len) + M-->>C: PutStartResp(target allocation) + Note over C: payload 写入 target allocation + C->>M: PutDoneReq(memory_ready) + Note right of M: nodes_replicas 写入内存副本\nkey-version 立即可读\nspawn post_put_ssd_replica_persist + M-->>C: PutDoneResp + M->>C: async SsdReplicaPersistReq(key, put_id, target_addr, len) + C->>SSD: KvSsdStorage.persist_from_addr(...) + Note over SSD: writer queue -> io_uring writev -> Writing/Committed + C->>M: SsdReplicaCommitReq(key, put_id, node_id, len) + Note right of M: ssd_replicas 写入 SSD 副本 + + C->>M: GetStartReq(key) + alt live memory replica exists + M-->>C: GetStartResp(source_kind=Memory, src_addr, target_addr) + else only SSD replica exists + Note right of M: 在 SSD owner CPU segment 分配 source_allocation\n在 requester CPU segment 分配 target allocation + M-->>C: GetStartResp(source_kind=Ssd, src_addr, target_addr, ssd_stage_len) + C->>SO: SsdStageReadReq(get_id, stage_addr=src_addr, stage_len=ssd_stage_len, target_node_id, target_addr, len) + SO->>SSD: load_into_addr_chunks(key, put_id, stage_addr, len, stage_len) + loop each ready chunk + SSD-->>SO: SsdLoadedChunk(offset, stage_addr+offset, chunk_len) + SO->>TE: transfer_data_no_copy(write, stage+offset -> target+offset, chunk_len) + end + SO->>M: GetDoneReq(get_id) + Note right of M: target allocation 进入 get_holding\nsource_allocation 释放 + M-->>SO: GetDoneResp(holder_id) + SO-->>C: SsdStageReadResp(done_holder_id, done_allocation_mode) + end + opt source_kind=Memory + C->>TE: transfer_data_no_copy(read, src_addr -> target_addr, len) + C->>M: GetDoneReq(get_id) + Note right of M: target allocation 进入 get_holding + M-->>C: GetDoneResp(holder_id) + end +``` + +## 当前实现 + +| 模块 | 职责 | +| --- | --- | +| `fluxon_kv/src/config.rs` | 解析 `fluxonkv_spec.ssd_storage.max_bytes`,禁止 external 声明该字段,派生 SSD 根目录。 | +| `fluxon_kv/src/kv_ssd_storage.rs` | owner 内部 SSD cache。使用 shard 文件、`O_DIRECT`、`io_uring`、有界读写队列和两阶段索引管理 key-version bytes。 | +| `client_kv_api/put.rs` | owner 是最终 target 时,先通过 `PutDoneReq` 提交内存副本;SSD persist 由 master 的后台 `SsdReplicaPersistReq` 触发,owner 完成本地落盘后再通过独立 SSD commit 上报。 | +| `client_kv_api/get.rs` | `GetSourceKind::Ssd` 时,请求方让 SSD owner stage、push 并完成 `get_done`;stage RPC 成功后跳过请求方 transfer,也跳过请求方 `get_done`。 | +| `client_kv_api/msg_pack.rs` | 定义 `SsdStageReadReq/SsdStageReadResp` 和 `SsdReplicaPersistReq/SsdReplicaPersistResp`,分别用于 SSD stage 读、回传 done 结果,以及 master 触发 owner 本地 SSD persist。 | +| `master_kv_router/put.rs` | `put_done` 只提交内存副本,随后异步发起 `SsdReplicaPersistReq`;`SsdReplicaCommitReq` 单独写 `ssd_replicas`。 | +| `master_kv_router/get.rs` | 内存副本优先;无内存副本时从 `ssd_replicas` 中选择可用 owner,分配 source staging 和 requester target。 | +| `master_kv_router/delete.rs` | 内存副本被驱逐时,如果同 key-version 仍有 SSD 副本,保留 `kv_routes`。 | + +## 接口里的角色分工 + +SSD 逻辑按接口看最清楚:`put` 先让一个 key-version 的内存副本 ready,再异步补交 SSD 副本;`get` 决定读请求先走内存副本还是 SSD fallback。每个接口里再分 master、owner、external 三个角色看状态归属。 + +### put + +```mermaid +sequenceDiagram + participant E as external + participant O as owner + participant M as master + participant SSD as owner SSD store + + E->>O: ExternalPutStartReq(key, len) + O->>M: PutStartReq(key, len) + Note right of M: 分配 put_id 和 src/target allocation\n记录 inflight_puts + M-->>O: PutStartResp(put_id, src_addr, target_addr) + O-->>E: ExternalPutStartResp(offsets, put_id) + + Note over E,O: external 写 owner mmap/staging + E->>O: ExternalPutTransferEndReq(put_id) + O->>O: transfer_data_no_copy if remote target + O->>M: PutDoneReq(memory_ready) + Note right of M: 写 nodes_replicas\nkey-version 立即可读\nspawn post_put_ssd_replica_persist + M-->>O: PutDoneResp + O-->>E: ExternalPutTransferEndResp + M->>O: async SsdReplicaPersistReq(key, put_id, target_addr, len) + O->>SSD: persist_from_addr(key, put_id, target_addr, len) + Note over SSD: device write_tx -> per-device ssd_writer_loop -> io_uring writev\nWriting -> Committed + O->>M: SsdReplicaCommitReq(key, put_id, node_id, len) + Note right of M: 写 ssd_replicas +``` + +#### master + +master 持有 `put` 的权威控制面状态:`inflight_puts` 记录未完成写入,`kv_routes` 记录提交后的当前版本。当前实现里 `PutDoneReq` 只表示内存副本 ready;SSD 副本通过独立 `SsdReplicaCommitReq` 进入 route。 + +当前协议结构如下。 + +```rust +pub struct MasterKvRouterInner { + pub inflight_puts: moka::future::Cache<(String, u64, u32), InflightPutInfo>, + pub kv_routes: DashMap>, + ... +} + +pub struct InflightPutInfo { + pub node_id: NodeID, + pub key: String, + pub req_node_id: NodeID, + pub len: u64, + pub src_target_allocation: Arc>>, +} + +pub struct OneKvNodesRoutes { + pub put_id: PutIDForAKey, + pub nodes_replicas: RwLock>, + pub ssd_replicas: RwLock>, + ... +} + +pub struct PutDoneReq { + pub key: String, + pub put_id: PutIDForAKey, + pub lease_id: Option, +} + +pub struct SsdReplicaCommitReq { + pub key: String, + pub put_id: PutIDForAKey, + pub node_id: NodeIDString, + pub len: u64, +} +``` + +`PutStartReq` 到达 master 后,master 分配 `put_id` 和源/目标 allocation,并把 allocation 放进 `InflightPutInfo.src_target_allocation`。`PutDoneReq` 到达时,master 只把 target allocation 写入 `nodes_replicas`,此时 key-version 已经可被 `get` 命中。SSD owner 后续完成落盘后再发 `SsdReplicaCommitReq`,master 校验 `kv_routes[key].put_id == put_id` 后,把 `KvSsdRouteInfo { node_id, len, tomb_tag }` 写入同一个 `OneKvNodesRoutes.ssd_replicas`。master 不保存 SSD 文件 offset,也不保存 owner 本地 ring index。 + +#### owner + +owner 持有数据面:本机 CPU segment、可选 SSD store、put transfer 和 SSD persist。当前实现里,SSD persist 发生在 master 收到 `PutDoneReq` 并提交内存路由之后,不能阻塞内存副本 ready。 + +当前 owner 字段如下。 + +```rust +pub struct ClientKvApiInner { + ssd_storage: Option>, + rpc_caller_put_start: RPCCaller, + rpc_caller_put_done: RPCCaller, + rpc_caller_ssd_replica_commit: RPCCaller, + ... +} + +pub struct SsdReplicaPersistReq { + pub key: String, + pub put_id: PutIDForAKey, + pub target_addr: u64, + pub len: u64, +} + +pub struct KvSsdStorage { + root_dirs: Vec, + devices: Vec, + shard_to_device: Vec, + next_write_device: AtomicUsize, + inner: Arc>, + space_notify: Arc, +} + +struct SsdDeviceWorker { + device_id: u64, + root_dir: PathBuf, + shard_ids: Vec, + _files: Vec, + _io: Arc, + write_tx: tokio_mpsc::Sender, + read_tx: tokio_mpsc::Sender, +} + +struct KvSsdStorageInner { + ring: SsdRingBuffer, +} +``` + +owner 如果是最终 target,先完成原有 transfer 和 `PutDoneReq`,让内存副本进入 `nodes_replicas`。master 随后在后台 task 里把 `SsdReplicaPersistReq { key, put_id, target_addr, len }` 发回 target owner,并持有 target allocation 的 `Arc`,保证 owner 从内存复制到 SSD 期间 payload 不会被释放或复用。 + +owner 的 `rpc_ssd_replica_persist` handler 收到请求后,从 target allocation 的绝对地址调用 `persist_local_kv_to_ssd(...)`,进入 `KvSsdStorage::persist_from_addr(key, put_id, addr, len)`。`persist_from_addr` 把真实 payload 拷到 512-byte 对齐的 `AlignedBuffer`,`persist_buffer` 通过 `next_write_device` round-robin 选择一个 `SsdDeviceWorker.write_tx` 并等待后台 writer 完成。每个 `ssd_writer_loop` 只拿自己的 `shard_ids` 调 `SsdRingBuffer::prepare_write_on_shards(...)`,在 `ring.entries` 中建立 `Writing(SsdIndexEntry)`;对应 device 的 `UringIoEngine` 对该 shard 文件执行 `writev`,成功后提交为 `Committed(SsdIndexEntry)`。这之后 owner 发送 `SsdReplicaCommitReq` 给 master,补交 SSD 副本。 + +#### external + +external 只持有写入请求上下文和 owner 暴露的 mmap offset,不持有 SSD route 或 SSD index。 + +```rust +pub struct ExternalPutStartReq { + pub key: String, + pub len: u64, + pub reject_if_inflight_same_key: bool, + pub preferred_sub_cluster: Option, + pub started_time: i64, + pub test_observe_put_phases: bool, +} + +pub struct ExternalPutTransferEndReq { + pub key: String, + pub len: u64, + pub src_offset: u64, + pub target_offset: u64, + pub peer_id: Option, + pub target_base_addr: Option, + pub put_id: Option, + pub lease_id: Option, + pub started_time: i64, + pub test_observe_put_phases: bool, +} +``` + +external put 仍然是 `ExternalPutStart -> 写 owner mmap -> ExternalPutTransferEnd`。`ExternalPutTransferEndResp` 只代表内存提交完成;SSD 是否启用、何时 persist 成功、何时写入 `ssd_replicas` 都由 owner 和 master 的内部 commit 协议决定。external 只通过 `started_time` 做 owner 代际校验,避免把旧代际请求提交给新 owner。 + +### get + +```mermaid +sequenceDiagram + participant E as external + participant RO as requester owner + participant M as master + participant SO as SSD owner + participant TE as transfer engine + participant SSD as owner SSD store + + E->>RO: ExternalGetReq(key) + RO->>M: GetStartReq(key) + alt memory replica exists + M-->>RO: GetStartResp(source_kind=Memory) + else SSD fallback + Note right of M: 在 SSD owner 分配 source_allocation\n在 requester owner 分配 target allocation\n写 inflight_gets + M-->>RO: GetStartResp(source_kind=Ssd, src_addr, target_addr, ssd_stage_len) + RO->>SO: SsdStageReadReq(get_id, stage_addr=src_addr, stage_len=ssd_stage_len, target_node_id, target_addr, len) + SO->>SSD: load_into_addr_chunks(key, put_id, stage_addr, len, stage_len) + Note over SSD: pin committed entry\nproducer 按 chunk readv direct 或 scratch fallback + loop each ready chunk + SSD-->>SO: SsdLoadedChunk(offset, stage_addr+offset, chunk_len) + SO->>TE: transfer_data_no_copy(write, stage+offset -> target+offset, chunk_len) + end + SO->>M: GetDoneReq(get_id) + Note right of M: target allocation -> get_holding\nsource_allocation 释放 + M-->>SO: GetDoneResp(holder_id) + SO-->>RO: SsdStageReadResp(done fields) + end + opt source_kind=Memory + RO->>RO: transfer_data_no_copy(read, src_addr -> target_addr, len) + RO->>M: GetDoneReq(get_id) + Note right of M: target allocation -> get_holding + M-->>RO: GetDoneResp(holder_id) + end + RO-->>E: ExternalGetResp(ExternalMemHolderInfo) +``` + +#### master + +master 持有 `get` 的权威路由、在途 allocation 和完成后的 holder authority。 + +```rust +pub struct MasterKvRouterInner { + pub inflight_gets: moka::future::Cache, + pub get_holding: MasterOwnerMemMgr, + pub kv_routes: DashMap>, + ... +} + +pub struct OneKvNodesRoutes { + pub put_id: PutIDForAKey, + pub nodes_replicas: RwLock>, + pub ssd_replicas: RwLock>, + pub get_durable_slots_used: AtomicU32, +} + +pub struct KvSsdRouteInfo { + pub node_id: NodeID, + pub len: u64, + pub tomb_tag: NodeTombTag, +} + +pub struct InflightGetInfo { + pub put_id: PutIDForAKey, + pub src_node_id: NodeID, + pub req_node_id: NodeID, + pub len: u64, + pub allocation: Arc, + pub source_allocation: Option>, + pub route: Arc, + pub allocation_mode: GetAllocationMode, + pub source_kind: GetSourceKind, +} +``` + +master 处理 `GetStartReq` 时,先查 `kv_routes`。有 live 内存副本时,返回 `GetSourceKind::Memory`。只有内存副本不可用时,master 才从 `ssd_replicas` 里选 SSD owner,并分配两块 CPU segment allocation:`source_allocation` 在 SSD owner 上,`allocation` 在 requester owner 上。`GetStartResp.src_addr` 是 SSD owner 本地对齐 staging 地址,`GetStartResp.target_addr` 是 requester target 地址,`GetStartResp.ssd_stage_len` 是对齐后的 source staging 容量,`GetStartResp.len` 始终是真实 payload 长度。 + +`GetDoneReq` 到达后,master 把 `InflightGetInfo.allocation` 转入 `get_holding`,返回 `holder_id`。memory source 路径的 `GetDoneReq` 由 requester owner 发送;SSD source 路径的 `GetDoneReq` 由 SSD owner 在全部 chunk transfer 完成后发送。master 不依赖 RPC 调用者身份决定 holder 归属,而是使用 `InflightGetInfo.req_node_id` 作为 holder 节点。`InflightGetInfo.source_allocation` 只服务 SSD owner 本地读盘 staging 和 owner-side push,不进入 `get_holding`。 + +#### owner + +owner 在 `get` 里有两个可能角色:requester owner 负责调用 master,并根据 `GetSourceKind` 选择 memory transfer 或 SSD stage RPC;SSD owner 负责响应 `SsdStageReadReq`,读取本地 SSD,把读出的 bytes 按 chunk push 到 requester target,并在全部 chunk transfer 完成后向 master 发送 `GetDoneReq`。 + +```rust +pub struct ClientKvApiInner { + ssd_storage: Option>, + pub external_get_holding: OwnerExternalMemMgr, + rpc_caller_get_start: RPCCaller, + rpc_caller_get_done: RPCCaller, + rpc_caller_ssd_stage_read: RPCCaller, + ... +} + +pub struct SsdStageReadReq { + pub key: String, + pub put_id: PutIDForAKey, + pub get_id: u64, + pub stage_addr: u64, + pub stage_len: u64, + pub target_node_id: NodeIDString, + pub target_addr: u64, + pub len: u64, +} + +pub struct SsdStageReadResp { + pub done_holder_id: u64, + pub done_allocation_mode: GetAllocationMode, + pub done_error_code: ErrorCode, + pub done_error_json: String, + pub done_server_process_us: i64, + pub error_code: ErrorCode, + pub error_json: String, +} +``` + +requester owner 收到 `GetSourceKind::Memory` 后走原有 transfer 分支,然后自己发送 `GetDoneReq`。requester owner 收到 `GetSourceKind::Ssd` 后调用 `stage_kv_from_ssd_source(...)`,该函数返回 `GetDoneResp` 对应字段;requester 跳过自己的 transfer,也跳过自己的 `get_done`,直接用返回的 done 结果构造 holder。 + +SSD owner 的 `rpc_ssd_stage_read` task 调用 `load_and_push_kv_from_ssd(...)`。这个函数内部把 `KvSsdStorage::load_into_addr_chunks(...)` 作为生产者,把 `transfer_loaded_ssd_chunks(...)` 作为消费者:生产者 pin 当前 committed entry,按 chunk 把磁盘数据读入 master 分配的 `stage_addr + offset`;消费者每收到一个 `SsdLoadedChunk`,立即用 `transfer_data_no_copy(peer=target_node_id, peer_src_or_target=false, stage_addr + offset, target_addr + offset, chunk_len, None)` push 到 requester target。所有 chunk transfer 成功后,SSD owner 用 `SsdStageReadReq.get_id` 调 master `GetDoneReq`,并把 `GetDoneResp` 拆成 `SsdStageReadResp.done_*` 字段返回给 requester。 + +```rust +struct SsdRingBuffer { + entries: HashMap, + read_pins: HashMap, + ... +} + +enum SsdEntryState { + Writing(SsdIndexEntry), + Committed(SsdIndexEntry), +} +``` + +`read_pins` 是 owner 本地 SSD ring 的生命周期保护,防止 writer 推进 tail 时覆盖 active read。chunk pipeline 在整个 producer 生命周期内持有同一个 read pin;每个 chunk 单独提交 read task。direct read 条件满足时,`readv` 直接写到 `SsdStageReadReq.stage_addr + offset`;否则先读 scratch aligned buffer,再复制当前 chunk 的真实 payload 长度到 staging。请求方 target 是否远端不影响 SSD direct read 的对齐判断。 + +#### external + +external 只发 `ExternalGetReq` 给 owner,并接收 owner 返回的 holder metadata。SSD route、SSD index、source staging allocation 都不会进入 external 进程。 + +```rust +pub struct ExternalGetReq { + pub key: String, + pub req_node_id: String, + pub started_time: i64, +} + +pub struct ExternalGetResp { + pub error_code: ErrorCode, + pub error_json: String, + pub external_memholder_info: Option, +} + +pub struct ExternalMemHolderInfo { + pub offset: u64, + pub len: u32, + pub holder_id: u64, +} + +pub struct ExternalMemHolder { + pub offset: u64, + pub addr: u64, + pub len: u32, + pub holder_id: u64, + pub key: String, + pub external_client_id: String, + pub owner_start_time: i64, + ... +} +``` + +owner 内部普通 `get` 完成后,会把 external 借用关系写入 `external_get_holding`,再返回 `ExternalMemHolderInfo { offset, len, holder_id }`。external 构造 `ExternalMemHolder` 后只通过 mmap offset/addr 读取结果。holder drop 时,external 发 `ExternalDeleteAckReq` 给 owner;owner 再释放 external 借用,并通过原有 owner -> master holder ack 链路释放 `get_holding`。 + +### stage 失败和释放 + +```mermaid +sequenceDiagram + participant RO as requester owner + participant M as master + participant SO as SSD owner + + RO->>SO: SsdStageReadReq + SO-->>RO: stage error + RO->>M: GetRevokeReq(drop_ssd_source=true) + Note right of M: 查 inflight_gets\n确认 source_kind=Ssd\n删除 route.ssd_replicas[src_node_id] + alt no live replica remains + M->>M: remove kv_routes and prefix index + end +``` + +```rust +pub struct GetRevokeReq { + pub get_id: u64, + pub drop_ssd_source: bool, +} +``` + +SSD stage 失败时,请求方调用 `get_revoke_ssd_source(...)`,也就是 `GetRevokeReq { drop_ssd_source: true }`。master 从 `inflight_gets` 找到 `InflightGetInfo`,只有 `source_kind == GetSourceKind::Ssd` 时才会删除 `route.ssd_replicas[src_node_id]`。如果同一 `OneKvNodesRoutes` 下已经没有 live 内存副本和 SSD 副本,master 再删除 `kv_routes` 并异步清理 prefix index。 + +RPC 字段里,`len` 始终是真实 payload 长度;`ssd_stage_len` / `stage_len` 是 SSD direct IO 需要的 staging 容量,通常是 512-byte 对齐后的长度。`target_addr` 只表示 requester target,不再表示 SSD owner 本地 staging。`SsdStageReadReq.get_id` 让 SSD owner 在全部 chunk transfer 完成后替 requester 完成 master `GetDoneReq`;`SsdStageReadResp.done_*` 是 master `GetDoneResp` 的字段投影,供 requester 复用原有 holder 构造逻辑。 + +```rust +pub struct GetStartResp { + pub get_id: u64, + pub node_id: NodeIDString, + pub put_id: PutIDForAKey, + pub source_kind: GetSourceKind, + pub target_addr: u64, + pub src_addr: u64, + pub len: u64, + pub ssd_stage_len: u64, + ... +} + +pub struct SsdStageReadReq { + pub key: String, + pub put_id: PutIDForAKey, + pub get_id: u64, + pub stage_addr: u64, + pub stage_len: u64, + pub target_node_id: NodeIDString, + pub target_addr: u64, + pub len: u64, +} + +pub struct SsdStageReadResp { + pub done_holder_id: u64, + pub done_allocation_mode: GetAllocationMode, + pub done_error_code: ErrorCode, + pub done_error_json: String, + pub done_server_process_us: i64, + pub error_code: ErrorCode, + pub error_json: String, +} +``` + +## 关键代码片段 + +### put_done 只提交内存副本 + +当前实现中,`put_done` 只把内存 target allocation 写入 `nodes_replicas`。SSD 是否落盘不影响这次 `PutDone` 的可见性。 + +```rust +pub struct PutDoneReq { + pub key: String, + pub put_id: PutIDForAKey, + pub lease_id: Option, +} + +one_kv_routes + .nodes_replicas + .write() + .insert(node_id.clone(), completed_info); +``` + +这段逻辑用到的字段边界是: + +- `put_id` 由 `OneKvNodesRoutes` 承载,SSD 副本和内存副本共享同一个版本。 +- `nodes_replicas` 代表内存副本 ready;`get_start` 可以立即从这里返回内存 source。 +- `ssd_replicas` 不能在这一步写入,否则 `PutDone` 会被 SSD 延迟拖住。 + +### SSD replica 独立 commit + +SSD owner 后台 persist 成功后,单独向 master 提交同一个 key-version 的 SSD 副本。master 必须校验当前 route 的 `put_id` 仍然匹配,避免旧版本 SSD late commit 污染新版本路由。 + +```rust +pub struct SsdReplicaCommitReq { + pub key: String, + pub put_id: PutIDForAKey, + pub node_id: NodeIDString, + pub len: u64, +} + +if let Some(route) = kv_routes.get(&req.key) { + if route.put_id == req.put_id { + route.ssd_replicas.write().insert( + node_id.clone(), + KvSsdRouteInfo { + node_id: node_id.clone(), + len: req.len, + tomb_tag, + }, + ); + } +} +``` + +这段逻辑用到的字段边界是: + +- `SsdReplicaCommitReq.put_id` 必须等于当前 `OneKvNodesRoutes.put_id`。 +- `SsdReplicaCommitReq.node_id` 必须对应当前 route 内已经 ready 的内存副本;master 用同一节点的 `KvRouteInfo.tomb_tag` 作为 SSD route 的 tomb 代际。 +- `SsdReplicaCommitReq.len` 记录真实 payload 长度,后续 SSD stage 和 transfer 都按这个长度对外可见。 +- `KvSsdRouteInfo` 不保存 SSD 文件 offset;offset 只在 owner 本地 SSD ring index 中。 +- late commit 命中过期 `put_id` 时直接丢弃,不能 resurrect 旧版本。 + +### get_start 分配分布式 SSD staging + +SSD fallback 发生在 master 已经没有可用 `nodes_replicas` 之后。source staging 一定分配在 SSD owner 的 CPU segment 上,target allocation 一定分配在 requester 的 CPU segment 上。 + +```rust +let ssd_stage_len = align_ssd_io_len(ssd_replica.len)?; +let source_alloc_len = ssd_stage_len + SSD_ALIGNMENT as u64 - 1; + +let source_allocation = allocate_get_buffer_on_node( + &view, + &ssd_replica.node_id, + source_alloc_len, + get_id, + "ssd source staging", +)?; +let target_allocation = allocate_get_buffer_on_node( + &view, + &req_node_id, + ssd_replica.len, + get_id, + "requesting target", +)?; + +let source_addr = align_ssd_stage_addr(source_base + source_allocation.addr())?; +``` + +这段逻辑的关键字段关系是: + +- `KvSsdRouteInfo.node_id` 决定 source staging 的 owner。 +- `source_alloc_len = align_up(len, 512) + 511`,保证 allocation 内总能找到 512-byte 对齐的 `src_addr`。 +- `GetStartResp.src_addr` 返回对齐后的绝对地址,不一定等于 `source_allocation` 的起始地址。 +- `InflightGetInfo.source_allocation` 持有原始 allocation,保证对齐后的 `src_addr` 在整个 SSD read/push 期间有效。 +- `InflightGetInfo.allocation` 持有 requester target;memory source 由 requester `get_done` 转成 holder,SSD source 由 SSD owner `get_done` 转成 holder。 + +### requester 触发 SSD owner stage/push/done + +请求方收到 `GetSourceKind::Ssd` 后,让 SSD owner 把数据读入 `src_addr`、按 chunk push 到 `target_addr + offset`,并由 SSD owner 直接完成 master `get_done`。这里没有新增用户 API;`SsdStageReadReq` 是 owner 内部 RPC。stage RPC 成功返回时,requester target 已经可读,并且 requester 已经拿到 master done 结果;请求方跳过自己的 transfer 分支,也跳过自己的 `get_done`。 + +```rust +let mut ssd_done_resp = None; +if resp.source_kind == GetSourceKind::Ssd { + let done_resp = self.stage_kv_from_ssd_source( + &resp.node_id, + key, + put_id, + get_id, + resp.src_addr, + resp.target_addr, + data_len as u64, + resp.ssd_stage_len, + ) + .await?; + ssd_done_resp = Some(done_resp); +} + +if resp.source_kind == GetSourceKind::Ssd { + // SSD owner already pushed all chunks to target_addr and called get_done. +} else { + self.view.client_transfer_engine() + .transfer_data_no_copy(peer_id, true, resp.src_addr, resp.target_addr, len, None) + .await?; +} + +let done_resp = if let Some(done_resp) = ssd_done_resp { + done_resp +} else { + self.get_done(get_id).await? +}; +``` + +`stage_kv_from_ssd_source(...)` 的分支只有两个: + +- `source_node_id == self`:本地调用 `load_and_push_kv_from_ssd(...)`,SSD read 生产 chunk,transfer consumer 把每个 chunk 写到本地 `target_addr + offset`,随后直接调用 `get_done(get_id)`。 +- 远端 SSD owner:发送 `SsdStageReadReq`,由 `rpc_ssd_stage_read` task 执行 `load_and_push_kv_from_ssd(...)`,SSD owner 每读出一个 chunk 就 push 到 requester target,全部 chunk transfer 完成后再调 `get_done(get_id)` 并通过 `SsdStageReadResp.done_*` 返回。 + +### SSD chunk read 与 direct/scratch fallback + +SSD owner 侧的核心结构是 `SsdLoadedChunk` 和 `ReadCommand`。`SsdLoadedChunk` 是 read producer 交给 transfer consumer 的最小就绪单元;`ReadCommand.file_offset` 让同一个 committed entry 可以按 chunk 提交不同文件偏移的读。 + +```rust +pub(crate) struct SsdLoadedChunk { + pub offset: u64, + pub stage_addr: u64, + pub len: u64, +} + +struct ReadCommand { + key: KvSsdKey, + entry: SsdIndexEntry, + file_offset: u64, + target: ReadTarget, + _read_pin: Option, + done_tx: oneshot::Sender>, +} +``` + +`load_and_push_kv_from_ssd(...)` 把 SSD read 和 transfer 并行起来。producer 最多保留 `DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT` 个读 IO;consumer 最多保留同样数量的 transfer future。这样大 payload 场景里,前一个 chunk 还在网络传输时,后续 chunk 可以继续从 SSD 读入 staging。 + +```rust +let (chunk_tx, chunk_rx) = ::tokio::sync::mpsc::channel( + DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT.saturating_mul(2).max(1), +); + +let producer = store.load_into_addr_chunks( + key, + put_id, + stage_addr, + len, + stage_len, + DEFAULT_READ_TRANSFER_PIPELINE_CHUNK_BYTES, + DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT, + chunk_tx, +); +let consumer = self.transfer_loaded_ssd_chunks(peer_id, target_addr, chunk_rx); +let (producer_res, consumer_res) = ::tokio::join!(producer, consumer); +``` + +`load_into_addr_chunks(...)` 先 pin 当前 committed entry,pin 生命周期覆盖整个 producer。每个 chunk 根据 `stage_addr + offset`、`entry.file_offset + offset` 和剩余 staging 容量选择 direct 或 scratch;chunk read 完成后立即发送 `SsdLoadedChunk`。 + +```rust +let (entry, _read_pin) = { + let mut inner = self.inner.lock(); + let Some(entry) = inner.ring.pin_read(&key) else { + return Err(KvError::Api(ApiError::KeyNotFound { key: key.key.clone() })); + }; + (entry, SsdReadPin { ... }) +}; + +let file_offset = entry.file_offset + offset; +let target = match choose_chunk_read_path(stage_addr, read_len, target_len, file_offset) { + SsdReadPath::Direct => ReadTarget::Direct { + target_addr: stage_addr, + len: read_len as usize, + }, + SsdReadPath::Scratch => ReadTarget::Scratch(AlignedBuffer::zeroed(read_len as usize)?), +}; + +let output = submit_read_command(key, entry, file_offset, target, None).await?; +if let ReadOutput::Scratch(buffer) = output { + copy_payload_to_stage(buffer, stage_addr, payload_len)?; +} +ready_tx.send(SsdLoadedChunk { offset, stage_addr, len: payload_len }).await?; +``` + +direct 路径把 `readv` 的目标直接设为当前 chunk 的 source staging;scratch 路径先读入 aligned buffer,再只复制当前 chunk 的真实 payload 长度到 staging。两条路径最后都只把真实 payload 长度暴露给 transfer 和 `MemHolder`。 + +## IO 模型 + +```mermaid +flowchart TD + A["large_file_paths"] --> B["derive SSD roots"] + B --> C["create root dirs + metadata.dev()"] + C --> D["deduplicate device roots"] + D --> E0["SsdDeviceWorker device 0"] + D --> E1["SsdDeviceWorker device 1"] + E0 --> F0["shard_ids: 0,2,..."] + E1 --> F1["shard_ids: 1,3,..."] + F0 --> G0["device 0 writer/read queues"] + F1 --> G1["device 1 writer/read queues"] + G0 --> H0["device 0 UringIoEngine"] + G1 --> H1["device 1 UringIoEngine"] + I["persist_buffer"] --> J["next_write_device round-robin"] + J --> G0 + J --> G1 + K["submit_read_command(entry.shard_id)"] --> L["shard_to_device"] + L --> G0 + L --> G1 +``` + +| 组件 | 设计 | +| --- | --- | +| device root | owner 从 `large_file_paths` 派生 SSD root,创建目录后读取 `metadata.dev()`;同一 device 只保留第一个 root。 | +| shard 文件 | 每个 owner 将 `max_bytes` 切成少量 shard,文件位于有效 device root 的 `shards/` 下,`shard_to_device` 记录 shard 到 device worker 的映射。 | +| 对齐 | 数据写入前复制到 512-byte 对齐 buffer,实际 IO 长度按 512-byte 向上对齐。 | +| 写队列 | `persist_from_addr` 只把任务送入某个 device 的有界 writer queue;后台 writer 控制 inflight 数量,并只在本 device 的 `shard_ids` 内分配 ring 空间。 | +| 读队列 | `load_into_addr_chunks` 先 pin committed 索引,再按 `entry.shard_id -> shard_to_device` 找到对应 device reader queue。只要 chunk staging 地址、文件 offset 和 staging 容量满足对齐约束,就直接读入目标 staging;否则读到 scratch aligned buffer 后只复制当前 chunk 的真实 payload 长度。 | +| io_uring | 每个有效 device 拥有自己的 `UringIoEngine`,engine 内多个后台线程持有 `IoUring`,使用 `readv/writev` 提交该 device 的 shard 文件 IO。底层每个 uring shard 有独立 read/write 发送队列,按 read/write inflight 比例调度,优先保护 kvcache 回填读延迟。 | +| 索引状态 | 新写入先进入 `Writing`;只有 IO 完成且 offset 仍有效时才转为 `Committed`。 | +| 位置保护 | `load_into_addr_chunks` 在 producer 生命周期内 pin committed entry;writer 分配新 ring 空间前检查 pinned read 和未完成 `Writing` entry,必要时等待 active IO 释放位置。 | +| ring 失效 | shard head 推进超过容量时推进 tail,并移除被覆盖 key-version 的本地索引。 | + +## Task / Actor / 独立线程 + +SSD 路径里有三层异步执行单元。控制面仍复用 KV 原有 actor;SSD 只为 owner 本地磁盘 IO 增加后台 task 和独立 uring 线程。owner 内部的 SSD task 按去重后的 effective device 创建;多个 `large_file_paths` 如果落在同一个 `metadata.dev()` 上,只创建一组 device worker。 + +| 执行单元 | 创建位置 | 类型 | 输入 | 职责 | +| --- | --- | --- | --- | --- | +| `ssd_writer_loop` | `KvSsdStorage::new`,每个 effective device 一个 | `tokio::task::spawn` | `SsdDeviceWorker.write_tx` | 从 `persist_from_addr` 接收写任务,只在本 device 的 `shard_ids` 内调用 `SsdRingBuffer::prepare_write_on_shards`,提交 `writev`,完成后 `commit(Writing -> Committed)`。 | +| `ssd_reader_loop` | `KvSsdStorage::new`,每个 effective device 一个 | `tokio::task::spawn` | `SsdDeviceWorker.read_tx` | 从 `load_into_addr_chunks` 接收属于本 device shard 的 chunk 读任务,提交 direct/scratch `readv`,校验 offset 仍有效,完成后回传 chunk 读结果;整条 producer 完成后释放 `SsdReadPin`。 | +| `fluxon-kv-ssd-uring-{idx}` | 每个 device 的 `UringIoEngine::new_multi` | `std::thread::spawn` | `read_rx/write_rx: crossbeam::channel` | 每个线程持有一个 `IoUring`,只提交本 device shard 文件的 `Readv/Writev` SQE,并按 read/write inflight 比例调度后回传 CQE。 | +| `rpc_ssd_replica_commit` | `MasterKvRouter` RPC handler 注册 | `view.spawn(...)` | `SsdReplicaCommitReq` | owner SSD persist 成功后提交 SSD 副本,master 校验 `put_id` 后写 `ssd_replicas`。 | +| `rpc_ssd_stage_read` | `ClientKvApi` RPC handler 注册 | `view.spawn(...)` | `SsdStageReadReq` | 远端 SSD owner 收到 stage 请求后,在 owner 进程内调用 `load_and_push_kv_from_ssd(...)`;SSD read producer 和 transfer consumer 流水线完成后,再调用 master `get_done` 并回传 done fields。 | +| `ssd_failure_remove_prefix_index` | `get_revoke(drop_ssd_source=true)` | `view.spawn(...)` | 失败 SSD source 的 key | 当失败 SSD source 是最后一个 live replica 时,异步删除 prefix index。 | + +没有单独的 SSD master route actor。SSD route 的权威更新点仍是原有 master RPC handler: + +- `PutDone`:同步更新 `nodes_replicas`,让内存副本立即可读。 +- `SsdReplicaCommit`:SSD persist 完成后同步更新 `ssd_replicas`,并拒绝过期 `put_id`。 +- `GetStart`:同步选择内存副本或 SSD 副本,并写入 `inflight_gets`。 +- `GetRevoke`:同步删除失败 SSD source;必要时触发 prefix index 小任务。 +- `Delete` / 覆盖写失效:复用原有 `delete_broadcast` 管线。 + +后台 task 的生命周期绑定 `KvSsdStorage`: + +```rust +for device in deduplicated_device_roots { + let io = Arc::new(UringIoEngine::new_multi(device_shard_fds, cfg)?); + task::spawn(ssd_writer_loop(..., shard_ids.clone())); + task::spawn(ssd_reader_loop(...)); + devices.push(SsdDeviceWorker { shard_ids, _io: io, ... }); +} + +std::thread::Builder::new() + .name(format!("fluxon-kv-ssd-uring-{idx}")) + .spawn(move || UringShard { ... }.run())?; +``` + +`KvSsdStorage` 通过每个 `SsdDeviceWorker` 持有 `_files` 和 `_io`,确保该 device 的 shard fd 与 uring 线程生命周期覆盖所有读写 task。`UringIoEngine::drop` 会关闭 read/write channel,并 join 所有 uring 线程。 + +## 3FS 和 foyer 对照 + +| 参考点 | 对 kvcache SSD 的结论 | +| --- | --- | +| foyer read/write split queue | 已落地到底层 `UringIoEngine`。写入 flush 和回填读进入不同发送队列,同一 uring shard 内用 inflight 比例避免读饥饿。 | +| foyer multi-partition device | 已落地到 owner 内部 per-device worker。`large_file_paths` 仍是唯一配置来源;owner 按 `metadata.dev()` 去重后为每个有效 device 建独立 writer/read queue、uring engine 和 shard 集。 | +| foyer block buffer/reclaimer | 适合后续把小 key-version 合并成 blob,并用 blob index 加速恢复;当前 kvcache value 以较大连续 payload 为主,先保留单 key-version 连续写入。 | +| 3FS write-new-position then commit metadata | 当前 `Writing/Committed` 两阶段索引已经匹配这条原则:IO 成功前不暴露 SSD 副本。 | +| 3FS read holds chunk position reference | 已落地到 SSD ring 内部。读提交前 pin entry;tail 推进和物理 offset 复用必须避开 pinned read。 | +| 3FS aligned direct read | 已落地 aligned fast path。master 自己控制 SSD source staging allocation,因此可以在 allocation 内选择对齐后的 source 地址,并把 SSD IO 长度扩到 512-byte 对齐;真实 payload 长度仍用于 transfer 和用户可见 `MemHolder`。 | +| 3FS batch read/RDMA response | Fluxon 已复用现有 transfer engine,并已落地 read/transfer chunk pipeline;后续优化重点放在批量 SSD stage、批量 transfer 和小窗口 staging allocation。 | +| PegaFlow fire-and-forget SSD ingest | 已落地到 put 路径。master 在 `PutDone` 中先提交内存 route,再通过后台 `post_put_ssd_replica_persist` 触发 owner 本地 SSD persist;owner 落盘成功后用 `SsdReplicaCommitReq` 独立提交 SSD route。 | + +## 不变量 + +- `ssd_replicas` 和 `nodes_replicas` 都属于同一个 `OneKvNodesRoutes.put_id`,不能跨版本复用。 +- `PutDoneReq` 只表示内存副本 ready,不能记录 SSD 副本。 +- master 只有在收到匹配当前 `put_id` 的 `SsdReplicaCommitReq` 后才能记录 SSD 副本。 +- `SsdReplicaCommitReq` 是内部控制面 RPC,不改变用户侧 `put/get/delete` API。 +- `GetSourceKind::Ssd` 必须有 source staging allocation,并由 master 持有到 SSD owner 发起的 `get_done` 或 requester 发起的 `get_revoke`。 +- SSD 回填失败必须通过 `get_revoke(drop_ssd_source=true)` 清理 in-flight get,并从 master 路由里移除失败的 SSD 副本。 +- SSD ring 本地失效后,master 可能短暂保留旧 SSD 路由;下一次 stage 失败会触发主动路由失效。 +- SSD ring tail 推进不能覆盖 active IO:未完成的 `Writing` entry 和 pinned read entry 必须先释放。 +- SSD direct stage 只在目标地址、SSD 内部对齐长度和文件 offset 都满足 512-byte 对齐,且 staging 容量覆盖对齐长度时启用;transfer 和用户可见 `MemHolder` 长度始终保持真实 payload 长度。 +- master 路由被删除后,旧 SSD bytes 即使还在 shard 文件里,也不能被公共 `get` 命中。 + +## 关键结论 + +这套实现把 SSD 做成和 CPU segment 同级的分布式数据源副本,但不新增并行的用户 API 或传输协议。Pegaflow 的优势被放在 owner 内部 IO 层:异步 direct IO、分片 ring、提交态隔离和队列背压;foyer 的 read/write 队列调度用于保护回填读延迟;3FS 的位置生命周期、aligned direct read 和 read/transfer chunk pipeline 用于保护 active IO 并减少大 payload 回填拷贝和串行等待。后续重点是批量 SSD stage、批量 transfer、小窗口 staging allocation 和 pipeline 观测指标。Fluxon 的优势继续由原有内存 KV 路由、allocation、transfer 和 holder 生命周期承接。 diff --git a/fluxon_rs/Cargo.lock b/fluxon_rs/Cargo.lock index a4b0ecd..3e8638a 100644 --- a/fluxon_rs/Cargo.lock +++ b/fluxon_rs/Cargo.lock @@ -1237,6 +1237,7 @@ dependencies = [ "hyper 0.14.32", "iceoryx2", "iceoryx2-cal", + "io-uring", "kanal", "lazy_static", "libc", @@ -2395,6 +2396,17 @@ dependencies = [ "str_stack", ] +[[package]] +name = "io-uring" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9080b15e63775b9a2ac7dca720f7050a8b955e092ea0f6020a4a80f69998cdc0" +dependencies = [ + "bitflags 2.9.1", + "cfg-if", + "libc", +] + [[package]] name = "ipnet" version = "2.11.0" diff --git a/fluxon_rs/fluxon_kv/Cargo.toml b/fluxon_rs/fluxon_kv/Cargo.toml index 22ff136..8208216 100644 --- a/fluxon_rs/fluxon_kv/Cargo.toml +++ b/fluxon_rs/fluxon_kv/Cargo.toml @@ -75,6 +75,7 @@ bytes = "1" pprof = { version = "0.15", features = ["flamegraph"] } hex = "0.4" sha2 = "0.10" +io-uring = "0.7" tokio-tungstenite = { version = "0.21", default-features = false, features = ["connect", "handshake"], optional = true } sockudo-ws = { version = "^1.7.4", default-features = false, features = ["tokio-runtime", "fastrand"], optional = true } diff --git a/fluxon_rs/fluxon_kv/src/client_kv_api/get.rs b/fluxon_rs/fluxon_kv/src/client_kv_api/get.rs index f309dd0..29da3f8 100644 --- a/fluxon_rs/fluxon_kv/src/client_kv_api/get.rs +++ b/fluxon_rs/fluxon_kv/src/client_kv_api/get.rs @@ -13,7 +13,7 @@ use crate::{ cluster_manager::NodeID, master_kv_router::msg_pack::{ GetAllocationMode, GetDoneReq, GetDoneResp, GetMetaReq, GetMetaResp, GetRevokeReq, - GetStartReq, GetStartResp, + GetSourceKind, GetStartReq, GetStartResp, }, p2p::msg_pack::MsgPack, rpcresp_kvresult_convert::msg_and_error::codes_api, @@ -26,19 +26,27 @@ use std::sync::Arc; pub struct RemoteGetInfo { get_id: u64, data_len: usize, + source_kind: GetSourceKind, src_addr: u64, target_addr: u64, node_id: NodeID, peer_is_src_or_target: bool, } +impl RemoteGetInfo { + pub fn source_kind(&self) -> GetSourceKind { + self.source_kind + } +} + impl std::fmt::Display for RemoteGetInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, - "GetInfo{{ get_id: {}, data_len: {} bytes, src_addr: {:#x}, target_addr: {:#x}, node_id: {:?}, remote_transfer: {} }}", + "GetInfo{{ get_id: {}, data_len: {} bytes, source_kind: {:?}, src_addr: {:#x}, target_addr: {:#x}, node_id: {:?}, remote_transfer: {} }}", self.get_id, self.data_len, + self.source_kind, self.src_addr, self.target_addr, self.node_id, @@ -177,8 +185,80 @@ impl ClientKvApiInner { ); } + let mut ssd_done_resp = None; + if resp.source_kind == GetSourceKind::Ssd { + let ssd_stage_len = resp.ssd_stage_len; + if ssd_stage_len < data_len as u64 { + #[cfg(test)] + { + self.test_record.remove_transfering_get(get_id); + } + + self.get_revoke(get_id).await?; + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "invalid ssd stage len for key={} get_id={} data_len={} ssd_stage_len={}", + key, get_id, data_len, ssd_stage_len + ), + })); + } + let done_resp = match self + .stage_kv_from_ssd_source( + &resp.node_id, + key, + put_id, + get_id, + abs_src, + abs_target, + data_len as u64, + ssd_stage_len, + ) + .await + { + Ok(done_resp) => done_resp, + Err(err) => { + tracing::warn!( + "kv get ssd stage failed: key={}, source_node={}, stage={:#x}, target={:#x}, len={}, ssd_stage_len={}, err={}", + key, + resp.node_id, + abs_src, + abs_target, + data_len, + ssd_stage_len, + err + ); + + #[cfg(test)] + { + self.test_record.remove_transfering_get(get_id); + } + + obe_get_transfer_error(&metrics, &client_id, &node_role, key, data_len as u64); + self.get_revoke_ssd_source(get_id).await?; + return Err(err); + } + }; + ssd_done_resp = Some(done_resp); + tracing::debug!( + "kv get ssd staged and pushed: key={}, source_node={}, stage={:#x}, target={:#x}, len={}, ssd_stage_len={}", + key, + resp.node_id, + abs_src, + abs_target, + data_len, + ssd_stage_len + ); + } + // transfer data (skip if local and src==target to avoid redundant copy) - if peer_id.is_none() && abs_src == abs_target { + if resp.source_kind == GetSourceKind::Ssd { + tracing::debug!( + "kv get ssd owner push complete: key={}, target={:#x}, len={} (skip requester transfer)", + key, + abs_target, + data_len + ); + } else if peer_id.is_none() && abs_src == abs_target { tracing::debug!( "kv get local no-op: src==target {:#x}, len={} (skip transfer)", abs_target, @@ -249,12 +329,17 @@ impl ClientKvApiInner { // Removed post-transfer zero-header verification per request. - // Complete the get operation and get holder_id - let done_resp = match self.get_done(get_id).await { - Ok(resp) => resp, - Err(err) => { - obe_get_end_error_rpc(&metrics, &client_id, &node_role, key, data_len as u64); - return Err(err); + // Complete the get operation and get holder_id. SSD source already called + // get_done after pushing into the requester target. + let done_resp = if let Some(done_resp) = ssd_done_resp { + done_resp + } else { + match self.get_done(get_id).await { + Ok(resp) => resp, + Err(err) => { + obe_get_end_error_rpc(&metrics, &client_id, &node_role, key, data_len as u64); + return Err(err); + } } }; let end_handle_us = done_resp.server_process_us; @@ -326,6 +411,7 @@ impl ClientKvApiInner { let get_info = RemoteGetInfo { get_id, data_len, + source_kind: resp.source_kind, src_addr: abs_src, target_addr: abs_target, node_id: resp.node_id.into(), @@ -435,8 +521,19 @@ impl ClientKvApiInner { /// 撤销 Get 操作,释放已分配的资源 pub async fn get_revoke(&self, get_id: u64) -> KvResult<()> { + self.get_revoke_inner(get_id, false).await + } + + async fn get_revoke_ssd_source(&self, get_id: u64) -> KvResult<()> { + self.get_revoke_inner(get_id, true).await + } + + async fn get_revoke_inner(&self, get_id: u64, drop_ssd_source: bool) -> KvResult<()> { let req = MsgPack { - serialize_part: GetRevokeReq { get_id }, + serialize_part: GetRevokeReq { + get_id, + drop_ssd_source, + }, raw_bytes: Vec::new(), }; diff --git a/fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs b/fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs index dec19f5..bd4655b 100644 --- a/fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs +++ b/fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs @@ -3,11 +3,17 @@ use crate::client_kv_api::msg_pack::{ ExternalDeleteAckReq, ExternalDeleteAckResp, ExternalDeleteReq, ExternalDeleteResp, ExternalGetReq, ExternalGetResp, ExternalIsExistReq, ExternalIsExistResp, ExternalPutCommitReq, ExternalPutCommitResp, ExternalPutRevokeReq, ExternalPutRevokeResp, ExternalPutStartReq, - ExternalPutStartResp, ExternalPutTransferEndReq, ExternalPutTransferEndResp, SyncKvToFileReq, - SyncKvToFileResp, TestPutPhaseTrace, + ExternalPutStartResp, ExternalPutTransferEndReq, ExternalPutTransferEndResp, + SsdReplicaPersistReq, SsdReplicaPersistResp, SsdStageReadReq, SsdStageReadResp, + SyncKvToFileReq, SyncKvToFileResp, TestPutPhaseTrace, }; use crate::cluster_manager::NodeIDString; +use crate::cluster_manager::app_logic_ext::ClusterManagerAppLogicExt; use crate::config::TestSpecConfig; +use crate::kv_ssd_storage::{ + DEFAULT_READ_TRANSFER_PIPELINE_CHUNK_BYTES, DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT, + KvSsdStorage, KvSsdStorageInit, SsdLoadedChunk, +}; use crate::master_kv_router::msg_pack::{ BatchDeleteAckReq, BatchDeleteClientKvMetaCacheReq, DeleteClientKvMetaCacheItem, }; @@ -22,8 +28,8 @@ use crate::{ client_transfer_engine::{ClientTransferEngine, ClientTransferEngineAccessTrait}, cluster_manager::{ClusterEvent, ClusterManager, ClusterManagerAccessTrait}, master_kv_router::msg_pack::{ - DeleteReq, GetDoneReq, GetMetaReq, GetRevokeReq, GetStartReq, PutDoneReq, PutRevokeReq, - PutStartReq, + DeleteReq, GetDoneReq, GetDoneResp, GetMetaReq, GetRevokeReq, GetStartReq, PutDoneReq, + PutRevokeReq, PutStartReq, SsdReplicaCommitReq, }, metric_reporter::{MetricReporter, MetricReporterAccessTrait}, metrics::{MetricsHandle, OperationKind, RequestStage}, @@ -37,6 +43,7 @@ use async_trait::async_trait; use dashmap::DashMap; use fluxon_framework::{LogicalModule, define_module}; use fluxon_util::map_lock::AMapLock; +use futures::stream::{FuturesUnordered, StreamExt}; use limit_thirdparty::tokio; use parking_lot::Mutex; use std::sync::Weak; @@ -451,6 +458,89 @@ async fn handle_external_put_revoke( } } +async fn handle_ssd_stage_read( + view: &ClientKvApiView, + msg: &MsgPack, +) -> MsgPack { + let req = msg.serialize_part.clone(); + let inner = view.client_kv_api().inner(); + let done_resp = match inner + .load_and_push_kv_from_ssd( + &req.key, + req.put_id, + req.stage_addr, + req.stage_len, + &req.target_node_id, + req.target_addr, + req.len, + ) + .await + { + Ok(()) => inner.get_done(req.get_id).await, + Err(err) => Err(err), + }; + + match done_resp { + Ok(done_resp) => MsgPack { + serialize_part: SsdStageReadResp { + done_holder_id: done_resp.holder_id, + done_allocation_mode: done_resp.allocation_mode, + done_error_code: done_resp.error_code, + done_error_json: done_resp.error_json, + done_server_process_us: done_resp.server_process_us, + error_code: crate::rpcresp_kvresult_convert::msg_and_error::OK, + error_json: String::new(), + }, + raw_bytes: Vec::new(), + }, + Err(err) => MsgPack { + serialize_part: crate::rpcresp_kvresult_convert::FromError::from_error(&err), + raw_bytes: Vec::new(), + }, + } +} + +async fn handle_ssd_replica_persist( + view: &ClientKvApiView, + msg: &MsgPack, +) -> MsgPack { + let req = msg.serialize_part.clone(); + let inner = view.client_kv_api().inner(); + let persisted = match inner + .persist_local_kv_to_ssd(&req.key, req.put_id, req.target_addr, req.len) + .await + { + Ok(persisted) => persisted, + Err(err) => { + return MsgPack { + serialize_part: crate::rpcresp_kvresult_convert::FromError::from_error(&err), + raw_bytes: Vec::new(), + }; + } + }; + + if persisted { + if let Err(err) = inner + .commit_ssd_replica_to_master(&req.key, req.put_id, req.len) + .await + { + return MsgPack { + serialize_part: crate::rpcresp_kvresult_convert::FromError::from_error(&err), + raw_bytes: Vec::new(), + }; + } + } + + MsgPack { + serialize_part: SsdReplicaPersistResp { + persisted, + error_code: crate::rpcresp_kvresult_convert::msg_and_error::OK, + error_json: String::new(), + }, + raw_bytes: Vec::new(), + } +} + async fn handle_external_delete_ack( view: &ClientKvApiView, msg: &MsgPack, @@ -729,6 +819,7 @@ define_module!( #[derive(Clone, Debug)] pub struct ClientKvApiNewArg { pub test_spec_config: TestSpecConfig, + pub ssd_storage: Option, } pub struct ClientKvApi(ClientKvApiInner); @@ -775,6 +866,7 @@ impl std::ops::Deref for ClientKvApiViewHolder { pub struct ClientKvApiInner { view: ClientKvApiViewHolder, test_spec_config: TestSpecConfig, + ssd_storage: Option>, metrics: OnceLock>, /// make sure each remote kv get run in order @@ -818,6 +910,8 @@ pub struct ClientKvApiInner { rpc_caller_external_put_commit: RPCCaller, rpc_caller_external_put_revoke: RPCCaller, rpc_caller_resolve_side_transfer_lane: RPCCaller, + rpc_caller_ssd_stage_read: RPCCaller, + rpc_caller_ssd_replica_commit: RPCCaller, /// Default lease id recorded for inspection/convenience, but NOT auto-applied. /// Callers must explicitly pass `Some(lease_id)` to attach a put to a lease. @@ -900,6 +994,222 @@ impl ClientKvApiInner { pub(crate) fn skip_put_end_commit_enabled(&self) -> bool { self.test_spec_config.skip_put_end_commit } + + pub(crate) async fn persist_local_kv_to_ssd( + &self, + key: &str, + put_id: crate::master_kv_router::put::PutIDForAKey, + abs_addr: u64, + len: u64, + ) -> KvResult { + let Some(store) = self.ssd_storage.as_ref() else { + return Ok(false); + }; + store.persist_from_addr(key, put_id, abs_addr, len).await?; + Ok(true) + } + + pub(crate) async fn commit_ssd_replica_to_master( + &self, + key: &str, + put_id: crate::master_kv_router::put::PutIDForAKey, + len: u64, + ) -> KvResult<()> { + let node_id = self.view.cluster_manager().get_self_info().id.clone(); + let req = MsgPack { + serialize_part: SsdReplicaCommitReq { + key: key.to_string(), + put_id, + node_id, + len, + }, + raw_bytes: Vec::new(), + }; + let master_node_id = self + .view + .cluster_manager() + .find_or_wait_master_node() + .await?; + let resp = self + .rpc_caller_ssd_replica_commit + .call( + self.view.p2p_module(), + master_node_id.into(), + req, + Some(Duration::from_secs(60)), + 2, + ) + .await + .map_err(KvError::from)?; + crate::rpcresp_kvresult_convert::try_from_code( + resp.serialize_part.error_code, + resp.serialize_part.error_json, + ) + } + + pub(crate) async fn load_and_push_kv_from_ssd( + &self, + key: &str, + put_id: crate::master_kv_router::put::PutIDForAKey, + stage_addr: u64, + stage_len: u64, + target_node_id: &NodeIDString, + target_addr: u64, + len: u64, + ) -> KvResult<()> { + let Some(store) = self.ssd_storage.as_ref() else { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage is not enabled on this owner".to_string(), + })); + }; + + let self_node_id = &self.view.cluster_manager().get_self_info().id; + let peer_id = if target_node_id == self_node_id { + None + } else { + Some(target_node_id.clone()) + }; + let (chunk_tx, chunk_rx) = ::tokio::sync::mpsc::channel( + DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT + .saturating_mul(2) + .max(1), + ); + let producer = store.load_into_addr_chunks( + key, + put_id, + stage_addr, + len, + stage_len, + DEFAULT_READ_TRANSFER_PIPELINE_CHUNK_BYTES, + DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT, + chunk_tx, + ); + let consumer = self.transfer_loaded_ssd_chunks(peer_id, target_addr, chunk_rx); + let (producer_res, consumer_res) = ::tokio::join!(producer, consumer); + match (producer_res, consumer_res) { + (Ok(()), Ok(())) => Ok(()), + (_, Err(err)) => Err(err), + (Err(err), _) => Err(err), + } + } + + async fn transfer_loaded_ssd_chunks( + &self, + peer_id: Option, + target_addr: u64, + mut chunk_rx: ::tokio::sync::mpsc::Receiver, + ) -> KvResult<()> { + let mut inflight = FuturesUnordered::new(); + let mut rx_open = true; + + loop { + tokio::select! { + maybe_chunk = chunk_rx.recv(), if rx_open && inflight.len() < DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT => { + match maybe_chunk { + Some(chunk) => { + let chunk_target_addr = target_addr.checked_add(chunk.offset).ok_or_else(|| { + KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd transfer target addr overflow: target_addr={:#x} offset={}", + target_addr, + chunk.offset + ), + }) + })?; + let transfer_engine = self.view.client_transfer_engine(); + let peer_id = peer_id.clone(); + inflight.push(async move { + transfer_engine + .transfer_data_no_copy( + peer_id, + false, + chunk.stage_addr, + chunk_target_addr, + chunk.len, + None, + ) + .await?; + Ok::<(), KvError>(()) + }); + } + None => { + rx_open = false; + } + } + } + Some(result) = inflight.next(), if !inflight.is_empty() => { + result?; + } + else => { + if !rx_open && inflight.is_empty() { + break; + } + } + } + } + Ok(()) + } + + pub(crate) async fn stage_kv_from_ssd_source( + &self, + source_node_id: &NodeIDString, + key: &str, + put_id: crate::master_kv_router::put::PutIDForAKey, + get_id: u64, + stage_addr: u64, + target_addr: u64, + len: u64, + stage_len: u64, + ) -> KvResult { + let self_node_id = self.view.cluster_manager().get_self_info().id.clone(); + if source_node_id == &self_node_id { + self.load_and_push_kv_from_ssd( + key, + put_id, + stage_addr, + stage_len, + &self_node_id, + target_addr, + len, + ) + .await?; + return self.get_done(get_id).await; + } + + let req = MsgPack { + serialize_part: SsdStageReadReq { + key: key.to_string(), + put_id, + get_id, + stage_addr, + stage_len, + target_node_id: self_node_id, + target_addr, + len, + }, + raw_bytes: Vec::new(), + }; + let resp = self + .rpc_caller_ssd_stage_read + .call( + self.view.p2p_module(), + source_node_id.clone().into(), + req, + Some(Duration::from_secs(60)), + 0, + ) + .await + .map_err(KvError::from)?; + let resp = resp.serialize_part; + crate::rpcresp_kvresult_convert::try_from_code(resp.error_code, resp.error_json)?; + Ok(GetDoneResp { + holder_id: resp.done_holder_id, + allocation_mode: resp.done_allocation_mode, + error_code: resp.done_error_code, + error_json: resp.done_error_json, + server_process_us: resp.done_server_process_us, + }) + } } #[derive(Debug, Clone)] @@ -1518,10 +1828,16 @@ impl ClientKvApi { pub async fn construct(arg: ClientKvApiNewArg) -> Result { tracing::info!("Constructing ClientKvApi in Client mode (PreView)"); + let ssd_storage = arg + .ssd_storage + .map(KvSsdStorage::new) + .transpose()? + .map(Arc::new); let inner = ClientKvApiInner { view: ClientKvApiViewHolder::new(), test_spec_config: arg.test_spec_config, + ssd_storage, metrics: OnceLock::new(), all_memholder_refcount: OnceLock::new(), get_remote_kv_lock: AMapLock::new(Duration::from_secs(60)), @@ -1554,6 +1870,8 @@ impl ClientKvApi { rpc_caller_external_put_commit: RPCCaller::new(), rpc_caller_external_put_revoke: RPCCaller::new(), rpc_caller_resolve_side_transfer_lane: RPCCaller::new(), + rpc_caller_ssd_stage_read: RPCCaller::new(), + rpc_caller_ssd_replica_commit: RPCCaller::new(), default_lease_id: parking_lot::RwLock::new(None), }; Ok(Self(inner)) @@ -1587,6 +1905,12 @@ impl ClientKvApi { inner .rpc_caller_resolve_side_transfer_lane .regist(inner.view.p2p_module()); + inner + .rpc_caller_ssd_stage_read + .regist(inner.view.p2p_module()); + inner + .rpc_caller_ssd_replica_commit + .regist(inner.view.p2p_module()); crate::key_prefix::init_for_p2p_owner(inner.view.p2p_module()); crate::kvlease::init_for_p2p_owner(inner.view.p2p_module()); // Register master-only metric RPC callers @@ -1686,6 +2010,31 @@ impl ClientKvApi { }, ); + let view_ext = inner.view.clone_view(); + RPCHandler::::new().regist(inner.view.p2p_module(), move |resp, msg| { + let view = view_ext.clone(); + let view_task = view.clone(); + let _ = view.spawn("rpc_ssd_stage_read", async move { + let result = handle_ssd_stage_read(&view_task, &msg).await; + let _ = resp.send_resp(result).await; + }); + Ok(()) + }); + + let view_ext = inner.view.clone_view(); + RPCHandler::::new().regist( + inner.view.p2p_module(), + move |resp, msg| { + let view = view_ext.clone(); + let view_task = view.clone(); + let _ = view.spawn("rpc_ssd_replica_persist", async move { + let result = handle_ssd_replica_persist(&view_task, &msg).await; + let _ = resp.send_resp(result).await; + }); + Ok(()) + }, + ); + let view_ext = inner.view.clone_view(); RPCHandler::::new().regist(inner.view.p2p_module(), move |resp, msg| { let view = view_ext.clone(); diff --git a/fluxon_rs/fluxon_kv/src/client_kv_api/msg_pack.rs b/fluxon_rs/fluxon_kv/src/client_kv_api/msg_pack.rs index 55f0970..bae5437 100644 --- a/fluxon_rs/fluxon_kv/src/client_kv_api/msg_pack.rs +++ b/fluxon_rs/fluxon_kv/src/client_kv_api/msg_pack.rs @@ -1,8 +1,10 @@ +use crate::master_kv_router::msg_pack::GetAllocationMode; use crate::master_kv_router::put::PutIDForAKey; use crate::p2p::msg_pack::{MsgPackSerializePart, RPCReq}; use crate::rpcresp_kvresult_convert::msg_and_error::ErrorCode; use bitcode::{Decode, Encode}; +use crate::cluster_manager::NodeIDString; use crate::memholder::ExternalMemHolderInfo; #[derive(Default, Debug, Clone, Encode, Decode)] @@ -89,6 +91,76 @@ impl MsgPackSerializePart for ExternalGetResp { } } +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdStageReadReq { + pub key: String, + pub put_id: PutIDForAKey, + pub get_id: u64, + pub stage_addr: u64, + pub stage_len: u64, + pub target_node_id: NodeIDString, + pub target_addr: u64, + pub len: u64, +} + +impl MsgPackSerializePart for SsdStageReadReq { + fn msg_id(&self) -> u32 { + 4020 + } +} + +impl RPCReq for SsdStageReadReq { + type Resp = SsdStageReadResp; +} + +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdStageReadResp { + pub done_holder_id: u64, + pub done_allocation_mode: GetAllocationMode, + pub done_error_code: ErrorCode, + pub done_error_json: String, + pub done_server_process_us: i64, + pub error_code: ErrorCode, + pub error_json: String, +} + +impl MsgPackSerializePart for SsdStageReadResp { + fn msg_id(&self) -> u32 { + 4021 + } +} + +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdReplicaPersistReq { + pub key: String, + pub put_id: PutIDForAKey, + pub target_addr: u64, + pub len: u64, +} + +impl MsgPackSerializePart for SsdReplicaPersistReq { + fn msg_id(&self) -> u32 { + 4022 + } +} + +impl RPCReq for SsdReplicaPersistReq { + type Resp = SsdReplicaPersistResp; +} + +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdReplicaPersistResp { + pub persisted: bool, + pub error_code: ErrorCode, + pub error_json: String, +} + +impl MsgPackSerializePart for SsdReplicaPersistResp { + fn msg_id(&self) -> u32 { + 4023 + } +} + // #[derive(Default, Debug, Clone, Encode, Decode)] // pub struct ExternalPutReq { // pub key: String, diff --git a/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs b/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs index 1aa6954..8c7cc78 100644 --- a/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs +++ b/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs @@ -237,10 +237,7 @@ impl ClientSegPool { std::path::Path::new(share_mem_path).join(SIDE_TRANSFER_PEERS_DIRNAME) } - pub fn side_transfer_peer_file_path( - share_mem_path: &str, - side_id: &str, - ) -> std::path::PathBuf { + pub fn side_transfer_peer_file_path(share_mem_path: &str, side_id: &str) -> std::path::PathBuf { Self::side_transfer_peers_dir(share_mem_path).join(format!("{side_id}.json")) } @@ -399,17 +396,13 @@ impl ClientSegPool { crate::rpcresp_kvresult_convert::msg_and_error::SharedMemError::MappingFailed { path: String::new(), len: map_len as u64, - detail: "share_mem_path is empty; explicit configuration required" - .to_string(), + detail: "share_mem_path is empty; explicit configuration required".to_string(), }, )); } let base_path = &share_mem_path; - tracing::info!( - "Using share_mem_path: {} for memory-mapped file", - base_path - ); + tracing::info!("Using share_mem_path: {} for memory-mapped file", base_path); std::fs::create_dir_all(base_path).map_err(|e| { KvError::SharedMem( crate::rpcresp_kvresult_convert::msg_and_error::SharedMemError::MappingFailed { diff --git a/fluxon_rs/fluxon_kv/src/config.rs b/fluxon_rs/fluxon_kv/src/config.rs index f9c7691..02f6e3f 100644 --- a/fluxon_rs/fluxon_kv/src/config.rs +++ b/fluxon_rs/fluxon_kv/src/config.rs @@ -581,6 +581,8 @@ pub struct FluxonKvSpecYaml { #[serde(skip_serializing_if = "Option::is_none")] pub large_file_paths: Option, #[serde(skip_serializing_if = "Option::is_none")] + pub ssd_storage: Option>, + #[serde(skip_serializing_if = "Option::is_none")] pub p2p_listen_port: Option, #[serde(skip_serializing_if = "Option::is_none")] pub redis_compat: Option>, @@ -592,6 +594,17 @@ pub struct FluxonKvSpecYaml { #[serde(transparent)] pub struct LargeFilePathsYaml(pub Vec); +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct KvSsdStorageConfigYaml { + pub max_bytes: u64, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct KvSsdStorageConfig { + pub max_bytes: u64, +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct RedisCompatConfigYaml { @@ -682,6 +695,34 @@ impl LargeFilePaths { .into_kverror()) } + fn resolve_all_usable_root_subdirs( + &self, + relative_dir: &Path, + target_name: &str, + ) -> KvResult> { + self.require_configured_paths()?; + let mut out = Vec::new(); + let mut errors = Vec::new(); + for root in &self.paths { + let candidate = Path::new(root).join(relative_dir); + match fs::create_dir_all(&candidate) { + Ok(()) => out.push(candidate), + Err(err) => errors.push(format!("{} ({})", candidate.display(), err)), + } + } + if out.is_empty() { + return Err(ConfigError::InvalidClientConfig { + detail: format!( + "large_file_paths contains no usable root for {}; tried: {}", + target_name, + errors.join(", ") + ), + } + .into_kverror()); + } + Ok(out) + } + pub fn kv_logs_dir(&self, cluster_name: &str) -> KvResult { let relative_dir = PathBuf::from(format!("{cluster_name}_cluster_kv_logs")); self.resolve_preferred_root_subdir(&relative_dir, "kv logs") @@ -714,6 +755,18 @@ impl LargeFilePaths { "fluxon fs disk cache", ) } + + pub fn kv_ssd_storage_dirs( + &self, + cluster_name: &str, + instance_key: &str, + ) -> KvResult> { + let relative_dir = PathBuf::from(format!( + "{cluster_name}_cluster_kv_ssd_storage/{}", + crate::kv_ssd_storage::safe_path_component(instance_key) + )); + self.resolve_all_usable_root_subdirs(&relative_dir, "kv ssd storage") + } } /// KV client backend types supported by the system @@ -733,8 +786,9 @@ pub struct ClientConfig { pub pprof_duration_seconds: Option, pub redis_compat_listen_addr: Option, pub fluxonkv_spec: FluxonKvSpec, - pub share_mem_path: String, // Mandatory shared bundle path + pub share_mem_path: String, // Mandatory shared bundle path pub large_file_paths: LargeFilePaths, // Mandatory large-file roots for logs and caches + pub ssd_storage: Option, pub test_spec_config: TestSpecConfig, } @@ -1028,6 +1082,13 @@ impl ClientConfigYaml { } .into_kverror()); } + if self.fluxonkv_spec.ssd_storage.is_some() { + return Err(ConfigError::InvalidClientConfig { + detail: "fluxonkv_spec.ssd_storage is forbidden in zero-contribution mode" + .to_string(), + } + .into_kverror()); + } } // Preserve historical behavior for configs that omit `protocol`, but allow @@ -1170,13 +1231,15 @@ impl ClientConfigYaml { } else { let Some(large_file_paths_yaml) = self.fluxonkv_spec.large_file_paths.as_ref() else { return Err(ConfigError::InvalidClientConfig { - detail: "fluxonkv_spec.large_file_paths is required for owner mode" - .to_string(), + detail: "fluxonkv_spec.large_file_paths is required for owner mode".to_string(), } .into_kverror()); }; LargeFilePaths { - paths: verify_non_empty_root_path_list(&large_file_paths_yaml.0, "large_file_paths")?, + paths: verify_non_empty_root_path_list( + &large_file_paths_yaml.0, + "large_file_paths", + )?, } }; @@ -1204,6 +1267,28 @@ impl ClientConfigYaml { } }; + let ssd_storage = if is_external { + None + } else { + match std::mem::take(&mut self.fluxonkv_spec.ssd_storage) { + None | Some(YamlNullable::Null) => None, + Some(YamlNullable::Value(raw)) => { + if raw.max_bytes < crate::kv_ssd_storage::SSD_ALIGNMENT as u64 { + return Err(ConfigError::InvalidClientConfig { + detail: format!( + "fluxonkv_spec.ssd_storage.max_bytes must be >= {}", + crate::kv_ssd_storage::SSD_ALIGNMENT + ), + } + .into_kverror()); + } + Some(KvSsdStorageConfig { + max_bytes: raw.max_bytes, + }) + } + } + }; + Ok(ClientConfig { cluster_name: fluxonkv_spec.cluster_name.clone(), etcd_addresses_raw, @@ -1215,6 +1300,7 @@ impl ClientConfigYaml { fluxonkv_spec, share_mem_path, large_file_paths, + ssd_storage, test_spec_config, }) } @@ -1647,7 +1733,80 @@ fluxonkv_spec: .unwrap(); let err = cfg.verify().unwrap_err(); let text = format!("{err}"); - assert!(text.contains("fluxonkv_spec.large_file_paths is forbidden in zero-contribution mode")); + assert!( + text.contains("fluxonkv_spec.large_file_paths is forbidden in zero-contribution mode") + ); + } + + #[test] + fn client_config_owner_accepts_ssd_storage() { + let cfg = ClientConfigYaml::from_str( + r#" +instance_key: test_owner +contribute_to_cluster_pool_size: + dram: 16777216 + vram: {} +fluxonkv_spec: + etcd_addresses: ["127.0.0.1:2379"] + cluster_name: test_cluster + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] + ssd_storage: + max_bytes: 1048576 + sub_cluster: rack-a +"#, + ) + .unwrap(); + let verified = cfg.verify().unwrap(); + assert_eq!( + verified.ssd_storage.as_ref().map(|cfg| cfg.max_bytes), + Some(1048576) + ); + } + + #[test] + fn client_config_owner_rejects_too_small_ssd_storage() { + let cfg = ClientConfigYaml::from_str( + r#" +instance_key: test_owner +contribute_to_cluster_pool_size: + dram: 16777216 + vram: {} +fluxonkv_spec: + etcd_addresses: ["127.0.0.1:2379"] + cluster_name: test_cluster + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] + ssd_storage: + max_bytes: 1 + sub_cluster: rack-a +"#, + ) + .unwrap(); + let err = cfg.verify().unwrap_err(); + let text = format!("{err}"); + assert!( + text.contains("fluxonkv_spec.ssd_storage.max_bytes must be >= 512"), + "{text}" + ); + } + + #[test] + fn client_config_zero_contribution_rejects_ssd_storage() { + let cfg = ClientConfigYaml::from_str( + r#" +instance_key: test_external +fluxonkv_spec: + cluster_name: test_cluster + share_mem_path: /tmp/test_external + ssd_storage: + max_bytes: 1048576 +"#, + ) + .unwrap(); + let err = cfg.verify().unwrap_err(); + let text = format!("{err}"); + assert!(text.contains("fluxonkv_spec.ssd_storage is forbidden in zero-contribution mode")); } #[test] @@ -1667,7 +1826,9 @@ fluxonkv_spec: let logs_dir = large_file_paths.kv_logs_dir("test_cluster").unwrap(); assert_eq!( logs_dir, - first_root.join("child").join("test_cluster_cluster_kv_logs") + first_root + .join("child") + .join("test_cluster_cluster_kv_logs") ); assert!(logs_dir.exists()); @@ -1683,6 +1844,32 @@ fluxonkv_spec: assert!(third_party_logs_dir.exists()); } + #[test] + fn large_file_paths_uses_all_usable_roots_for_kv_ssd_storage() { + let tempdir = new_test_dir("fluxon_large_paths_uses_all_usable_roots_for_kv_ssd_storage"); + let first_root = tempdir.join("first_root"); + let second_root = tempdir.join("second_root"); + + let large_file_paths = LargeFilePaths { + paths: vec![ + first_root.to_string_lossy().into_owned(), + second_root.to_string_lossy().into_owned(), + ], + }; + + let dirs = large_file_paths + .kv_ssd_storage_dirs("test_cluster", "owner/a:b") + .unwrap(); + assert_eq!( + dirs, + vec![ + first_root.join("test_cluster_cluster_kv_ssd_storage/owner_a_b"), + second_root.join("test_cluster_cluster_kv_ssd_storage/owner_a_b"), + ] + ); + assert!(dirs.iter().all(|dir| dir.exists())); + } + #[test] fn client_test_spec_config_accepts_explicit_rdma_device_names() { let cfg = ClientConfigYaml::from_str( diff --git a/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs b/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs index da701cd..630a8ea 100644 --- a/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs +++ b/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs @@ -89,6 +89,7 @@ fn new_client_config( large_file_paths: LargeFilePaths { paths: vec![format!("{}_large", shm_path)], }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), } } @@ -130,6 +131,7 @@ fn new_zero_contribution_client_config( }, share_mem_path: shm_path.to_string(), large_file_paths: LargeFilePaths { paths: Vec::new() }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), } } diff --git a/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs b/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs index 9cb291f..b7715dd 100644 --- a/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs +++ b/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs @@ -865,8 +865,7 @@ impl ExternalInner { return Ok(false); } - self.finish_owner_recover(&share_mem_path, payload) - .await?; + self.finish_owner_recover(&share_mem_path, payload).await?; Ok(true) } diff --git a/fluxon_rs/fluxon_kv/src/kv_ssd_storage.rs b/fluxon_rs/fluxon_kv/src/kv_ssd_storage.rs new file mode 100644 index 0000000..26d711e --- /dev/null +++ b/fluxon_rs/fluxon_kv/src/kv_ssd_storage.rs @@ -0,0 +1,2159 @@ +use crate::master_kv_router::put::PutIDForAKey; +use crate::rpcresp_kvresult_convert::msg_and_error::{ApiError, KvError, KvResult}; +use ::tokio::{ + sync::{Notify, mpsc as tokio_mpsc, oneshot}, + task, +}; +use futures::stream::{FuturesUnordered, StreamExt}; +use io_uring::{IoUring, opcode, types::Fd}; +use parking_lot::Mutex; +use std::collections::{HashMap, HashSet, VecDeque}; +use std::fs::{self, OpenOptions}; +use std::io; +use std::os::fd::{AsRawFd, RawFd}; +use std::os::unix::fs::MetadataExt; +use std::os::unix::fs::OpenOptionsExt; +use std::path::{Path, PathBuf}; +use std::ptr::NonNull; +use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::thread::JoinHandle; + +pub(crate) const SSD_ALIGNMENT: usize = 512; +const DEFAULT_SHARDS_PER_OWNER: usize = 4; +const DEFAULT_URING_THREADS: usize = 16; +const DEFAULT_URING_IO_DEPTH: usize = 128; +const DEFAULT_URING_READ_WEIGHT: usize = 2; +const DEFAULT_WRITE_QUEUE_DEPTH: usize = 8; +const DEFAULT_READ_QUEUE_DEPTH: usize = 16; +const DEFAULT_WRITE_INFLIGHT: usize = 2; +const DEFAULT_READ_INFLIGHT: usize = 16; +pub(crate) const DEFAULT_READ_TRANSFER_PIPELINE_CHUNK_BYTES: u64 = 4 * 1024 * 1024; +pub(crate) const DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT: usize = 4; + +#[derive(Clone, Debug)] +pub struct KvSsdStorageInit { + pub root_dirs: Vec, + pub max_bytes: u64, +} + +#[derive(Debug)] +pub struct KvSsdStorage { + root_dirs: Vec, + devices: Vec, + shard_to_device: Vec, + next_write_device: AtomicUsize, + inner: Arc>, + space_notify: Arc, +} + +#[derive(Debug)] +struct SsdDeviceWorker { + device_id: u64, + root_dir: PathBuf, + shard_ids: Vec, + _files: Vec, + _io: Arc, + write_tx: tokio_mpsc::Sender, + read_tx: tokio_mpsc::Sender, +} + +#[derive(Clone, Debug)] +struct SsdDeviceRoot { + device_id: u64, + root_dir: PathBuf, +} + +struct OpenedSsdShard { + shard_id: usize, + device_idx: usize, + file: std::fs::File, +} + +#[derive(Clone, Copy, Debug)] +pub(crate) struct SsdLoadedChunk { + pub offset: u64, + pub stage_addr: u64, + pub len: u64, +} + +#[derive(Debug)] +struct KvSsdStorageInner { + ring: SsdRingBuffer, +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +struct KvSsdKey { + key: String, + put_id: PutIDForAKey, +} + +#[derive(Clone, Debug)] +struct SsdIndexEntry { + shard_id: usize, + begin: u64, + len: u64, + aligned_len: u64, + file_offset: u64, +} + +#[derive(Clone, Debug)] +struct SsdReadPinInfo { + entry: SsdIndexEntry, + count: usize, +} + +#[derive(Clone, Debug)] +enum SsdEntryState { + Writing(SsdIndexEntry), + Committed(SsdIndexEntry), +} + +impl SsdEntryState { + fn entry(&self) -> &SsdIndexEntry { + match self { + Self::Writing(entry) | Self::Committed(entry) => entry, + } + } +} + +#[derive(Debug)] +struct SsdShardRing { + capacity: u64, + head: u64, + tail: u64, + order: VecDeque, +} + +#[derive(Debug)] +struct SsdRingBuffer { + shards: Vec, + next_shard: usize, + entries: HashMap, + read_pins: HashMap, +} + +#[derive(Debug)] +enum SsdPreparedWrite { + Ready(SsdIndexEntry), + Existing, + BlockedByBusyIo, +} + +#[derive(Debug)] +enum SsdAllocation { + Ready { begin: u64, file_offset: u64 }, + BlockedByBusyIo, + TooLarge, +} + +impl SsdRingBuffer { + fn new(shard_capacities: Vec) -> Self { + assert!(!shard_capacities.is_empty()); + Self { + shards: shard_capacities + .into_iter() + .map(|capacity| SsdShardRing { + capacity, + head: 0, + tail: 0, + order: VecDeque::new(), + }) + .collect(), + next_shard: 0, + entries: HashMap::new(), + read_pins: HashMap::new(), + } + } + + #[cfg(test)] + fn get(&self, key: &KvSsdKey) -> Option { + match self.entries.get(key) { + Some(SsdEntryState::Committed(entry)) if self.is_offset_valid(entry) => { + Some(entry.clone()) + } + _ => None, + } + } + + fn pin_read(&mut self, key: &KvSsdKey) -> Option { + let entry = match self.entries.get(key) { + Some(SsdEntryState::Committed(entry)) if self.is_offset_valid(entry) => entry.clone(), + _ => return None, + }; + let pin = self + .read_pins + .entry(key.clone()) + .or_insert_with(|| SsdReadPinInfo { + entry: entry.clone(), + count: 0, + }); + pin.count += 1; + Some(entry) + } + + fn unpin_read(&mut self, key: &KvSsdKey) { + match self.read_pins.get_mut(key) { + Some(pin) if pin.count > 1 => pin.count -= 1, + Some(_) => { + self.read_pins.remove(key); + } + None => debug_assert!(false, "missing kv ssd read pin for key={key:?}"), + } + } + + #[cfg(test)] + fn prepare_write(&mut self, key: KvSsdKey, len: u64) -> KvResult { + let allowed_shards = (0..self.shards.len()).collect::>(); + self.prepare_write_on_shards(key, len, &allowed_shards) + } + + fn prepare_write_on_shards( + &mut self, + key: KvSsdKey, + len: u64, + allowed_shards: &[usize], + ) -> KvResult { + if allowed_shards.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd device has no shards".to_string(), + })); + } + if self.entries.contains_key(&key) { + return Ok(SsdPreparedWrite::Existing); + } + let aligned_len = align_up_u64(len, SSD_ALIGNMENT as u64)?; + let max_capacity = self + .shards + .iter() + .enumerate() + .filter(|(idx, _)| allowed_shards.contains(idx)) + .map(|(_, shard)| shard.capacity) + .max() + .ok_or_else(|| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd device has invalid shard set: {allowed_shards:?}"), + }) + })?; + if aligned_len > max_capacity { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd value len={} aligned_len={} exceeds shard capacity={}", + len, aligned_len, max_capacity + ), + })); + } + + let shard_count = self.shards.len(); + for offset in 0..shard_count { + let shard_id = (self.next_shard + offset) % shard_count; + if !allowed_shards.contains(&shard_id) { + continue; + } + let (begin, file_offset) = match self.allocate_contiguous(shard_id, aligned_len) { + SsdAllocation::Ready { begin, file_offset } => (begin, file_offset), + SsdAllocation::BlockedByBusyIo => continue, + SsdAllocation::TooLarge => unreachable!("aligned_len was checked against capacity"), + }; + self.next_shard = (shard_id + 1) % shard_count; + + let entry = SsdIndexEntry { + shard_id, + begin, + len, + aligned_len, + file_offset, + }; + self.entries + .insert(key.clone(), SsdEntryState::Writing(entry.clone())); + self.shards[shard_id].order.push_back(key); + return Ok(SsdPreparedWrite::Ready(entry)); + } + + Ok(SsdPreparedWrite::BlockedByBusyIo) + } + + fn allocate_contiguous(&mut self, shard_id: usize, size: u64) -> SsdAllocation { + let shard = &self.shards[shard_id]; + if size > shard.capacity { + return SsdAllocation::TooLarge; + } + let capacity = shard.capacity; + let mut head = shard.head; + let phys = head % capacity; + let space_until_end = capacity - phys; + if size > space_until_end { + head += space_until_end; + } + let begin = head; + let new_head = head + size; + let new_tail = new_head.saturating_sub(capacity); + if self.has_busy_entries_before(shard_id, new_tail) { + return SsdAllocation::BlockedByBusyIo; + } + + self.shards[shard_id].head = new_head; + self.advance_tail(shard_id, new_tail); + SsdAllocation::Ready { + begin, + file_offset: begin % capacity, + } + } + + fn advance_tail(&mut self, shard_id: usize, new_tail: u64) { + if new_tail <= self.shards[shard_id].tail { + return; + } + debug_assert!(!self.has_busy_entries_before(shard_id, new_tail)); + self.shards[shard_id].tail = new_tail; + + while let Some(key) = self.shards[shard_id].order.front() { + match self.entries.get(key) { + Some(state) if state.entry().begin >= new_tail => break, + _ => { + let key = self.shards[shard_id] + .order + .pop_front() + .expect("front key exists"); + self.entries.remove(&key); + } + } + } + } + + fn commit(&mut self, key: &KvSsdKey, success: bool) -> bool { + let Some(state) = self.entries.get(key) else { + return false; + }; + let entry = match state { + SsdEntryState::Writing(entry) => entry.clone(), + SsdEntryState::Committed(_) => return true, + }; + if !self.is_offset_valid(&entry) || !success { + self.entries.remove(key); + return false; + } + self.entries + .insert(key.clone(), SsdEntryState::Committed(entry)); + true + } + + fn remove(&mut self, key: &KvSsdKey) { + self.entries.remove(key); + } + + fn is_offset_valid(&self, entry: &SsdIndexEntry) -> bool { + self.shards + .get(entry.shard_id) + .is_some_and(|shard| entry.begin >= shard.tail) + } + + fn has_busy_entries_before(&self, shard_id: usize, new_tail: u64) -> bool { + if new_tail <= self.shards[shard_id].tail { + return false; + } + let writing_busy = self.entries.values().any(|state| match state { + SsdEntryState::Writing(entry) => entry.shard_id == shard_id && entry.begin < new_tail, + SsdEntryState::Committed(_) => false, + }); + if writing_busy { + return true; + } + self.read_pins + .values() + .any(|pin| pin.entry.shard_id == shard_id && pin.entry.begin < new_tail) + } +} + +struct SsdReadPin { + inner: Arc>, + space_notify: Arc, + key: KvSsdKey, +} + +impl Drop for SsdReadPin { + fn drop(&mut self) { + self.inner.lock().ring.unpin_read(&self.key); + self.space_notify.notify_one(); + } +} + +struct WriteCommand { + key: KvSsdKey, + entry_len: u64, + data: AlignedBuffer, + done_tx: oneshot::Sender>, +} + +struct ReadCommand { + key: KvSsdKey, + entry: SsdIndexEntry, + file_offset: u64, + target: ReadTarget, + _read_pin: Option, + done_tx: oneshot::Sender>, +} + +struct WriteTask { + key: KvSsdKey, + entry: SsdIndexEntry, + data: AlignedBuffer, + done_tx: oneshot::Sender>, +} + +struct ReadTask { + key: KvSsdKey, + entry: SsdIndexEntry, + file_offset: u64, + target: ReadTarget, + _read_pin: Option, + done_tx: oneshot::Sender>, +} + +struct WriteCompletion { + key: KvSsdKey, + success: bool, + result: KvResult<()>, + done_tx: oneshot::Sender>, +} + +struct ReadCompletion { + key: KvSsdKey, + entry: SsdIndexEntry, + result: KvResult, + _read_pin: Option, + done_tx: oneshot::Sender>, +} + +enum ReadTarget { + Scratch(AlignedBuffer), + Direct { target_addr: u64, len: usize }, +} + +enum ReadOutput { + Scratch(AlignedBuffer), + Direct, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum SsdReadPath { + Scratch, + Direct, +} + +pub fn safe_path_component(raw: &str) -> String { + let mut out = String::with_capacity(raw.len().max(1)); + for ch in raw.chars() { + if ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.') { + out.push(ch); + } else { + out.push('_'); + } + } + if out.is_empty() { + "unnamed".to_string() + } else { + out + } +} + +impl KvSsdStorage { + pub fn new(init: KvSsdStorageInit) -> KvResult { + if init.max_bytes < SSD_ALIGNMENT as u64 { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd storage max_bytes must be >= {}", SSD_ALIGNMENT), + })); + } + if init.root_dirs.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage root_dirs must contain at least one path".to_string(), + })); + } + + let device_roots = deduplicate_device_roots(&init.root_dirs)?; + let effective_root_dirs = device_roots + .iter() + .map(|root| root.root_dir.clone()) + .collect::>(); + let shard_count = choose_shard_count(init.max_bytes, device_roots.len()); + let shard_capacity = aligned_shard_capacity(init.max_bytes, shard_count)?; + let opened_shards = open_cache_files(&device_roots, shard_count, shard_capacity)?; + let inner = Arc::new(Mutex::new(KvSsdStorageInner { + ring: SsdRingBuffer::new(vec![shard_capacity; shard_count]), + })); + let space_notify = Arc::new(Notify::new()); + let mut shard_to_device = vec![0usize; shard_count]; + let mut device_shards = device_roots + .iter() + .map(|root| (root.clone(), Vec::<(usize, std::fs::File)>::new())) + .collect::>(); + for opened in opened_shards { + shard_to_device[opened.shard_id] = opened.device_idx; + device_shards[opened.device_idx] + .1 + .push((opened.shard_id, opened.file)); + } + + let mut devices = Vec::with_capacity(device_shards.len()); + for (device_root, shard_files) in device_shards { + let shard_ids = shard_files + .iter() + .map(|(shard_id, _)| *shard_id) + .collect::>(); + let fds = shard_files + .iter() + .map(|(shard_id, file)| (*shard_id, file.as_raw_fd())) + .collect::>(); + let io = Arc::new(UringIoEngine::new_multi( + fds, + UringConfig { + threads: DEFAULT_URING_THREADS, + io_depth: DEFAULT_URING_IO_DEPTH, + }, + )?); + let (write_tx, write_rx) = tokio_mpsc::channel(DEFAULT_WRITE_QUEUE_DEPTH); + let (read_tx, read_rx) = tokio_mpsc::channel(DEFAULT_READ_QUEUE_DEPTH); + + task::spawn(ssd_writer_loop( + Arc::clone(&inner), + write_rx, + Arc::clone(&io), + Arc::clone(&space_notify), + DEFAULT_WRITE_INFLIGHT, + shard_ids.clone(), + )); + task::spawn(ssd_reader_loop( + Arc::clone(&inner), + read_rx, + Arc::clone(&io), + DEFAULT_READ_INFLIGHT, + )); + + devices.push(SsdDeviceWorker { + device_id: device_root.device_id, + root_dir: device_root.root_dir, + shard_ids, + _files: shard_files + .into_iter() + .map(|(_, file)| file) + .collect::>(), + _io: io, + write_tx, + read_tx, + }); + } + + Ok(Self { + root_dirs: effective_root_dirs, + devices, + shard_to_device, + next_write_device: AtomicUsize::new(0), + inner, + space_notify, + }) + } + + pub fn root_dirs(&self) -> &[PathBuf] { + &self.root_dirs + } + + fn next_write_tx(&self) -> KvResult> { + if self.devices.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage has no active device".to_string(), + })); + } + let idx = self.next_write_device.fetch_add(1, Ordering::Relaxed) % self.devices.len(); + Ok(self.devices[idx].write_tx.clone()) + } + + fn read_tx_for_shard(&self, shard_id: usize) -> KvResult> { + let Some(device_idx) = self.shard_to_device.get(shard_id).copied() else { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd invalid shard id for read: {}", shard_id), + })); + }; + let Some(device) = self.devices.get(device_idx) else { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd invalid device index for read: shard_id={} device_idx={}", + shard_id, device_idx + ), + })); + }; + if !device.shard_ids.contains(&shard_id) { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd shard/device route mismatch: shard_id={} device_idx={} device_id={} root_dir={}", + shard_id, + device_idx, + device.device_id, + device.root_dir.display() + ), + })); + } + Ok(device.read_tx.clone()) + } + + pub async fn persist_from_addr( + &self, + key: &str, + put_id: PutIDForAKey, + addr: u64, + len: u64, + ) -> KvResult<()> { + validate_key(key)?; + let len_usize = usize::try_from(len).map_err(|_| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd persist len does not fit usize: {}", len), + }) + })?; + let aligned_len = align_up_usize(len_usize, SSD_ALIGNMENT)?; + let data = unsafe { AlignedBuffer::copy_from_addr(addr, len_usize, aligned_len)? }; + self.persist_buffer(key, put_id, len, data).await + } + + pub async fn persist(&self, key: &str, put_id: PutIDForAKey, data: &[u8]) -> KvResult<()> { + validate_key(key)?; + let aligned_len = align_up_usize(data.len(), SSD_ALIGNMENT)?; + let mut buffer = AlignedBuffer::zeroed(aligned_len)?; + unsafe { + std::ptr::copy_nonoverlapping(data.as_ptr(), buffer.as_mut_ptr(), data.len()); + } + self.persist_buffer(key, put_id, data.len() as u64, buffer) + .await + } + + async fn persist_buffer( + &self, + key: &str, + put_id: PutIDForAKey, + entry_len: u64, + data: AlignedBuffer, + ) -> KvResult<()> { + let (done_tx, done_rx) = oneshot::channel(); + let write_tx = self.next_write_tx()?; + write_tx + .send(WriteCommand { + key: KvSsdKey { + key: key.to_string(), + put_id, + }, + entry_len, + data, + done_tx, + }) + .await + .map_err(|err| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd write queue closed: {}", err), + }) + })?; + done_rx.await.map_err(|err| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd write completion closed: {}", err), + }) + })? + } + + pub async fn load_into_addr( + &self, + key: &str, + put_id: PutIDForAKey, + target_addr: u64, + len: u64, + target_len: u64, + ) -> KvResult<()> { + validate_key(key)?; + if target_len < len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd target capacity too small for key={} put_id=({},{}) len={} target_len={}", + key, put_id.0, put_id.1, len, target_len + ), + })); + } + let key = KvSsdKey { + key: key.to_string(), + put_id, + }; + let (entry, read_pin) = { + let mut inner = self.inner.lock(); + let Some(entry) = inner.ring.pin_read(&key) else { + return Err(KvError::Api(ApiError::KeyNotFound { + key: key.key.clone(), + })); + }; + ( + entry, + SsdReadPin { + inner: Arc::clone(&self.inner), + space_notify: Arc::clone(&self.space_notify), + key: key.clone(), + }, + ) + }; + if entry.len != len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd length mismatch for key={} put_id=({},{}) expected={} actual={}", + key.key, put_id.0, put_id.1, len, entry.len + ), + })); + } + + let len_usize = usize::try_from(len).map_err(|_| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd load len does not fit usize: {}", len), + }) + })?; + let aligned_len_usize = usize::try_from(entry.aligned_len).map_err(|_| { + KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd aligned load len does not fit usize: {}", + entry.aligned_len + ), + }) + })?; + let target = match choose_read_path(&entry, target_addr, len, target_len) { + SsdReadPath::Direct => ReadTarget::Direct { + target_addr, + len: aligned_len_usize, + }, + SsdReadPath::Scratch => ReadTarget::Scratch(AlignedBuffer::zeroed(aligned_len_usize)?), + }; + let output = self + .submit_read_command( + key, + entry.clone(), + entry.file_offset, + target, + Some(read_pin), + ) + .await?; + if let ReadOutput::Scratch(buffer) = output { + unsafe { + std::ptr::copy_nonoverlapping(buffer.as_ptr(), target_addr as *mut u8, len_usize); + } + } + Ok(()) + } + + pub(crate) async fn load_into_addr_chunks( + &self, + key: &str, + put_id: PutIDForAKey, + target_addr: u64, + len: u64, + target_len: u64, + chunk_bytes: u64, + max_read_inflight: usize, + ready_tx: tokio_mpsc::Sender, + ) -> KvResult<()> { + validate_key(key)?; + if target_len < len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd target capacity too small for chunked load: key={} put_id=({},{}) len={} target_len={}", + key, put_id.0, put_id.1, len, target_len + ), + })); + } + let chunk_bytes = align_up_u64(chunk_bytes.max(1), SSD_ALIGNMENT as u64)?; + let key = KvSsdKey { + key: key.to_string(), + put_id, + }; + let (entry, _read_pin) = { + let mut inner = self.inner.lock(); + let Some(entry) = inner.ring.pin_read(&key) else { + return Err(KvError::Api(ApiError::KeyNotFound { + key: key.key.clone(), + })); + }; + ( + entry, + SsdReadPin { + inner: Arc::clone(&self.inner), + space_notify: Arc::clone(&self.space_notify), + key: key.clone(), + }, + ) + }; + if entry.len != len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd length mismatch for chunked load: key={} put_id=({},{}) expected={} actual={}", + key.key, put_id.0, put_id.1, len, entry.len + ), + })); + } + + let mut next_offset = 0u64; + let mut inflight = FuturesUnordered::new(); + let max_read_inflight = max_read_inflight.max(1); + + loop { + while next_offset < len && inflight.len() < max_read_inflight { + let payload_len = chunk_bytes.min(len - next_offset); + let stage_addr = checked_add_u64(target_addr, next_offset, "chunk stage addr")?; + let remaining_target_len = target_len - next_offset; + inflight.push(self.load_entry_range_into_addr( + key.clone(), + entry.clone(), + next_offset, + payload_len, + stage_addr, + remaining_target_len, + )); + next_offset += payload_len; + } + + let Some(chunk) = inflight.next().await else { + break; + }; + let chunk = chunk?; + ready_tx.send(chunk).await.map_err(|err| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd chunk ready queue closed: {}", err), + }) + })?; + } + Ok(()) + } + + async fn load_entry_range_into_addr( + &self, + key: KvSsdKey, + entry: SsdIndexEntry, + offset: u64, + payload_len: u64, + target_addr: u64, + target_len: u64, + ) -> KvResult { + if payload_len == 0 { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd chunk payload len must be positive".to_string(), + })); + } + let payload_end = checked_add_u64(offset, payload_len, "chunk payload end")?; + if payload_end > entry.len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd chunk exceeds entry len: offset={} len={} entry_len={}", + offset, payload_len, entry.len + ), + })); + } + let read_len = align_up_u64(payload_len, SSD_ALIGNMENT as u64)?; + let read_end = checked_add_u64(offset, read_len, "chunk read end")?; + if read_end > entry.aligned_len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd aligned chunk exceeds entry aligned len: offset={} read_len={} aligned_len={}", + offset, read_len, entry.aligned_len + ), + })); + } + if target_len < read_len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd chunk target capacity too small: offset={} read_len={} target_len={}", + offset, read_len, target_len + ), + })); + } + let file_offset = checked_add_u64(entry.file_offset, offset, "chunk file offset")?; + let read_len_usize = usize::try_from(read_len).map_err(|_| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd chunk read len does not fit usize: {}", read_len), + }) + })?; + let payload_len_usize = usize::try_from(payload_len).map_err(|_| { + KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd chunk payload len does not fit usize: {}", + payload_len + ), + }) + })?; + let target = match choose_chunk_read_path(target_addr, read_len, target_len, file_offset) { + SsdReadPath::Direct => ReadTarget::Direct { + target_addr, + len: read_len_usize, + }, + SsdReadPath::Scratch => ReadTarget::Scratch(AlignedBuffer::zeroed(read_len_usize)?), + }; + let output = self + .submit_read_command(key, entry, file_offset, target, None) + .await?; + if let ReadOutput::Scratch(buffer) = output { + unsafe { + std::ptr::copy_nonoverlapping( + buffer.as_ptr(), + target_addr as *mut u8, + payload_len_usize, + ); + } + } + Ok(SsdLoadedChunk { + offset, + stage_addr: target_addr, + len: payload_len, + }) + } + + async fn submit_read_command( + &self, + key: KvSsdKey, + entry: SsdIndexEntry, + file_offset: u64, + target: ReadTarget, + read_pin: Option, + ) -> KvResult { + let (done_tx, done_rx) = oneshot::channel(); + let read_tx = self.read_tx_for_shard(entry.shard_id)?; + read_tx + .send(ReadCommand { + key, + entry, + file_offset, + target, + _read_pin: read_pin, + done_tx, + }) + .await + .map_err(|err| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd read queue closed: {}", err), + }) + })?; + done_rx.await.map_err(|err| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd read completion closed: {}", err), + }) + })? + } + + #[cfg(test)] + async fn has_entry(&self, key: &str, put_id: PutIDForAKey) -> bool { + let key = KvSsdKey { + key: key.to_string(), + put_id, + }; + self.inner.lock().ring.get(&key).is_some() + } +} + +async fn ssd_writer_loop( + inner: Arc>, + mut rx: tokio_mpsc::Receiver, + io: Arc, + space_notify: Arc, + write_inflight: usize, + shard_ids: Vec, +) { + let mut pending: VecDeque = VecDeque::new(); + let mut inflight = FuturesUnordered::new(); + let max_inflight = write_inflight.max(1); + + loop { + while inflight.len() < max_inflight { + let Some(cmd) = pending.pop_front() else { + break; + }; + let prepared = { + let mut inner = inner.lock(); + inner + .ring + .prepare_write_on_shards(cmd.key.clone(), cmd.entry_len, &shard_ids) + }; + match prepared { + Ok(SsdPreparedWrite::Ready(entry)) => { + inflight.push(execute_write( + WriteTask { + key: cmd.key, + entry, + data: cmd.data, + done_tx: cmd.done_tx, + }, + Arc::clone(&io), + )); + } + Ok(SsdPreparedWrite::Existing) => { + let _ = cmd.done_tx.send(Ok(())); + } + Ok(SsdPreparedWrite::BlockedByBusyIo) => { + pending.push_front(cmd); + break; + } + Err(err) => { + let _ = cmd.done_tx.send(Err(err)); + } + } + } + + tokio::select! { + Some(completion) = inflight.next(), if !inflight.is_empty() => { + finish_write_completion(&inner, &space_notify, completion); + } + Some(cmd) = rx.recv() => { + pending.push_back(cmd); + } + _ = space_notify.notified(), if !pending.is_empty() => { + // Retry pending commands after an active read/write releases a ring position. + } + else => { + if pending.is_empty() && inflight.is_empty() { + break; + } + }, + } + } + + while !pending.is_empty() || !inflight.is_empty() { + while inflight.len() < max_inflight { + let Some(cmd) = pending.pop_front() else { + break; + }; + let prepared = { + let mut inner = inner.lock(); + inner + .ring + .prepare_write_on_shards(cmd.key.clone(), cmd.entry_len, &shard_ids) + }; + match prepared { + Ok(SsdPreparedWrite::Ready(entry)) => { + inflight.push(execute_write( + WriteTask { + key: cmd.key, + entry, + data: cmd.data, + done_tx: cmd.done_tx, + }, + Arc::clone(&io), + )); + } + Ok(SsdPreparedWrite::Existing) => { + let _ = cmd.done_tx.send(Ok(())); + } + Ok(SsdPreparedWrite::BlockedByBusyIo) => { + pending.push_front(cmd); + break; + } + Err(err) => { + let _ = cmd.done_tx.send(Err(err)); + } + } + } + + if let Some(completion) = inflight.next().await { + finish_write_completion(&inner, &space_notify, completion); + } else if !pending.is_empty() { + space_notify.notified().await; + } + } +} + +fn finish_write_completion( + inner: &Arc>, + space_notify: &Notify, + completion: WriteCompletion, +) { + let committed = inner + .lock() + .ring + .commit(&completion.key, completion.success); + space_notify.notify_one(); + let result = if completion.success && !committed { + Err(KvError::Api(ApiError::KeyNotFound { + key: completion.key.key.clone(), + })) + } else { + completion.result + }; + let _ = completion.done_tx.send(result); +} + +async fn execute_write(task: WriteTask, io: Arc) -> WriteCompletion { + let WriteTask { + key, + entry, + data, + done_tx, + } = task; + let data_len = data.len(); + let shard_id = entry.shard_id; + let file_offset = entry.file_offset; + let result = async move { + let rx = { + let data_ptr = data.as_ptr(); + io.writev_at_async(shard_id, vec![(data_ptr, data_len)], file_offset)? + }; + let written = rx + .await + .map_err(|_| io::Error::other("kv ssd write completion dropped"))??; + if written != data_len { + return Err(io::Error::new( + io::ErrorKind::WriteZero, + format!("short kv ssd write: {} != {}", written, data_len), + ) + .into()); + } + Ok(()) + } + .await; + let result = result.map_err(|err| file_error_for_entry(&key, file_offset, err)); + WriteCompletion { + key, + success: result.is_ok(), + result, + done_tx, + } +} + +async fn ssd_reader_loop( + inner: Arc>, + mut rx: tokio_mpsc::Receiver, + io: Arc, + read_inflight: usize, +) { + let mut pending = VecDeque::new(); + let mut inflight = FuturesUnordered::new(); + let max_inflight = read_inflight.max(1); + + loop { + while inflight.len() < max_inflight { + let Some(task) = pending.pop_front() else { + break; + }; + inflight.push(execute_read(task, Arc::clone(&io))); + } + + tokio::select! { + Some(completion) = inflight.next(), if !inflight.is_empty() => { + let valid = inner.lock().ring.is_offset_valid(&completion.entry); + let result = if valid { + completion.result + } else { + inner.lock().ring.remove(&completion.key); + Err(KvError::Api(ApiError::KeyNotFound { + key: completion.key.key.clone(), + })) + }; + let _ = completion.done_tx.send(result); + } + Some(cmd) = rx.recv() => { + pending.push_back(ReadTask { + key: cmd.key, + entry: cmd.entry, + file_offset: cmd.file_offset, + target: cmd.target, + _read_pin: cmd._read_pin, + done_tx: cmd.done_tx, + }); + } + else => break, + } + } + + while let Some(completion) = inflight.next().await { + let valid = inner.lock().ring.is_offset_valid(&completion.entry); + let result = if valid { + completion.result + } else { + inner.lock().ring.remove(&completion.key); + Err(KvError::Api(ApiError::KeyNotFound { + key: completion.key.key.clone(), + })) + }; + let _ = completion.done_tx.send(result); + } +} + +async fn execute_read(task: ReadTask, io: Arc) -> ReadCompletion { + let ReadTask { + key, + entry, + file_offset, + target, + _read_pin, + done_tx, + } = task; + let shard_id = entry.shard_id; + let result = async move { + match target { + ReadTarget::Scratch(mut buffer) => { + let buffer_len = buffer.len(); + let rx = { + let buffer_ptr = buffer.as_mut_ptr(); + io.readv_at_async(shard_id, vec![(buffer_ptr, buffer_len)], file_offset)? + }; + let read = rx + .await + .map_err(|_| io::Error::other("kv ssd read completion dropped"))??; + if read != buffer_len { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!("short kv ssd read: {} != {}", read, buffer_len), + )); + } + Ok(ReadOutput::Scratch(buffer)) + } + ReadTarget::Direct { target_addr, len } => { + let rx = + io.readv_at_async(shard_id, vec![(target_addr as *mut u8, len)], file_offset)?; + let read = rx + .await + .map_err(|_| io::Error::other("kv ssd read completion dropped"))??; + if read != len { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!("short kv ssd direct read: {} != {}", read, len), + )); + } + Ok(ReadOutput::Direct) + } + } + } + .await + .map_err(|err| file_error_for_entry(&key, file_offset, err)); + ReadCompletion { + key, + entry, + result, + _read_pin, + done_tx, + } +} + +#[derive(Clone, Copy)] +struct UringConfig { + threads: usize, + io_depth: usize, +} + +#[derive(Clone, Copy)] +enum IoType { + Readv, + Writev, +} + +struct IoCtx { + io_type: IoType, + fd: RawFd, + len: usize, + offset: u64, + complete: oneshot::Sender>, + iovecs: Box<[libc::iovec]>, +} + +unsafe impl Send for IoCtx {} + +struct UringShard { + read_rx: crossbeam::channel::Receiver, + write_rx: crossbeam::channel::Receiver, + uring: IoUring, + io_depth: usize, + read_weight: usize, +} + +impl UringShard { + fn run(mut self) { + let mut read_inflight = 0usize; + let mut write_inflight = 0usize; + let mut read_closed = false; + let mut write_closed = false; + + loop { + let mut inflight = read_inflight + write_inflight; + while inflight < self.io_depth && !(read_closed && write_closed) { + let next = self.try_recv_weighted( + &mut read_closed, + &mut write_closed, + read_inflight, + write_inflight, + ); + let Some(ctx) = next else { + break; + }; + self.submit_ctx(ctx, &mut read_inflight, &mut write_inflight); + inflight = read_inflight + write_inflight; + } + + if read_closed && write_closed && inflight == 0 { + return; + } + if inflight == 0 { + let Some(ctx) = self.recv_blocking(&mut read_closed, &mut write_closed) else { + continue; + }; + self.submit_ctx(ctx, &mut read_inflight, &mut write_inflight); + continue; + } + if let Err(err) = self.uring.submit_and_wait(1) { + while let Some(cqe) = self.uring.completion().next() { + let data = cqe.user_data(); + if data != 0 { + let ctx = unsafe { Box::from_raw(data as *mut IoCtx) }; + let _ = ctx.complete.send(Err(io::Error::other(format!( + "io_uring submit failed: {err}" + )))); + } + } + return; + } + + for cqe in self.uring.completion() { + let data = cqe.user_data(); + if data == 0 { + continue; + } + let ctx = unsafe { Box::from_raw(data as *mut IoCtx) }; + match ctx.io_type { + IoType::Readv => read_inflight = read_inflight.saturating_sub(1), + IoType::Writev => write_inflight = write_inflight.saturating_sub(1), + } + let res = cqe.result(); + let send_res = if res < 0 { + Err(io::Error::from_raw_os_error(-res)) + } else { + Ok(res as usize) + }; + let _ = ctx.complete.send(send_res); + } + } + } + + fn try_recv_weighted( + &self, + read_closed: &mut bool, + write_closed: &mut bool, + read_inflight: usize, + write_inflight: usize, + ) -> Option { + let prefer_read = read_inflight <= write_inflight.saturating_mul(self.read_weight); + if prefer_read { + self.try_recv_read(read_closed) + .or_else(|| self.try_recv_write(write_closed)) + } else { + self.try_recv_write(write_closed) + .or_else(|| self.try_recv_read(read_closed)) + } + } + + fn try_recv_read(&self, read_closed: &mut bool) -> Option { + if *read_closed { + return None; + } + match self.read_rx.try_recv() { + Ok(ctx) => Some(ctx), + Err(crossbeam::channel::TryRecvError::Empty) => None, + Err(crossbeam::channel::TryRecvError::Disconnected) => { + *read_closed = true; + None + } + } + } + + fn try_recv_write(&self, write_closed: &mut bool) -> Option { + if *write_closed { + return None; + } + match self.write_rx.try_recv() { + Ok(ctx) => Some(ctx), + Err(crossbeam::channel::TryRecvError::Empty) => None, + Err(crossbeam::channel::TryRecvError::Disconnected) => { + *write_closed = true; + None + } + } + } + + fn recv_blocking(&self, read_closed: &mut bool, write_closed: &mut bool) -> Option { + loop { + match (!*read_closed, !*write_closed) { + (true, true) => { + crossbeam::channel::select! { + recv(self.read_rx) -> msg => match msg { + Ok(ctx) => return Some(ctx), + Err(_) => *read_closed = true, + }, + recv(self.write_rx) -> msg => match msg { + Ok(ctx) => return Some(ctx), + Err(_) => *write_closed = true, + }, + } + } + (true, false) => match self.read_rx.recv() { + Ok(ctx) => return Some(ctx), + Err(_) => *read_closed = true, + }, + (false, true) => match self.write_rx.recv() { + Ok(ctx) => return Some(ctx), + Err(_) => *write_closed = true, + }, + (false, false) => return None, + } + } + } + + fn submit_ctx(&mut self, ctx: IoCtx, read_inflight: &mut usize, write_inflight: &mut usize) { + let fd = Fd(ctx.fd); + let iovecs_ptr = ctx.iovecs.as_ptr(); + let sqe = match ctx.io_type { + IoType::Readv => opcode::Readv::new(fd, iovecs_ptr, ctx.len as _) + .offset(ctx.offset) + .build(), + IoType::Writev => opcode::Writev::new(fd, iovecs_ptr, ctx.len as _) + .offset(ctx.offset) + .build(), + }; + let io_type = ctx.io_type; + let data = Box::into_raw(Box::new(ctx)) as u64; + let sqe = sqe.user_data(data); + let push_result = unsafe { self.uring.submission().push(&sqe) }; + if push_result.is_err() { + let ctx = unsafe { Box::from_raw(data as *mut IoCtx) }; + let _ = ctx + .complete + .send(Err(io::Error::other("submission queue full"))); + return; + } + match io_type { + IoType::Readv => *read_inflight += 1, + IoType::Writev => *write_inflight += 1, + } + } +} + +#[derive(Debug)] +struct UringIoEngine { + fds: HashMap, + read_txs: Vec>, + write_txs: Vec>, + handles: Vec>, +} + +impl UringIoEngine { + fn new_multi(shard_fds: Vec<(usize, RawFd)>, cfg: UringConfig) -> io::Result { + if cfg.threads == 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "threads must be > 0", + )); + } + if shard_fds.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "at least one fd is required", + )); + } + let fds = shard_fds.into_iter().collect::>(); + let mut read_txs = Vec::with_capacity(cfg.threads); + let mut write_txs = Vec::with_capacity(cfg.threads); + let mut handles = Vec::with_capacity(cfg.threads); + for idx in 0..cfg.threads { + let (read_tx, read_rx) = crossbeam::channel::bounded(cfg.io_depth * 2); + let (write_tx, write_rx) = crossbeam::channel::bounded(cfg.io_depth * 2); + let uring = IoUring::builder().build(cfg.io_depth as u32)?; + let handle = std::thread::Builder::new() + .name(format!("fluxon-kv-ssd-uring-{idx}")) + .spawn(move || { + UringShard { + read_rx, + write_rx, + uring, + io_depth: cfg.io_depth, + read_weight: DEFAULT_URING_READ_WEIGHT, + } + .run() + })?; + read_txs.push(read_tx); + write_txs.push(write_tx); + handles.push(handle); + } + Ok(Self { + fds, + read_txs, + write_txs, + handles, + }) + } + + fn readv_at_async( + &self, + shard_id: usize, + iovecs: Vec<(*mut u8, usize)>, + offset: u64, + ) -> io::Result>> { + self.submit_iovecs(IoType::Readv, shard_id, iovecs, offset) + } + + fn writev_at_async( + &self, + shard_id: usize, + iovecs: Vec<(*const u8, usize)>, + offset: u64, + ) -> io::Result>> { + let iovecs = iovecs + .into_iter() + .map(|(ptr, len)| (ptr as *mut u8, len)) + .collect(); + self.submit_iovecs(IoType::Writev, shard_id, iovecs, offset) + } + + fn submit_iovecs( + &self, + io_type: IoType, + shard_id: usize, + iovecs: Vec<(*mut u8, usize)>, + offset: u64, + ) -> io::Result>> { + if iovecs.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "readv/writev requires at least one iovec", + )); + } + validate_direct_io( + iovecs.iter().map(|(ptr, len)| (*ptr as usize, *len)), + offset, + )?; + let iovecs_libc = iovecs + .iter() + .map(|(ptr, len)| libc::iovec { + iov_base: *ptr as *mut libc::c_void, + iov_len: *len, + }) + .collect::>() + .into_boxed_slice(); + let (tx, rx) = oneshot::channel(); + let ctx = IoCtx { + io_type, + fd: self.fd(shard_id)?, + len: iovecs_libc.len(), + offset, + complete: tx, + iovecs: iovecs_libc, + }; + self.pick_tx(io_type, shard_id).send(ctx).map_err(|err| { + io::Error::new( + io::ErrorKind::BrokenPipe, + format!("io_uring send failed: {}", err), + ) + })?; + Ok(rx) + } + + fn fd(&self, shard_id: usize) -> io::Result { + self.fds.get(&shard_id).copied().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("invalid SSD shard id {shard_id}"), + ) + }) + } + + fn pick_tx(&self, io_type: IoType, shard_id: usize) -> &crossbeam::channel::Sender { + match io_type { + IoType::Readv => &self.read_txs[shard_id % self.read_txs.len()], + IoType::Writev => &self.write_txs[shard_id % self.write_txs.len()], + } + } +} + +impl Drop for UringIoEngine { + fn drop(&mut self) { + self.read_txs.clear(); + self.write_txs.clear(); + for handle in self.handles.drain(..) { + let _ = handle.join(); + } + } +} + +struct AlignedBuffer { + ptr: NonNull, + len: usize, +} + +unsafe impl Send for AlignedBuffer {} + +impl AlignedBuffer { + fn zeroed(len: usize) -> KvResult { + if len == 0 || !len.is_multiple_of(SSD_ALIGNMENT) { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "aligned buffer len must be positive and {}-byte aligned: {}", + SSD_ALIGNMENT, len + ), + })); + } + let mut raw = std::ptr::null_mut(); + let rc = unsafe { libc::posix_memalign(&mut raw, SSD_ALIGNMENT, len) }; + if rc != 0 || raw.is_null() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!("posix_memalign failed with rc={}", rc), + })); + } + unsafe { + std::ptr::write_bytes(raw as *mut u8, 0, len); + } + Ok(Self { + ptr: NonNull::new(raw as *mut u8).expect("posix_memalign returned non-null"), + len, + }) + } + + unsafe fn copy_from_addr(addr: u64, actual_len: usize, aligned_len: usize) -> KvResult { + let mut buffer = Self::zeroed(aligned_len)?; + unsafe { + std::ptr::copy_nonoverlapping(addr as *const u8, buffer.as_mut_ptr(), actual_len); + } + Ok(buffer) + } + + fn as_ptr(&self) -> *const u8 { + self.ptr.as_ptr() + } + + fn as_mut_ptr(&mut self) -> *mut u8 { + self.ptr.as_ptr() + } + + fn len(&self) -> usize { + self.len + } +} + +impl Drop for AlignedBuffer { + fn drop(&mut self) { + unsafe { + libc::free(self.ptr.as_ptr() as *mut libc::c_void); + } + } +} + +fn validate_key(key: &str) -> KvResult<()> { + if key.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage key must be non-empty".to_string(), + })); + } + Ok(()) +} + +fn choose_read_path( + entry: &SsdIndexEntry, + target_addr: u64, + len: u64, + target_len: u64, +) -> SsdReadPath { + if len == 0 || entry.len != len { + return SsdReadPath::Scratch; + } + if target_addr.is_multiple_of(SSD_ALIGNMENT as u64) + && target_len >= entry.aligned_len + && entry.file_offset.is_multiple_of(SSD_ALIGNMENT as u64) + { + SsdReadPath::Direct + } else { + SsdReadPath::Scratch + } +} + +fn choose_chunk_read_path( + target_addr: u64, + read_len: u64, + target_len: u64, + file_offset: u64, +) -> SsdReadPath { + if read_len != 0 + && target_addr.is_multiple_of(SSD_ALIGNMENT as u64) + && read_len.is_multiple_of(SSD_ALIGNMENT as u64) + && target_len >= read_len + && file_offset.is_multiple_of(SSD_ALIGNMENT as u64) + { + SsdReadPath::Direct + } else { + SsdReadPath::Scratch + } +} + +fn choose_shard_count(max_bytes: u64, root_count: usize) -> usize { + let max_aligned_shards = (max_bytes / SSD_ALIGNMENT as u64).max(1) as usize; + DEFAULT_SHARDS_PER_OWNER + .max(root_count) + .min(max_aligned_shards) + .max(1) +} + +fn aligned_shard_capacity(capacity_bytes: u64, shard_count: usize) -> KvResult { + let raw = capacity_bytes / shard_count as u64; + let capacity = raw / SSD_ALIGNMENT as u64 * SSD_ALIGNMENT as u64; + if capacity == 0 { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage capacity is too small for shard count".to_string(), + })); + } + Ok(capacity) +} + +fn deduplicate_device_roots(root_dirs: &[PathBuf]) -> KvResult> { + if root_dirs.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage root_dirs must contain at least one path".to_string(), + })); + } + let mut seen_devices = HashSet::new(); + let mut device_roots = Vec::new(); + for root_dir in root_dirs { + fs::create_dir_all(root_dir).map_err(|err| file_error(root_dir, 0, err))?; + let metadata = fs::metadata(root_dir).map_err(|err| file_error(root_dir, 0, err))?; + let device_id = metadata.dev(); + if seen_devices.insert(device_id) { + device_roots.push(SsdDeviceRoot { + device_id, + root_dir: root_dir.clone(), + }); + } + } + if device_roots.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage root_dirs contains no usable device".to_string(), + })); + } + Ok(device_roots) +} + +fn open_cache_files( + device_roots: &[SsdDeviceRoot], + shard_count: usize, + shard_capacity: u64, +) -> KvResult> { + if device_roots.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage root_dirs must contain at least one path".to_string(), + })); + } + let mut files = Vec::with_capacity(shard_count); + for shard_id in 0..shard_count { + let device_idx = shard_id % device_roots.len(); + let root_dir = &device_roots[device_idx].root_dir; + let shards_dir = root_dir.join("shards"); + fs::create_dir_all(&shards_dir).map_err(|err| file_error(&shards_dir, 0, err))?; + let path = shards_dir.join(format!("shard-{shard_id:06}.dat")); + let file = OpenOptions::new() + .create(true) + .truncate(true) + .read(true) + .write(true) + .custom_flags(libc::O_DIRECT) + .open(&path) + .map_err(|err| file_error(&path, 0, err))?; + file.set_len(shard_capacity) + .map_err(|err| file_error(&path, 0, err))?; + files.push(OpenedSsdShard { + shard_id, + device_idx, + file, + }); + } + Ok(files) +} + +fn align_up_usize(value: usize, alignment: usize) -> KvResult { + value + .checked_add(alignment - 1) + .map(|v| v / alignment * alignment) + .ok_or_else(|| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("alignment overflow for value={}", value), + }) + }) +} + +fn align_up_u64(value: u64, alignment: u64) -> KvResult { + value + .checked_add(alignment - 1) + .map(|v| v / alignment * alignment) + .ok_or_else(|| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("alignment overflow for value={}", value), + }) + }) +} + +pub(crate) fn align_ssd_io_len(len: u64) -> KvResult { + align_up_u64(len, SSD_ALIGNMENT as u64) +} + +fn checked_add_u64(lhs: u64, rhs: u64, label: &str) -> KvResult { + lhs.checked_add(rhs).ok_or_else(|| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd {label} overflow: {lhs} + {rhs}"), + }) + }) +} + +fn validate_direct_io( + iovecs: impl IntoIterator, + offset: u64, +) -> io::Result<()> { + ensure_aligned("offset", offset as usize)?; + for (addr, len) in iovecs { + ensure_aligned("buffer address", addr)?; + ensure_aligned("iovec length", len)?; + } + Ok(()) +} + +fn ensure_aligned(name: &str, value: usize) -> io::Result<()> { + if value.is_multiple_of(SSD_ALIGNMENT) { + Ok(()) + } else { + Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("O_DIRECT {name} {value:#x} is not {SSD_ALIGNMENT}-byte aligned"), + )) + } +} + +fn file_error_for_entry(key: &KvSsdKey, offset: u64, err: io::Error) -> KvError { + KvError::Api(ApiError::FileWriteError { + path: format!("kv-ssd://{}@({},{})", key.key, key.put_id.0, key.put_id.1), + offset, + detail: err.to_string(), + }) +} + +fn file_error(path: &Path, offset: u64, err: io::Error) -> KvError { + KvError::Api(ApiError::FileWriteError { + path: path.to_string_lossy().to_string(), + offset, + detail: err.to_string(), + }) +} + +impl From for KvError { + fn from(err: io::Error) -> Self { + KvError::Api(ApiError::FileWriteError { + path: "kv-ssd://io".to_string(), + offset: 0, + detail: err.to_string(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use uuid::Uuid; + + fn new_root() -> PathBuf { + std::env::current_dir() + .unwrap() + .join("target") + .join("fluxon_kv_ssd_tests") + .join(Uuid::new_v4().to_string()) + } + + async fn new_store(max_bytes: u64) -> KvSsdStorage { + KvSsdStorage::new(KvSsdStorageInit { + root_dirs: vec![new_root()], + max_bytes, + }) + .unwrap() + } + + fn test_key(key: &str, version: u64) -> KvSsdKey { + KvSsdKey { + key: key.to_string(), + put_id: (version, 0), + } + } + + fn prepare_ready(ring: &mut SsdRingBuffer, key: &KvSsdKey) -> SsdIndexEntry { + match ring.prepare_write(key.clone(), 500).unwrap() { + SsdPreparedWrite::Ready(entry) => entry, + other => panic!("expected ready SSD write, got {other:?}"), + } + } + + #[::tokio::test] + async fn persist_and_load_roundtrip() { + let store = new_store(1024 * 1024).await; + let data = b"hello from ssd"; + let put_id = (10, 1); + store.persist("k", put_id, data).await.unwrap(); + + let mut out = vec![0u8; data.len()]; + store + .load_into_addr( + "k", + put_id, + out.as_mut_ptr() as u64, + out.len() as u64, + out.len() as u64, + ) + .await + .unwrap(); + assert_eq!(out, data); + } + + #[::tokio::test] + async fn aligned_load_roundtrip_uses_direct_target() { + let store = new_store(1024 * 1024).await; + let data = (0..4096).map(|idx| (idx % 251) as u8).collect::>(); + let put_id = (11, 1); + store.persist("aligned", put_id, &data).await.unwrap(); + + let mut out = AlignedBuffer::zeroed(data.len()).unwrap(); + let target_addr = out.as_mut_ptr() as u64; + let entry = { + let key = KvSsdKey { + key: "aligned".to_string(), + put_id, + }; + store.inner.lock().ring.get(&key).unwrap() + }; + assert_eq!( + choose_read_path(&entry, target_addr, data.len() as u64, data.len() as u64), + SsdReadPath::Direct + ); + + store + .load_into_addr( + "aligned", + put_id, + target_addr, + data.len() as u64, + data.len() as u64, + ) + .await + .unwrap(); + + let out_slice = unsafe { std::slice::from_raw_parts(out.as_ptr(), data.len()) }; + assert_eq!(out_slice, data.as_slice()); + } + + #[::tokio::test] + async fn chunked_load_roundtrip_streams_ready_chunks() { + let store = new_store(1024 * 1024).await; + let data = (0..2500).map(|idx| (idx % 251) as u8).collect::>(); + let put_id = (13, 1); + store.persist("chunked", put_id, &data).await.unwrap(); + + let mut out = + AlignedBuffer::zeroed(align_ssd_io_len(data.len() as u64).unwrap() as usize).unwrap(); + let target_addr = out.as_mut_ptr() as u64; + let (tx, mut rx) = ::tokio::sync::mpsc::channel(2); + let producer = store.load_into_addr_chunks( + "chunked", + put_id, + target_addr, + data.len() as u64, + out.len() as u64, + 1024, + 2, + tx, + ); + let consumer = async { + let mut chunks = Vec::new(); + while let Some(chunk) = rx.recv().await { + chunks.push((chunk.offset, chunk.len)); + } + chunks + }; + let (producer_res, mut chunks) = ::tokio::join!(producer, consumer); + producer_res.unwrap(); + chunks.sort_unstable(); + assert_eq!(chunks, vec![(0, 1024), (1024, 1024), (2048, 452)]); + + let out_slice = unsafe { std::slice::from_raw_parts(out.as_ptr(), data.len()) }; + assert_eq!(out_slice, data.as_slice()); + } + + #[test] + fn read_path_uses_direct_for_aligned_target_with_enough_capacity() { + let aligned = SsdIndexEntry { + shard_id: 0, + begin: 0, + len: 4096, + aligned_len: 4096, + file_offset: 0, + }; + assert_eq!( + choose_read_path(&aligned, 4096, 4096, 4096), + SsdReadPath::Direct + ); + assert_eq!( + choose_read_path(&aligned, 4097, 4096, 4096), + SsdReadPath::Scratch + ); + + let unaligned_len = SsdIndexEntry { + len: 500, + aligned_len: 512, + ..aligned + }; + assert_eq!( + choose_read_path(&unaligned_len, 4096, 500, 512), + SsdReadPath::Direct + ); + assert_eq!( + choose_read_path(&unaligned_len, 4096, 500, 500), + SsdReadPath::Scratch + ); + } + + #[::tokio::test] + async fn unaligned_payload_loads_direct_when_stage_capacity_is_aligned() { + let store = new_store(1024 * 1024).await; + let data = (0..500).map(|idx| (idx % 251) as u8).collect::>(); + let put_id = (12, 1); + store.persist("unaligned", put_id, &data).await.unwrap(); + + let mut out = AlignedBuffer::zeroed(SSD_ALIGNMENT).unwrap(); + let target_addr = out.as_mut_ptr() as u64; + let entry = { + let key = KvSsdKey { + key: "unaligned".to_string(), + put_id, + }; + store.inner.lock().ring.get(&key).unwrap() + }; + assert_eq!(entry.len, data.len() as u64); + assert_eq!(entry.aligned_len, SSD_ALIGNMENT as u64); + assert_eq!( + choose_read_path(&entry, target_addr, data.len() as u64, SSD_ALIGNMENT as u64), + SsdReadPath::Direct + ); + + store + .load_into_addr( + "unaligned", + put_id, + target_addr, + data.len() as u64, + SSD_ALIGNMENT as u64, + ) + .await + .unwrap(); + + let out_slice = unsafe { std::slice::from_raw_parts(out.as_ptr(), data.len()) }; + assert_eq!(out_slice, data.as_slice()); + } + + #[::tokio::test] + async fn storage_deduplicates_root_dirs_on_same_device() { + let root_a = new_root(); + let root_b = new_root(); + let store = KvSsdStorage::new(KvSsdStorageInit { + root_dirs: vec![root_a.clone(), root_b.clone()], + max_bytes: 4 * SSD_ALIGNMENT as u64, + }) + .unwrap(); + + assert_eq!( + fs::metadata(&root_a).unwrap().dev(), + fs::metadata(&root_b).unwrap().dev() + ); + assert_eq!(store.root_dirs(), &[root_a.clone()]); + assert_eq!(store.devices.len(), 1); + assert_eq!(store.shard_to_device, vec![0, 0, 0, 0]); + assert!(root_a.join("shards/shard-000000.dat").exists()); + assert!(root_a.join("shards/shard-000001.dat").exists()); + assert!(root_a.join("shards/shard-000002.dat").exists()); + assert!(root_a.join("shards/shard-000003.dat").exists()); + assert!(!root_b.join("shards").exists()); + } + + #[test] + fn ring_prepare_write_on_shards_uses_only_allowed_shards() { + let mut ring = SsdRingBuffer::new(vec![1024, 1024, 1024, 1024]); + let mut allocated_shards = Vec::new(); + + for version in 0..4 { + let key = test_key("per-device", version); + let entry = match ring + .prepare_write_on_shards(key.clone(), 500, &[1, 3]) + .unwrap() + { + SsdPreparedWrite::Ready(entry) => entry, + other => panic!("expected ready SSD write, got {other:?}"), + }; + allocated_shards.push(entry.shard_id); + assert!(ring.commit(&key, true)); + } + + assert_eq!(allocated_shards, vec![1, 3, 1, 3]); + } + + #[::tokio::test] + async fn ring_keeps_new_entry_and_expires_old() { + let store = new_store(1024).await; + store.persist("old", (1, 0), &[1u8; 500]).await.unwrap(); + store.persist("filler", (2, 0), &[2u8; 500]).await.unwrap(); + store.persist("new", (3, 0), &[3u8; 500]).await.unwrap(); + + assert!(!store.has_entry("old", (1, 0)).await); + assert!(store.has_entry("filler", (2, 0)).await); + assert!(store.has_entry("new", (3, 0)).await); + } + + #[test] + fn ring_read_pin_blocks_overwrite_until_unpinned() { + let mut ring = SsdRingBuffer::new(vec![1024]); + let old = test_key("old", 1); + let filler = test_key("filler", 2); + let new = test_key("new", 3); + + let old_entry = prepare_ready(&mut ring, &old); + assert_eq!(old_entry.begin, 0); + assert!(ring.commit(&old, true)); + prepare_ready(&mut ring, &filler); + assert!(ring.commit(&filler, true)); + + let pinned = ring.pin_read(&old).unwrap(); + assert_eq!(pinned.begin, old_entry.begin); + assert!(matches!( + ring.prepare_write(new.clone(), 500).unwrap(), + SsdPreparedWrite::BlockedByBusyIo + )); + assert!(ring.get(&old).is_some()); + + ring.unpin_read(&old); + let new_entry = prepare_ready(&mut ring, &new); + assert_eq!(new_entry.file_offset, 0); + assert!(ring.commit(&new, true)); + assert!(ring.get(&old).is_none()); + } + + #[test] + fn ring_writing_entry_blocks_overwrite_until_write_finishes() { + let mut ring = SsdRingBuffer::new(vec![1024]); + let old = test_key("old", 1); + let filler = test_key("filler", 2); + let new = test_key("new", 3); + + let old_entry = prepare_ready(&mut ring, &old); + assert_eq!(old_entry.begin, 0); + prepare_ready(&mut ring, &filler); + + assert!(matches!( + ring.prepare_write(new.clone(), 500).unwrap(), + SsdPreparedWrite::BlockedByBusyIo + )); + + assert!(ring.commit(&old, true)); + let new_entry = prepare_ready(&mut ring, &new); + assert_eq!(new_entry.file_offset, 0); + } + + #[test] + fn safe_component_replaces_path_separators() { + assert_eq!(safe_path_component("owner/a:b"), "owner_a_b"); + } +} diff --git a/fluxon_rs/fluxon_kv/src/kv_test.rs b/fluxon_rs/fluxon_kv/src/kv_test.rs index 5f0a9e2..94d8ebe 100644 --- a/fluxon_rs/fluxon_kv/src/kv_test.rs +++ b/fluxon_rs/fluxon_kv/src/kv_test.rs @@ -11,9 +11,11 @@ use crate::cluster_manager::ClusterManagerRdmaControlInit; use crate::config::{ - ClientConfig, ContributeToClusterPoolSize, FluxonKvSpec, LargeFilePaths, MasterConfig, MonitoringConfig, - ProtocolConfig, ProtocolType, TestSpecConfig, TestSpecTransportMode, TransferEngineType, + ClientConfig, ContributeToClusterPoolSize, FluxonKvSpec, KvSsdStorageConfig, LargeFilePaths, + MasterConfig, MonitoringConfig, ProtocolConfig, ProtocolType, TestSpecConfig, + TestSpecTransportMode, TransferEngineType, }; +use crate::master_kv_router::msg_pack::GetSourceKind; use crate::run_master_with_test_overrides; use crate::{ClientRunTestOverrides, MasterRunTestOverrides, run_client_with_test_overrides}; // external client runs via run_client when contribution is zero @@ -38,6 +40,8 @@ const CLIENT_COMMUNICATION_VALUE: &[u8] = b"message_from_client1_to_client2"; const TRANSFER_DATA_PROBE_VALUE_LEN: usize = 256 * 1024; const KV_TEST_TRANSFER_PROBE_IO_TIMEOUT_SECS: u64 = 10; const KV_TEST_SHUTDOWN_TIMEOUT_SECS: u64 = 60; +const KV_TEST_SSD_STORAGE_BYTES: u64 = 64 * 1024 * 1024; +const KV_TEST_STORAGE_PROFILE_SSD_ROUTE_TIMEOUT_SECS: u64 = 30; fn kv_test_run_scope() -> &'static str { static RUN_SCOPE: OnceLock = OnceLock::new(); @@ -610,6 +614,7 @@ struct KvTestClientOptions { enable_transfer_rpc_fast_path: Option, contribute_to_cluster_pool_size: Option, share_mem_path: Option, + ssd_storage: Option, etcd_mode: Option, } @@ -642,6 +647,10 @@ impl KvTestClientOptions { .share_mem_path .clone() .or_else(|| self.share_mem_path.clone()), + ssd_storage: overrides + .ssd_storage + .clone() + .or_else(|| self.ssd_storage.clone()), etcd_mode: overrides .etcd_mode .clone() @@ -650,6 +659,40 @@ impl KvTestClientOptions { } } +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum KvTestStorageProfile { + Memory, + Ssd, + MemorySsd, +} + +impl KvTestStorageProfile { + fn round_suffix(self) -> &'static str { + match self { + Self::Memory => "", + Self::Ssd => "_ssd", + Self::MemorySsd => "_memory_ssd", + } + } + + fn ssd_storage(self) -> Option { + match self { + Self::Memory => None, + Self::Ssd | Self::MemorySsd => Some(KvSsdStorageConfig { + max_bytes: KV_TEST_SSD_STORAGE_BYTES, + }), + } + } + + fn requires_memory_source(self) -> bool { + matches!(self, Self::Memory | Self::MemorySsd) + } + + fn requires_ssd_source(self) -> bool { + matches!(self, Self::Ssd | Self::MemorySsd) + } +} + #[derive(Clone, Copy, Debug, PartialEq, Eq)] enum KvTestRoundProfile { P2pOnly, @@ -760,6 +803,7 @@ fn kv_test_round_test_spec_config(round_profile: KvTestRoundProfile) -> TestSpec #[derive(Clone, Debug)] struct KvTestRoundOptions { round_profile: KvTestRoundProfile, + storage_profile: KvTestStorageProfile, round_name: String, cluster_name: String, master_port: Option, @@ -803,6 +847,9 @@ impl KvTestRoundOptions { ) } + fn owner_sub_cluster(&self) -> String { + format!("{}_owners", self.round_name) + } } #[derive(Clone, Debug)] @@ -842,8 +889,7 @@ fn default_client_large_file_paths( instance_key: &str, contribute_to_cluster_pool_size: &ContributeToClusterPoolSize, ) -> LargeFilePaths { - if contribute_to_cluster_pool_size.dram == 0 - && contribute_to_cluster_pool_size.vram.is_empty() + if contribute_to_cluster_pool_size.dram == 0 && contribute_to_cluster_pool_size.vram.is_empty() { return LargeFilePaths { paths: Vec::new() }; } @@ -852,7 +898,10 @@ fn default_client_large_file_paths( } } -fn default_owner_test_client_options(round_profile: KvTestRoundProfile) -> KvTestClientOptions { +fn default_owner_test_client_options( + round_profile: KvTestRoundProfile, + storage_profile: KvTestStorageProfile, +) -> KvTestClientOptions { KvTestClientOptions { protocol_config: Some(round_profile.protocol_config()), transfer_engine: Some(round_profile.owner_transfer_engine()), @@ -861,6 +910,7 @@ fn default_owner_test_client_options(round_profile: KvTestRoundProfile) -> KvTes enable_transfer_rpc_fast_path: Some(round_profile.enable_transfer_rpc_fast_path()), contribute_to_cluster_pool_size: Some(default_owner_contribute_to_cluster_pool_size()), share_mem_path: None, + ssd_storage: storage_profile.ssd_storage(), etcd_mode: Some(KvTestEtcdMode::Enabled), } } @@ -874,6 +924,7 @@ fn default_master_test_client_options(round_profile: KvTestRoundProfile) -> KvTe enable_transfer_rpc_fast_path: Some(round_profile.enable_transfer_rpc_fast_path()), contribute_to_cluster_pool_size: None, share_mem_path: None, + ssd_storage: None, etcd_mode: None, } } @@ -887,22 +938,31 @@ fn default_external_test_client_options() -> KvTestClientOptions { enable_transfer_rpc_fast_path: Some(false), contribute_to_cluster_pool_size: Some(default_external_contribute_to_cluster_pool_size()), share_mem_path: None, + ssd_storage: None, etcd_mode: Some(KvTestEtcdMode::Disabled), } } -fn new_kv_test_round(round_profile: KvTestRoundProfile) -> KvTestRoundOptions { - let round_name = round_profile.round_name(); +fn new_kv_test_round( + round_profile: KvTestRoundProfile, + storage_profile: KvTestStorageProfile, +) -> KvTestRoundOptions { + let round_name = format!( + "{}{}", + round_profile.round_name(), + storage_profile.round_suffix() + ); KvTestRoundOptions { round_profile, - round_name: round_name.to_string(), + storage_profile, + round_name: round_name.clone(), // Keep each process run on its own cluster namespace so a crashed/aborted previous run // cannot poison the next rerun with stale members. cluster_name: format!("test_cluster_{}_{}", round_name, kv_test_run_scope()), master_port: None, step8_master_port: None, master_options: default_master_test_client_options(round_profile), - owner_client_options: default_owner_test_client_options(round_profile), + owner_client_options: default_owner_test_client_options(round_profile, storage_profile), external_client_options: default_external_test_client_options(), } } @@ -919,15 +979,35 @@ fn default_kv_test_run_options() -> KvTestRunOptions { .filter(|item| !item.is_empty()) { let profile = match round_name { - "p2p_only" => KvTestRoundProfile::P2pOnly, + "p2p_only" => { + rounds.push(new_kv_test_round( + KvTestRoundProfile::P2pOnly, + KvTestStorageProfile::Memory, + )); + continue; + } + "p2p_only_ssd" => { + rounds.push(new_kv_test_round( + KvTestRoundProfile::P2pOnly, + KvTestStorageProfile::Ssd, + )); + continue; + } + "p2p_only_memory_ssd" => { + rounds.push(new_kv_test_round( + KvTestRoundProfile::P2pOnly, + KvTestStorageProfile::MemorySsd, + )); + continue; + } "rdma_transfer_only" => KvTestRoundProfile::RdmaTransferOnly, "rdma_transfer_with_rpc" => KvTestRoundProfile::RdmaTransferWithRpc, other => panic!( - "unsupported FLUXON_KV_TEST_ROUNDS entry '{}'; expected one of: p2p_only, rdma_transfer_only, rdma_transfer_with_rpc", + "unsupported FLUXON_KV_TEST_ROUNDS entry '{}'; expected one of: p2p_only, p2p_only_ssd, p2p_only_memory_ssd, rdma_transfer_only, rdma_transfer_with_rpc", other ), }; - rounds.push(new_kv_test_round(profile)); + rounds.push(new_kv_test_round(profile, KvTestStorageProfile::Memory)); } if rounds.is_empty() { panic!("FLUXON_KV_TEST_ROUNDS was set but produced no valid rounds"); @@ -937,9 +1017,17 @@ fn default_kv_test_run_options() -> KvTestRunOptions { KvTestRunOptions { rounds: vec![ - new_kv_test_round(KvTestRoundProfile::P2pOnly), - new_kv_test_round(KvTestRoundProfile::RdmaTransferOnly), - new_kv_test_round(KvTestRoundProfile::RdmaTransferWithRpc), + new_kv_test_round(KvTestRoundProfile::P2pOnly, KvTestStorageProfile::Memory), + new_kv_test_round(KvTestRoundProfile::P2pOnly, KvTestStorageProfile::Ssd), + new_kv_test_round(KvTestRoundProfile::P2pOnly, KvTestStorageProfile::MemorySsd), + new_kv_test_round( + KvTestRoundProfile::RdmaTransferOnly, + KvTestStorageProfile::Memory, + ), + new_kv_test_round( + KvTestRoundProfile::RdmaTransferWithRpc, + KvTestStorageProfile::Memory, + ), ], } } @@ -1022,6 +1110,8 @@ fn build_client_launch( let contribute_to_cluster_pool_size = options .contribute_to_cluster_pool_size .unwrap_or(default_owner_contribute_to_cluster_pool_size()); + let is_external = contribute_to_cluster_pool_size.dram == 0 + && contribute_to_cluster_pool_size.vram.is_empty(); let share_mem_path = options .share_mem_path .unwrap_or_else(|| format!("/tmp/kvcache_shared_memory/{}", instance_key)); @@ -1043,7 +1133,11 @@ fn build_client_launch( enable_transfer_rpc_fast_path: options .enable_transfer_rpc_fast_path .expect("kv_test requires enable_transfer_rpc_fast_path to be set explicitly"), - sub_cluster: None, + sub_cluster: if is_external { + None + } else { + Some(round.owner_sub_cluster()) + }, }, // English note: // kv_test uses a per-instance shared memory path by default so each owner/external share @@ -1054,6 +1148,7 @@ fn build_client_launch( &instance_key, &contribute_to_cluster_pool_size, ), + ssd_storage: options.ssd_storage, // Mirror round intent into the generated config so logs and runtime behavior // agree on whether this launch is transfer_only vs transfer_with_rpc. test_spec_config: kv_test_round_test_spec_config(round.round_profile), @@ -1381,7 +1476,10 @@ async fn key_meta_cache_check( } } - tracing::info!("🔍 Starting PUT and GET in parallel: {}", parallel_unique_key); + tracing::info!( + "🔍 Starting PUT and GET in parallel: {}", + parallel_unique_key + ); for i in 0..10 { let (put_client, other_client) = if i % 2 == 0 { (client, client2) @@ -1420,7 +1518,9 @@ async fn key_meta_cache_check( } assert!( - put_client.client_kv_api().has_cached_key(parallel_unique_key), + put_client + .client_kv_api() + .has_cached_key(parallel_unique_key), "put client should have immediate local cache metadata for key {} after put time {}", parallel_unique_key, i @@ -1577,6 +1677,208 @@ async fn shutdown_framework_with_timeout(label: &str, framework: &crate::Framewo } } +fn build_storage_profile_probe_value(tag: &str) -> Vec { + const STORAGE_PROFILE_PROBE_VALUE_LEN: usize = 64 * 1024; + build_storage_profile_probe_value_with_len(tag, STORAGE_PROFILE_PROBE_VALUE_LEN) +} + +fn build_storage_profile_probe_value_with_len(tag: &str, len: usize) -> Vec { + let pattern = format!("kv_test_storage_profile:{tag}:").into_bytes(); + let mut value = Vec::with_capacity(len); + while value.len() < len { + value.extend_from_slice(pattern.as_slice()); + } + value.truncate(len); + value +} + +async fn force_evict_memory_replicas_for_storage_probe( + master_framework: &crate::Framework, + key: &str, +) { + let master_view = master_framework.master_kv_router_view(); + let deadline = + Instant::now() + Duration::from_secs(KV_TEST_STORAGE_PROFILE_SSD_ROUTE_TIMEOUT_SECS); + let (put_id, memory_replica_nodes) = loop { + if let Some(route) = master_view.master_kv_router().inner().kv_routes.get(key) { + let put_id = route.put_id; + let memory_replica_nodes = route + .nodes_replicas + .read() + .keys() + .cloned() + .collect::>(); + let ssd_replica_count = route.ssd_replicas.read().len(); + if ssd_replica_count > 0 { + break (put_id, memory_replica_nodes); + } + } + + if Instant::now() >= deadline { + panic!( + "storage profile probe expected at least one SSD replica before memory eviction: key={} timeout={}s", + key, KV_TEST_STORAGE_PROFILE_SSD_ROUTE_TIMEOUT_SECS + ); + } + sleep(Duration::from_millis(50)).await; + }; + + for node_id in memory_replica_nodes { + crate::master_kv_router::delete::evict_one_kv_replica_for_node( + &master_view, + key.to_string(), + node_id.clone(), + put_id, + ) + .unwrap_or_else(|code| { + panic!( + "storage profile probe failed to evict memory replica: key={} node={} put_id=({},{}) code={}", + key, node_id, put_id.0, put_id.1, code + ) + }); + } + + let Some(route) = master_view.master_kv_router().inner().kv_routes.get(key) else { + panic!("storage profile probe route disappeared after memory replicas eviction: key={key}"); + }; + assert!( + route.nodes_replicas.read().is_empty(), + "storage profile probe memory replicas still exist after eviction: key={}", + key + ); + assert!( + !route.ssd_replicas.read().is_empty(), + "storage profile probe SSD replica disappeared after memory replicas eviction: key={}", + key + ); +} + +async fn assert_owner_get_source_kind( + reader_framework: &crate::Framework, + key: &str, + expected_value: &[u8], + expected_source_kind: GetSourceKind, +) { + let reader_view = reader_framework.client_kv_api_view().clone(); + let reader_api = reader_view.client_kv_api(); + let (mem_holder, get_info) = reader_api + .inner() + .get(key) + .await + .unwrap_or_else(|err| { + panic!( + "storage profile probe get failed: key={} expected_source={:?} err={}", + key, expected_source_kind, err + ) + }) + .unwrap_or_else(|| { + panic!( + "storage profile probe get returned None: key={} expected_source={:?}", + key, expected_source_kind + ) + }); + assert_eq!( + mem_holder.bytes(), + expected_value, + "storage profile probe value mismatch for key={key}" + ); + let Some(get_info) = get_info else { + panic!( + "storage profile probe expected remote get info for key={} source={:?}", + key, expected_source_kind + ); + }; + assert_eq!( + get_info.source_kind(), + expected_source_kind, + "storage profile probe source kind mismatch for key={key}" + ); +} + +async fn run_non_rdma_storage_profile_coverage( + round: &KvTestRoundOptions, + master_framework: &crate::Framework, + writer_framework: &crate::Framework, +) -> Option> { + if round.round_profile != KvTestRoundProfile::P2pOnly { + return None; + } + + info!( + "📋 Storage profile coverage: round={} storage={:?}", + round.round_name, round.storage_profile + ); + + let writer_view = writer_framework.client_kv_api_view().clone(); + let writer_api = writer_view.client_kv_api(); + let storage_probe_put_opts = || { + crate::client_kv_api::PutOptionalArgs(vec![ + crate::client_kv_api::PutOptionalArg::PreferredSubCluster(round.owner_sub_cluster()), + ]) + }; + + let memory_key = format!("storage_profile_memory_key_{}", round.round_name); + let memory_value = build_storage_profile_probe_value(&format!("{}:memory", round.round_name)); + if round.storage_profile.requires_memory_source() { + writer_api + .inner() + .put(&memory_key, &memory_value, storage_probe_put_opts()) + .await + .unwrap_or_else(|err| { + panic!( + "storage profile memory probe put failed: key={} err={}", + memory_key, err + ) + }); + } + + let ssd_key = format!("storage_profile_ssd_key_{}", round.round_name); + let ssd_value = build_storage_profile_probe_value_with_len( + &format!("{}:ssd", round.round_name), + 64 * 1024 + 123, + ); + if round.storage_profile.requires_ssd_source() { + writer_api + .inner() + .put(&ssd_key, &ssd_value, storage_probe_put_opts()) + .await + .unwrap_or_else(|err| { + panic!( + "storage profile SSD probe put failed: key={} err={}", + ssd_key, err + ) + }); + force_evict_memory_replicas_for_storage_probe(master_framework, &ssd_key).await; + } + + let reader_launch = new_client_launch(round, "test_storage_profile_reader", None); + let (reader_framework, _) = run_kv_test_client(reader_launch) + .await + .expect("Failed to start storage profile reader"); + + sleep(Duration::from_secs(10)).await; + + if round.storage_profile.requires_memory_source() { + assert_owner_get_source_kind( + &reader_framework, + &memory_key, + &memory_value, + GetSourceKind::Memory, + ) + .await; + } + if round.storage_profile.requires_ssd_source() { + assert_owner_get_source_kind(&reader_framework, &ssd_key, &ssd_value, GetSourceKind::Ssd) + .await; + } + + info!( + "✅ Storage profile coverage passed: round={} storage={:?}", + round.round_name, round.storage_profile + ); + Some(reader_framework) +} + async fn run_kv_step8(round: &KvTestRoundOptions) { info!("📋 Step 8: Verifying external client blocking and recovery behavior"); @@ -2720,6 +3022,9 @@ async fn run_kv_round(round: &KvTestRoundOptions) { info!("✅ Key meta cache testing completed"); } + let storage_profile_reader_framework = + run_non_rdma_storage_profile_coverage(round, &master_framework, &client1_framework).await; + // 清理旧资源 { info!("🧹 Cleaning up resources"); @@ -2743,6 +3048,14 @@ async fn run_kv_round(round: &KvTestRoundOptions) { .unwrap_or_else(|e| panic!("Client 1 framework shutdown failed: {}", e)); info!("✅ Client 1 framework shutdown successfully"); + if let Some(storage_profile_reader_framework) = storage_profile_reader_framework { + shutdown_framework_with_timeout( + "storage profile reader", + &storage_profile_reader_framework, + ) + .await; + } + master_framework .shutdown() .await diff --git a/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs b/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs index c74b64a..43d3c09 100644 --- a/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs +++ b/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs @@ -148,6 +148,7 @@ fn new_client_config_with_cluster_and_dram( large_file_paths: crate::config::LargeFilePaths { paths: vec![format!("{}/large/{}", base, instance_key)], }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), }; println!("fluxonkv core created client config for test: {:?}", conf); diff --git a/fluxon_rs/fluxon_kv/src/lib.rs b/fluxon_rs/fluxon_kv/src/lib.rs index edaa386..3b1116d 100644 --- a/fluxon_rs/fluxon_kv/src/lib.rs +++ b/fluxon_rs/fluxon_kv/src/lib.rs @@ -7,6 +7,7 @@ pub mod external_client_api; pub mod panel_proxy; // #[cfg(test)] pub mod key_prefix; +pub mod kv_ssd_storage; #[cfg(feature = "test_bins")] pub mod kv_test; pub mod kvlease; @@ -797,6 +798,7 @@ fn build_side_transfer_worker_config( }, share_mem_path: owner_config.share_mem_path.clone(), large_file_paths: owner_config.large_file_paths.clone(), + ssd_storage: None, test_spec_config, }) } @@ -841,6 +843,7 @@ fn build_side_transfer_worker_config_yaml( cluster_name: side_config.cluster_name, share_mem_path: side_config.share_mem_path, large_file_paths: None, + ssd_storage: None, p2p_listen_port: side_config.fluxonkv_spec.p2p_listen_port, redis_compat: None, sub_cluster: None, @@ -1915,6 +1918,9 @@ async fn run_client_impl( if is_side_transfer_worker { metadata.insert("side_transfer_worker".to_string(), "true".to_string()); } + if !is_external && !is_side_transfer_worker && config.ssd_storage.is_some() { + metadata.insert("kv_ssd_storage".to_string(), "true".to_string()); + } // Local IPC routing requires both share-group owner id and the local IPC root. // The owner id is also published via a dedicated share-group key; we denormalize it into @@ -2004,6 +2010,20 @@ async fn run_client_impl( .await .map_err(|e| anyhow::anyhow!("Failed to initialize framework: {:#}", e))?; } else { + let ssd_storage = if is_side_transfer_worker { + None + } else if let Some(ssd_cfg) = config.ssd_storage.as_ref() { + let root_dirs = config + .large_file_paths + .kv_ssd_storage_dirs(&config.cluster_name, &config.instance_key) + .map_err(|err| anyhow::anyhow!("invalid kv ssd storage dirs: {}", err))?; + Some(crate::kv_ssd_storage::KvSsdStorageInit { + root_dirs, + max_bytes: ssd_cfg.max_bytes, + }) + } else { + None + }; let init_args = InitArgsOwner { cluster_manager_arg: ClusterManagerNewArg { etcd_endpoints: config.fluxonkv_spec.etcd_addresses.clone(), @@ -2036,6 +2056,7 @@ async fn run_client_impl( }, client_kv_api_arg: ClientKvApiNewArg { test_spec_config: config.test_spec_config.clone(), + ssd_storage, }, client_seg_pool_arg: ClientSegPoolNewArg { contribute_size: config.contribute_to_cluster_pool_size.clone(), @@ -2468,6 +2489,7 @@ mod tests { large_file_paths: crate::config::LargeFilePaths { paths: vec!["/tmp/fluxon_side_transfer_test_large".to_string()], }, + ssd_storage: None, test_spec_config: TestSpecConfig { enable_side_transfer: true, side_transfer_worker_count: 4, @@ -2736,8 +2758,8 @@ mod tests { large_file_paths: crate::config::LargeFilePaths { paths: vec![owner_large_root.to_string_lossy().into_owned()], }, - protocol_version: - fluxon_util::git_version_build_record::get_current_git_commitid().unwrap(), + protocol_version: fluxon_util::git_version_build_record::get_current_git_commitid() + .unwrap(), write_ts: Some(chrono::Utc::now().timestamp_micros()), }; let shared_meta_json = serde_json::to_string(&shared_meta).unwrap(); @@ -2773,6 +2795,7 @@ mod tests { }, share_mem_path: share_mem_root.to_string_lossy().into_owned(), large_file_paths: crate::config::LargeFilePaths { paths: Vec::new() }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), }; diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/delete.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/delete.rs index 12a55ee..52ac76e 100755 --- a/fluxon_rs/fluxon_kv/src/master_kv_router/delete.rs +++ b/fluxon_rs/fluxon_kv/src/master_kv_router/delete.rs @@ -130,7 +130,7 @@ pub fn evict_one_kv_replica_for_node( return Ok(()); } - let last_replica_gone = route.nodes_replicas.read().is_empty(); + let last_replica_gone = !route.has_live_replica(); if last_replica_gone { let removed = view .master_kv_router() diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/get.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/get.rs index 8c17155..346df40 100755 --- a/fluxon_rs/fluxon_kv/src/master_kv_router/get.rs +++ b/fluxon_rs/fluxon_kv/src/master_kv_router/get.rs @@ -2,9 +2,10 @@ use super::{ InflightGetInfo, KvRouteInfo, MasterKvRouterView, NodeValueReplicaDesc, OwnerHoldingGetInfo, msg_pack::{ GetAllocationMode, GetDoneReq, GetDoneResp, GetMetaReq, GetMetaResp, GetRevokeReq, - GetRevokeResp, GetStartReq, GetStartResp, + GetRevokeResp, GetSourceKind, GetStartReq, GetStartResp, }, }; +use crate::kv_ssd_storage::{SSD_ALIGNMENT, align_ssd_io_len}; use crate::master_kv_router::OneKvNodesRoutes; use crate::master_kv_router::put::PutIDForAKey; use crate::memholder::MemholderManagerTrait; @@ -82,7 +83,7 @@ pub async fn handle_get_start( let mut remove_in_kv_routes = false; if let Some(one_kv_nodes_routes) = view.master_kv_router().inner().kv_routes.get(key) { one_kv_nodes_routes.clean_up_tomb_nodes_replicas(put_id, tombs, view); - if one_kv_nodes_routes.nodes_replicas.read().is_empty() { + if !one_kv_nodes_routes.has_live_replica() { remove_in_kv_routes = true; } } @@ -113,6 +114,67 @@ pub async fn handle_get_start( }, ) } + fn allocate_get_buffer_on_node( + view: &MasterKvRouterView, + node_id: &NodeID, + len: u64, + get_id: u64, + purpose: &str, + ) -> Result, msg_and_error::KvError> { + let node_allocators = view.master_seg_manager().get_node_allocators(node_id); + if node_allocators.is_empty() { + tracing::info!( + "No allocators found for {} during get: {}, node is not ready", + purpose, + node_id + ); + return Err(msg_and_error::KvError::Unreachable( + msg_and_error::UnreachableError::OwnerNoSeg { detail: "config=0 initializes as external; non-zero initializes as owner; the owner must have memory space (segment)".to_string() } + )); + } + + let allocator = node_allocators.choose(&mut rand::thread_rng()).unwrap(); + let mut allocated_addr: Option = None; + for attempt in 1..=3 { + if let Ok(allocation) = allocator.allocate(len) { + allocated_addr = Some(allocation); + break; + } else { + tracing::info!( + "{} allocation attempt {}/3 failed for get_id {} on node {}", + purpose, + attempt, + get_id, + node_id + ); + } + } + if let Some(allocation) = allocated_addr { + return Ok(Arc::new(allocation)); + } + + let total = allocator.total_size_bytes(); + let used = allocator.used_size_bytes(); + let free = total.saturating_sub(used); + Err(msg_and_error::KvError::Api( + msg_and_error::ApiError::NoSpace { + node: node_id.as_ref().to_string(), + segment: allocator.seg_device_id.clone(), + total_capacity: total, + free_capacity: free, + }, + )) + } + fn align_ssd_stage_addr(raw_addr: u64) -> Result { + raw_addr + .checked_add(SSD_ALIGNMENT as u64 - 1) + .map(|addr| addr / SSD_ALIGNMENT as u64 * SSD_ALIGNMENT as u64) + .ok_or_else(|| { + msg_and_error::KvError::Api(msg_and_error::ApiError::InvalidArgument { + detail: format!("ssd source staging address alignment overflow: {raw_addr}"), + }) + }) + } tracing::debug!("Handling GetStartReq: {:?}", req.serialize_part); @@ -253,11 +315,13 @@ pub async fn handle_get_start( put_id: one_kv_nodes_routes.put_id, get_id, node_id: resp_node_id.clone().into(), + source_kind: GetSourceKind::Memory, src_addr: resp_src_addr, target_addr: resp_target_addr, src_base_addr: resp_src_base, target_base_addr: resp_target_base, len: src_allocation.size(), + ssd_stage_len: 0, error_code: msg_and_error::OK, error_json: String::new(), server_process_us: 0, @@ -270,8 +334,10 @@ pub async fn handle_get_start( req_node_id, len: src_allocation.size(), allocation: target_allocation, // 存储target allocation + source_allocation: None, route: one_kv_nodes_routes.clone(), allocation_mode, + source_kind: GetSourceKind::Memory, }; view.master_kv_router() @@ -308,6 +374,167 @@ pub async fn handle_get_start( }, ); } + + let ssd_replicas = one_kv_nodes_routes.ssd_replicas.read().clone(); + let mut ssd_replica_keys = ssd_replicas.keys().collect::>(); + while !ssd_replica_keys.is_empty() { + let to_remove_idx = rand::thread_rng().gen_range(0..ssd_replica_keys.len()); + let selected_ssd_key = ssd_replica_keys.remove(to_remove_idx); + let ssd_replica = ssd_replicas + .get(&*selected_ssd_key) + .expect("selected SSD replica key must exist"); + if ssd_replica.tomb_tag.is_tomb() { + tombs.insert(selected_ssd_key.to_owned()); + } else { + let ssd_stage_len = match align_ssd_io_len(ssd_replica.len) { + Ok(len) => len, + Err(err) => { + return failed_resp_err( + err, + Some((tombs, one_kv_nodes_routes.put_id)), + &view, + &req.serialize_part.key, + ); + } + }; + let source_alloc_len = match ssd_stage_len.checked_add(SSD_ALIGNMENT as u64 - 1) { + Some(len) => len, + None => { + let err = + msg_and_error::KvError::Api(msg_and_error::ApiError::InvalidArgument { + detail: format!( + "ssd source staging allocation length overflow: {ssd_stage_len}" + ), + }); + return failed_resp_err( + err, + Some((tombs, one_kv_nodes_routes.put_id)), + &view, + &req.serialize_part.key, + ); + } + }; + let source_allocation = match allocate_get_buffer_on_node( + &view, + &ssd_replica.node_id, + source_alloc_len, + get_id, + "ssd source staging", + ) { + Ok(allocation) => allocation, + Err(err) => { + tracing::info!( + "Skipping SSD source for get_id {} on node {}: {}", + get_id, + ssd_replica.node_id, + err + ); + continue; + } + }; + let target_allocation = match allocate_get_buffer_on_node( + &view, + &req_node_id, + ssd_replica.len, + get_id, + "requesting target", + ) { + Ok(allocation) => allocation, + Err(err) => { + return failed_resp_err( + err, + Some((tombs, one_kv_nodes_routes.put_id)), + &view, + &req.serialize_part.key, + ); + } + }; + let allocation_mode = if one_kv_nodes_routes.try_reserve_get_durable_slot() { + GetAllocationMode::DurableReplica + } else { + GetAllocationMode::Temporary + }; + let source_base = source_allocation.base_addr(); + let source_raw_addr = match source_base.checked_add(source_allocation.addr()) { + Some(addr) => addr, + None => { + let err = + msg_and_error::KvError::Api(msg_and_error::ApiError::InvalidArgument { + detail: format!( + "ssd source staging raw address overflow: base={} offset={}", + source_base, + source_allocation.addr() + ), + }); + return failed_resp_err( + err, + Some((tombs, one_kv_nodes_routes.put_id)), + &view, + &req.serialize_part.key, + ); + } + }; + let source_addr = match align_ssd_stage_addr(source_raw_addr) { + Ok(addr) => addr, + Err(err) => { + return failed_resp_err( + err, + Some((tombs, one_kv_nodes_routes.put_id)), + &view, + &req.serialize_part.key, + ); + } + }; + let target_base = target_allocation.base_addr(); + let target_addr = target_base + target_allocation.addr(); + let resp = GetStartResp { + put_id: one_kv_nodes_routes.put_id, + get_id, + node_id: ssd_replica.node_id.clone().into(), + source_kind: GetSourceKind::Ssd, + src_addr: source_addr, + target_addr, + src_base_addr: source_base, + target_base_addr: target_base, + len: ssd_replica.len, + ssd_stage_len, + error_code: msg_and_error::OK, + error_json: String::new(), + server_process_us: 0, + }; + let info = InflightGetInfo { + put_id: one_kv_nodes_routes.put_id, + src_node_id: ssd_replica.node_id.clone(), + key: req.serialize_part.key.clone(), + req_node_id, + len: ssd_replica.len, + allocation: target_allocation, + source_allocation: Some(source_allocation), + route: one_kv_nodes_routes.clone(), + allocation_mode, + source_kind: GetSourceKind::Ssd, + }; + + view.master_kv_router() + .inner() + .inflight_gets + .insert(get_id, info) + .await; + + clean_up_tombs( + &view, + Some((tombs, one_kv_nodes_routes.put_id)), + &req.serialize_part.key, + ); + return ( + get_id, + MsgPack { + serialize_part: resp, + raw_bytes: Vec::new(), + }, + ); + } + } tracing::info!("Key not found: {}", req.serialize_part.key); { let err = msg_and_error::KvError::Api(msg_and_error::ApiError::KeyNotFound { @@ -322,6 +549,64 @@ pub async fn handle_get_start( } } +fn drop_failed_ssd_source(view: &MasterKvRouterView, inflight_info: &InflightGetInfo) { + if inflight_info.source_kind != GetSourceKind::Ssd { + tracing::warn!( + "Ignoring drop_ssd_source for non-SSD get: get_key={} put_id=({},{}) source_kind={:?}", + inflight_info.key, + inflight_info.put_id.0, + inflight_info.put_id.1, + inflight_info.source_kind + ); + return; + } + + let route = inflight_info.route.clone(); + if route.put_id != inflight_info.put_id { + return; + } + + let removed = route + .ssd_replicas + .write() + .remove(&inflight_info.src_node_id) + .is_some(); + if !removed { + return; + } + + tracing::warn!( + "Removed failed SSD replica: key={} node={} put_id=({},{})", + inflight_info.key, + inflight_info.src_node_id, + inflight_info.put_id.0, + inflight_info.put_id.1 + ); + + if route.has_live_replica() { + return; + } + + let route_for_compare = route.clone(); + let removed_route = view + .master_kv_router() + .inner() + .kv_routes + .remove_if(&inflight_info.key, |_, current| { + Arc::ptr_eq(current, &route_for_compare) && current.put_id == inflight_info.put_id + }) + .is_some(); + if removed_route && view.master_kv_router().prefix_index_enabled() { + let view_task = view.clone(); + let key_for_prefix = inflight_info.key.clone(); + let _ = view.spawn("ssd_failure_remove_prefix_index", async move { + let inner = view_task.master_kv_router().inner(); + let mut tree = inner.prefix_index.write().await; + tree.remove(&key_for_prefix); + }); + } +} + pub async fn handle_get_revoke( view: MasterKvRouterView, req: MsgPack, @@ -338,6 +623,9 @@ pub async fn handle_get_revoke( .remove(&get_id) .await { + if req.serialize_part.drop_ssd_source { + drop_failed_ssd_source(&view, &inflight_info); + } inflight_info.release_durable_slot_if_needed(); tracing::info!("Revoked get operation with get_id: {}", get_id); } else { @@ -381,7 +669,6 @@ pub async fn handle_get_done( .next_holder_id .fetch_add(1, Ordering::Relaxed); - let src_node_id = inflight_info.src_node_id; let key = inflight_info.key; // Create holding info @@ -404,7 +691,7 @@ pub async fn handle_get_done( if one_kv_nodes_routes.put_id == inflight_info.put_id { let mut nodes_replicas = one_kv_nodes_routes.nodes_replicas.write(); if let Some(tomb_tag) = - view.master_seg_manager().get_node_tomb_tag(&src_node_id) + view.master_seg_manager().get_node_tomb_tag(&req_node_id) { if !tomb_tag.is_tomb() { nodes_replicas.insert( @@ -632,6 +919,21 @@ pub async fn handle_get_meta( raw_bytes: Vec::new(), }; } + let ssd_replicas = (*one_kv_nodes_routes.ssd_replicas.read()).clone(); + for (_, kv_info) in ssd_replicas.iter() { + if kv_info.tomb_tag.is_tomb() { + continue; + } + return MsgPack { + serialize_part: GetMetaResp { + exists: true, + len: kv_info.len, + error_code: msg_and_error::OK, + error_json: String::new(), + }, + raw_bytes: Vec::new(), + }; + } // if let Some((_, kv_info)) = replicas.iter().next() { // let len = kv_info.allocation.size(); diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/mod.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/mod.rs index ee4ca2b..afbfc41 100644 --- a/fluxon_rs/fluxon_kv/src/master_kv_router/mod.rs +++ b/fluxon_rs/fluxon_kv/src/master_kv_router/mod.rs @@ -15,13 +15,14 @@ use self::{ msg_pack::{ BatchDeleteAckReq, BatchDeleteClientKvMetaCacheReq, CountPrefixReq, CountPrefixResp, DeleteAckReq, DeleteReq, GetAllocationMode, GetDoneReq, GetMetaReq, GetRevokeReq, - GetStartReq, PutDoneReq, PutRevokeReq, PutStartReq, + GetSourceKind, GetStartReq, PutDoneReq, PutRevokeReq, PutStartReq, SsdReplicaCommitReq, }, placement::{PlacementDefault, PlacementPolicy}, - put::{handle_put_done, handle_put_revoke, handle_put_start}, + put::{handle_put_done, handle_put_revoke, handle_put_start, handle_ssd_replica_commit}, }; use crate::ClientKvApiAccessTrait; use crate::client_kv_api::ClientKvApi; +use crate::client_kv_api::msg_pack::SsdReplicaPersistReq; use crate::cluster_manager::{ ClusterEvent, ClusterManager, ClusterManagerAccessTrait, NodeID, NodeIDString, }; @@ -116,8 +117,10 @@ pub struct InflightGetInfo { pub req_node_id: NodeID, pub len: u64, pub allocation: Arc, + pub source_allocation: Option>, pub route: Arc, pub allocation_mode: GetAllocationMode, + pub source_kind: GetSourceKind, } impl InflightGetInfo { @@ -201,6 +204,13 @@ pub struct KvRouteInfo { pub tomb_tag: NodeTombTag, } +#[derive(Clone, Debug)] +pub struct KvSsdRouteInfo { + pub node_id: NodeID, + pub len: u64, + pub tomb_tag: NodeTombTag, +} + #[derive(Debug)] pub struct OneKvNodesRoutes { /// the version id for a kv put operation @@ -230,6 +240,8 @@ pub struct OneKvNodesRoutes { /// node_id -> KvRouteInfo pub nodes_replicas: RwLock>, + /// node_id -> SSD replica metadata for the same key-version. + pub ssd_replicas: RwLock>, pub get_durable_slots_used: AtomicU32, } @@ -247,9 +259,16 @@ impl OneKvNodesRoutes { let mut nodes_replicas = self.nodes_replicas.write(); nodes_replicas.retain(|_, kv_info| !tombs.contains(&kv_info.node_id)); + let mut ssd_replicas = self.ssd_replicas.write(); + ssd_replicas.retain(|_, kv_info| !tombs.contains(&kv_info.node_id)); + return true; } + fn has_live_replica(&self) -> bool { + !self.nodes_replicas.read().is_empty() || !self.ssd_replicas.read().is_empty() + } + fn try_reserve_get_durable_slot(&self) -> bool { self.get_durable_slots_used .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| { @@ -283,6 +302,7 @@ mod tests { put_id: (1, 0), lease_id: None, nodes_replicas: RwLock::new(HashMap::new()), + ssd_replicas: RwLock::new(HashMap::new()), get_durable_slots_used: AtomicU32::new(0), }; @@ -607,6 +627,7 @@ impl MasterKvRouter { fn register_rpc_callers(&self) { RPCCaller::::new().regist(self.0.view().p2p_module()); + RPCCaller::::new().regist(self.0.view().p2p_module()); } fn register_rpc_handlers(&self) { @@ -766,6 +787,22 @@ impl MasterKvRouter { Ok(()) }); + let view = self.0.view().clone(); + RPCHandler::::new().regist(p2p, move |resp, msg| { + let view = view.clone(); + let view2 = view.clone(); + let view_task = view2.clone(); + let _ = view.spawn("rpc_ssd_replica_commit", async move { + let t0 = Utc::now().timestamp_micros(); + let mut ack = handle_ssd_replica_commit(view_task, msg).await; + ack.serialize_part.server_process_us = Utc::now().timestamp_micros() - t0; + if let Err(e) = resp.send_resp(ack).await { + error!("Failed to send SsdReplicaCommitResp: {:?}", e); + } + }); + Ok(()) + }); + // --- MemHolder Handlers --- // let view = inner.view.clone(); // RPCHandler::::new().regist(p2p, move |resp, msg| { diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/msg_pack.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/msg_pack.rs index 9d5eb1d..bdd85b6 100755 --- a/fluxon_rs/fluxon_kv/src/master_kv_router/msg_pack.rs +++ b/fluxon_rs/fluxon_kv/src/master_kv_router/msg_pack.rs @@ -18,6 +18,13 @@ pub enum GetAllocationMode { DurableReplica = 2, } +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Encode, Decode)] +pub enum GetSourceKind { + #[default] + Memory = 0, + Ssd = 1, +} + #[derive(Default, Debug, Clone, Encode, Decode)] pub struct GetStartReq { pub key: String, @@ -32,6 +39,7 @@ pub struct GetStartResp { pub get_id: u64, pub node_id: NodeIDString, pub put_id: PutIDForAKey, + pub source_kind: GetSourceKind, // absolute addresses because Mooncake transfer engine requires absolute addresses (not offsets) pub target_addr: u64, pub src_addr: u64, @@ -39,6 +47,8 @@ pub struct GetStartResp { pub target_base_addr: u64, pub src_base_addr: u64, pub len: u64, + /// SSD source staging bytes available at src_addr. Zero for memory sources. + pub ssd_stage_len: u64, pub error_code: ErrorCode, pub error_json: String, /// Server-side processing time in microseconds for this RPC handler @@ -56,6 +66,8 @@ impl RPCReq for GetStartReq { #[derive(Default, Debug, Clone, Encode, Decode)] pub struct GetRevokeReq { pub get_id: u64, + /// True only when an SSD stage failed and the source must be removed from routing. + pub drop_ssd_source: bool, } impl MsgPackSerializePart for GetRevokeReq { fn msg_id(&self) -> u32 { @@ -250,6 +262,34 @@ impl RPCReq for PutDoneReq { type Resp = PutDoneResp; } +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdReplicaCommitReq { + pub key: String, + pub put_id: PutIDForAKey, + pub node_id: NodeIDString, + pub len: u64, +} +impl MsgPackSerializePart for SsdReplicaCommitReq { + fn msg_id(&self) -> u32 { + MsgId::SsdReplicaCommitReq as u32 + } +} +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdReplicaCommitResp { + pub error_code: ErrorCode, + pub error_json: String, + /// Server-side processing time in microseconds for this RPC handler + pub server_process_us: i64, +} +impl MsgPackSerializePart for SsdReplicaCommitResp { + fn msg_id(&self) -> u32 { + MsgId::SsdReplicaCommitResp as u32 + } +} +impl RPCReq for SsdReplicaCommitReq { + type Resp = SsdReplicaCommitResp; +} + // --- RPC for MemHolder KeepAlive --- #[derive(Default, Debug, Clone, Encode, Decode)] diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/put.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/put.rs index 70d8858..06e41cc 100755 --- a/fluxon_rs/fluxon_kv/src/master_kv_router/put.rs +++ b/fluxon_rs/fluxon_kv/src/master_kv_router/put.rs @@ -1,15 +1,19 @@ -use super::NodeValueReplicaDesc; use super::{ InflightPutAllocation, InflightPutInfo, KvRouteInfo, MasterKvRouterView, PutPlacementMode, - msg_pack::{PutDoneReq, PutDoneResp, PutRevokeReq, PutRevokeResp, PutStartReq, PutStartResp}, + msg_pack::{ + PutDoneReq, PutDoneResp, PutRevokeReq, PutRevokeResp, PutStartReq, PutStartResp, + SsdReplicaCommitReq, SsdReplicaCommitResp, + }, placement::PutPlacementTarget, }; +use super::{KvSsdRouteInfo, NodeValueReplicaDesc}; +use crate::client_kv_api::msg_pack::SsdReplicaPersistReq; use crate::master_kv_router::OneKvNodesRoutes; use crate::master_kv_router::delete::DeleteKeyInfo; use crate::{ cluster_manager::{META_KEY_LOCAL_IPC_ROOT, NodeID}, master_seg_manager::one_seg_allocator::Allocation, - p2p::msg_pack::MsgPack, + p2p::msg_pack::{MsgPack, RPCCaller}, rpcresp_kvresult_convert::msg_and_error, }; use fluxon_commu::{META_KEY_SHARED_STORAGE_NODE_ID, META_KEY_SHARED_STORAGE_NODE_START_TIME}; @@ -19,6 +23,7 @@ use rand::seq::SliceRandom; use std::{ collections::HashMap, sync::{Arc, atomic::AtomicU32}, + time::Duration, }; pub type PutIDForAKey = (u64, u32); @@ -474,6 +479,171 @@ pub async fn handle_put_revoke( } } +fn spawn_ssd_replica_persist_request( + view: &MasterKvRouterView, + key: String, + put_id: PutIDForAKey, + node_id: NodeID, + len: u64, + allocation: Arc, +) { + let target_addr = allocation.base_addr() + allocation.addr(); + let view = view.clone(); + let view_task = view.clone(); + let _ = view.spawn("post_put_ssd_replica_persist", async move { + let _allocation_guard = allocation; + let req = MsgPack { + serialize_part: SsdReplicaPersistReq { + key: key.clone(), + put_id, + target_addr, + len, + }, + raw_bytes: Vec::new(), + }; + let resp = RPCCaller::::new() + .call( + view_task.p2p_module(), + node_id.clone(), + req, + Some(Duration::from_secs(60)), + 2, + ) + .await; + match resp { + Ok(resp) => { + if let Err(err) = crate::rpcresp_kvresult_convert::try_from_code( + resp.serialize_part.error_code, + resp.serialize_part.error_json, + ) { + tracing::warn!( + "SSD replica persist failed: key={} put_id=({},{}) node={} err={}", + key, + put_id.0, + put_id.1, + node_id, + err + ); + } else if resp.serialize_part.persisted { + tracing::debug!( + "SSD replica persist completed: key={} put_id=({},{}) node={}", + key, + put_id.0, + put_id.1, + node_id + ); + } else { + tracing::debug!( + "SSD replica persist skipped because owner has no SSD store: key={} put_id=({},{}) node={}", + key, + put_id.0, + put_id.1, + node_id + ); + } + } + Err(err) => { + tracing::warn!( + "SSD replica persist RPC failed: key={} put_id=({},{}) node={} err={:?}", + key, + put_id.0, + put_id.1, + node_id, + err + ); + } + } + }); +} + +fn ok_ssd_replica_commit_resp() -> MsgPack { + MsgPack { + serialize_part: SsdReplicaCommitResp { + error_code: msg_and_error::OK, + error_json: String::new(), + server_process_us: 0, + }, + raw_bytes: Vec::new(), + } +} + +pub async fn handle_ssd_replica_commit( + view: MasterKvRouterView, + req: MsgPack, +) -> MsgPack { + let req = req.serialize_part; + let node_id: NodeID = req.node_id.clone().into(); + let Some(route_ref) = view.master_kv_router().inner().kv_routes.get(&req.key) else { + tracing::debug!( + "Ignoring SSD replica commit for missing key: key={} put_id=({},{}) node={}", + req.key, + req.put_id.0, + req.put_id.1, + req.node_id + ); + return ok_ssd_replica_commit_resp(); + }; + let route = route_ref.value().clone(); + drop(route_ref); + + if route.put_id != req.put_id { + tracing::debug!( + "Ignoring stale SSD replica commit: key={} req_put_id=({},{}) current_put_id=({},{}) node={}", + req.key, + req.put_id.0, + req.put_id.1, + route.put_id.0, + route.put_id.1, + req.node_id + ); + return ok_ssd_replica_commit_resp(); + } + + let tomb_tag = { + let replicas = route.nodes_replicas.read(); + let Some(memory_replica) = replicas.get(&node_id) else { + tracing::debug!( + "Ignoring SSD replica commit without matching memory replica: key={} put_id=({},{}) node={}", + req.key, + req.put_id.0, + req.put_id.1, + req.node_id + ); + return ok_ssd_replica_commit_resp(); + }; + memory_replica.tomb_tag.clone() + }; + + if tomb_tag.is_tomb() { + tracing::debug!( + "Ignoring SSD replica commit for tombed node: key={} put_id=({},{}) node={}", + req.key, + req.put_id.0, + req.put_id.1, + req.node_id + ); + return ok_ssd_replica_commit_resp(); + } + + route.ssd_replicas.write().insert( + node_id.clone(), + KvSsdRouteInfo { + node_id, + len: req.len, + tomb_tag, + }, + ); + tracing::debug!( + "Committed SSD replica route: key={} put_id=({},{}) node={} len={}", + req.key, + req.put_id.0, + req.put_id.1, + req.node_id, + req.len + ); + ok_ssd_replica_commit_resp() +} + pub async fn handle_put_done( view: MasterKvRouterView, req: MsgPack, @@ -488,6 +658,7 @@ pub async fn handle_put_done( if let Some(InflightPutInfo { node_id, key, + len, src_target_allocation, .. }) = view @@ -631,8 +802,9 @@ pub async fn handle_put_done( let completed_info = KvRouteInfo { node_id: node_id.clone(), allocation: Arc::new(target_allocation), - tomb_tag, + tomb_tag: tomb_tag.clone(), }; + let target_allocation_for_ssd = Arc::clone(&completed_info.allocation); // Insert into kv_routes with replica support let mut old_one_kv_routes: Option> = None; @@ -649,6 +821,7 @@ pub async fn handle_put_done( put_id, lease_id: lease_id_opt, nodes_replicas: RwLock::new(HashMap::new()), + ssd_replicas: RwLock::new(HashMap::new()), get_durable_slots_used: AtomicU32::new(0), }) }); @@ -659,6 +832,7 @@ pub async fn handle_put_done( put_id, lease_id: lease_id_opt, nodes_replicas: RwLock::new(HashMap::new()), + ssd_replicas: RwLock::new(HashMap::new()), get_durable_slots_used: AtomicU32::new(0), }); } @@ -668,6 +842,15 @@ pub async fn handle_put_done( .insert(node_id.clone(), completed_info); } + spawn_ssd_replica_persist_request( + &view, + key.clone(), + put_id, + node_id.clone(), + len, + target_allocation_for_ssd, + ); + if let Some(old) = old_one_kv_routes { if let Err(err) = view .master_kv_router() diff --git a/fluxon_rs/fluxon_kv/src/master_lease_manager/lease_manager_test.rs b/fluxon_rs/fluxon_kv/src/master_lease_manager/lease_manager_test.rs index 5c20cc1..5d344c9 100755 --- a/fluxon_rs/fluxon_kv/src/master_lease_manager/lease_manager_test.rs +++ b/fluxon_rs/fluxon_kv/src/master_lease_manager/lease_manager_test.rs @@ -22,7 +22,8 @@ async fn test1_lease_expire_removes_keys() { unsafe { std::env::set_var("FLUXON_LOG", "debug"); } - let (master_fw, client_fw) = start_master_and_client("lease_master_t1", "lease_client_t1").await; + let (master_fw, client_fw) = + start_master_and_client("lease_master_t1", "lease_client_t1").await; let client_view = client_fw.client_kv_api_view(); wait_master_ready(&client_view).await; @@ -82,7 +83,8 @@ async fn test2_rebind_to_new_lease_preserves_until_new_expire() { unsafe { std::env::set_var("FLUXON_LOG", "debug"); } - let (master_fw, client_fw) = start_master_and_client("lease_master_t2", "lease_client_t2").await; + let (master_fw, client_fw) = + start_master_and_client("lease_master_t2", "lease_client_t2").await; let client_view = client_fw.client_kv_api_view(); wait_master_ready(&client_view).await; @@ -161,7 +163,8 @@ async fn test3_keepalive() { unsafe { std::env::set_var("FLUXON_LOG", "debug"); } - let (master_fw, client_fw) = start_master_and_client("lease_master_t3", "lease_client_t3").await; + let (master_fw, client_fw) = + start_master_and_client("lease_master_t3", "lease_client_t3").await; let client_view = client_fw.client_kv_api_view(); wait_master_ready(&client_view).await; @@ -236,7 +239,8 @@ async fn test4_delete_under_lease_then_get_fails() { unsafe { std::env::set_var("FLUXON_LOG", "debug"); } - let (master_fw, client_fw) = start_master_and_client("lease_master_t4", "lease_client_t4").await; + let (master_fw, client_fw) = + start_master_and_client("lease_master_t4", "lease_client_t4").await; let client_view = client_fw.client_kv_api_view(); wait_master_ready(&client_view).await; diff --git a/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs b/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs index 692a9a0..cfd6d55 100644 --- a/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs +++ b/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs @@ -101,6 +101,7 @@ fn new_client_config_with_size( large_file_paths: crate::config::LargeFilePaths { paths: vec![format!("/tmp/kvcache_large/{}", instance_key)], }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), } } @@ -134,6 +135,7 @@ fn new_zero_contribution_client_config( }, share_mem_path: format!("/tmp/kvcache_shared_memory/{}", owner_instance_key), large_file_paths: crate::config::LargeFilePaths { paths: Vec::new() }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), } } diff --git a/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/msg_and_error.rs b/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/msg_and_error.rs index 42a9cbc..def8b1c 100644 --- a/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/msg_and_error.rs +++ b/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/msg_and_error.rs @@ -35,6 +35,8 @@ pub enum MsgId { DeleteAckResp = 3024, BatchDeleteAckReq = 3029, BatchDeleteAckResp = 3030, + SsdReplicaCommitReq = 3031, + SsdReplicaCommitResp = 3032, GetMetaReq = 3019, GetMetaResp = 3020, BatchDeleteClientKvMetaCacheReq = 3021, diff --git a/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/rpcresp_kvresult_convert.rs b/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/rpcresp_kvresult_convert.rs index b6eb7d6..a5a18b4 100755 --- a/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/rpcresp_kvresult_convert.rs +++ b/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/rpcresp_kvresult_convert.rs @@ -3,11 +3,12 @@ use super::msg_and_error::{ErrorCode, KvError, KvResult}; use crate::client_kv_api::msg_pack::{ ExternalDeleteAckResp, ExternalDeleteResp, ExternalGetResp, ExternalIsExistResp, ExternalPutCommitResp, ExternalPutRevokeResp, ExternalPutStartResp, ExternalPutTransferEndResp, + SsdReplicaPersistResp, SsdStageReadResp, }; use crate::master_kv_router::msg_pack::{ BatchDeleteAckResp, BatchDeleteClientKvMetaCacheResp, DeleteAckResp, DeleteResp, GetDoneResp, GetMasterOnlyMetricPartResp, GetMetaResp, GetRevokeResp, GetStartResp, MemHolderKeepAliveResp, - MemHolderReleaseResp, PutDoneResp, PutRevokeResp, PutStartResp, + MemHolderReleaseResp, PutDoneResp, PutRevokeResp, PutStartResp, SsdReplicaCommitResp, }; use crate::master_seg_manager::msg_pack::RequestSegmentRegistrationResp; use crate::memholder::ExternalMemHolderInfo; @@ -232,6 +233,26 @@ impl FromError for ExternalDeleteAckResp { } } } +impl FromError for SsdStageReadResp { + fn from_error(e: &KvError) -> Self { + let code = e.code(); + Self { + error_code: code, + error_json: e.to_json(), + ..Default::default() + } + } +} +impl FromError for SsdReplicaPersistResp { + fn from_error(e: &KvError) -> Self { + let code = e.code(); + Self { + error_code: code, + error_json: e.to_json(), + ..Default::default() + } + } +} // ---- FromError for Master KV Router Resps ---- impl FromError for GetStartResp { @@ -294,6 +315,16 @@ impl FromError for PutDoneResp { } } } +impl FromError for SsdReplicaCommitResp { + fn from_error(e: &KvError) -> Self { + let code = e.code(); + Self { + error_code: code, + error_json: e.to_json(), + ..Default::default() + } + } +} impl FromError for MemHolderKeepAliveResp { fn from_error(e: &KvError) -> Self { let code = e.code(); From dec00166a896683955dc0bd381d2dd7969fa78c9 Mon Sep 17 00:00:00 2001 From: zTz01 <1773266173@qq.com> Date: Fri, 3 Jul 2026 20:18:44 +0800 Subject: [PATCH 2/4] docs: update SSD KV storage design --- ...30\345\202\250\350\256\276\350\256\241.md" | 53 ++++++++----------- 1 file changed, 21 insertions(+), 32 deletions(-) diff --git "a/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" "b/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" index d0da8a6..f4da5ff 100644 --- "a/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" +++ "b/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" @@ -1,12 +1,10 @@ # KV 设计 5 - SSD 存储 -## 稳定结论 +## 设计目标 -当前 KV 的 SSD 存储应当是分布式 owner SSD 副本层。每个 owner 可以持有本地 SSD backing tier,master 在同一条 key-version 路由里分开记录内存 owner 副本和 SSD owner 副本;它和 CPU segment 内存副本共享 owner placement、allocation、transfer engine 和 `MemHolder` 生命周期。SSD 只承担可回填的数据源,不暴露第二套用户读写 API。 +SSD 存储在 Fluxon KV 中作为 owner 本地 backing tier 接入通用 KV 链路。它不是一套独立的读写 API,也不改变用户侧 `put/get/delete` 语义;master 仍然以 key-version 为单位维护路由,内存副本是第一数据源,SSD 副本是内存副本不可用时的回填数据源。 -读路径按内存优先。master 找不到可用内存副本时,可以选择任意 owner 上的 SSD 副本;SSD owner 把磁盘数据按 chunk 读入自己节点上的 CPU staging allocation,每个 chunk ready 后立即由 SSD owner 侧复用现有 transfer engine push 到请求方 target allocation 对应 offset。SSD source 路径由 SSD owner 在所有 chunk transfer 完成后直接向 master 发送 `GetDoneReq`,再把 holder 结果放进 `SsdStageReadResp` 回给请求方;请求方不再从 SSD owner staging 发起第二段 transfer,也不再在 SSD source 路径上单独发送 `GetDoneReq`。 - -IO 层吸收 Pegaflow SSD cache 的核心做法:分片环形文件、`O_DIRECT` 对齐 buffer、`io_uring` 后台读写线程、有界读写队列、`Writing/Committed` 两阶段索引,以及 ring tail 推进时的主动失效。进一步对照 3FS 和 foyer 后,当前实现把底层 uring 调度改成 read/write 独立队列,并在同一 shard 内按 inflight 比例优先补读队列,避免 kvcache 回填读被持续写入压住;同时按 3FS 的位置生命周期约束保护正在读或正在写的 ring 位置,tail 推进不能覆盖 active IO。对大 payload 高带宽场景,aligned SSD stage 可以直接 readv 到 source staging allocation,跳过中间 aligned buffer 到 staging 的额外内存拷贝;SSD read 和 transfer 之间使用 producer/consumer pipeline,chunk read 完成即可发起对应 chunk transfer。owner 启动时从 `large_file_paths` 派生 SSD root,先按 `metadata.dev()` 去重,再为每个有效 device 建独立 writer/reader queue 和 `UringIoEngine`;shard 只在所属 device worker 的 shard 集内分配。写路径必须把内存提交和 SSD 提交拆开:内存 `PutDone` 先让 key-version 可读,SSD 写入完成后再通过独立 commit 把同一版本加入 `ssd_replicas`。 +读取侧采用“内存优先、SSD 回填”的设计。`GetStart` 优先选择 live 内存副本;没有可用内存副本时,master 才选择 SSD owner,并分配 SSD owner 本机 source staging 和 requester target。SSD owner 从本地 SSD 读入 source staging,再复用现有 transfer engine 把数据推到 requester target,最后继续使用原有 `GetDone` 和 `MemHolder` 生命周期。 ## 公共契约 @@ -34,8 +32,8 @@ fluxonkv_spec: | 分布式 SSD 读取 | 已接入。`GetStart` 可以返回任意 SSD owner,source staging allocation 位于 SSD owner,target allocation 位于请求方 owner。 | | owner 内部多 SSD 路径 | 已接入。SSD root 来自 `large_file_paths`,先按 device 去重;每个有效 device 有独立 writer/reader queue、uring engine 和 shard 集,单 owner 可以利用多块真实本地 SSD。 | | 内存 KV 复用 | 已复用。SSD 回填由 SSD owner 侧调用 `transfer_data_no_copy` 按 chunk push 到 requester target,全部 chunk transfer 完成后由 SSD owner 调 master `get_done`;requester 只复用返回的 holder 结果构造 `MemHolder`。 | -| Pegaflow IO 模型 | 已接入核心形态:分片 ring、`O_DIRECT`、`io_uring`、有界队列、两阶段提交、tail 失效;写路径已经把内存 `PutDone` 和 SSD commit 拆开。 | -| 3FS 位置生命周期 | 已接入到 SSD ring。读 IO 提交前 pin committed entry;未完成的 `Writing` entry 和 pinned read entry 都会阻止物理位置复用。 | +| SSD 写入 IO 模型 | 已接入。分片 ring、`O_DIRECT`、`io_uring`、有界队列、两阶段提交和 tail 失效都在 owner 本地 `KvSsdStorage` 内完成;写路径已经把内存 `PutDone` 和 SSD commit 拆开。 | +| ring 位置生命周期 | 已接入。读 IO 提交前 pin committed entry;未完成的 `Writing` entry 和 pinned read entry 都会阻止物理位置复用。 | | 大 payload direct stage | 已接入 aligned fast path 和 chunk pipeline。master 给 SSD source staging 多分配最多 511 bytes,并在 allocation 内返回 512-byte 对齐后的 `src_addr`;SSD read 按 chunk 对齐 IO 长度直接写入 staging,chunk ready 后立刻 transfer,`MemHolder` 仍只使用真实 payload 长度。 | | 冷启动恢复 | 当前不扫描 SSD shard 重建 master 路由;路由仍由本轮运行时的 `put/get/delete` 生命周期产生。 | | lease key 专门治理 | 当前没有单独的 lease SSD 生命周期策略;lease 与普通 key 共用 key-version 路由约束。 | @@ -49,9 +47,11 @@ flowchart TD B --> G["owner -> master PutDone(memory_ready)"] G --> H["master route: nodes_replicas"] B --> C["async KvSsdStorage.persist_from_addr(key, put_id, addr, len)"] - C --> D["SSD writer queue"] - D --> E["io_uring writev to sharded O_DIRECT ring"] - E --> F["commit index: Writing -> Committed"] + C --> D["copy payload to 512-byte aligned buffer"] + D --> E["per-device writer queue"] + E --> E2["shard ring entry: Writing"] + E2 --> E3["O_DIRECT + io_uring writev"] + E3 --> F["commit index: Writing -> Committed"] F --> I["owner -> master SsdReplicaCommit"] I --> J["master route: ssd_replicas"] @@ -63,7 +63,8 @@ flowchart TD N -->|yes| O["allocate source staging on SSD owner"] O --> P["allocate target on requester"] P --> Q["return GetSourceKind::Ssd"] - Q --> R["SSD owner chunk readv into source staging"] + Q --> R0["pin committed SSD entry"] + R0 --> R["chunk read: direct staging or scratch fallback"] R --> S["SsdLoadedChunk(offset,len)"] S --> W["SSD owner transfer chunk: staging+offset -> requester target+offset"] W --> T["all chunks done: SSD owner -> master GetDoneReq"] @@ -92,7 +93,7 @@ sequenceDiagram M-->>C: PutDoneResp M->>C: async SsdReplicaPersistReq(key, put_id, target_addr, len) C->>SSD: KvSsdStorage.persist_from_addr(...) - Note over SSD: writer queue -> io_uring writev -> Writing/Committed + Note over SSD: aligned buffer -> per-device writer queue\nshard ring Writing -> O_DIRECT + io_uring writev -> Committed C->>M: SsdReplicaCommitReq(key, put_id, node_id, len) Note right of M: ssd_replicas 写入 SSD 副本 @@ -104,6 +105,7 @@ sequenceDiagram M-->>C: GetStartResp(source_kind=Ssd, src_addr, target_addr, ssd_stage_len) C->>SO: SsdStageReadReq(get_id, stage_addr=src_addr, stage_len=ssd_stage_len, target_node_id, target_addr, len) SO->>SSD: load_into_addr_chunks(key, put_id, stage_addr, len, stage_len) + Note over SSD: pin committed entry\nchunk read direct to source staging or scratch fallback loop each ready chunk SSD-->>SO: SsdLoadedChunk(offset, stage_addr+offset, chunk_len) SO->>TE: transfer_data_no_copy(write, stage+offset -> target+offset, chunk_len) @@ -259,7 +261,7 @@ struct KvSsdStorageInner { owner 如果是最终 target,先完成原有 transfer 和 `PutDoneReq`,让内存副本进入 `nodes_replicas`。master 随后在后台 task 里把 `SsdReplicaPersistReq { key, put_id, target_addr, len }` 发回 target owner,并持有 target allocation 的 `Arc`,保证 owner 从内存复制到 SSD 期间 payload 不会被释放或复用。 -owner 的 `rpc_ssd_replica_persist` handler 收到请求后,从 target allocation 的绝对地址调用 `persist_local_kv_to_ssd(...)`,进入 `KvSsdStorage::persist_from_addr(key, put_id, addr, len)`。`persist_from_addr` 把真实 payload 拷到 512-byte 对齐的 `AlignedBuffer`,`persist_buffer` 通过 `next_write_device` round-robin 选择一个 `SsdDeviceWorker.write_tx` 并等待后台 writer 完成。每个 `ssd_writer_loop` 只拿自己的 `shard_ids` 调 `SsdRingBuffer::prepare_write_on_shards(...)`,在 `ring.entries` 中建立 `Writing(SsdIndexEntry)`;对应 device 的 `UringIoEngine` 对该 shard 文件执行 `writev`,成功后提交为 `Committed(SsdIndexEntry)`。这之后 owner 发送 `SsdReplicaCommitReq` 给 master,补交 SSD 副本。 +owner 的 `rpc_ssd_replica_persist` handler 收到请求后,从 target allocation 的绝对地址调用 `persist_local_kv_to_ssd(...)`,进入 `KvSsdStorage::persist_from_addr(key, put_id, addr, len)`。`persist_from_addr` 把真实 payload 拷到 512-byte 对齐的 `AlignedBuffer`,`persist_buffer` 通过 `next_write_device` round-robin 选择一个 `SsdDeviceWorker.write_tx` 并等待后台 writer 完成。每个 `ssd_writer_loop` 只拿自己的 `shard_ids` 调 `SsdRingBuffer::prepare_write_on_shards(...)`,在 `ring.entries` 中建立 `Writing(SsdIndexEntry)`;对应 device 的 `UringIoEngine` 对该 shard 文件执行 `O_DIRECT + writev`,成功后提交为 `Committed(SsdIndexEntry)`。这之后 owner 发送 `SsdReplicaCommitReq` 给 master,补交 SSD 副本。写队列和底层 uring 队列都是有界队列;当 SSD 写入慢于提交速度时,背压停在 owner 本地 SSD persist 路径,不改变已经完成的内存 `PutDone` 语义。 #### external @@ -410,7 +412,7 @@ pub struct SsdStageReadResp { requester owner 收到 `GetSourceKind::Memory` 后走原有 transfer 分支,然后自己发送 `GetDoneReq`。requester owner 收到 `GetSourceKind::Ssd` 后调用 `stage_kv_from_ssd_source(...)`,该函数返回 `GetDoneResp` 对应字段;requester 跳过自己的 transfer,也跳过自己的 `get_done`,直接用返回的 done 结果构造 holder。 -SSD owner 的 `rpc_ssd_stage_read` task 调用 `load_and_push_kv_from_ssd(...)`。这个函数内部把 `KvSsdStorage::load_into_addr_chunks(...)` 作为生产者,把 `transfer_loaded_ssd_chunks(...)` 作为消费者:生产者 pin 当前 committed entry,按 chunk 把磁盘数据读入 master 分配的 `stage_addr + offset`;消费者每收到一个 `SsdLoadedChunk`,立即用 `transfer_data_no_copy(peer=target_node_id, peer_src_or_target=false, stage_addr + offset, target_addr + offset, chunk_len, None)` push 到 requester target。所有 chunk transfer 成功后,SSD owner 用 `SsdStageReadReq.get_id` 调 master `GetDoneReq`,并把 `GetDoneResp` 拆成 `SsdStageReadResp.done_*` 字段返回给 requester。 +SSD owner 的 `rpc_ssd_stage_read` task 调用 `load_and_push_kv_from_ssd(...)`。这个函数内部把 `KvSsdStorage::load_into_addr_chunks(...)` 作为生产者,把 `transfer_loaded_ssd_chunks(...)` 作为消费者:生产者 pin 当前 committed entry,按 chunk 把磁盘数据读入 master 分配的 `stage_addr + offset`;消费者每收到一个 `SsdLoadedChunk`,立即用 `transfer_data_no_copy(peer=target_node_id, peer_src_or_target=false, stage_addr + offset, target_addr + offset, chunk_len, None)` push 到 requester target。所有 chunk transfer 成功后,SSD owner 用 `SsdStageReadReq.get_id` 调 master `GetDoneReq`,并把 `GetDoneResp` 拆成 `SsdStageReadResp.done_*` 字段返回给 requester。读路径进入 per-device reader queue,底层 `UringIoEngine` 把 read/write 分成独立发送队列,并按 inflight 比例补读,避免回填读长期排在持续写入之后。 ```rust struct SsdRingBuffer { @@ -425,7 +427,7 @@ enum SsdEntryState { } ``` -`read_pins` 是 owner 本地 SSD ring 的生命周期保护,防止 writer 推进 tail 时覆盖 active read。chunk pipeline 在整个 producer 生命周期内持有同一个 read pin;每个 chunk 单独提交 read task。direct read 条件满足时,`readv` 直接写到 `SsdStageReadReq.stage_addr + offset`;否则先读 scratch aligned buffer,再复制当前 chunk 的真实 payload 长度到 staging。请求方 target 是否远端不影响 SSD direct read 的对齐判断。 +`read_pins` 是 owner 本地 SSD ring 的生命周期保护,防止 writer 推进 tail 时覆盖 active read。chunk pipeline 在整个 producer 生命周期内持有同一个 read pin;每个 chunk 单独提交 read task。direct read 条件满足时,`readv` 直接写到 `SsdStageReadReq.stage_addr + offset`;否则先读 scratch aligned buffer,再复制当前 chunk 的真实 payload 长度到 staging。direct read 省掉的是 scratch buffer 到 source staging 的本机 memcpy,不省掉 `source staging -> requester target` 的 transfer。请求方 target 是否远端不影响 SSD direct read 的对齐判断。 #### external @@ -758,11 +760,11 @@ flowchart TD | 组件 | 设计 | | --- | --- | | device root | owner 从 `large_file_paths` 派生 SSD root,创建目录后读取 `metadata.dev()`;同一 device 只保留第一个 root。 | -| shard 文件 | 每个 owner 将 `max_bytes` 切成少量 shard,文件位于有效 device root 的 `shards/` 下,`shard_to_device` 记录 shard 到 device worker 的映射。 | -| 对齐 | 数据写入前复制到 512-byte 对齐 buffer,实际 IO 长度按 512-byte 向上对齐。 | +| shard 文件 | `max_bytes` 是 owner 本地 SSD cache 的容量上限;owner 将它拆成多个本地 ring shard 文件,每个 shard 位于某个有效 device root 的 `shards/` 下,`shard_to_device` 用来把 shard 映射到对应 device worker。 | +| 对齐 | SSD shard 使用 `O_DIRECT` 绕过 page cache,减少大 payload 双重缓存;对应要求 buffer 地址、IO 长度和文件 offset 512-byte 对齐,由 `AlignedBuffer` / `align_ssd_io_len` 保证。 | | 写队列 | `persist_from_addr` 只把任务送入某个 device 的有界 writer queue;后台 writer 控制 inflight 数量,并只在本 device 的 `shard_ids` 内分配 ring 空间。 | | 读队列 | `load_into_addr_chunks` 先 pin committed 索引,再按 `entry.shard_id -> shard_to_device` 找到对应 device reader queue。只要 chunk staging 地址、文件 offset 和 staging 容量满足对齐约束,就直接读入目标 staging;否则读到 scratch aligned buffer 后只复制当前 chunk 的真实 payload 长度。 | -| io_uring | 每个有效 device 拥有自己的 `UringIoEngine`,engine 内多个后台线程持有 `IoUring`,使用 `readv/writev` 提交该 device 的 shard 文件 IO。底层每个 uring shard 有独立 read/write 发送队列,按 read/write inflight 比例调度,优先保护 kvcache 回填读延迟。 | +| io_uring | 每个有效 device 拥有自己的 `UringIoEngine`,engine 内多个后台线程持有 `IoUring`,使用 `readv/writev` 提交该 device 的 shard 文件 IO。底层每个 uring shard 有独立 read/write 发送队列,按 read/write inflight 比例调度,优先保护 KV 回填读延迟。 | | 索引状态 | 新写入先进入 `Writing`;只有 IO 完成且 offset 仍有效时才转为 `Committed`。 | | 位置保护 | `load_into_addr_chunks` 在 producer 生命周期内 pin committed entry;writer 分配新 ring 空间前检查 pinned read 和未完成 `Writing` entry,必要时等待 active IO 释放位置。 | | ring 失效 | shard head 推进超过容量时推进 tail,并移除被覆盖 key-version 的本地索引。 | @@ -805,19 +807,6 @@ std::thread::Builder::new() `KvSsdStorage` 通过每个 `SsdDeviceWorker` 持有 `_files` 和 `_io`,确保该 device 的 shard fd 与 uring 线程生命周期覆盖所有读写 task。`UringIoEngine::drop` 会关闭 read/write channel,并 join 所有 uring 线程。 -## 3FS 和 foyer 对照 - -| 参考点 | 对 kvcache SSD 的结论 | -| --- | --- | -| foyer read/write split queue | 已落地到底层 `UringIoEngine`。写入 flush 和回填读进入不同发送队列,同一 uring shard 内用 inflight 比例避免读饥饿。 | -| foyer multi-partition device | 已落地到 owner 内部 per-device worker。`large_file_paths` 仍是唯一配置来源;owner 按 `metadata.dev()` 去重后为每个有效 device 建独立 writer/read queue、uring engine 和 shard 集。 | -| foyer block buffer/reclaimer | 适合后续把小 key-version 合并成 blob,并用 blob index 加速恢复;当前 kvcache value 以较大连续 payload 为主,先保留单 key-version 连续写入。 | -| 3FS write-new-position then commit metadata | 当前 `Writing/Committed` 两阶段索引已经匹配这条原则:IO 成功前不暴露 SSD 副本。 | -| 3FS read holds chunk position reference | 已落地到 SSD ring 内部。读提交前 pin entry;tail 推进和物理 offset 复用必须避开 pinned read。 | -| 3FS aligned direct read | 已落地 aligned fast path。master 自己控制 SSD source staging allocation,因此可以在 allocation 内选择对齐后的 source 地址,并把 SSD IO 长度扩到 512-byte 对齐;真实 payload 长度仍用于 transfer 和用户可见 `MemHolder`。 | -| 3FS batch read/RDMA response | Fluxon 已复用现有 transfer engine,并已落地 read/transfer chunk pipeline;后续优化重点放在批量 SSD stage、批量 transfer 和小窗口 staging allocation。 | -| PegaFlow fire-and-forget SSD ingest | 已落地到 put 路径。master 在 `PutDone` 中先提交内存 route,再通过后台 `post_put_ssd_replica_persist` 触发 owner 本地 SSD persist;owner 落盘成功后用 `SsdReplicaCommitReq` 独立提交 SSD route。 | - ## 不变量 - `ssd_replicas` 和 `nodes_replicas` 都属于同一个 `OneKvNodesRoutes.put_id`,不能跨版本复用。 @@ -833,4 +822,4 @@ std::thread::Builder::new() ## 关键结论 -这套实现把 SSD 做成和 CPU segment 同级的分布式数据源副本,但不新增并行的用户 API 或传输协议。Pegaflow 的优势被放在 owner 内部 IO 层:异步 direct IO、分片 ring、提交态隔离和队列背压;foyer 的 read/write 队列调度用于保护回填读延迟;3FS 的位置生命周期、aligned direct read 和 read/transfer chunk pipeline 用于保护 active IO 并减少大 payload 回填拷贝和串行等待。后续重点是批量 SSD stage、批量 transfer、小窗口 staging allocation 和 pipeline 观测指标。Fluxon 的优势继续由原有内存 KV 路由、allocation、transfer 和 holder 生命周期承接。 +这套实现把 SSD 做成和 CPU segment 同级的分布式数据源副本,但不新增并行的用户 API 或传输协议。写入侧先用 `PutDone` 提交内存 route,再由 target owner 本地异步写入 SSD,写成功后用 `SsdReplicaCommitReq` 补交 SSD route;读取侧先走内存副本,内存副本不可用时由 SSD owner 把本地 shard ring 中的 committed entry 按 chunk 读入 source staging,并边读边 transfer 到 requester target。owner 内部的分片 ring、`O_DIRECT` 对齐、`io_uring` 队列、`Writing/Committed` 两阶段索引、read pin、direct/scratch read 和 read/transfer pipeline 都服务于同一个目标:让 SSD 成为可回填的数据源,同时保持原有 KV 路由、allocation、transfer 和 holder 生命周期不变。后续重点是批量 SSD stage、批量 transfer、小窗口 staging allocation 和 pipeline 观测指标。 From 9bb1284d36532c5c7633cbe2cbbce3a53a4f9266 Mon Sep 17 00:00:00 2001 From: zTz01 <1773266173@qq.com> Date: Sun, 5 Jul 2026 17:37:28 +0800 Subject: [PATCH 3/4] docs: update SSD KV storage design --- ...30\345\202\250\350\256\276\350\256\241.md" | 309 ++++++++++-------- 1 file changed, 177 insertions(+), 132 deletions(-) diff --git "a/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" "b/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" index f4da5ff..0f3ac5a 100644 --- "a/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" +++ "b/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" @@ -29,15 +29,15 @@ fluxonkv_spec: | 范围 | 当前结论 | | --- | --- | -| 分布式 SSD 读取 | 已接入。`GetStart` 可以返回任意 SSD owner,source staging allocation 位于 SSD owner,target allocation 位于请求方 owner。 | -| owner 内部多 SSD 路径 | 已接入。SSD root 来自 `large_file_paths`,先按 device 去重;每个有效 device 有独立 writer/reader queue、uring engine 和 shard 集,单 owner 可以利用多块真实本地 SSD。 | -| 内存 KV 复用 | 已复用。SSD 回填由 SSD owner 侧调用 `transfer_data_no_copy` 按 chunk push 到 requester target,全部 chunk transfer 完成后由 SSD owner 调 master `get_done`;requester 只复用返回的 holder 结果构造 `MemHolder`。 | -| SSD 写入 IO 模型 | 已接入。分片 ring、`O_DIRECT`、`io_uring`、有界队列、两阶段提交和 tail 失效都在 owner 本地 `KvSsdStorage` 内完成;写路径已经把内存 `PutDone` 和 SSD commit 拆开。 | -| ring 位置生命周期 | 已接入。读 IO 提交前 pin committed entry;未完成的 `Writing` entry 和 pinned read entry 都会阻止物理位置复用。 | +| 分布式 SSD 读取 | 已接入。读取 key 时,master 仍优先选择可用内存副本;没有可用内存副本时,才选择持有 SSD 副本的 owner。磁盘数据先读到 SSD owner 本机的 source staging,再传到请求方 owner 的 target allocation。 | +| owner 内部多 SSD 路径 | 已接入。owner 可通过多个 `large_file_paths` 使用多块本地 SSD;路径会先按实际 device 去重,只有落在不同 device 上的 SSD cache root 目录才会创建独立读写队列、`UringIoEngine` 和 shard 文件集。 | +| 内存 KV 复用 | 已复用。SSD 回填继续走现有 KV transfer 链路:SSD owner 按 chunk 读出数据后,通过 `transfer_data_no_copy` 写到请求方 target;全部 chunk 完成后,SSD owner 向 master 提交 `get_done`,用户侧仍通过普通 `get` 拿到 `MemHolder`,不需要调用 SSD 专用接口。 | +| SSD 写入 IO 模型 | 已接入。owner 完成内存 `PutDone` 后,再异步把同一份 payload 写入本地 SSD。SSD 写入在 `KvSsdStorage` 内完成,使用 shard ring、`O_DIRECT`、`io_uring`、有界队列和 `Writing -> Committed` 两阶段提交。 | +| ring 位置生命周期 | 已接入。SSD 读写会保护正在使用的物理位置:读 IO 提交前会 pin 已提交的 entry;未完成写入的 `Writing` entry 和正在读取的 pinned entry 都不会被新的写入覆盖。 | | 大 payload direct stage | 已接入 aligned fast path 和 chunk pipeline。master 给 SSD source staging 多分配最多 511 bytes,并在 allocation 内返回 512-byte 对齐后的 `src_addr`;SSD read 按 chunk 对齐 IO 长度直接写入 staging,chunk ready 后立刻 transfer,`MemHolder` 仍只使用真实 payload 长度。 | -| 冷启动恢复 | 当前不扫描 SSD shard 重建 master 路由;路由仍由本轮运行时的 `put/get/delete` 生命周期产生。 | -| lease key 专门治理 | 当前没有单独的 lease SSD 生命周期策略;lease 与普通 key 共用 key-version 路由约束。 | -| 独立 SSD 路径参数 | 不提供。SSD 根目录从 `large_file_paths` 派生,避免和日志、共享 bundle、FS disk cache 混用。 | +| 冷启动恢复 | 当前不支持。owner 启动时不会扫描已有 SSD shard 来重建 master 路由;SSD 副本路由只来自本轮运行期间的 `put/get/delete` 生命周期。 | +| lease key 专门治理 | 当前没有专用策略。带 lease 的 key 和普通 key 使用同一套 key-version 路由与 SSD 副本生命周期,SSD 层不单独维护 lease 过期扫描或清理规则。 | +| 独立 SSD 路径参数 | 不提供。SSD cache 目录统一从 owner 的 `large_file_paths` 派生,不再增加单独的 SSD 路径配置,避免日志、共享 bundle、FS disk cache 和 KV SSD cache 出现多套路径来源。 | ## 数据流 @@ -49,9 +49,9 @@ flowchart TD B --> C["async KvSsdStorage.persist_from_addr(key, put_id, addr, len)"] C --> D["copy payload to 512-byte aligned buffer"] D --> E["per-device writer queue"] - E --> E2["shard ring entry: Writing"] - E2 --> E3["O_DIRECT + io_uring writev"] - E3 --> F["commit index: Writing -> Committed"] + E --> E2["SsdRingBuffer 分配 shard_id + file_offset,记录 Writing entry"] + E2 --> E3["O_DIRECT + io_uring writev 写入 SSD shard 文件"] + E3 --> F["提交索引:Writing -> Committed"] F --> I["owner -> master SsdReplicaCommit"] I --> J["master route: ssd_replicas"] @@ -63,8 +63,9 @@ flowchart TD N -->|yes| O["allocate source staging on SSD owner"] O --> P["allocate target on requester"] P --> Q["return GetSourceKind::Ssd"] - Q --> R0["pin committed SSD entry"] - R0 --> R["chunk read: direct staging or scratch fallback"] + Q --> R0["SsdRingBuffer pin committed entry"] + R0 --> R1["根据 entry.shard_id 找到 device reader queue"] + R1 --> R["从 SSD shard 文件按 file_offset 读取 chunk"] R --> S["SsdLoadedChunk(offset,len)"] S --> W["SSD owner transfer chunk: staging+offset -> requester target+offset"] W --> T["all chunks done: SSD owner -> master GetDoneReq"] @@ -75,7 +76,7 @@ flowchart TD ## 端到端调用时序 -SSD 路径只在两个位置扩展主链路:`put_done` 提交内存副本后,owner 异步把本地 target allocation 落到 SSD,并在完成后单独提交 SSD 副本;`get_start` 找不到可用内存副本时,master 为 SSD owner 分配 source staging,再由 SSD owner 按 chunk 把磁盘数据读入 staging 并 push 到 requester target。`get_done` 和 holder 生命周期继续走内存 KV 的原 master 逻辑,但 SSD source 路径的 `GetDoneReq` 由 SSD owner 在全部 chunk transfer 完成后发起,requester 只消费 `SsdStageReadResp` 里带回的 done 结果。 +SSD 路径只在两个位置扩展主链路:`put_done` 提交内存副本后,owner 异步把本地 target allocation 落到 SSD,并在完成后单独提交 SSD 副本;`get_start` 找不到可用内存副本时,master 为 SSD owner 分配 source staging,再由 SSD owner 按 chunk 把磁盘数据读入 staging 并 push 到 requester target。`get_done` 和 `MemHolder` 生命周期仍复用原有内存 KV 逻辑。SSD 回填时,最终 holder 对应的是请求方 owner 上的 target allocation;SSD owner 只负责从本地 SSD 读出数据、把全部 chunk 传到请求方 target,并在传输完成后向 master 调用 `GetDoneReq`。master 返回的 holder 字段会由 SSD owner 放入 `SsdStageReadResp` 带回请求方,请求方再用这些字段构造普通 `MemHolder`。 ```mermaid sequenceDiagram @@ -83,7 +84,7 @@ sequenceDiagram participant M as master participant SO as SSD owner participant TE as transfer engine - participant SSD as owner SSD shard + participant SSD as SSD shard files / SsdRingBuffer C->>M: PutStartReq(key, len) M-->>C: PutStartResp(target allocation) @@ -93,7 +94,7 @@ sequenceDiagram M-->>C: PutDoneResp M->>C: async SsdReplicaPersistReq(key, put_id, target_addr, len) C->>SSD: KvSsdStorage.persist_from_addr(...) - Note over SSD: aligned buffer -> per-device writer queue\nshard ring Writing -> O_DIRECT + io_uring writev -> Committed + Note over SSD: aligned buffer -> per-device writer queue\nSsdRingBuffer 分配 shard_id + file_offset\nO_DIRECT + io_uring writev 写入 SSD shard 文件\nWriting -> Committed C->>M: SsdReplicaCommitReq(key, put_id, node_id, len) Note right of M: ssd_replicas 写入 SSD 副本 @@ -105,14 +106,14 @@ sequenceDiagram M-->>C: GetStartResp(source_kind=Ssd, src_addr, target_addr, ssd_stage_len) C->>SO: SsdStageReadReq(get_id, stage_addr=src_addr, stage_len=ssd_stage_len, target_node_id, target_addr, len) SO->>SSD: load_into_addr_chunks(key, put_id, stage_addr, len, stage_len) - Note over SSD: pin committed entry\nchunk read direct to source staging or scratch fallback + Note over SSD: SsdRingBuffer pin committed entry\n按 entry.shard_id 进入 device reader queue\n从 SSD shard 文件按 file_offset 读取 chunk loop each ready chunk SSD-->>SO: SsdLoadedChunk(offset, stage_addr+offset, chunk_len) SO->>TE: transfer_data_no_copy(write, stage+offset -> target+offset, chunk_len) end SO->>M: GetDoneReq(get_id) Note right of M: target allocation 进入 get_holding\nsource_allocation 释放 - M-->>SO: GetDoneResp(holder_id) + M-->>SO: GetDoneResp(holder_id for requester target) SO-->>C: SsdStageReadResp(done_holder_id, done_allocation_mode) end opt source_kind=Memory @@ -123,6 +124,7 @@ sequenceDiagram end ``` + ## 当前实现 | 模块 | 职责 | @@ -177,36 +179,48 @@ master 持有 `put` 的权威控制面状态:`inflight_puts` 记录未完成 ```rust pub struct MasterKvRouterInner { + // PutStart 到 PutDone / PutRevoke 期间保留的 put 在途状态。 pub inflight_puts: moka::future::Cache<(String, u64, u32), InflightPutInfo>, + // 已提交 key-version 的权威路由表。 pub kv_routes: DashMap>, ... } pub struct InflightPutInfo { + // 放置策略最终选中的 target owner。 pub node_id: NodeID, pub key: String, + // 发起这次 put 的原始请求节点。 pub req_node_id: NodeID, pub len: u64, + // PutDone 前保留 source / target allocation,避免内存被提前释放。 pub src_target_allocation: Arc>>, } pub struct OneKvNodesRoutes { + // 当前已提交 value 的稳定版本号。 pub put_id: PutIDForAKey, + // 内存副本路由;PutDone 成功后立即写入。 pub nodes_replicas: RwLock>, + // SSD 副本路由;只记录 owner 和长度,不保存本地文件 offset。 pub ssd_replicas: RwLock>, ... } pub struct PutDoneReq { pub key: String, + // 和当前 route 版本匹配时,才提交内存副本。 pub put_id: PutIDForAKey, pub lease_id: Option, } pub struct SsdReplicaCommitReq { pub key: String, + // SSD late commit 必须用这个版本号防止污染新 route。 pub put_id: PutIDForAKey, + // 完成 SSD persist 的 owner 节点。 pub node_id: NodeIDString, + // 真实 payload 长度;SSD 文件 offset 只保存在 owner 本地。 pub len: u64, } ``` @@ -221,6 +235,7 @@ owner 持有数据面:本机 CPU segment、可选 SSD store、put transfer 和 ```rust pub struct ClientKvApiInner { + // owner 本地可选 SSD cache;external 不直接持有它。 ssd_storage: Option>, rpc_caller_put_start: RPCCaller, rpc_caller_put_done: RPCCaller, @@ -231,48 +246,65 @@ pub struct ClientKvApiInner { pub struct SsdReplicaPersistReq { pub key: String, pub put_id: PutIDForAKey, + // 已经 PutDone 的内存 target 绝对地址,owner 从这里复制 payload 到 SSD。 pub target_addr: u64, pub len: u64, } pub struct KvSsdStorage { + // 按 device 去重后的 SSD cache root 目录。 root_dirs: Vec, + // 每个有效 device 对应一个读写 worker。 devices: Vec, + // shard_id 到 device worker 的映射,读路径按它选择 reader queue。 shard_to_device: Vec, + // 写入按有效 device 做 round-robin。 next_write_device: AtomicUsize, + // 全部 shard ring 和 key-version 索引的共享状态。 inner: Arc>, + // ring 空间被 active IO 占住时,用它通知 writer 重试。 space_notify: Arc, } struct SsdDeviceWorker { + // Linux metadata.dev() 得到的实际 device 标识。 device_id: u64, root_dir: PathBuf, + // 这个 device 负责的 shard 文件编号。 shard_ids: Vec, + // 持有 shard 文件 fd,保证 uring IO 生命周期内 fd 有效。 _files: Vec, + // 这个 device 独立的 io_uring engine。 _io: Arc, + // per-device 写队列。 write_tx: tokio_mpsc::Sender, + // per-device 读队列。 read_tx: tokio_mpsc::Sender, } struct KvSsdStorageInner { + // 管理各 shard 文件内的环形 offset 空间和 key-version 索引。 ring: SsdRingBuffer, } ``` -owner 如果是最终 target,先完成原有 transfer 和 `PutDoneReq`,让内存副本进入 `nodes_replicas`。master 随后在后台 task 里把 `SsdReplicaPersistReq { key, put_id, target_addr, len }` 发回 target owner,并持有 target allocation 的 `Arc`,保证 owner 从内存复制到 SSD 期间 payload 不会被释放或复用。 +当 master 把这次 put 的最终 target allocation 放在某个 owner 上时,这个 owner 就是该 key-version 的内存副本 owner。`PutDoneReq` 只把这个 target allocation 提交到 `nodes_replicas`;提交完成后,这个 key-version 已经可以被普通 `get` 读到。SSD 落盘不在 `PutDoneReq` 的同步路径里;master 会在后台 task 中向同一个 target owner 发送 `SsdReplicaPersistReq { key, put_id, target_addr, len }`。这个后台 task 会继续持有 target allocation 的 `Arc`,保证 owner 从内存复制 payload 到 SSD 之前,这块内存不会被释放或复用。 -owner 的 `rpc_ssd_replica_persist` handler 收到请求后,从 target allocation 的绝对地址调用 `persist_local_kv_to_ssd(...)`,进入 `KvSsdStorage::persist_from_addr(key, put_id, addr, len)`。`persist_from_addr` 把真实 payload 拷到 512-byte 对齐的 `AlignedBuffer`,`persist_buffer` 通过 `next_write_device` round-robin 选择一个 `SsdDeviceWorker.write_tx` 并等待后台 writer 完成。每个 `ssd_writer_loop` 只拿自己的 `shard_ids` 调 `SsdRingBuffer::prepare_write_on_shards(...)`,在 `ring.entries` 中建立 `Writing(SsdIndexEntry)`;对应 device 的 `UringIoEngine` 对该 shard 文件执行 `O_DIRECT + writev`,成功后提交为 `Committed(SsdIndexEntry)`。这之后 owner 发送 `SsdReplicaCommitReq` 给 master,补交 SSD 副本。写队列和底层 uring 队列都是有界队列;当 SSD 写入慢于提交速度时,背压停在 owner 本地 SSD persist 路径,不改变已经完成的内存 `PutDone` 语义。 +target owner 收到 `SsdReplicaPersistReq` 后,从 `target_addr` 指向的内存 target 复制完整 payload,并构造 512-byte 对齐的 `AlignedBuffer`。随后 `persist_buffer` 按 value 级别通过 `next_write_device` round-robin 选择一个有效 device 的 `write_tx`;当前实现不会把同一个 payload 拆到多个 device。该 device 的 `ssd_writer_loop` 只在自己的 `shard_ids` 中选择一个 shard,由 `SsdRingBuffer::prepare_write_on_shards(...)` 为整个 aligned payload 分配一段连续 `file_offset`,并先记录 `Writing(SsdIndexEntry)`。对应 device 的 `UringIoEngine` 对这个 shard 文件执行 `O_DIRECT + writev`;写入成功后,entry 才从 `Writing` 提交为 `Committed`。最后 owner 向 master 发送 `SsdReplicaCommitReq`;master 校验请求里的 `put_id` 与当前内存 route 的 `put_id` 相同后,才会把这个 key-version 的 SSD 副本补充进 `ssd_replicas`。写队列和底层 uring 队列都是有界队列;如果 SSD 变慢,背压只停在 owner 本地 SSD persist 路径,不会回头改变已经完成的内存 `PutDone` 语义。 #### external -external 只持有写入请求上下文和 owner 暴露的 mmap offset,不持有 SSD route 或 SSD index。 +external 的状态边界只到 owner mmap 写入:它保存本次 put 所需的 `key`、`len`、`put_id` 和 mmap offset。SSD route 由 master 管理,SSD 文件位置由 target owner 本地 `SsdRingBuffer` 管理,external 不保存也不更新这些状态。 ```rust pub struct ExternalPutStartReq { pub key: String, pub len: u64, + // 透传给 master PutStart,用于拒绝同 key 并发 put。 pub reject_if_inflight_same_key: bool, + // 透传给 master 放置策略,影响 target owner 选择。 pub preferred_sub_cluster: Option, + // owner 代际校验,防止旧 external 请求提交到新 owner。 pub started_time: i64, pub test_observe_put_phases: bool, } @@ -280,10 +312,15 @@ pub struct ExternalPutStartReq { pub struct ExternalPutTransferEndReq { pub key: String, pub len: u64, + // external 实际写入的 owner mmap offset;远端 target 时它是本地 staging。 pub src_offset: u64, + // 本地 target 时等于最终 target;远端 target 时由 owner 内部上下文修正。 pub target_offset: u64, + // 远端 target owner;本地 target 时为空。 pub peer_id: Option, + // 远端 target owner 的 base addr;本地 target 时为空。 pub target_base_addr: Option, + // ExternalPutStart 返回的版本号,TransferEnd 用它完成 PutDone。 pub put_id: Option, pub lease_id: Option, pub started_time: i64, @@ -319,9 +356,9 @@ sequenceDiagram SO->>TE: transfer_data_no_copy(write, stage+offset -> target+offset, chunk_len) end SO->>M: GetDoneReq(get_id) - Note right of M: target allocation -> get_holding\nsource_allocation 释放 - M-->>SO: GetDoneResp(holder_id) - SO-->>RO: SsdStageReadResp(done fields) + Note right of M: requester target allocation -> get_holding\nsource_allocation 释放 + M-->>SO: GetDoneResp(holder_id for requester target) + SO-->>RO: SsdStageReadResp(done_holder_id, done_allocation_mode) end opt source_kind=Memory RO->>RO: transfer_data_no_copy(read, src_addr -> target_addr, len) @@ -334,45 +371,60 @@ sequenceDiagram #### master -master 持有 `get` 的权威路由、在途 allocation 和完成后的 holder authority。 +master 是 `get` 的控制面 authority:`kv_routes` 决定当前 key-version 可以从哪些内存或 SSD 副本读取,`inflight_gets` 记录本次 get 的 source/target allocation,`get_holding` 记录 `GetDone` 后仍被 holder 持有的 requester target allocation。 ```rust pub struct MasterKvRouterInner { + // GetStart 到 GetDone / GetRevoke 期间保留的 get 在途状态。 pub inflight_gets: moka::future::Cache, + // GetDone 后的 holder authority,键由 requester 节点和 holder_id 组成。 pub get_holding: MasterOwnerMemMgr, + // get_start 查询的当前稳定 key-version 路由。 pub kv_routes: DashMap>, ... } pub struct OneKvNodesRoutes { + // 当前稳定版本号,内存副本和 SSD 副本共享它。 pub put_id: PutIDForAKey, + // 内存副本优先作为 get source。 pub nodes_replicas: RwLock>, + // 内存副本不可用时才作为 SSD fallback source。 pub ssd_replicas: RwLock>, pub get_durable_slots_used: AtomicU32, } pub struct KvSsdRouteInfo { + // 持有本地 SSD 副本的 owner。 pub node_id: NodeID, + // 真实 payload 长度;SSD stage 和 transfer 对外只暴露这个长度。 pub len: u64, + // 和内存 route 对齐的节点代际,用于失效判断。 pub tomb_tag: NodeTombTag, } pub struct InflightGetInfo { + // 本次 get 命中的 key-version,用于拒绝过期完成。 pub put_id: PutIDForAKey, + // master 选中的 source 节点;SSD fallback 时是 SSD owner。 pub src_node_id: NodeID, + // 发起 get 的 requester owner,最终 holder 归属使用它。 pub req_node_id: NodeID, pub len: u64, + // requester target allocation,GetDone 后进入 get_holding。 pub allocation: Arc, + // SSD source staging allocation;memory source 路径为空。 pub source_allocation: Option>, pub route: Arc, pub allocation_mode: GetAllocationMode, + // 区分 memory source 和 SSD fallback source。 pub source_kind: GetSourceKind, } ``` -master 处理 `GetStartReq` 时,先查 `kv_routes`。有 live 内存副本时,返回 `GetSourceKind::Memory`。只有内存副本不可用时,master 才从 `ssd_replicas` 里选 SSD owner,并分配两块 CPU segment allocation:`source_allocation` 在 SSD owner 上,`allocation` 在 requester owner 上。`GetStartResp.src_addr` 是 SSD owner 本地对齐 staging 地址,`GetStartResp.target_addr` 是 requester target 地址,`GetStartResp.ssd_stage_len` 是对齐后的 source staging 容量,`GetStartResp.len` 始终是真实 payload 长度。 +master 处理 `GetStartReq` 时先查 `kv_routes`,并优先选择 live 内存副本。命中内存副本时,`GetStartResp` 返回 `GetSourceKind::Memory`,requester owner 按原有 transfer 路径把数据搬到 requester target。只有没有可用内存副本时,master 才从 `ssd_replicas` 里选择 SSD owner,并同时分配两块 allocation:`source_allocation` 位于 SSD owner,用作本地读盘 staging;`allocation` 位于 requester owner,是最终进入 holder 的 target。`GetStartResp.src_addr` 是 SSD owner 本地对齐后的 staging 地址,`target_addr` 是 requester target 地址,`ssd_stage_len` 是对齐后的 source staging 容量,`len` 始终是真实 payload 长度。 -`GetDoneReq` 到达后,master 把 `InflightGetInfo.allocation` 转入 `get_holding`,返回 `holder_id`。memory source 路径的 `GetDoneReq` 由 requester owner 发送;SSD source 路径的 `GetDoneReq` 由 SSD owner 在全部 chunk transfer 完成后发送。master 不依赖 RPC 调用者身份决定 holder 归属,而是使用 `InflightGetInfo.req_node_id` 作为 holder 节点。`InflightGetInfo.source_allocation` 只服务 SSD owner 本地读盘 staging 和 owner-side push,不进入 `get_holding`。 +`GetDoneReq` 到达后,master 从 `inflight_gets` 取出本次 get,把 requester target allocation 转入 `get_holding`,并返回 `holder_id`。memory source 路径由 requester owner 调用 `GetDoneReq`;SSD source 路径由 SSD owner 在全部 chunk transfer 完成后调用。无论谁发起 `GetDoneReq`,holder 都归属 `InflightGetInfo.req_node_id` 对应的 requester owner,SSD owner 的 `source_allocation` 只作为读盘 staging,不进入 `get_holding`。 #### owner @@ -380,7 +432,9 @@ owner 在 `get` 里有两个可能角色:requester owner 负责调用 master ```rust pub struct ClientKvApiInner { + // requester owner 和 SSD owner 都通过它访问本地 SSD cache。 ssd_storage: Option>, + // external get 返回的 holder 在 owner 侧的借用表。 pub external_get_holding: OwnerExternalMemMgr, rpc_caller_get_start: RPCCaller, rpc_caller_get_done: RPCCaller, @@ -391,38 +445,52 @@ pub struct ClientKvApiInner { pub struct SsdStageReadReq { pub key: String, pub put_id: PutIDForAKey, + // SSD owner 用它在全部 chunk transfer 完成后调用 master GetDoneReq。 pub get_id: u64, + // master 在 SSD owner 上分配的 source staging 对齐地址。 pub stage_addr: u64, + // source staging 容量,包含 O_DIRECT 对齐需要的空间。 pub stage_len: u64, + // 最终接收数据的 requester owner。 pub target_node_id: NodeIDString, + // requester target allocation 的绝对地址。 pub target_addr: u64, + // 真实 payload 长度。 pub len: u64, } pub struct SsdStageReadResp { + // master GetDoneResp 的 holder_id 投影。 pub done_holder_id: u64, + // master GetDoneResp 的 allocation_mode 投影。 pub done_allocation_mode: GetAllocationMode, + // master GetDoneResp 的状态字段投影。 pub done_error_code: ErrorCode, pub done_error_json: String, pub done_server_process_us: i64, + // SsdStageRead RPC 自身的状态字段。 pub error_code: ErrorCode, pub error_json: String, } ``` -requester owner 收到 `GetSourceKind::Memory` 后走原有 transfer 分支,然后自己发送 `GetDoneReq`。requester owner 收到 `GetSourceKind::Ssd` 后调用 `stage_kv_from_ssd_source(...)`,该函数返回 `GetDoneResp` 对应字段;requester 跳过自己的 transfer,也跳过自己的 `get_done`,直接用返回的 done 结果构造 holder。 +requester owner 收到 `GetSourceKind::Memory` 时,继续走原有内存 transfer:从 `src_addr` 读,把数据写到 `target_addr`,传输完成后由 requester owner 自己调用 master `GetDoneReq`。收到 `GetSourceKind::Ssd` 时,requester owner 不自己读 SSD,也不自己调用 `get_done`;它向 SSD owner 发起 `SsdStageReadReq`,等待 `SsdStageReadResp` 带回 master `GetDoneResp` 的 holder 字段。 -SSD owner 的 `rpc_ssd_stage_read` task 调用 `load_and_push_kv_from_ssd(...)`。这个函数内部把 `KvSsdStorage::load_into_addr_chunks(...)` 作为生产者,把 `transfer_loaded_ssd_chunks(...)` 作为消费者:生产者 pin 当前 committed entry,按 chunk 把磁盘数据读入 master 分配的 `stage_addr + offset`;消费者每收到一个 `SsdLoadedChunk`,立即用 `transfer_data_no_copy(peer=target_node_id, peer_src_or_target=false, stage_addr + offset, target_addr + offset, chunk_len, None)` push 到 requester target。所有 chunk transfer 成功后,SSD owner 用 `SsdStageReadReq.get_id` 调 master `GetDoneReq`,并把 `GetDoneResp` 拆成 `SsdStageReadResp.done_*` 字段返回给 requester。读路径进入 per-device reader queue,底层 `UringIoEngine` 把 read/write 分成独立发送队列,并按 inflight 比例补读,避免回填读长期排在持续写入之后。 +SSD owner 收到 `SsdStageReadReq` 后,在本地执行 `load_and_push_kv_from_ssd(...)`。read producer 先 pin 当前 committed entry,再按 chunk 从 SSD shard 文件读到 `stage_addr + offset`;transfer consumer 每收到一个 `SsdLoadedChunk`,就把 `stage_addr + offset` 推到 requester 的 `target_addr + offset`。全部 chunk transfer 成功后,SSD owner 用 `get_id` 向 master 调 `GetDoneReq`,再把返回的 `holder_id` 和 `allocation_mode` 填入 `SsdStageReadResp.done_*` 返回 requester。读路径进入 per-device reader queue,底层 `UringIoEngine` 把 read/write 分成独立发送队列,并按 inflight 比例补读,避免回填读长期排在持续写入之后。 ```rust struct SsdRingBuffer { + // key-version 到 Writing/Committed SSD 位置的全局索引。 entries: HashMap, + // active read pin,防止 writer 推进 tail 覆盖正在读取的位置。 read_pins: HashMap, ... } enum SsdEntryState { + // 已分配 offset 但 writev 尚未完成。 Writing(SsdIndexEntry), + // writev 成功后才允许 get_start 作为 SSD source 命中。 Committed(SsdIndexEntry), } ``` @@ -431,40 +499,48 @@ enum SsdEntryState { #### external -external 只发 `ExternalGetReq` 给 owner,并接收 owner 返回的 holder metadata。SSD route、SSD index、source staging allocation 都不会进入 external 进程。 +external 的状态边界只到 owner 返回的 mmap holder:它发 `ExternalGetReq` 给 requester owner,并接收 `ExternalMemHolderInfo { offset, len, holder_id }`。SSD route 由 master 管理,SSD 文件位置和 source staging 由 SSD owner 管理,external 不保存也不更新这些状态。 ```rust pub struct ExternalGetReq { pub key: String, + // external 通过 owner 发起 get,req_node_id 仍指向请求方身份。 pub req_node_id: String, + // owner 代际校验,防止过期 external 请求继续使用旧 owner。 pub started_time: i64, } pub struct ExternalGetResp { pub error_code: ErrorCode, pub error_json: String, + // 成功时返回 external 可见的 holder 元数据。 pub external_memholder_info: Option, } pub struct ExternalMemHolderInfo { + // external attach 到 owner mmap 后可见的 offset。 pub offset: u64, + // 真实 payload 长度。 pub len: u32, + // 后续 release ack 使用的 holder id。 pub holder_id: u64, } pub struct ExternalMemHolder { pub offset: u64, + // 当前 external 进程内 mmap 后的绝对地址。 pub addr: u64, pub len: u32, pub holder_id: u64, pub key: String, pub external_client_id: String, + // drop/release 时校验 owner 代际。 pub owner_start_time: i64, ... } ``` -owner 内部普通 `get` 完成后,会把 external 借用关系写入 `external_get_holding`,再返回 `ExternalMemHolderInfo { offset, len, holder_id }`。external 构造 `ExternalMemHolder` 后只通过 mmap offset/addr 读取结果。holder drop 时,external 发 `ExternalDeleteAckReq` 给 owner;owner 再释放 external 借用,并通过原有 owner -> master holder ack 链路释放 `get_holding`。 +owner 内部完成普通 `get` 后,会把 `MemoryInfo` 写入 `external_get_holding`,用这条 owner 侧引用代表 external 当前仍在借用该 holder;随后 owner 只把 `ExternalMemHolderInfo { offset, len, holder_id }` 返回给 external。external 构造 `ExternalMemHolder` 后,通过 owner mmap 的 `offset/addr` 读取结果。external holder drop 时,会向 owner 发送 `ExternalDeleteAckReq`;owner 删除 `external_get_holding` 中对应记录,释放 external 这一份引用。只有当 owner 侧不再有其它 `Arc` 引用时,`MemoryInfo` drop 才会沿用原有 owner -> master holder ack 链路释放 master `get_holding`。 ### stage 失败和释放 @@ -485,49 +561,16 @@ sequenceDiagram ```rust pub struct GetRevokeReq { + // 要撤销的在途 get。 pub get_id: u64, + // 只有 SSD stage 失败时才置 true,用来删除失败的 SSD source route。 pub drop_ssd_source: bool, } ``` -SSD stage 失败时,请求方调用 `get_revoke_ssd_source(...)`,也就是 `GetRevokeReq { drop_ssd_source: true }`。master 从 `inflight_gets` 找到 `InflightGetInfo`,只有 `source_kind == GetSourceKind::Ssd` 时才会删除 `route.ssd_replicas[src_node_id]`。如果同一 `OneKvNodesRoutes` 下已经没有 live 内存副本和 SSD 副本,master 再删除 `kv_routes` 并异步清理 prefix index。 - -RPC 字段里,`len` 始终是真实 payload 长度;`ssd_stage_len` / `stage_len` 是 SSD direct IO 需要的 staging 容量,通常是 512-byte 对齐后的长度。`target_addr` 只表示 requester target,不再表示 SSD owner 本地 staging。`SsdStageReadReq.get_id` 让 SSD owner 在全部 chunk transfer 完成后替 requester 完成 master `GetDoneReq`;`SsdStageReadResp.done_*` 是 master `GetDoneResp` 的字段投影,供 requester 复用原有 holder 构造逻辑。 +SSD stage 失败时,请求方调用 `get_revoke_ssd_source(...)`,也就是 `GetRevokeReq { drop_ssd_source: true }`。master 从 `inflight_gets` 找到本次 get,只有 `source_kind == GetSourceKind::Ssd` 时才会删除 `route.ssd_replicas[src_node_id]`,避免后续 get 继续选择同一个失败 SSD source。如果同一个 `OneKvNodesRoutes` 下已经没有 live 内存副本和 SSD 副本,master 再删除 `kv_routes` 并异步清理 prefix index。 -```rust -pub struct GetStartResp { - pub get_id: u64, - pub node_id: NodeIDString, - pub put_id: PutIDForAKey, - pub source_kind: GetSourceKind, - pub target_addr: u64, - pub src_addr: u64, - pub len: u64, - pub ssd_stage_len: u64, - ... -} - -pub struct SsdStageReadReq { - pub key: String, - pub put_id: PutIDForAKey, - pub get_id: u64, - pub stage_addr: u64, - pub stage_len: u64, - pub target_node_id: NodeIDString, - pub target_addr: u64, - pub len: u64, -} - -pub struct SsdStageReadResp { - pub done_holder_id: u64, - pub done_allocation_mode: GetAllocationMode, - pub done_error_code: ErrorCode, - pub done_error_json: String, - pub done_server_process_us: i64, - pub error_code: ErrorCode, - pub error_json: String, -} -``` +这里的释放边界是:SSD owner 上的 `source_allocation` 只服务本次 stage,失败后随 `inflight_gets` 清理释放;requester target allocation 没有进入 `get_holding`,因此不会生成用户可见 holder。 ## 关键代码片段 @@ -538,21 +581,19 @@ pub struct SsdStageReadResp { ```rust pub struct PutDoneReq { pub key: String, + // 只提交这个 key-version 的内存副本。 pub put_id: PutIDForAKey, pub lease_id: Option, } +// 这里只把内存 target 写入 nodes_replicas;SSD 副本稍后独立 commit。 one_kv_routes .nodes_replicas .write() .insert(node_id.clone(), completed_info); ``` -这段逻辑用到的字段边界是: - -- `put_id` 由 `OneKvNodesRoutes` 承载,SSD 副本和内存副本共享同一个版本。 -- `nodes_replicas` 代表内存副本 ready;`get_start` 可以立即从这里返回内存 source。 -- `ssd_replicas` 不能在这一步写入,否则 `PutDone` 会被 SSD 延迟拖住。 +这段边界是:`nodes_replicas` 代表内存副本 ready,`get_start` 可以立即从这里返回 memory source;`ssd_replicas` 不能在这一步写入,否则 `PutDone` 会被 SSD persist 延迟拖住。SSD 副本后续用同一个 `put_id` 独立提交。 ### SSD replica 独立 commit @@ -561,13 +602,18 @@ SSD owner 后台 persist 成功后,单独向 master 提交同一个 key-versio ```rust pub struct SsdReplicaCommitReq { pub key: String, + // 必须匹配当前 route 版本,避免 SSD late commit 污染新版本。 pub put_id: PutIDForAKey, + // 完成落盘的 SSD owner。 pub node_id: NodeIDString, + // 真实 payload 长度。 pub len: u64, } if let Some(route) = kv_routes.get(&req.key) { + // 过期 put_id 直接丢弃,不 resurrect 旧版本。 if route.put_id == req.put_id { + // master 只保存 SSD owner 和长度;文件 offset 留在 owner 本地 ring index。 route.ssd_replicas.write().insert( node_id.clone(), KvSsdRouteInfo { @@ -580,22 +626,19 @@ if let Some(route) = kv_routes.get(&req.key) { } ``` -这段逻辑用到的字段边界是: - -- `SsdReplicaCommitReq.put_id` 必须等于当前 `OneKvNodesRoutes.put_id`。 -- `SsdReplicaCommitReq.node_id` 必须对应当前 route 内已经 ready 的内存副本;master 用同一节点的 `KvRouteInfo.tomb_tag` 作为 SSD route 的 tomb 代际。 -- `SsdReplicaCommitReq.len` 记录真实 payload 长度,后续 SSD stage 和 transfer 都按这个长度对外可见。 -- `KvSsdRouteInfo` 不保存 SSD 文件 offset;offset 只在 owner 本地 SSD ring index 中。 -- late commit 命中过期 `put_id` 时直接丢弃,不能 resurrect 旧版本。 +master 只在 `req.put_id == route.put_id` 时写 `ssd_replicas`;过期 `put_id` 的 late commit 会被丢弃,不能 resurrect 旧版本。`SsdReplicaCommitReq.len` 是真实 payload 长度;SSD shard 文件 offset 不进入 master route,只留在 target owner 本地 `SsdRingBuffer`。 ### get_start 分配分布式 SSD staging SSD fallback 发生在 master 已经没有可用 `nodes_replicas` 之后。source staging 一定分配在 SSD owner 的 CPU segment 上,target allocation 一定分配在 requester 的 CPU segment 上。 ```rust +// SSD read 使用 O_DIRECT,读长度先按 512 bytes 对齐。 let ssd_stage_len = align_ssd_io_len(ssd_replica.len)?; +// 额外预留 511 bytes,确保 allocation 内能找到 512-byte 对齐地址。 let source_alloc_len = ssd_stage_len + SSD_ALIGNMENT as u64 - 1; +// source staging 放在 SSD owner 上,只服务本次读盘和 push。 let source_allocation = allocate_get_buffer_on_node( &view, &ssd_replica.node_id, @@ -603,6 +646,7 @@ let source_allocation = allocate_get_buffer_on_node( get_id, "ssd source staging", )?; +// target allocation 放在 requester 上,GetDone 后转成最终 holder。 let target_allocation = allocate_get_buffer_on_node( &view, &req_node_id, @@ -611,16 +655,11 @@ let target_allocation = allocate_get_buffer_on_node( "requesting target", )?; +// 返回给 SSD owner 的是对齐后的 staging 地址,不一定等于 allocation 起点。 let source_addr = align_ssd_stage_addr(source_base + source_allocation.addr())?; ``` -这段逻辑的关键字段关系是: - -- `KvSsdRouteInfo.node_id` 决定 source staging 的 owner。 -- `source_alloc_len = align_up(len, 512) + 511`,保证 allocation 内总能找到 512-byte 对齐的 `src_addr`。 -- `GetStartResp.src_addr` 返回对齐后的绝对地址,不一定等于 `source_allocation` 的起始地址。 -- `InflightGetInfo.source_allocation` 持有原始 allocation,保证对齐后的 `src_addr` 在整个 SSD read/push 期间有效。 -- `InflightGetInfo.allocation` 持有 requester target;memory source 由 requester `get_done` 转成 holder,SSD source 由 SSD owner `get_done` 转成 holder。 +这里的关键边界是:`source_allocation` 在 SSD owner 上,只用于读盘 staging;`target_allocation` 在 requester owner 上,成功 `GetDone` 后进入 `get_holding`。`source_alloc_len = align_up(len, 512) + 511`,保证 allocation 内总能找到 512-byte 对齐的 `src_addr`;`src_addr` 是对齐后的 staging 地址,不一定等于 `source_allocation` 起点。 ### requester 触发 SSD owner stage/push/done @@ -629,6 +668,7 @@ let source_addr = align_ssd_stage_addr(source_base + source_allocation.addr())?; ```rust let mut ssd_done_resp = None; if resp.source_kind == GetSourceKind::Ssd { + // SSD owner 负责读盘、push chunk,并在完成后调用 master GetDoneReq。 let done_resp = self.stage_kv_from_ssd_source( &resp.node_id, key, @@ -644,53 +684,62 @@ if resp.source_kind == GetSourceKind::Ssd { } if resp.source_kind == GetSourceKind::Ssd { - // SSD owner already pushed all chunks to target_addr and called get_done. + // SSD owner 已经把全部 chunk push 到 target_addr,并完成 get_done。 } else { + // memory source 路径仍由 requester 自己做 transfer。 self.view.client_transfer_engine() .transfer_data_no_copy(peer_id, true, resp.src_addr, resp.target_addr, len, None) .await?; } let done_resp = if let Some(done_resp) = ssd_done_resp { + // SSD source 路径复用 SsdStageReadResp 带回的 GetDoneResp 字段。 done_resp } else { + // memory source 路径的 GetDoneReq 仍由 requester 发送。 self.get_done(get_id).await? }; ``` -`stage_kv_from_ssd_source(...)` 的分支只有两个: - -- `source_node_id == self`:本地调用 `load_and_push_kv_from_ssd(...)`,SSD read 生产 chunk,transfer consumer 把每个 chunk 写到本地 `target_addr + offset`,随后直接调用 `get_done(get_id)`。 -- 远端 SSD owner:发送 `SsdStageReadReq`,由 `rpc_ssd_stage_read` task 执行 `load_and_push_kv_from_ssd(...)`,SSD owner 每读出一个 chunk 就 push 到 requester target,全部 chunk transfer 完成后再调 `get_done(get_id)` 并通过 `SsdStageReadResp.done_*` 返回。 +SSD source 路径里,`stage_kv_from_ssd_source(...)` 成功返回时,SSD owner 已经完成读盘、chunk transfer 和 master `GetDoneReq`。requester 因此跳过自己的 transfer 和 `get_done`,直接复用 `SsdStageReadResp.done_*` 构造 holder。memory source 路径仍由 requester 自己 transfer 并调用 `get_done`。 ### SSD chunk read 与 direct/scratch fallback -SSD owner 侧的核心结构是 `SsdLoadedChunk` 和 `ReadCommand`。`SsdLoadedChunk` 是 read producer 交给 transfer consumer 的最小就绪单元;`ReadCommand.file_offset` 让同一个 committed entry 可以按 chunk 提交不同文件偏移的读。 +当前实现只有 SSD 回填读路径会把 payload 切成 chunk;SSD 写入按 value 级别一次写入一个 device 的一个 shard 连续 offset。`SsdLoadedChunk` 是 read producer 交给 transfer consumer 的最小就绪单元;`ReadCommand` 记录本次 chunk 要读的 committed entry、shard 文件 offset 和读入目标。 ```rust pub(crate) struct SsdLoadedChunk { + // 当前 chunk 在完整 payload 中的偏移。 pub offset: u64, + // 当前 chunk 在 SSD owner source staging 中的起始地址。 pub stage_addr: u64, + // 当前 chunk 的真实 payload 长度,不包含 O_DIRECT padding。 pub len: u64, } struct ReadCommand { key: KvSsdKey, + // 已 pin 的 committed entry,里面包含 shard_id、file_offset 和长度。 entry: SsdIndexEntry, + // 这次 chunk read 在 SSD shard 文件内的起始 offset。 file_offset: u64, + // Direct 表示直接读入 staging,Scratch 表示先读入 aligned buffer。 target: ReadTarget, + // 持有 read pin,防止 writer 在读完成前覆盖该位置。 _read_pin: Option, done_tx: oneshot::Sender>, } ``` -`load_and_push_kv_from_ssd(...)` 把 SSD read 和 transfer 并行起来。producer 最多保留 `DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT` 个读 IO;consumer 最多保留同样数量的 transfer future。这样大 payload 场景里,前一个 chunk 还在网络传输时,后续 chunk 可以继续从 SSD 读入 staging。 +`load_and_push_kv_from_ssd(...)` 把 SSD read 和 transfer 做成流水线:producer 按 chunk 提交 SSD read,并把读好的 chunk 放入 ready queue;consumer 收到 ready chunk 后立即 push 到 requester target。读和传输可以重叠,多个 chunk 可以同时处于 read inflight 或 transfer inflight 状态。 ```rust +// ready queue 让 read producer 和 transfer consumer 解耦。 let (chunk_tx, chunk_rx) = ::tokio::sync::mpsc::channel( DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT.saturating_mul(2).max(1), ); +// producer 按 chunk 从 SSD shard 文件读入 source staging。 let producer = store.load_into_addr_chunks( key, put_id, @@ -701,13 +750,16 @@ let producer = store.load_into_addr_chunks( DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT, chunk_tx, ); +// consumer 收到 ready chunk 后立即 push 到 requester target。 let consumer = self.transfer_loaded_ssd_chunks(peer_id, target_addr, chunk_rx); +// 两个 future 并发执行,形成 read-transfer pipeline。 let (producer_res, consumer_res) = ::tokio::join!(producer, consumer); ``` -`load_into_addr_chunks(...)` 先 pin 当前 committed entry,pin 生命周期覆盖整个 producer。每个 chunk 根据 `stage_addr + offset`、`entry.file_offset + offset` 和剩余 staging 容量选择 direct 或 scratch;chunk read 完成后立即发送 `SsdLoadedChunk`。 +`load_into_addr_chunks(...)` 先 pin 当前 committed entry,pin 生命周期覆盖整个 producer。每个 chunk 用 `entry.file_offset + offset` 定位 SSD shard 文件中的读取位置,并根据 staging 地址、文件 offset 和 staging 容量选择 direct 或 scratch;chunk read 完成后立即发送 `SsdLoadedChunk`。 ```rust +// pin 生命周期覆盖整个 producer,writer 不能覆盖 active read 位置。 let (entry, _read_pin) = { let mut inner = self.inner.lock(); let Some(entry) = inner.ring.pin_read(&key) else { @@ -716,23 +768,29 @@ let (entry, _read_pin) = { (entry, SsdReadPin { ... }) }; +// 每个 chunk 在同一个 committed entry 内推进文件 offset。 let file_offset = entry.file_offset + offset; let target = match choose_chunk_read_path(stage_addr, read_len, target_len, file_offset) { + // staging 地址、文件 offset 和 IO 长度都满足对齐时走 direct read。 SsdReadPath::Direct => ReadTarget::Direct { target_addr: stage_addr, len: read_len as usize, }, + // 否则先读到 aligned scratch buffer。 SsdReadPath::Scratch => ReadTarget::Scratch(AlignedBuffer::zeroed(read_len as usize)?), }; +// submit_read_command 根据 entry.shard_id 进入对应 device reader queue。 let output = submit_read_command(key, entry, file_offset, target, None).await?; if let ReadOutput::Scratch(buffer) = output { + // scratch 路径只把真实 payload 长度复制到 staging。 copy_payload_to_stage(buffer, stage_addr, payload_len)?; } +// 下游 transfer 只看到真实 payload 长度。 ready_tx.send(SsdLoadedChunk { offset, stage_addr, len: payload_len }).await?; ``` -direct 路径把 `readv` 的目标直接设为当前 chunk 的 source staging;scratch 路径先读入 aligned buffer,再只复制当前 chunk 的真实 payload 长度到 staging。两条路径最后都只把真实 payload 长度暴露给 transfer 和 `MemHolder`。 +direct 路径把 `readv` 的目标直接设为当前 chunk 的 source staging;scratch 路径先读入 aligned buffer,再只复制当前 chunk 的真实 payload 长度到 staging。两条路径最后都只把真实 payload 长度暴露给 transfer 和 `MemHolder`,不会把 `O_DIRECT` padding 暴露给用户。 ## IO 模型 @@ -759,30 +817,36 @@ flowchart TD | 组件 | 设计 | | --- | --- | -| device root | owner 从 `large_file_paths` 派生 SSD root,创建目录后读取 `metadata.dev()`;同一 device 只保留第一个 root。 | -| shard 文件 | `max_bytes` 是 owner 本地 SSD cache 的容量上限;owner 将它拆成多个本地 ring shard 文件,每个 shard 位于某个有效 device root 的 `shards/` 下,`shard_to_device` 用来把 shard 映射到对应 device worker。 | -| 对齐 | SSD shard 使用 `O_DIRECT` 绕过 page cache,减少大 payload 双重缓存;对应要求 buffer 地址、IO 长度和文件 offset 512-byte 对齐,由 `AlignedBuffer` / `align_ssd_io_len` 保证。 | -| 写队列 | `persist_from_addr` 只把任务送入某个 device 的有界 writer queue;后台 writer 控制 inflight 数量,并只在本 device 的 `shard_ids` 内分配 ring 空间。 | -| 读队列 | `load_into_addr_chunks` 先 pin committed 索引,再按 `entry.shard_id -> shard_to_device` 找到对应 device reader queue。只要 chunk staging 地址、文件 offset 和 staging 容量满足对齐约束,就直接读入目标 staging;否则读到 scratch aligned buffer 后只复制当前 chunk 的真实 payload 长度。 | -| io_uring | 每个有效 device 拥有自己的 `UringIoEngine`,engine 内多个后台线程持有 `IoUring`,使用 `readv/writev` 提交该 device 的 shard 文件 IO。底层每个 uring shard 有独立 read/write 发送队列,按 read/write inflight 比例调度,优先保护 KV 回填读延迟。 | -| 索引状态 | 新写入先进入 `Writing`;只有 IO 完成且 offset 仍有效时才转为 `Committed`。 | -| 位置保护 | `load_into_addr_chunks` 在 producer 生命周期内 pin committed entry;writer 分配新 ring 空间前检查 pinned read 和未完成 `Writing` entry,必要时等待 active IO 释放位置。 | -| ring 失效 | shard head 推进超过容量时推进 tail,并移除被覆盖 key-version 的本地索引。 | +| device root | owner 从 `large_file_paths` 派生 SSD cache root;创建目录后用 `metadata.dev()` 判断真实 device,同一 device 只保留一个有效 root。 | +| shard 文件 | `max_bytes` 是 owner 本地 SSD cache 的容量上限;容量被拆成多个 shard 文件,分布到有效 device root 的 `shards/` 下。`shard_to_device` 记录每个 shard 属于哪个 device。 | +| 写入选路 | `persist_buffer` 用 `next_write_device` round-robin 选择一个 device;一个 payload 只进入这个 device 的 writer queue,并在该 device 的某个 shard 中分配一段连续 `file_offset`。 | +| 读取选路 | committed entry 保存 `shard_id` 和 `file_offset`;读 chunk 时通过 `entry.shard_id -> shard_to_device` 找到 device reader queue,再从对应 shard 文件的 `file_offset + offset` 读取。 | +| per-device worker | 每个有效 device 有独立 writer queue、reader queue 和 `UringIoEngine`;这些 worker 只处理本 device 的 shard 文件 IO。 | +| 对齐与回收 | SSD shard 使用 `O_DIRECT`,要求地址、长度和文件 offset 512-byte 对齐;不满足 direct 条件的读 chunk 走 scratch buffer。ring head/tail 和 read pin 只在 owner 本地保护 shard 文件位置,不进入 master route。 | ## Task / Actor / 独立线程 -SSD 路径里有三层异步执行单元。控制面仍复用 KV 原有 actor;SSD 只为 owner 本地磁盘 IO 增加后台 task 和独立 uring 线程。owner 内部的 SSD task 按去重后的 effective device 创建;多个 `large_file_paths` 如果落在同一个 `metadata.dev()` 上,只创建一组 device worker。 +这一节只列运行时执行单元,不再重复 device/shard 选路。SSD 没有新增独立的 master route actor;控制面仍由原有 master/owner RPC handler 承载。新增的后台执行主要在 owner 本地:每个有效 device 有 writer task、reader task 和对应的 `UringIoEngine` 后台线程。 + +### owner 本地 SSD IO 执行单元 | 执行单元 | 创建位置 | 类型 | 输入 | 职责 | | --- | --- | --- | --- | --- | | `ssd_writer_loop` | `KvSsdStorage::new`,每个 effective device 一个 | `tokio::task::spawn` | `SsdDeviceWorker.write_tx` | 从 `persist_from_addr` 接收写任务,只在本 device 的 `shard_ids` 内调用 `SsdRingBuffer::prepare_write_on_shards`,提交 `writev`,完成后 `commit(Writing -> Committed)`。 | | `ssd_reader_loop` | `KvSsdStorage::new`,每个 effective device 一个 | `tokio::task::spawn` | `SsdDeviceWorker.read_tx` | 从 `load_into_addr_chunks` 接收属于本 device shard 的 chunk 读任务,提交 direct/scratch `readv`,校验 offset 仍有效,完成后回传 chunk 读结果;整条 producer 完成后释放 `SsdReadPin`。 | | `fluxon-kv-ssd-uring-{idx}` | 每个 device 的 `UringIoEngine::new_multi` | `std::thread::spawn` | `read_rx/write_rx: crossbeam::channel` | 每个线程持有一个 `IoUring`,只提交本 device shard 文件的 `Readv/Writev` SQE,并按 read/write inflight 比例调度后回传 CQE。 | + +`KvSsdStorage` 通过每个 `SsdDeviceWorker` 持有 shard fd 和 `UringIoEngine`,保证 fd 与 uring 线程生命周期覆盖 writer/reader task;drop 时关闭 channel 并 join uring 线程。 + +### 控制面 RPC / 清理任务 + +| 执行单元 | 创建位置 | 类型 | 输入 | 职责 | +| --- | --- | --- | --- | --- | | `rpc_ssd_replica_commit` | `MasterKvRouter` RPC handler 注册 | `view.spawn(...)` | `SsdReplicaCommitReq` | owner SSD persist 成功后提交 SSD 副本,master 校验 `put_id` 后写 `ssd_replicas`。 | | `rpc_ssd_stage_read` | `ClientKvApi` RPC handler 注册 | `view.spawn(...)` | `SsdStageReadReq` | 远端 SSD owner 收到 stage 请求后,在 owner 进程内调用 `load_and_push_kv_from_ssd(...)`;SSD read producer 和 transfer consumer 流水线完成后,再调用 master `get_done` 并回传 done fields。 | | `ssd_failure_remove_prefix_index` | `get_revoke(drop_ssd_source=true)` | `view.spawn(...)` | 失败 SSD source 的 key | 当失败 SSD source 是最后一个 live replica 时,异步删除 prefix index。 | -没有单独的 SSD master route actor。SSD route 的权威更新点仍是原有 master RPC handler: +SSD route 的权威更新点仍是原有 master RPC handler: - `PutDone`:同步更新 `nodes_replicas`,让内存副本立即可读。 - `SsdReplicaCommit`:SSD persist 完成后同步更新 `ssd_replicas`,并拒绝过期 `put_id`。 @@ -790,36 +854,17 @@ SSD 路径里有三层异步执行单元。控制面仍复用 KV 原有 actor; - `GetRevoke`:同步删除失败 SSD source;必要时触发 prefix index 小任务。 - `Delete` / 覆盖写失效:复用原有 `delete_broadcast` 管线。 -后台 task 的生命周期绑定 `KvSsdStorage`: - -```rust -for device in deduplicated_device_roots { - let io = Arc::new(UringIoEngine::new_multi(device_shard_fds, cfg)?); - task::spawn(ssd_writer_loop(..., shard_ids.clone())); - task::spawn(ssd_reader_loop(...)); - devices.push(SsdDeviceWorker { shard_ids, _io: io, ... }); -} - -std::thread::Builder::new() - .name(format!("fluxon-kv-ssd-uring-{idx}")) - .spawn(move || UringShard { ... }.run())?; -``` - -`KvSsdStorage` 通过每个 `SsdDeviceWorker` 持有 `_files` 和 `_io`,确保该 device 的 shard fd 与 uring 线程生命周期覆盖所有读写 task。`UringIoEngine::drop` 会关闭 read/write channel,并 join 所有 uring 线程。 - ## 不变量 -- `ssd_replicas` 和 `nodes_replicas` 都属于同一个 `OneKvNodesRoutes.put_id`,不能跨版本复用。 -- `PutDoneReq` 只表示内存副本 ready,不能记录 SSD 副本。 -- master 只有在收到匹配当前 `put_id` 的 `SsdReplicaCommitReq` 后才能记录 SSD 副本。 +- `nodes_replicas` 和 `ssd_replicas` 都属于同一个 `OneKvNodesRoutes.put_id`,不能跨版本复用。 +- `PutDoneReq` 只表示内存副本 ready;SSD 副本只能由匹配当前 `put_id` 的 `SsdReplicaCommitReq` 补充进 `ssd_replicas`。 - `SsdReplicaCommitReq` 是内部控制面 RPC,不改变用户侧 `put/get/delete` API。 -- `GetSourceKind::Ssd` 必须有 source staging allocation,并由 master 持有到 SSD owner 发起的 `get_done` 或 requester 发起的 `get_revoke`。 +- `GetSourceKind::Ssd` 必须同时有 SSD owner source staging 和 requester target allocation;成功后只有 requester target allocation 进入 `get_holding`。 - SSD 回填失败必须通过 `get_revoke(drop_ssd_source=true)` 清理 in-flight get,并从 master 路由里移除失败的 SSD 副本。 -- SSD ring 本地失效后,master 可能短暂保留旧 SSD 路由;下一次 stage 失败会触发主动路由失效。 -- SSD ring tail 推进不能覆盖 active IO:未完成的 `Writing` entry 和 pinned read entry 必须先释放。 -- SSD direct stage 只在目标地址、SSD 内部对齐长度和文件 offset 都满足 512-byte 对齐,且 staging 容量覆盖对齐长度时启用;transfer 和用户可见 `MemHolder` 长度始终保持真实 payload 长度。 - master 路由被删除后,旧 SSD bytes 即使还在 shard 文件里,也不能被公共 `get` 命中。 ## 关键结论 -这套实现把 SSD 做成和 CPU segment 同级的分布式数据源副本,但不新增并行的用户 API 或传输协议。写入侧先用 `PutDone` 提交内存 route,再由 target owner 本地异步写入 SSD,写成功后用 `SsdReplicaCommitReq` 补交 SSD route;读取侧先走内存副本,内存副本不可用时由 SSD owner 把本地 shard ring 中的 committed entry 按 chunk 读入 source staging,并边读边 transfer 到 requester target。owner 内部的分片 ring、`O_DIRECT` 对齐、`io_uring` 队列、`Writing/Committed` 两阶段索引、read pin、direct/scratch read 和 read/transfer pipeline 都服务于同一个目标:让 SSD 成为可回填的数据源,同时保持原有 KV 路由、allocation、transfer 和 holder 生命周期不变。后续重点是批量 SSD stage、批量 transfer、小窗口 staging allocation 和 pipeline 观测指标。 +这套实现把 SSD 作为内存 KV 之外的可回填数据源副本,而不是新增一套用户 API。写入侧先完成内存 `PutDone`,再由 target owner 异步落 SSD,并通过 `SsdReplicaCommitReq` 补充 SSD route;读取侧优先使用内存副本,内存副本不可用时由 SSD owner 从本地 shard 文件读出数据,按 chunk push 到 requester target,再复用原有 `get_done` 和 holder 生命周期。 + +因此,SSD 相关的 shard ring、`O_DIRECT`、`io_uring`、read pin 和 read/transfer pipeline 都限制在 owner 本地实现内;master 只保存这个 key-version 有哪些 owner 持有 SSD 副本,以及 value 的真实 payload 长度,不保存 SSD 文件 offset、shard_id 或本地 ring 状态。 From 59999727b20157674faf46766fe8694bcfc7c6bb Mon Sep 17 00:00:00 2001 From: zTz01 <1773266173@qq.com> Date: Sun, 5 Jul 2026 17:49:13 +0800 Subject: [PATCH 4/4] docs: update SSD KV storage design --- ...\255\230\345\202\250\350\256\276\350\256\241.md" | 13 ------------- 1 file changed, 13 deletions(-) diff --git "a/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" "b/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" index 0f3ac5a..77175cc 100644 --- "a/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" +++ "b/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" @@ -125,19 +125,6 @@ sequenceDiagram ``` -## 当前实现 - -| 模块 | 职责 | -| --- | --- | -| `fluxon_kv/src/config.rs` | 解析 `fluxonkv_spec.ssd_storage.max_bytes`,禁止 external 声明该字段,派生 SSD 根目录。 | -| `fluxon_kv/src/kv_ssd_storage.rs` | owner 内部 SSD cache。使用 shard 文件、`O_DIRECT`、`io_uring`、有界读写队列和两阶段索引管理 key-version bytes。 | -| `client_kv_api/put.rs` | owner 是最终 target 时,先通过 `PutDoneReq` 提交内存副本;SSD persist 由 master 的后台 `SsdReplicaPersistReq` 触发,owner 完成本地落盘后再通过独立 SSD commit 上报。 | -| `client_kv_api/get.rs` | `GetSourceKind::Ssd` 时,请求方让 SSD owner stage、push 并完成 `get_done`;stage RPC 成功后跳过请求方 transfer,也跳过请求方 `get_done`。 | -| `client_kv_api/msg_pack.rs` | 定义 `SsdStageReadReq/SsdStageReadResp` 和 `SsdReplicaPersistReq/SsdReplicaPersistResp`,分别用于 SSD stage 读、回传 done 结果,以及 master 触发 owner 本地 SSD persist。 | -| `master_kv_router/put.rs` | `put_done` 只提交内存副本,随后异步发起 `SsdReplicaPersistReq`;`SsdReplicaCommitReq` 单独写 `ssd_replicas`。 | -| `master_kv_router/get.rs` | 内存副本优先;无内存副本时从 `ssd_replicas` 中选择可用 owner,分配 source staging 和 requester target。 | -| `master_kv_router/delete.rs` | 内存副本被驱逐时,如果同 key-version 仍有 SSD 副本,保留 `kv_routes`。 | - ## 接口里的角色分工 SSD 逻辑按接口看最清楚:`put` 先让一个 key-version 的内存副本 ready,再异步补交 SSD 副本;`get` 决定读请求先走内存副本还是 SSD fallback。每个接口里再分 master、owner、external 三个角色看状态归属。