diff --git "a/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" "b/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" new file mode 100644 index 0000000..77175cc --- /dev/null +++ "b/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" @@ -0,0 +1,857 @@ +# KV 设计 5 - SSD 存储 + +## 设计目标 + +SSD 存储在 Fluxon KV 中作为 owner 本地 backing tier 接入通用 KV 链路。它不是一套独立的读写 API,也不改变用户侧 `put/get/delete` 语义;master 仍然以 key-version 为单位维护路由,内存副本是第一数据源,SSD 副本是内存副本不可用时的回填数据源。 + +读取侧采用“内存优先、SSD 回填”的设计。`GetStart` 优先选择 live 内存副本;没有可用内存副本时,master 才选择 SSD owner,并分配 SSD owner 本机 source staging 和 requester target。SSD owner 从本地 SSD 读入 source staging,再复用现有 transfer engine 把数据推到 requester target,最后继续使用原有 `GetDone` 和 `MemHolder` 生命周期。 + +## 公共契约 + +公共配置只有一个 owner-only 字段: + +```yaml +fluxonkv_spec: + large_file_paths: [/data/fluxon_large] + ssd_storage: + max_bytes: 4294967296 +``` + +规则: + +- `ssd_storage` 缺省或为 `null` 时不启用 SSD。 +- `max_bytes` 必须大于或等于 512 bytes,满足当前 `O_DIRECT` 对齐约束。 +- zero-contribution external 禁止声明 `ssd_storage`;external 只能通过 owner 的 mmap、RPC 和 transfer surface 访问 SSD 回填结果。 +- 实际目录为每个可用 `large_file_root` 下的 `_cluster_kv_ssd_storage//`;owner 启动时创建目录并读取 `metadata.dev()`,同一个 device 只保留第一个 root,避免多个路径指向同一块盘时制造虚假的 IO 并行度。 +- 用户侧 `put/get/delete` API 不因 SSD 增加新入口;SSD 副本是 master 路由内部能力。 + +## 范围边界 + +| 范围 | 当前结论 | +| --- | --- | +| 分布式 SSD 读取 | 已接入。读取 key 时,master 仍优先选择可用内存副本;没有可用内存副本时,才选择持有 SSD 副本的 owner。磁盘数据先读到 SSD owner 本机的 source staging,再传到请求方 owner 的 target allocation。 | +| owner 内部多 SSD 路径 | 已接入。owner 可通过多个 `large_file_paths` 使用多块本地 SSD;路径会先按实际 device 去重,只有落在不同 device 上的 SSD cache root 目录才会创建独立读写队列、`UringIoEngine` 和 shard 文件集。 | +| 内存 KV 复用 | 已复用。SSD 回填继续走现有 KV transfer 链路:SSD owner 按 chunk 读出数据后,通过 `transfer_data_no_copy` 写到请求方 target;全部 chunk 完成后,SSD owner 向 master 提交 `get_done`,用户侧仍通过普通 `get` 拿到 `MemHolder`,不需要调用 SSD 专用接口。 | +| SSD 写入 IO 模型 | 已接入。owner 完成内存 `PutDone` 后,再异步把同一份 payload 写入本地 SSD。SSD 写入在 `KvSsdStorage` 内完成,使用 shard ring、`O_DIRECT`、`io_uring`、有界队列和 `Writing -> Committed` 两阶段提交。 | +| ring 位置生命周期 | 已接入。SSD 读写会保护正在使用的物理位置:读 IO 提交前会 pin 已提交的 entry;未完成写入的 `Writing` entry 和正在读取的 pinned entry 都不会被新的写入覆盖。 | +| 大 payload direct stage | 已接入 aligned fast path 和 chunk pipeline。master 给 SSD source staging 多分配最多 511 bytes,并在 allocation 内返回 512-byte 对齐后的 `src_addr`;SSD read 按 chunk 对齐 IO 长度直接写入 staging,chunk ready 后立刻 transfer,`MemHolder` 仍只使用真实 payload 长度。 | +| 冷启动恢复 | 当前不支持。owner 启动时不会扫描已有 SSD shard 来重建 master 路由;SSD 副本路由只来自本轮运行期间的 `put/get/delete` 生命周期。 | +| lease key 专门治理 | 当前没有专用策略。带 lease 的 key 和普通 key 使用同一套 key-version 路由与 SSD 副本生命周期,SSD 层不单独维护 lease 过期扫描或清理规则。 | +| 独立 SSD 路径参数 | 不提供。SSD cache 目录统一从 owner 的 `large_file_paths` 派生,不再增加单独的 SSD 路径配置,避免日志、共享 bundle、FS disk cache 和 KV SSD cache 出现多套路径来源。 | + +## 数据流 + +```mermaid +flowchart TD + A["owner put target allocation"] --> B["write bytes into owner mmap"] + B --> G["owner -> master PutDone(memory_ready)"] + G --> H["master route: nodes_replicas"] + B --> C["async KvSsdStorage.persist_from_addr(key, put_id, addr, len)"] + C --> D["copy payload to 512-byte aligned buffer"] + D --> E["per-device writer queue"] + E --> E2["SsdRingBuffer 分配 shard_id + file_offset,记录 Writing entry"] + E2 --> E3["O_DIRECT + io_uring writev 写入 SSD shard 文件"] + E3 --> F["提交索引:Writing -> Committed"] + F --> I["owner -> master SsdReplicaCommit"] + I --> J["master route: ssd_replicas"] + + J["get_start"] --> K{"live memory replica?"} + K -->|yes| L["return GetSourceKind::Memory"] + L --> M["existing transfer path"] + + K -->|no| N{"live SSD replica?"} + N -->|yes| O["allocate source staging on SSD owner"] + O --> P["allocate target on requester"] + P --> Q["return GetSourceKind::Ssd"] + Q --> R0["SsdRingBuffer pin committed entry"] + R0 --> R1["根据 entry.shard_id 找到 device reader queue"] + R1 --> R["从 SSD shard 文件按 file_offset 读取 chunk"] + R --> S["SsdLoadedChunk(offset,len)"] + S --> W["SSD owner transfer chunk: staging+offset -> requester target+offset"] + W --> T["all chunks done: SSD owner -> master GetDoneReq"] + T --> V["SsdStageReadResp carries GetDoneResp fields"] + + N -->|no| U["KeyNotFound"] +``` + +## 端到端调用时序 + +SSD 路径只在两个位置扩展主链路:`put_done` 提交内存副本后,owner 异步把本地 target allocation 落到 SSD,并在完成后单独提交 SSD 副本;`get_start` 找不到可用内存副本时,master 为 SSD owner 分配 source staging,再由 SSD owner 按 chunk 把磁盘数据读入 staging 并 push 到 requester target。`get_done` 和 `MemHolder` 生命周期仍复用原有内存 KV 逻辑。SSD 回填时,最终 holder 对应的是请求方 owner 上的 target allocation;SSD owner 只负责从本地 SSD 读出数据、把全部 chunk 传到请求方 target,并在传输完成后向 master 调用 `GetDoneReq`。master 返回的 holder 字段会由 SSD owner 放入 `SsdStageReadResp` 带回请求方,请求方再用这些字段构造普通 `MemHolder`。 + +```mermaid +sequenceDiagram + participant C as requester owner + participant M as master + participant SO as SSD owner + participant TE as transfer engine + participant SSD as SSD shard files / SsdRingBuffer + + C->>M: PutStartReq(key, len) + M-->>C: PutStartResp(target allocation) + Note over C: payload 写入 target allocation + C->>M: PutDoneReq(memory_ready) + Note right of M: nodes_replicas 写入内存副本\nkey-version 立即可读\nspawn post_put_ssd_replica_persist + M-->>C: PutDoneResp + M->>C: async SsdReplicaPersistReq(key, put_id, target_addr, len) + C->>SSD: KvSsdStorage.persist_from_addr(...) + Note over SSD: aligned buffer -> per-device writer queue\nSsdRingBuffer 分配 shard_id + file_offset\nO_DIRECT + io_uring writev 写入 SSD shard 文件\nWriting -> Committed + C->>M: SsdReplicaCommitReq(key, put_id, node_id, len) + Note right of M: ssd_replicas 写入 SSD 副本 + + C->>M: GetStartReq(key) + alt live memory replica exists + M-->>C: GetStartResp(source_kind=Memory, src_addr, target_addr) + else only SSD replica exists + Note right of M: 在 SSD owner CPU segment 分配 source_allocation\n在 requester CPU segment 分配 target allocation + M-->>C: GetStartResp(source_kind=Ssd, src_addr, target_addr, ssd_stage_len) + C->>SO: SsdStageReadReq(get_id, stage_addr=src_addr, stage_len=ssd_stage_len, target_node_id, target_addr, len) + SO->>SSD: load_into_addr_chunks(key, put_id, stage_addr, len, stage_len) + Note over SSD: SsdRingBuffer pin committed entry\n按 entry.shard_id 进入 device reader queue\n从 SSD shard 文件按 file_offset 读取 chunk + loop each ready chunk + SSD-->>SO: SsdLoadedChunk(offset, stage_addr+offset, chunk_len) + SO->>TE: transfer_data_no_copy(write, stage+offset -> target+offset, chunk_len) + end + SO->>M: GetDoneReq(get_id) + Note right of M: target allocation 进入 get_holding\nsource_allocation 释放 + M-->>SO: GetDoneResp(holder_id for requester target) + SO-->>C: SsdStageReadResp(done_holder_id, done_allocation_mode) + end + opt source_kind=Memory + C->>TE: transfer_data_no_copy(read, src_addr -> target_addr, len) + C->>M: GetDoneReq(get_id) + Note right of M: target allocation 进入 get_holding + M-->>C: GetDoneResp(holder_id) + end +``` + + +## 接口里的角色分工 + +SSD 逻辑按接口看最清楚:`put` 先让一个 key-version 的内存副本 ready,再异步补交 SSD 副本;`get` 决定读请求先走内存副本还是 SSD fallback。每个接口里再分 master、owner、external 三个角色看状态归属。 + +### put + +```mermaid +sequenceDiagram + participant E as external + participant O as owner + participant M as master + participant SSD as owner SSD store + + E->>O: ExternalPutStartReq(key, len) + O->>M: PutStartReq(key, len) + Note right of M: 分配 put_id 和 src/target allocation\n记录 inflight_puts + M-->>O: PutStartResp(put_id, src_addr, target_addr) + O-->>E: ExternalPutStartResp(offsets, put_id) + + Note over E,O: external 写 owner mmap/staging + E->>O: ExternalPutTransferEndReq(put_id) + O->>O: transfer_data_no_copy if remote target + O->>M: PutDoneReq(memory_ready) + Note right of M: 写 nodes_replicas\nkey-version 立即可读\nspawn post_put_ssd_replica_persist + M-->>O: PutDoneResp + O-->>E: ExternalPutTransferEndResp + M->>O: async SsdReplicaPersistReq(key, put_id, target_addr, len) + O->>SSD: persist_from_addr(key, put_id, target_addr, len) + Note over SSD: device write_tx -> per-device ssd_writer_loop -> io_uring writev\nWriting -> Committed + O->>M: SsdReplicaCommitReq(key, put_id, node_id, len) + Note right of M: 写 ssd_replicas +``` + +#### master + +master 持有 `put` 的权威控制面状态:`inflight_puts` 记录未完成写入,`kv_routes` 记录提交后的当前版本。当前实现里 `PutDoneReq` 只表示内存副本 ready;SSD 副本通过独立 `SsdReplicaCommitReq` 进入 route。 + +当前协议结构如下。 + +```rust +pub struct MasterKvRouterInner { + // PutStart 到 PutDone / PutRevoke 期间保留的 put 在途状态。 + pub inflight_puts: moka::future::Cache<(String, u64, u32), InflightPutInfo>, + // 已提交 key-version 的权威路由表。 + pub kv_routes: DashMap>, + ... +} + +pub struct InflightPutInfo { + // 放置策略最终选中的 target owner。 + pub node_id: NodeID, + pub key: String, + // 发起这次 put 的原始请求节点。 + pub req_node_id: NodeID, + pub len: u64, + // PutDone 前保留 source / target allocation,避免内存被提前释放。 + pub src_target_allocation: Arc>>, +} + +pub struct OneKvNodesRoutes { + // 当前已提交 value 的稳定版本号。 + pub put_id: PutIDForAKey, + // 内存副本路由;PutDone 成功后立即写入。 + pub nodes_replicas: RwLock>, + // SSD 副本路由;只记录 owner 和长度,不保存本地文件 offset。 + pub ssd_replicas: RwLock>, + ... +} + +pub struct PutDoneReq { + pub key: String, + // 和当前 route 版本匹配时,才提交内存副本。 + pub put_id: PutIDForAKey, + pub lease_id: Option, +} + +pub struct SsdReplicaCommitReq { + pub key: String, + // SSD late commit 必须用这个版本号防止污染新 route。 + pub put_id: PutIDForAKey, + // 完成 SSD persist 的 owner 节点。 + pub node_id: NodeIDString, + // 真实 payload 长度;SSD 文件 offset 只保存在 owner 本地。 + pub len: u64, +} +``` + +`PutStartReq` 到达 master 后,master 分配 `put_id` 和源/目标 allocation,并把 allocation 放进 `InflightPutInfo.src_target_allocation`。`PutDoneReq` 到达时,master 只把 target allocation 写入 `nodes_replicas`,此时 key-version 已经可被 `get` 命中。SSD owner 后续完成落盘后再发 `SsdReplicaCommitReq`,master 校验 `kv_routes[key].put_id == put_id` 后,把 `KvSsdRouteInfo { node_id, len, tomb_tag }` 写入同一个 `OneKvNodesRoutes.ssd_replicas`。master 不保存 SSD 文件 offset,也不保存 owner 本地 ring index。 + +#### owner + +owner 持有数据面:本机 CPU segment、可选 SSD store、put transfer 和 SSD persist。当前实现里,SSD persist 发生在 master 收到 `PutDoneReq` 并提交内存路由之后,不能阻塞内存副本 ready。 + +当前 owner 字段如下。 + +```rust +pub struct ClientKvApiInner { + // owner 本地可选 SSD cache;external 不直接持有它。 + ssd_storage: Option>, + rpc_caller_put_start: RPCCaller, + rpc_caller_put_done: RPCCaller, + rpc_caller_ssd_replica_commit: RPCCaller, + ... +} + +pub struct SsdReplicaPersistReq { + pub key: String, + pub put_id: PutIDForAKey, + // 已经 PutDone 的内存 target 绝对地址,owner 从这里复制 payload 到 SSD。 + pub target_addr: u64, + pub len: u64, +} + +pub struct KvSsdStorage { + // 按 device 去重后的 SSD cache root 目录。 + root_dirs: Vec, + // 每个有效 device 对应一个读写 worker。 + devices: Vec, + // shard_id 到 device worker 的映射,读路径按它选择 reader queue。 + shard_to_device: Vec, + // 写入按有效 device 做 round-robin。 + next_write_device: AtomicUsize, + // 全部 shard ring 和 key-version 索引的共享状态。 + inner: Arc>, + // ring 空间被 active IO 占住时,用它通知 writer 重试。 + space_notify: Arc, +} + +struct SsdDeviceWorker { + // Linux metadata.dev() 得到的实际 device 标识。 + device_id: u64, + root_dir: PathBuf, + // 这个 device 负责的 shard 文件编号。 + shard_ids: Vec, + // 持有 shard 文件 fd,保证 uring IO 生命周期内 fd 有效。 + _files: Vec, + // 这个 device 独立的 io_uring engine。 + _io: Arc, + // per-device 写队列。 + write_tx: tokio_mpsc::Sender, + // per-device 读队列。 + read_tx: tokio_mpsc::Sender, +} + +struct KvSsdStorageInner { + // 管理各 shard 文件内的环形 offset 空间和 key-version 索引。 + ring: SsdRingBuffer, +} +``` + +当 master 把这次 put 的最终 target allocation 放在某个 owner 上时,这个 owner 就是该 key-version 的内存副本 owner。`PutDoneReq` 只把这个 target allocation 提交到 `nodes_replicas`;提交完成后,这个 key-version 已经可以被普通 `get` 读到。SSD 落盘不在 `PutDoneReq` 的同步路径里;master 会在后台 task 中向同一个 target owner 发送 `SsdReplicaPersistReq { key, put_id, target_addr, len }`。这个后台 task 会继续持有 target allocation 的 `Arc`,保证 owner 从内存复制 payload 到 SSD 之前,这块内存不会被释放或复用。 + +target owner 收到 `SsdReplicaPersistReq` 后,从 `target_addr` 指向的内存 target 复制完整 payload,并构造 512-byte 对齐的 `AlignedBuffer`。随后 `persist_buffer` 按 value 级别通过 `next_write_device` round-robin 选择一个有效 device 的 `write_tx`;当前实现不会把同一个 payload 拆到多个 device。该 device 的 `ssd_writer_loop` 只在自己的 `shard_ids` 中选择一个 shard,由 `SsdRingBuffer::prepare_write_on_shards(...)` 为整个 aligned payload 分配一段连续 `file_offset`,并先记录 `Writing(SsdIndexEntry)`。对应 device 的 `UringIoEngine` 对这个 shard 文件执行 `O_DIRECT + writev`;写入成功后,entry 才从 `Writing` 提交为 `Committed`。最后 owner 向 master 发送 `SsdReplicaCommitReq`;master 校验请求里的 `put_id` 与当前内存 route 的 `put_id` 相同后,才会把这个 key-version 的 SSD 副本补充进 `ssd_replicas`。写队列和底层 uring 队列都是有界队列;如果 SSD 变慢,背压只停在 owner 本地 SSD persist 路径,不会回头改变已经完成的内存 `PutDone` 语义。 + +#### external + +external 的状态边界只到 owner mmap 写入:它保存本次 put 所需的 `key`、`len`、`put_id` 和 mmap offset。SSD route 由 master 管理,SSD 文件位置由 target owner 本地 `SsdRingBuffer` 管理,external 不保存也不更新这些状态。 + +```rust +pub struct ExternalPutStartReq { + pub key: String, + pub len: u64, + // 透传给 master PutStart,用于拒绝同 key 并发 put。 + pub reject_if_inflight_same_key: bool, + // 透传给 master 放置策略,影响 target owner 选择。 + pub preferred_sub_cluster: Option, + // owner 代际校验,防止旧 external 请求提交到新 owner。 + pub started_time: i64, + pub test_observe_put_phases: bool, +} + +pub struct ExternalPutTransferEndReq { + pub key: String, + pub len: u64, + // external 实际写入的 owner mmap offset;远端 target 时它是本地 staging。 + pub src_offset: u64, + // 本地 target 时等于最终 target;远端 target 时由 owner 内部上下文修正。 + pub target_offset: u64, + // 远端 target owner;本地 target 时为空。 + pub peer_id: Option, + // 远端 target owner 的 base addr;本地 target 时为空。 + pub target_base_addr: Option, + // ExternalPutStart 返回的版本号,TransferEnd 用它完成 PutDone。 + pub put_id: Option, + pub lease_id: Option, + pub started_time: i64, + pub test_observe_put_phases: bool, +} +``` + +external put 仍然是 `ExternalPutStart -> 写 owner mmap -> ExternalPutTransferEnd`。`ExternalPutTransferEndResp` 只代表内存提交完成;SSD 是否启用、何时 persist 成功、何时写入 `ssd_replicas` 都由 owner 和 master 的内部 commit 协议决定。external 只通过 `started_time` 做 owner 代际校验,避免把旧代际请求提交给新 owner。 + +### get + +```mermaid +sequenceDiagram + participant E as external + participant RO as requester owner + participant M as master + participant SO as SSD owner + participant TE as transfer engine + participant SSD as owner SSD store + + E->>RO: ExternalGetReq(key) + RO->>M: GetStartReq(key) + alt memory replica exists + M-->>RO: GetStartResp(source_kind=Memory) + else SSD fallback + Note right of M: 在 SSD owner 分配 source_allocation\n在 requester owner 分配 target allocation\n写 inflight_gets + M-->>RO: GetStartResp(source_kind=Ssd, src_addr, target_addr, ssd_stage_len) + RO->>SO: SsdStageReadReq(get_id, stage_addr=src_addr, stage_len=ssd_stage_len, target_node_id, target_addr, len) + SO->>SSD: load_into_addr_chunks(key, put_id, stage_addr, len, stage_len) + Note over SSD: pin committed entry\nproducer 按 chunk readv direct 或 scratch fallback + loop each ready chunk + SSD-->>SO: SsdLoadedChunk(offset, stage_addr+offset, chunk_len) + SO->>TE: transfer_data_no_copy(write, stage+offset -> target+offset, chunk_len) + end + SO->>M: GetDoneReq(get_id) + Note right of M: requester target allocation -> get_holding\nsource_allocation 释放 + M-->>SO: GetDoneResp(holder_id for requester target) + SO-->>RO: SsdStageReadResp(done_holder_id, done_allocation_mode) + end + opt source_kind=Memory + RO->>RO: transfer_data_no_copy(read, src_addr -> target_addr, len) + RO->>M: GetDoneReq(get_id) + Note right of M: target allocation -> get_holding + M-->>RO: GetDoneResp(holder_id) + end + RO-->>E: ExternalGetResp(ExternalMemHolderInfo) +``` + +#### master + +master 是 `get` 的控制面 authority:`kv_routes` 决定当前 key-version 可以从哪些内存或 SSD 副本读取,`inflight_gets` 记录本次 get 的 source/target allocation,`get_holding` 记录 `GetDone` 后仍被 holder 持有的 requester target allocation。 + +```rust +pub struct MasterKvRouterInner { + // GetStart 到 GetDone / GetRevoke 期间保留的 get 在途状态。 + pub inflight_gets: moka::future::Cache, + // GetDone 后的 holder authority,键由 requester 节点和 holder_id 组成。 + pub get_holding: MasterOwnerMemMgr, + // get_start 查询的当前稳定 key-version 路由。 + pub kv_routes: DashMap>, + ... +} + +pub struct OneKvNodesRoutes { + // 当前稳定版本号,内存副本和 SSD 副本共享它。 + pub put_id: PutIDForAKey, + // 内存副本优先作为 get source。 + pub nodes_replicas: RwLock>, + // 内存副本不可用时才作为 SSD fallback source。 + pub ssd_replicas: RwLock>, + pub get_durable_slots_used: AtomicU32, +} + +pub struct KvSsdRouteInfo { + // 持有本地 SSD 副本的 owner。 + pub node_id: NodeID, + // 真实 payload 长度;SSD stage 和 transfer 对外只暴露这个长度。 + pub len: u64, + // 和内存 route 对齐的节点代际,用于失效判断。 + pub tomb_tag: NodeTombTag, +} + +pub struct InflightGetInfo { + // 本次 get 命中的 key-version,用于拒绝过期完成。 + pub put_id: PutIDForAKey, + // master 选中的 source 节点;SSD fallback 时是 SSD owner。 + pub src_node_id: NodeID, + // 发起 get 的 requester owner,最终 holder 归属使用它。 + pub req_node_id: NodeID, + pub len: u64, + // requester target allocation,GetDone 后进入 get_holding。 + pub allocation: Arc, + // SSD source staging allocation;memory source 路径为空。 + pub source_allocation: Option>, + pub route: Arc, + pub allocation_mode: GetAllocationMode, + // 区分 memory source 和 SSD fallback source。 + pub source_kind: GetSourceKind, +} +``` + +master 处理 `GetStartReq` 时先查 `kv_routes`,并优先选择 live 内存副本。命中内存副本时,`GetStartResp` 返回 `GetSourceKind::Memory`,requester owner 按原有 transfer 路径把数据搬到 requester target。只有没有可用内存副本时,master 才从 `ssd_replicas` 里选择 SSD owner,并同时分配两块 allocation:`source_allocation` 位于 SSD owner,用作本地读盘 staging;`allocation` 位于 requester owner,是最终进入 holder 的 target。`GetStartResp.src_addr` 是 SSD owner 本地对齐后的 staging 地址,`target_addr` 是 requester target 地址,`ssd_stage_len` 是对齐后的 source staging 容量,`len` 始终是真实 payload 长度。 + +`GetDoneReq` 到达后,master 从 `inflight_gets` 取出本次 get,把 requester target allocation 转入 `get_holding`,并返回 `holder_id`。memory source 路径由 requester owner 调用 `GetDoneReq`;SSD source 路径由 SSD owner 在全部 chunk transfer 完成后调用。无论谁发起 `GetDoneReq`,holder 都归属 `InflightGetInfo.req_node_id` 对应的 requester owner,SSD owner 的 `source_allocation` 只作为读盘 staging,不进入 `get_holding`。 + +#### owner + +owner 在 `get` 里有两个可能角色:requester owner 负责调用 master,并根据 `GetSourceKind` 选择 memory transfer 或 SSD stage RPC;SSD owner 负责响应 `SsdStageReadReq`,读取本地 SSD,把读出的 bytes 按 chunk push 到 requester target,并在全部 chunk transfer 完成后向 master 发送 `GetDoneReq`。 + +```rust +pub struct ClientKvApiInner { + // requester owner 和 SSD owner 都通过它访问本地 SSD cache。 + ssd_storage: Option>, + // external get 返回的 holder 在 owner 侧的借用表。 + pub external_get_holding: OwnerExternalMemMgr, + rpc_caller_get_start: RPCCaller, + rpc_caller_get_done: RPCCaller, + rpc_caller_ssd_stage_read: RPCCaller, + ... +} + +pub struct SsdStageReadReq { + pub key: String, + pub put_id: PutIDForAKey, + // SSD owner 用它在全部 chunk transfer 完成后调用 master GetDoneReq。 + pub get_id: u64, + // master 在 SSD owner 上分配的 source staging 对齐地址。 + pub stage_addr: u64, + // source staging 容量,包含 O_DIRECT 对齐需要的空间。 + pub stage_len: u64, + // 最终接收数据的 requester owner。 + pub target_node_id: NodeIDString, + // requester target allocation 的绝对地址。 + pub target_addr: u64, + // 真实 payload 长度。 + pub len: u64, +} + +pub struct SsdStageReadResp { + // master GetDoneResp 的 holder_id 投影。 + pub done_holder_id: u64, + // master GetDoneResp 的 allocation_mode 投影。 + pub done_allocation_mode: GetAllocationMode, + // master GetDoneResp 的状态字段投影。 + pub done_error_code: ErrorCode, + pub done_error_json: String, + pub done_server_process_us: i64, + // SsdStageRead RPC 自身的状态字段。 + pub error_code: ErrorCode, + pub error_json: String, +} +``` + +requester owner 收到 `GetSourceKind::Memory` 时,继续走原有内存 transfer:从 `src_addr` 读,把数据写到 `target_addr`,传输完成后由 requester owner 自己调用 master `GetDoneReq`。收到 `GetSourceKind::Ssd` 时,requester owner 不自己读 SSD,也不自己调用 `get_done`;它向 SSD owner 发起 `SsdStageReadReq`,等待 `SsdStageReadResp` 带回 master `GetDoneResp` 的 holder 字段。 + +SSD owner 收到 `SsdStageReadReq` 后,在本地执行 `load_and_push_kv_from_ssd(...)`。read producer 先 pin 当前 committed entry,再按 chunk 从 SSD shard 文件读到 `stage_addr + offset`;transfer consumer 每收到一个 `SsdLoadedChunk`,就把 `stage_addr + offset` 推到 requester 的 `target_addr + offset`。全部 chunk transfer 成功后,SSD owner 用 `get_id` 向 master 调 `GetDoneReq`,再把返回的 `holder_id` 和 `allocation_mode` 填入 `SsdStageReadResp.done_*` 返回 requester。读路径进入 per-device reader queue,底层 `UringIoEngine` 把 read/write 分成独立发送队列,并按 inflight 比例补读,避免回填读长期排在持续写入之后。 + +```rust +struct SsdRingBuffer { + // key-version 到 Writing/Committed SSD 位置的全局索引。 + entries: HashMap, + // active read pin,防止 writer 推进 tail 覆盖正在读取的位置。 + read_pins: HashMap, + ... +} + +enum SsdEntryState { + // 已分配 offset 但 writev 尚未完成。 + Writing(SsdIndexEntry), + // writev 成功后才允许 get_start 作为 SSD source 命中。 + Committed(SsdIndexEntry), +} +``` + +`read_pins` 是 owner 本地 SSD ring 的生命周期保护,防止 writer 推进 tail 时覆盖 active read。chunk pipeline 在整个 producer 生命周期内持有同一个 read pin;每个 chunk 单独提交 read task。direct read 条件满足时,`readv` 直接写到 `SsdStageReadReq.stage_addr + offset`;否则先读 scratch aligned buffer,再复制当前 chunk 的真实 payload 长度到 staging。direct read 省掉的是 scratch buffer 到 source staging 的本机 memcpy,不省掉 `source staging -> requester target` 的 transfer。请求方 target 是否远端不影响 SSD direct read 的对齐判断。 + +#### external + +external 的状态边界只到 owner 返回的 mmap holder:它发 `ExternalGetReq` 给 requester owner,并接收 `ExternalMemHolderInfo { offset, len, holder_id }`。SSD route 由 master 管理,SSD 文件位置和 source staging 由 SSD owner 管理,external 不保存也不更新这些状态。 + +```rust +pub struct ExternalGetReq { + pub key: String, + // external 通过 owner 发起 get,req_node_id 仍指向请求方身份。 + pub req_node_id: String, + // owner 代际校验,防止过期 external 请求继续使用旧 owner。 + pub started_time: i64, +} + +pub struct ExternalGetResp { + pub error_code: ErrorCode, + pub error_json: String, + // 成功时返回 external 可见的 holder 元数据。 + pub external_memholder_info: Option, +} + +pub struct ExternalMemHolderInfo { + // external attach 到 owner mmap 后可见的 offset。 + pub offset: u64, + // 真实 payload 长度。 + pub len: u32, + // 后续 release ack 使用的 holder id。 + pub holder_id: u64, +} + +pub struct ExternalMemHolder { + pub offset: u64, + // 当前 external 进程内 mmap 后的绝对地址。 + pub addr: u64, + pub len: u32, + pub holder_id: u64, + pub key: String, + pub external_client_id: String, + // drop/release 时校验 owner 代际。 + pub owner_start_time: i64, + ... +} +``` + +owner 内部完成普通 `get` 后,会把 `MemoryInfo` 写入 `external_get_holding`,用这条 owner 侧引用代表 external 当前仍在借用该 holder;随后 owner 只把 `ExternalMemHolderInfo { offset, len, holder_id }` 返回给 external。external 构造 `ExternalMemHolder` 后,通过 owner mmap 的 `offset/addr` 读取结果。external holder drop 时,会向 owner 发送 `ExternalDeleteAckReq`;owner 删除 `external_get_holding` 中对应记录,释放 external 这一份引用。只有当 owner 侧不再有其它 `Arc` 引用时,`MemoryInfo` drop 才会沿用原有 owner -> master holder ack 链路释放 master `get_holding`。 + +### stage 失败和释放 + +```mermaid +sequenceDiagram + participant RO as requester owner + participant M as master + participant SO as SSD owner + + RO->>SO: SsdStageReadReq + SO-->>RO: stage error + RO->>M: GetRevokeReq(drop_ssd_source=true) + Note right of M: 查 inflight_gets\n确认 source_kind=Ssd\n删除 route.ssd_replicas[src_node_id] + alt no live replica remains + M->>M: remove kv_routes and prefix index + end +``` + +```rust +pub struct GetRevokeReq { + // 要撤销的在途 get。 + pub get_id: u64, + // 只有 SSD stage 失败时才置 true,用来删除失败的 SSD source route。 + pub drop_ssd_source: bool, +} +``` + +SSD stage 失败时,请求方调用 `get_revoke_ssd_source(...)`,也就是 `GetRevokeReq { drop_ssd_source: true }`。master 从 `inflight_gets` 找到本次 get,只有 `source_kind == GetSourceKind::Ssd` 时才会删除 `route.ssd_replicas[src_node_id]`,避免后续 get 继续选择同一个失败 SSD source。如果同一个 `OneKvNodesRoutes` 下已经没有 live 内存副本和 SSD 副本,master 再删除 `kv_routes` 并异步清理 prefix index。 + +这里的释放边界是:SSD owner 上的 `source_allocation` 只服务本次 stage,失败后随 `inflight_gets` 清理释放;requester target allocation 没有进入 `get_holding`,因此不会生成用户可见 holder。 + +## 关键代码片段 + +### put_done 只提交内存副本 + +当前实现中,`put_done` 只把内存 target allocation 写入 `nodes_replicas`。SSD 是否落盘不影响这次 `PutDone` 的可见性。 + +```rust +pub struct PutDoneReq { + pub key: String, + // 只提交这个 key-version 的内存副本。 + pub put_id: PutIDForAKey, + pub lease_id: Option, +} + +// 这里只把内存 target 写入 nodes_replicas;SSD 副本稍后独立 commit。 +one_kv_routes + .nodes_replicas + .write() + .insert(node_id.clone(), completed_info); +``` + +这段边界是:`nodes_replicas` 代表内存副本 ready,`get_start` 可以立即从这里返回 memory source;`ssd_replicas` 不能在这一步写入,否则 `PutDone` 会被 SSD persist 延迟拖住。SSD 副本后续用同一个 `put_id` 独立提交。 + +### SSD replica 独立 commit + +SSD owner 后台 persist 成功后,单独向 master 提交同一个 key-version 的 SSD 副本。master 必须校验当前 route 的 `put_id` 仍然匹配,避免旧版本 SSD late commit 污染新版本路由。 + +```rust +pub struct SsdReplicaCommitReq { + pub key: String, + // 必须匹配当前 route 版本,避免 SSD late commit 污染新版本。 + pub put_id: PutIDForAKey, + // 完成落盘的 SSD owner。 + pub node_id: NodeIDString, + // 真实 payload 长度。 + pub len: u64, +} + +if let Some(route) = kv_routes.get(&req.key) { + // 过期 put_id 直接丢弃,不 resurrect 旧版本。 + if route.put_id == req.put_id { + // master 只保存 SSD owner 和长度;文件 offset 留在 owner 本地 ring index。 + route.ssd_replicas.write().insert( + node_id.clone(), + KvSsdRouteInfo { + node_id: node_id.clone(), + len: req.len, + tomb_tag, + }, + ); + } +} +``` + +master 只在 `req.put_id == route.put_id` 时写 `ssd_replicas`;过期 `put_id` 的 late commit 会被丢弃,不能 resurrect 旧版本。`SsdReplicaCommitReq.len` 是真实 payload 长度;SSD shard 文件 offset 不进入 master route,只留在 target owner 本地 `SsdRingBuffer`。 + +### get_start 分配分布式 SSD staging + +SSD fallback 发生在 master 已经没有可用 `nodes_replicas` 之后。source staging 一定分配在 SSD owner 的 CPU segment 上,target allocation 一定分配在 requester 的 CPU segment 上。 + +```rust +// SSD read 使用 O_DIRECT,读长度先按 512 bytes 对齐。 +let ssd_stage_len = align_ssd_io_len(ssd_replica.len)?; +// 额外预留 511 bytes,确保 allocation 内能找到 512-byte 对齐地址。 +let source_alloc_len = ssd_stage_len + SSD_ALIGNMENT as u64 - 1; + +// source staging 放在 SSD owner 上,只服务本次读盘和 push。 +let source_allocation = allocate_get_buffer_on_node( + &view, + &ssd_replica.node_id, + source_alloc_len, + get_id, + "ssd source staging", +)?; +// target allocation 放在 requester 上,GetDone 后转成最终 holder。 +let target_allocation = allocate_get_buffer_on_node( + &view, + &req_node_id, + ssd_replica.len, + get_id, + "requesting target", +)?; + +// 返回给 SSD owner 的是对齐后的 staging 地址,不一定等于 allocation 起点。 +let source_addr = align_ssd_stage_addr(source_base + source_allocation.addr())?; +``` + +这里的关键边界是:`source_allocation` 在 SSD owner 上,只用于读盘 staging;`target_allocation` 在 requester owner 上,成功 `GetDone` 后进入 `get_holding`。`source_alloc_len = align_up(len, 512) + 511`,保证 allocation 内总能找到 512-byte 对齐的 `src_addr`;`src_addr` 是对齐后的 staging 地址,不一定等于 `source_allocation` 起点。 + +### requester 触发 SSD owner stage/push/done + +请求方收到 `GetSourceKind::Ssd` 后,让 SSD owner 把数据读入 `src_addr`、按 chunk push 到 `target_addr + offset`,并由 SSD owner 直接完成 master `get_done`。这里没有新增用户 API;`SsdStageReadReq` 是 owner 内部 RPC。stage RPC 成功返回时,requester target 已经可读,并且 requester 已经拿到 master done 结果;请求方跳过自己的 transfer 分支,也跳过自己的 `get_done`。 + +```rust +let mut ssd_done_resp = None; +if resp.source_kind == GetSourceKind::Ssd { + // SSD owner 负责读盘、push chunk,并在完成后调用 master GetDoneReq。 + let done_resp = self.stage_kv_from_ssd_source( + &resp.node_id, + key, + put_id, + get_id, + resp.src_addr, + resp.target_addr, + data_len as u64, + resp.ssd_stage_len, + ) + .await?; + ssd_done_resp = Some(done_resp); +} + +if resp.source_kind == GetSourceKind::Ssd { + // SSD owner 已经把全部 chunk push 到 target_addr,并完成 get_done。 +} else { + // memory source 路径仍由 requester 自己做 transfer。 + self.view.client_transfer_engine() + .transfer_data_no_copy(peer_id, true, resp.src_addr, resp.target_addr, len, None) + .await?; +} + +let done_resp = if let Some(done_resp) = ssd_done_resp { + // SSD source 路径复用 SsdStageReadResp 带回的 GetDoneResp 字段。 + done_resp +} else { + // memory source 路径的 GetDoneReq 仍由 requester 发送。 + self.get_done(get_id).await? +}; +``` + +SSD source 路径里,`stage_kv_from_ssd_source(...)` 成功返回时,SSD owner 已经完成读盘、chunk transfer 和 master `GetDoneReq`。requester 因此跳过自己的 transfer 和 `get_done`,直接复用 `SsdStageReadResp.done_*` 构造 holder。memory source 路径仍由 requester 自己 transfer 并调用 `get_done`。 + +### SSD chunk read 与 direct/scratch fallback + +当前实现只有 SSD 回填读路径会把 payload 切成 chunk;SSD 写入按 value 级别一次写入一个 device 的一个 shard 连续 offset。`SsdLoadedChunk` 是 read producer 交给 transfer consumer 的最小就绪单元;`ReadCommand` 记录本次 chunk 要读的 committed entry、shard 文件 offset 和读入目标。 + +```rust +pub(crate) struct SsdLoadedChunk { + // 当前 chunk 在完整 payload 中的偏移。 + pub offset: u64, + // 当前 chunk 在 SSD owner source staging 中的起始地址。 + pub stage_addr: u64, + // 当前 chunk 的真实 payload 长度,不包含 O_DIRECT padding。 + pub len: u64, +} + +struct ReadCommand { + key: KvSsdKey, + // 已 pin 的 committed entry,里面包含 shard_id、file_offset 和长度。 + entry: SsdIndexEntry, + // 这次 chunk read 在 SSD shard 文件内的起始 offset。 + file_offset: u64, + // Direct 表示直接读入 staging,Scratch 表示先读入 aligned buffer。 + target: ReadTarget, + // 持有 read pin,防止 writer 在读完成前覆盖该位置。 + _read_pin: Option, + done_tx: oneshot::Sender>, +} +``` + +`load_and_push_kv_from_ssd(...)` 把 SSD read 和 transfer 做成流水线:producer 按 chunk 提交 SSD read,并把读好的 chunk 放入 ready queue;consumer 收到 ready chunk 后立即 push 到 requester target。读和传输可以重叠,多个 chunk 可以同时处于 read inflight 或 transfer inflight 状态。 + +```rust +// ready queue 让 read producer 和 transfer consumer 解耦。 +let (chunk_tx, chunk_rx) = ::tokio::sync::mpsc::channel( + DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT.saturating_mul(2).max(1), +); + +// producer 按 chunk 从 SSD shard 文件读入 source staging。 +let producer = store.load_into_addr_chunks( + key, + put_id, + stage_addr, + len, + stage_len, + DEFAULT_READ_TRANSFER_PIPELINE_CHUNK_BYTES, + DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT, + chunk_tx, +); +// consumer 收到 ready chunk 后立即 push 到 requester target。 +let consumer = self.transfer_loaded_ssd_chunks(peer_id, target_addr, chunk_rx); +// 两个 future 并发执行,形成 read-transfer pipeline。 +let (producer_res, consumer_res) = ::tokio::join!(producer, consumer); +``` + +`load_into_addr_chunks(...)` 先 pin 当前 committed entry,pin 生命周期覆盖整个 producer。每个 chunk 用 `entry.file_offset + offset` 定位 SSD shard 文件中的读取位置,并根据 staging 地址、文件 offset 和 staging 容量选择 direct 或 scratch;chunk read 完成后立即发送 `SsdLoadedChunk`。 + +```rust +// pin 生命周期覆盖整个 producer,writer 不能覆盖 active read 位置。 +let (entry, _read_pin) = { + let mut inner = self.inner.lock(); + let Some(entry) = inner.ring.pin_read(&key) else { + return Err(KvError::Api(ApiError::KeyNotFound { key: key.key.clone() })); + }; + (entry, SsdReadPin { ... }) +}; + +// 每个 chunk 在同一个 committed entry 内推进文件 offset。 +let file_offset = entry.file_offset + offset; +let target = match choose_chunk_read_path(stage_addr, read_len, target_len, file_offset) { + // staging 地址、文件 offset 和 IO 长度都满足对齐时走 direct read。 + SsdReadPath::Direct => ReadTarget::Direct { + target_addr: stage_addr, + len: read_len as usize, + }, + // 否则先读到 aligned scratch buffer。 + SsdReadPath::Scratch => ReadTarget::Scratch(AlignedBuffer::zeroed(read_len as usize)?), +}; + +// submit_read_command 根据 entry.shard_id 进入对应 device reader queue。 +let output = submit_read_command(key, entry, file_offset, target, None).await?; +if let ReadOutput::Scratch(buffer) = output { + // scratch 路径只把真实 payload 长度复制到 staging。 + copy_payload_to_stage(buffer, stage_addr, payload_len)?; +} +// 下游 transfer 只看到真实 payload 长度。 +ready_tx.send(SsdLoadedChunk { offset, stage_addr, len: payload_len }).await?; +``` + +direct 路径把 `readv` 的目标直接设为当前 chunk 的 source staging;scratch 路径先读入 aligned buffer,再只复制当前 chunk 的真实 payload 长度到 staging。两条路径最后都只把真实 payload 长度暴露给 transfer 和 `MemHolder`,不会把 `O_DIRECT` padding 暴露给用户。 + +## IO 模型 + +```mermaid +flowchart TD + A["large_file_paths"] --> B["derive SSD roots"] + B --> C["create root dirs + metadata.dev()"] + C --> D["deduplicate device roots"] + D --> E0["SsdDeviceWorker device 0"] + D --> E1["SsdDeviceWorker device 1"] + E0 --> F0["shard_ids: 0,2,..."] + E1 --> F1["shard_ids: 1,3,..."] + F0 --> G0["device 0 writer/read queues"] + F1 --> G1["device 1 writer/read queues"] + G0 --> H0["device 0 UringIoEngine"] + G1 --> H1["device 1 UringIoEngine"] + I["persist_buffer"] --> J["next_write_device round-robin"] + J --> G0 + J --> G1 + K["submit_read_command(entry.shard_id)"] --> L["shard_to_device"] + L --> G0 + L --> G1 +``` + +| 组件 | 设计 | +| --- | --- | +| device root | owner 从 `large_file_paths` 派生 SSD cache root;创建目录后用 `metadata.dev()` 判断真实 device,同一 device 只保留一个有效 root。 | +| shard 文件 | `max_bytes` 是 owner 本地 SSD cache 的容量上限;容量被拆成多个 shard 文件,分布到有效 device root 的 `shards/` 下。`shard_to_device` 记录每个 shard 属于哪个 device。 | +| 写入选路 | `persist_buffer` 用 `next_write_device` round-robin 选择一个 device;一个 payload 只进入这个 device 的 writer queue,并在该 device 的某个 shard 中分配一段连续 `file_offset`。 | +| 读取选路 | committed entry 保存 `shard_id` 和 `file_offset`;读 chunk 时通过 `entry.shard_id -> shard_to_device` 找到 device reader queue,再从对应 shard 文件的 `file_offset + offset` 读取。 | +| per-device worker | 每个有效 device 有独立 writer queue、reader queue 和 `UringIoEngine`;这些 worker 只处理本 device 的 shard 文件 IO。 | +| 对齐与回收 | SSD shard 使用 `O_DIRECT`,要求地址、长度和文件 offset 512-byte 对齐;不满足 direct 条件的读 chunk 走 scratch buffer。ring head/tail 和 read pin 只在 owner 本地保护 shard 文件位置,不进入 master route。 | + +## Task / Actor / 独立线程 + +这一节只列运行时执行单元,不再重复 device/shard 选路。SSD 没有新增独立的 master route actor;控制面仍由原有 master/owner RPC handler 承载。新增的后台执行主要在 owner 本地:每个有效 device 有 writer task、reader task 和对应的 `UringIoEngine` 后台线程。 + +### owner 本地 SSD IO 执行单元 + +| 执行单元 | 创建位置 | 类型 | 输入 | 职责 | +| --- | --- | --- | --- | --- | +| `ssd_writer_loop` | `KvSsdStorage::new`,每个 effective device 一个 | `tokio::task::spawn` | `SsdDeviceWorker.write_tx` | 从 `persist_from_addr` 接收写任务,只在本 device 的 `shard_ids` 内调用 `SsdRingBuffer::prepare_write_on_shards`,提交 `writev`,完成后 `commit(Writing -> Committed)`。 | +| `ssd_reader_loop` | `KvSsdStorage::new`,每个 effective device 一个 | `tokio::task::spawn` | `SsdDeviceWorker.read_tx` | 从 `load_into_addr_chunks` 接收属于本 device shard 的 chunk 读任务,提交 direct/scratch `readv`,校验 offset 仍有效,完成后回传 chunk 读结果;整条 producer 完成后释放 `SsdReadPin`。 | +| `fluxon-kv-ssd-uring-{idx}` | 每个 device 的 `UringIoEngine::new_multi` | `std::thread::spawn` | `read_rx/write_rx: crossbeam::channel` | 每个线程持有一个 `IoUring`,只提交本 device shard 文件的 `Readv/Writev` SQE,并按 read/write inflight 比例调度后回传 CQE。 | + +`KvSsdStorage` 通过每个 `SsdDeviceWorker` 持有 shard fd 和 `UringIoEngine`,保证 fd 与 uring 线程生命周期覆盖 writer/reader task;drop 时关闭 channel 并 join uring 线程。 + +### 控制面 RPC / 清理任务 + +| 执行单元 | 创建位置 | 类型 | 输入 | 职责 | +| --- | --- | --- | --- | --- | +| `rpc_ssd_replica_commit` | `MasterKvRouter` RPC handler 注册 | `view.spawn(...)` | `SsdReplicaCommitReq` | owner SSD persist 成功后提交 SSD 副本,master 校验 `put_id` 后写 `ssd_replicas`。 | +| `rpc_ssd_stage_read` | `ClientKvApi` RPC handler 注册 | `view.spawn(...)` | `SsdStageReadReq` | 远端 SSD owner 收到 stage 请求后,在 owner 进程内调用 `load_and_push_kv_from_ssd(...)`;SSD read producer 和 transfer consumer 流水线完成后,再调用 master `get_done` 并回传 done fields。 | +| `ssd_failure_remove_prefix_index` | `get_revoke(drop_ssd_source=true)` | `view.spawn(...)` | 失败 SSD source 的 key | 当失败 SSD source 是最后一个 live replica 时,异步删除 prefix index。 | + +SSD route 的权威更新点仍是原有 master RPC handler: + +- `PutDone`:同步更新 `nodes_replicas`,让内存副本立即可读。 +- `SsdReplicaCommit`:SSD persist 完成后同步更新 `ssd_replicas`,并拒绝过期 `put_id`。 +- `GetStart`:同步选择内存副本或 SSD 副本,并写入 `inflight_gets`。 +- `GetRevoke`:同步删除失败 SSD source;必要时触发 prefix index 小任务。 +- `Delete` / 覆盖写失效:复用原有 `delete_broadcast` 管线。 + +## 不变量 + +- `nodes_replicas` 和 `ssd_replicas` 都属于同一个 `OneKvNodesRoutes.put_id`,不能跨版本复用。 +- `PutDoneReq` 只表示内存副本 ready;SSD 副本只能由匹配当前 `put_id` 的 `SsdReplicaCommitReq` 补充进 `ssd_replicas`。 +- `SsdReplicaCommitReq` 是内部控制面 RPC,不改变用户侧 `put/get/delete` API。 +- `GetSourceKind::Ssd` 必须同时有 SSD owner source staging 和 requester target allocation;成功后只有 requester target allocation 进入 `get_holding`。 +- SSD 回填失败必须通过 `get_revoke(drop_ssd_source=true)` 清理 in-flight get,并从 master 路由里移除失败的 SSD 副本。 +- master 路由被删除后,旧 SSD bytes 即使还在 shard 文件里,也不能被公共 `get` 命中。 + +## 关键结论 + +这套实现把 SSD 作为内存 KV 之外的可回填数据源副本,而不是新增一套用户 API。写入侧先完成内存 `PutDone`,再由 target owner 异步落 SSD,并通过 `SsdReplicaCommitReq` 补充 SSD route;读取侧优先使用内存副本,内存副本不可用时由 SSD owner 从本地 shard 文件读出数据,按 chunk push 到 requester target,再复用原有 `get_done` 和 holder 生命周期。 + +因此,SSD 相关的 shard ring、`O_DIRECT`、`io_uring`、read pin 和 read/transfer pipeline 都限制在 owner 本地实现内;master 只保存这个 key-version 有哪些 owner 持有 SSD 副本,以及 value 的真实 payload 长度,不保存 SSD 文件 offset、shard_id 或本地 ring 状态。 diff --git a/fluxon_rs/Cargo.lock b/fluxon_rs/Cargo.lock index a4b0ecd..3e8638a 100644 --- a/fluxon_rs/Cargo.lock +++ b/fluxon_rs/Cargo.lock @@ -1237,6 +1237,7 @@ dependencies = [ "hyper 0.14.32", "iceoryx2", "iceoryx2-cal", + "io-uring", "kanal", "lazy_static", "libc", @@ -2395,6 +2396,17 @@ dependencies = [ "str_stack", ] +[[package]] +name = "io-uring" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9080b15e63775b9a2ac7dca720f7050a8b955e092ea0f6020a4a80f69998cdc0" +dependencies = [ + "bitflags 2.9.1", + "cfg-if", + "libc", +] + [[package]] name = "ipnet" version = "2.11.0" diff --git a/fluxon_rs/fluxon_kv/Cargo.toml b/fluxon_rs/fluxon_kv/Cargo.toml index 22ff136..8208216 100644 --- a/fluxon_rs/fluxon_kv/Cargo.toml +++ b/fluxon_rs/fluxon_kv/Cargo.toml @@ -75,6 +75,7 @@ bytes = "1" pprof = { version = "0.15", features = ["flamegraph"] } hex = "0.4" sha2 = "0.10" +io-uring = "0.7" tokio-tungstenite = { version = "0.21", default-features = false, features = ["connect", "handshake"], optional = true } sockudo-ws = { version = "^1.7.4", default-features = false, features = ["tokio-runtime", "fastrand"], optional = true } diff --git a/fluxon_rs/fluxon_kv/src/client_kv_api/get.rs b/fluxon_rs/fluxon_kv/src/client_kv_api/get.rs index f309dd0..29da3f8 100644 --- a/fluxon_rs/fluxon_kv/src/client_kv_api/get.rs +++ b/fluxon_rs/fluxon_kv/src/client_kv_api/get.rs @@ -13,7 +13,7 @@ use crate::{ cluster_manager::NodeID, master_kv_router::msg_pack::{ GetAllocationMode, GetDoneReq, GetDoneResp, GetMetaReq, GetMetaResp, GetRevokeReq, - GetStartReq, GetStartResp, + GetSourceKind, GetStartReq, GetStartResp, }, p2p::msg_pack::MsgPack, rpcresp_kvresult_convert::msg_and_error::codes_api, @@ -26,19 +26,27 @@ use std::sync::Arc; pub struct RemoteGetInfo { get_id: u64, data_len: usize, + source_kind: GetSourceKind, src_addr: u64, target_addr: u64, node_id: NodeID, peer_is_src_or_target: bool, } +impl RemoteGetInfo { + pub fn source_kind(&self) -> GetSourceKind { + self.source_kind + } +} + impl std::fmt::Display for RemoteGetInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, - "GetInfo{{ get_id: {}, data_len: {} bytes, src_addr: {:#x}, target_addr: {:#x}, node_id: {:?}, remote_transfer: {} }}", + "GetInfo{{ get_id: {}, data_len: {} bytes, source_kind: {:?}, src_addr: {:#x}, target_addr: {:#x}, node_id: {:?}, remote_transfer: {} }}", self.get_id, self.data_len, + self.source_kind, self.src_addr, self.target_addr, self.node_id, @@ -177,8 +185,80 @@ impl ClientKvApiInner { ); } + let mut ssd_done_resp = None; + if resp.source_kind == GetSourceKind::Ssd { + let ssd_stage_len = resp.ssd_stage_len; + if ssd_stage_len < data_len as u64 { + #[cfg(test)] + { + self.test_record.remove_transfering_get(get_id); + } + + self.get_revoke(get_id).await?; + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "invalid ssd stage len for key={} get_id={} data_len={} ssd_stage_len={}", + key, get_id, data_len, ssd_stage_len + ), + })); + } + let done_resp = match self + .stage_kv_from_ssd_source( + &resp.node_id, + key, + put_id, + get_id, + abs_src, + abs_target, + data_len as u64, + ssd_stage_len, + ) + .await + { + Ok(done_resp) => done_resp, + Err(err) => { + tracing::warn!( + "kv get ssd stage failed: key={}, source_node={}, stage={:#x}, target={:#x}, len={}, ssd_stage_len={}, err={}", + key, + resp.node_id, + abs_src, + abs_target, + data_len, + ssd_stage_len, + err + ); + + #[cfg(test)] + { + self.test_record.remove_transfering_get(get_id); + } + + obe_get_transfer_error(&metrics, &client_id, &node_role, key, data_len as u64); + self.get_revoke_ssd_source(get_id).await?; + return Err(err); + } + }; + ssd_done_resp = Some(done_resp); + tracing::debug!( + "kv get ssd staged and pushed: key={}, source_node={}, stage={:#x}, target={:#x}, len={}, ssd_stage_len={}", + key, + resp.node_id, + abs_src, + abs_target, + data_len, + ssd_stage_len + ); + } + // transfer data (skip if local and src==target to avoid redundant copy) - if peer_id.is_none() && abs_src == abs_target { + if resp.source_kind == GetSourceKind::Ssd { + tracing::debug!( + "kv get ssd owner push complete: key={}, target={:#x}, len={} (skip requester transfer)", + key, + abs_target, + data_len + ); + } else if peer_id.is_none() && abs_src == abs_target { tracing::debug!( "kv get local no-op: src==target {:#x}, len={} (skip transfer)", abs_target, @@ -249,12 +329,17 @@ impl ClientKvApiInner { // Removed post-transfer zero-header verification per request. - // Complete the get operation and get holder_id - let done_resp = match self.get_done(get_id).await { - Ok(resp) => resp, - Err(err) => { - obe_get_end_error_rpc(&metrics, &client_id, &node_role, key, data_len as u64); - return Err(err); + // Complete the get operation and get holder_id. SSD source already called + // get_done after pushing into the requester target. + let done_resp = if let Some(done_resp) = ssd_done_resp { + done_resp + } else { + match self.get_done(get_id).await { + Ok(resp) => resp, + Err(err) => { + obe_get_end_error_rpc(&metrics, &client_id, &node_role, key, data_len as u64); + return Err(err); + } } }; let end_handle_us = done_resp.server_process_us; @@ -326,6 +411,7 @@ impl ClientKvApiInner { let get_info = RemoteGetInfo { get_id, data_len, + source_kind: resp.source_kind, src_addr: abs_src, target_addr: abs_target, node_id: resp.node_id.into(), @@ -435,8 +521,19 @@ impl ClientKvApiInner { /// 撤销 Get 操作,释放已分配的资源 pub async fn get_revoke(&self, get_id: u64) -> KvResult<()> { + self.get_revoke_inner(get_id, false).await + } + + async fn get_revoke_ssd_source(&self, get_id: u64) -> KvResult<()> { + self.get_revoke_inner(get_id, true).await + } + + async fn get_revoke_inner(&self, get_id: u64, drop_ssd_source: bool) -> KvResult<()> { let req = MsgPack { - serialize_part: GetRevokeReq { get_id }, + serialize_part: GetRevokeReq { + get_id, + drop_ssd_source, + }, raw_bytes: Vec::new(), }; diff --git a/fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs b/fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs index dec19f5..bd4655b 100644 --- a/fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs +++ b/fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs @@ -3,11 +3,17 @@ use crate::client_kv_api::msg_pack::{ ExternalDeleteAckReq, ExternalDeleteAckResp, ExternalDeleteReq, ExternalDeleteResp, ExternalGetReq, ExternalGetResp, ExternalIsExistReq, ExternalIsExistResp, ExternalPutCommitReq, ExternalPutCommitResp, ExternalPutRevokeReq, ExternalPutRevokeResp, ExternalPutStartReq, - ExternalPutStartResp, ExternalPutTransferEndReq, ExternalPutTransferEndResp, SyncKvToFileReq, - SyncKvToFileResp, TestPutPhaseTrace, + ExternalPutStartResp, ExternalPutTransferEndReq, ExternalPutTransferEndResp, + SsdReplicaPersistReq, SsdReplicaPersistResp, SsdStageReadReq, SsdStageReadResp, + SyncKvToFileReq, SyncKvToFileResp, TestPutPhaseTrace, }; use crate::cluster_manager::NodeIDString; +use crate::cluster_manager::app_logic_ext::ClusterManagerAppLogicExt; use crate::config::TestSpecConfig; +use crate::kv_ssd_storage::{ + DEFAULT_READ_TRANSFER_PIPELINE_CHUNK_BYTES, DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT, + KvSsdStorage, KvSsdStorageInit, SsdLoadedChunk, +}; use crate::master_kv_router::msg_pack::{ BatchDeleteAckReq, BatchDeleteClientKvMetaCacheReq, DeleteClientKvMetaCacheItem, }; @@ -22,8 +28,8 @@ use crate::{ client_transfer_engine::{ClientTransferEngine, ClientTransferEngineAccessTrait}, cluster_manager::{ClusterEvent, ClusterManager, ClusterManagerAccessTrait}, master_kv_router::msg_pack::{ - DeleteReq, GetDoneReq, GetMetaReq, GetRevokeReq, GetStartReq, PutDoneReq, PutRevokeReq, - PutStartReq, + DeleteReq, GetDoneReq, GetDoneResp, GetMetaReq, GetRevokeReq, GetStartReq, PutDoneReq, + PutRevokeReq, PutStartReq, SsdReplicaCommitReq, }, metric_reporter::{MetricReporter, MetricReporterAccessTrait}, metrics::{MetricsHandle, OperationKind, RequestStage}, @@ -37,6 +43,7 @@ use async_trait::async_trait; use dashmap::DashMap; use fluxon_framework::{LogicalModule, define_module}; use fluxon_util::map_lock::AMapLock; +use futures::stream::{FuturesUnordered, StreamExt}; use limit_thirdparty::tokio; use parking_lot::Mutex; use std::sync::Weak; @@ -451,6 +458,89 @@ async fn handle_external_put_revoke( } } +async fn handle_ssd_stage_read( + view: &ClientKvApiView, + msg: &MsgPack, +) -> MsgPack { + let req = msg.serialize_part.clone(); + let inner = view.client_kv_api().inner(); + let done_resp = match inner + .load_and_push_kv_from_ssd( + &req.key, + req.put_id, + req.stage_addr, + req.stage_len, + &req.target_node_id, + req.target_addr, + req.len, + ) + .await + { + Ok(()) => inner.get_done(req.get_id).await, + Err(err) => Err(err), + }; + + match done_resp { + Ok(done_resp) => MsgPack { + serialize_part: SsdStageReadResp { + done_holder_id: done_resp.holder_id, + done_allocation_mode: done_resp.allocation_mode, + done_error_code: done_resp.error_code, + done_error_json: done_resp.error_json, + done_server_process_us: done_resp.server_process_us, + error_code: crate::rpcresp_kvresult_convert::msg_and_error::OK, + error_json: String::new(), + }, + raw_bytes: Vec::new(), + }, + Err(err) => MsgPack { + serialize_part: crate::rpcresp_kvresult_convert::FromError::from_error(&err), + raw_bytes: Vec::new(), + }, + } +} + +async fn handle_ssd_replica_persist( + view: &ClientKvApiView, + msg: &MsgPack, +) -> MsgPack { + let req = msg.serialize_part.clone(); + let inner = view.client_kv_api().inner(); + let persisted = match inner + .persist_local_kv_to_ssd(&req.key, req.put_id, req.target_addr, req.len) + .await + { + Ok(persisted) => persisted, + Err(err) => { + return MsgPack { + serialize_part: crate::rpcresp_kvresult_convert::FromError::from_error(&err), + raw_bytes: Vec::new(), + }; + } + }; + + if persisted { + if let Err(err) = inner + .commit_ssd_replica_to_master(&req.key, req.put_id, req.len) + .await + { + return MsgPack { + serialize_part: crate::rpcresp_kvresult_convert::FromError::from_error(&err), + raw_bytes: Vec::new(), + }; + } + } + + MsgPack { + serialize_part: SsdReplicaPersistResp { + persisted, + error_code: crate::rpcresp_kvresult_convert::msg_and_error::OK, + error_json: String::new(), + }, + raw_bytes: Vec::new(), + } +} + async fn handle_external_delete_ack( view: &ClientKvApiView, msg: &MsgPack, @@ -729,6 +819,7 @@ define_module!( #[derive(Clone, Debug)] pub struct ClientKvApiNewArg { pub test_spec_config: TestSpecConfig, + pub ssd_storage: Option, } pub struct ClientKvApi(ClientKvApiInner); @@ -775,6 +866,7 @@ impl std::ops::Deref for ClientKvApiViewHolder { pub struct ClientKvApiInner { view: ClientKvApiViewHolder, test_spec_config: TestSpecConfig, + ssd_storage: Option>, metrics: OnceLock>, /// make sure each remote kv get run in order @@ -818,6 +910,8 @@ pub struct ClientKvApiInner { rpc_caller_external_put_commit: RPCCaller, rpc_caller_external_put_revoke: RPCCaller, rpc_caller_resolve_side_transfer_lane: RPCCaller, + rpc_caller_ssd_stage_read: RPCCaller, + rpc_caller_ssd_replica_commit: RPCCaller, /// Default lease id recorded for inspection/convenience, but NOT auto-applied. /// Callers must explicitly pass `Some(lease_id)` to attach a put to a lease. @@ -900,6 +994,222 @@ impl ClientKvApiInner { pub(crate) fn skip_put_end_commit_enabled(&self) -> bool { self.test_spec_config.skip_put_end_commit } + + pub(crate) async fn persist_local_kv_to_ssd( + &self, + key: &str, + put_id: crate::master_kv_router::put::PutIDForAKey, + abs_addr: u64, + len: u64, + ) -> KvResult { + let Some(store) = self.ssd_storage.as_ref() else { + return Ok(false); + }; + store.persist_from_addr(key, put_id, abs_addr, len).await?; + Ok(true) + } + + pub(crate) async fn commit_ssd_replica_to_master( + &self, + key: &str, + put_id: crate::master_kv_router::put::PutIDForAKey, + len: u64, + ) -> KvResult<()> { + let node_id = self.view.cluster_manager().get_self_info().id.clone(); + let req = MsgPack { + serialize_part: SsdReplicaCommitReq { + key: key.to_string(), + put_id, + node_id, + len, + }, + raw_bytes: Vec::new(), + }; + let master_node_id = self + .view + .cluster_manager() + .find_or_wait_master_node() + .await?; + let resp = self + .rpc_caller_ssd_replica_commit + .call( + self.view.p2p_module(), + master_node_id.into(), + req, + Some(Duration::from_secs(60)), + 2, + ) + .await + .map_err(KvError::from)?; + crate::rpcresp_kvresult_convert::try_from_code( + resp.serialize_part.error_code, + resp.serialize_part.error_json, + ) + } + + pub(crate) async fn load_and_push_kv_from_ssd( + &self, + key: &str, + put_id: crate::master_kv_router::put::PutIDForAKey, + stage_addr: u64, + stage_len: u64, + target_node_id: &NodeIDString, + target_addr: u64, + len: u64, + ) -> KvResult<()> { + let Some(store) = self.ssd_storage.as_ref() else { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage is not enabled on this owner".to_string(), + })); + }; + + let self_node_id = &self.view.cluster_manager().get_self_info().id; + let peer_id = if target_node_id == self_node_id { + None + } else { + Some(target_node_id.clone()) + }; + let (chunk_tx, chunk_rx) = ::tokio::sync::mpsc::channel( + DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT + .saturating_mul(2) + .max(1), + ); + let producer = store.load_into_addr_chunks( + key, + put_id, + stage_addr, + len, + stage_len, + DEFAULT_READ_TRANSFER_PIPELINE_CHUNK_BYTES, + DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT, + chunk_tx, + ); + let consumer = self.transfer_loaded_ssd_chunks(peer_id, target_addr, chunk_rx); + let (producer_res, consumer_res) = ::tokio::join!(producer, consumer); + match (producer_res, consumer_res) { + (Ok(()), Ok(())) => Ok(()), + (_, Err(err)) => Err(err), + (Err(err), _) => Err(err), + } + } + + async fn transfer_loaded_ssd_chunks( + &self, + peer_id: Option, + target_addr: u64, + mut chunk_rx: ::tokio::sync::mpsc::Receiver, + ) -> KvResult<()> { + let mut inflight = FuturesUnordered::new(); + let mut rx_open = true; + + loop { + tokio::select! { + maybe_chunk = chunk_rx.recv(), if rx_open && inflight.len() < DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT => { + match maybe_chunk { + Some(chunk) => { + let chunk_target_addr = target_addr.checked_add(chunk.offset).ok_or_else(|| { + KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd transfer target addr overflow: target_addr={:#x} offset={}", + target_addr, + chunk.offset + ), + }) + })?; + let transfer_engine = self.view.client_transfer_engine(); + let peer_id = peer_id.clone(); + inflight.push(async move { + transfer_engine + .transfer_data_no_copy( + peer_id, + false, + chunk.stage_addr, + chunk_target_addr, + chunk.len, + None, + ) + .await?; + Ok::<(), KvError>(()) + }); + } + None => { + rx_open = false; + } + } + } + Some(result) = inflight.next(), if !inflight.is_empty() => { + result?; + } + else => { + if !rx_open && inflight.is_empty() { + break; + } + } + } + } + Ok(()) + } + + pub(crate) async fn stage_kv_from_ssd_source( + &self, + source_node_id: &NodeIDString, + key: &str, + put_id: crate::master_kv_router::put::PutIDForAKey, + get_id: u64, + stage_addr: u64, + target_addr: u64, + len: u64, + stage_len: u64, + ) -> KvResult { + let self_node_id = self.view.cluster_manager().get_self_info().id.clone(); + if source_node_id == &self_node_id { + self.load_and_push_kv_from_ssd( + key, + put_id, + stage_addr, + stage_len, + &self_node_id, + target_addr, + len, + ) + .await?; + return self.get_done(get_id).await; + } + + let req = MsgPack { + serialize_part: SsdStageReadReq { + key: key.to_string(), + put_id, + get_id, + stage_addr, + stage_len, + target_node_id: self_node_id, + target_addr, + len, + }, + raw_bytes: Vec::new(), + }; + let resp = self + .rpc_caller_ssd_stage_read + .call( + self.view.p2p_module(), + source_node_id.clone().into(), + req, + Some(Duration::from_secs(60)), + 0, + ) + .await + .map_err(KvError::from)?; + let resp = resp.serialize_part; + crate::rpcresp_kvresult_convert::try_from_code(resp.error_code, resp.error_json)?; + Ok(GetDoneResp { + holder_id: resp.done_holder_id, + allocation_mode: resp.done_allocation_mode, + error_code: resp.done_error_code, + error_json: resp.done_error_json, + server_process_us: resp.done_server_process_us, + }) + } } #[derive(Debug, Clone)] @@ -1518,10 +1828,16 @@ impl ClientKvApi { pub async fn construct(arg: ClientKvApiNewArg) -> Result { tracing::info!("Constructing ClientKvApi in Client mode (PreView)"); + let ssd_storage = arg + .ssd_storage + .map(KvSsdStorage::new) + .transpose()? + .map(Arc::new); let inner = ClientKvApiInner { view: ClientKvApiViewHolder::new(), test_spec_config: arg.test_spec_config, + ssd_storage, metrics: OnceLock::new(), all_memholder_refcount: OnceLock::new(), get_remote_kv_lock: AMapLock::new(Duration::from_secs(60)), @@ -1554,6 +1870,8 @@ impl ClientKvApi { rpc_caller_external_put_commit: RPCCaller::new(), rpc_caller_external_put_revoke: RPCCaller::new(), rpc_caller_resolve_side_transfer_lane: RPCCaller::new(), + rpc_caller_ssd_stage_read: RPCCaller::new(), + rpc_caller_ssd_replica_commit: RPCCaller::new(), default_lease_id: parking_lot::RwLock::new(None), }; Ok(Self(inner)) @@ -1587,6 +1905,12 @@ impl ClientKvApi { inner .rpc_caller_resolve_side_transfer_lane .regist(inner.view.p2p_module()); + inner + .rpc_caller_ssd_stage_read + .regist(inner.view.p2p_module()); + inner + .rpc_caller_ssd_replica_commit + .regist(inner.view.p2p_module()); crate::key_prefix::init_for_p2p_owner(inner.view.p2p_module()); crate::kvlease::init_for_p2p_owner(inner.view.p2p_module()); // Register master-only metric RPC callers @@ -1686,6 +2010,31 @@ impl ClientKvApi { }, ); + let view_ext = inner.view.clone_view(); + RPCHandler::::new().regist(inner.view.p2p_module(), move |resp, msg| { + let view = view_ext.clone(); + let view_task = view.clone(); + let _ = view.spawn("rpc_ssd_stage_read", async move { + let result = handle_ssd_stage_read(&view_task, &msg).await; + let _ = resp.send_resp(result).await; + }); + Ok(()) + }); + + let view_ext = inner.view.clone_view(); + RPCHandler::::new().regist( + inner.view.p2p_module(), + move |resp, msg| { + let view = view_ext.clone(); + let view_task = view.clone(); + let _ = view.spawn("rpc_ssd_replica_persist", async move { + let result = handle_ssd_replica_persist(&view_task, &msg).await; + let _ = resp.send_resp(result).await; + }); + Ok(()) + }, + ); + let view_ext = inner.view.clone_view(); RPCHandler::::new().regist(inner.view.p2p_module(), move |resp, msg| { let view = view_ext.clone(); diff --git a/fluxon_rs/fluxon_kv/src/client_kv_api/msg_pack.rs b/fluxon_rs/fluxon_kv/src/client_kv_api/msg_pack.rs index 55f0970..bae5437 100644 --- a/fluxon_rs/fluxon_kv/src/client_kv_api/msg_pack.rs +++ b/fluxon_rs/fluxon_kv/src/client_kv_api/msg_pack.rs @@ -1,8 +1,10 @@ +use crate::master_kv_router::msg_pack::GetAllocationMode; use crate::master_kv_router::put::PutIDForAKey; use crate::p2p::msg_pack::{MsgPackSerializePart, RPCReq}; use crate::rpcresp_kvresult_convert::msg_and_error::ErrorCode; use bitcode::{Decode, Encode}; +use crate::cluster_manager::NodeIDString; use crate::memholder::ExternalMemHolderInfo; #[derive(Default, Debug, Clone, Encode, Decode)] @@ -89,6 +91,76 @@ impl MsgPackSerializePart for ExternalGetResp { } } +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdStageReadReq { + pub key: String, + pub put_id: PutIDForAKey, + pub get_id: u64, + pub stage_addr: u64, + pub stage_len: u64, + pub target_node_id: NodeIDString, + pub target_addr: u64, + pub len: u64, +} + +impl MsgPackSerializePart for SsdStageReadReq { + fn msg_id(&self) -> u32 { + 4020 + } +} + +impl RPCReq for SsdStageReadReq { + type Resp = SsdStageReadResp; +} + +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdStageReadResp { + pub done_holder_id: u64, + pub done_allocation_mode: GetAllocationMode, + pub done_error_code: ErrorCode, + pub done_error_json: String, + pub done_server_process_us: i64, + pub error_code: ErrorCode, + pub error_json: String, +} + +impl MsgPackSerializePart for SsdStageReadResp { + fn msg_id(&self) -> u32 { + 4021 + } +} + +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdReplicaPersistReq { + pub key: String, + pub put_id: PutIDForAKey, + pub target_addr: u64, + pub len: u64, +} + +impl MsgPackSerializePart for SsdReplicaPersistReq { + fn msg_id(&self) -> u32 { + 4022 + } +} + +impl RPCReq for SsdReplicaPersistReq { + type Resp = SsdReplicaPersistResp; +} + +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdReplicaPersistResp { + pub persisted: bool, + pub error_code: ErrorCode, + pub error_json: String, +} + +impl MsgPackSerializePart for SsdReplicaPersistResp { + fn msg_id(&self) -> u32 { + 4023 + } +} + // #[derive(Default, Debug, Clone, Encode, Decode)] // pub struct ExternalPutReq { // pub key: String, diff --git a/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs b/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs index 1aa6954..8c7cc78 100644 --- a/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs +++ b/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs @@ -237,10 +237,7 @@ impl ClientSegPool { std::path::Path::new(share_mem_path).join(SIDE_TRANSFER_PEERS_DIRNAME) } - pub fn side_transfer_peer_file_path( - share_mem_path: &str, - side_id: &str, - ) -> std::path::PathBuf { + pub fn side_transfer_peer_file_path(share_mem_path: &str, side_id: &str) -> std::path::PathBuf { Self::side_transfer_peers_dir(share_mem_path).join(format!("{side_id}.json")) } @@ -399,17 +396,13 @@ impl ClientSegPool { crate::rpcresp_kvresult_convert::msg_and_error::SharedMemError::MappingFailed { path: String::new(), len: map_len as u64, - detail: "share_mem_path is empty; explicit configuration required" - .to_string(), + detail: "share_mem_path is empty; explicit configuration required".to_string(), }, )); } let base_path = &share_mem_path; - tracing::info!( - "Using share_mem_path: {} for memory-mapped file", - base_path - ); + tracing::info!("Using share_mem_path: {} for memory-mapped file", base_path); std::fs::create_dir_all(base_path).map_err(|e| { KvError::SharedMem( crate::rpcresp_kvresult_convert::msg_and_error::SharedMemError::MappingFailed { diff --git a/fluxon_rs/fluxon_kv/src/config.rs b/fluxon_rs/fluxon_kv/src/config.rs index f9c7691..02f6e3f 100644 --- a/fluxon_rs/fluxon_kv/src/config.rs +++ b/fluxon_rs/fluxon_kv/src/config.rs @@ -581,6 +581,8 @@ pub struct FluxonKvSpecYaml { #[serde(skip_serializing_if = "Option::is_none")] pub large_file_paths: Option, #[serde(skip_serializing_if = "Option::is_none")] + pub ssd_storage: Option>, + #[serde(skip_serializing_if = "Option::is_none")] pub p2p_listen_port: Option, #[serde(skip_serializing_if = "Option::is_none")] pub redis_compat: Option>, @@ -592,6 +594,17 @@ pub struct FluxonKvSpecYaml { #[serde(transparent)] pub struct LargeFilePathsYaml(pub Vec); +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct KvSsdStorageConfigYaml { + pub max_bytes: u64, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct KvSsdStorageConfig { + pub max_bytes: u64, +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct RedisCompatConfigYaml { @@ -682,6 +695,34 @@ impl LargeFilePaths { .into_kverror()) } + fn resolve_all_usable_root_subdirs( + &self, + relative_dir: &Path, + target_name: &str, + ) -> KvResult> { + self.require_configured_paths()?; + let mut out = Vec::new(); + let mut errors = Vec::new(); + for root in &self.paths { + let candidate = Path::new(root).join(relative_dir); + match fs::create_dir_all(&candidate) { + Ok(()) => out.push(candidate), + Err(err) => errors.push(format!("{} ({})", candidate.display(), err)), + } + } + if out.is_empty() { + return Err(ConfigError::InvalidClientConfig { + detail: format!( + "large_file_paths contains no usable root for {}; tried: {}", + target_name, + errors.join(", ") + ), + } + .into_kverror()); + } + Ok(out) + } + pub fn kv_logs_dir(&self, cluster_name: &str) -> KvResult { let relative_dir = PathBuf::from(format!("{cluster_name}_cluster_kv_logs")); self.resolve_preferred_root_subdir(&relative_dir, "kv logs") @@ -714,6 +755,18 @@ impl LargeFilePaths { "fluxon fs disk cache", ) } + + pub fn kv_ssd_storage_dirs( + &self, + cluster_name: &str, + instance_key: &str, + ) -> KvResult> { + let relative_dir = PathBuf::from(format!( + "{cluster_name}_cluster_kv_ssd_storage/{}", + crate::kv_ssd_storage::safe_path_component(instance_key) + )); + self.resolve_all_usable_root_subdirs(&relative_dir, "kv ssd storage") + } } /// KV client backend types supported by the system @@ -733,8 +786,9 @@ pub struct ClientConfig { pub pprof_duration_seconds: Option, pub redis_compat_listen_addr: Option, pub fluxonkv_spec: FluxonKvSpec, - pub share_mem_path: String, // Mandatory shared bundle path + pub share_mem_path: String, // Mandatory shared bundle path pub large_file_paths: LargeFilePaths, // Mandatory large-file roots for logs and caches + pub ssd_storage: Option, pub test_spec_config: TestSpecConfig, } @@ -1028,6 +1082,13 @@ impl ClientConfigYaml { } .into_kverror()); } + if self.fluxonkv_spec.ssd_storage.is_some() { + return Err(ConfigError::InvalidClientConfig { + detail: "fluxonkv_spec.ssd_storage is forbidden in zero-contribution mode" + .to_string(), + } + .into_kverror()); + } } // Preserve historical behavior for configs that omit `protocol`, but allow @@ -1170,13 +1231,15 @@ impl ClientConfigYaml { } else { let Some(large_file_paths_yaml) = self.fluxonkv_spec.large_file_paths.as_ref() else { return Err(ConfigError::InvalidClientConfig { - detail: "fluxonkv_spec.large_file_paths is required for owner mode" - .to_string(), + detail: "fluxonkv_spec.large_file_paths is required for owner mode".to_string(), } .into_kverror()); }; LargeFilePaths { - paths: verify_non_empty_root_path_list(&large_file_paths_yaml.0, "large_file_paths")?, + paths: verify_non_empty_root_path_list( + &large_file_paths_yaml.0, + "large_file_paths", + )?, } }; @@ -1204,6 +1267,28 @@ impl ClientConfigYaml { } }; + let ssd_storage = if is_external { + None + } else { + match std::mem::take(&mut self.fluxonkv_spec.ssd_storage) { + None | Some(YamlNullable::Null) => None, + Some(YamlNullable::Value(raw)) => { + if raw.max_bytes < crate::kv_ssd_storage::SSD_ALIGNMENT as u64 { + return Err(ConfigError::InvalidClientConfig { + detail: format!( + "fluxonkv_spec.ssd_storage.max_bytes must be >= {}", + crate::kv_ssd_storage::SSD_ALIGNMENT + ), + } + .into_kverror()); + } + Some(KvSsdStorageConfig { + max_bytes: raw.max_bytes, + }) + } + } + }; + Ok(ClientConfig { cluster_name: fluxonkv_spec.cluster_name.clone(), etcd_addresses_raw, @@ -1215,6 +1300,7 @@ impl ClientConfigYaml { fluxonkv_spec, share_mem_path, large_file_paths, + ssd_storage, test_spec_config, }) } @@ -1647,7 +1733,80 @@ fluxonkv_spec: .unwrap(); let err = cfg.verify().unwrap_err(); let text = format!("{err}"); - assert!(text.contains("fluxonkv_spec.large_file_paths is forbidden in zero-contribution mode")); + assert!( + text.contains("fluxonkv_spec.large_file_paths is forbidden in zero-contribution mode") + ); + } + + #[test] + fn client_config_owner_accepts_ssd_storage() { + let cfg = ClientConfigYaml::from_str( + r#" +instance_key: test_owner +contribute_to_cluster_pool_size: + dram: 16777216 + vram: {} +fluxonkv_spec: + etcd_addresses: ["127.0.0.1:2379"] + cluster_name: test_cluster + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] + ssd_storage: + max_bytes: 1048576 + sub_cluster: rack-a +"#, + ) + .unwrap(); + let verified = cfg.verify().unwrap(); + assert_eq!( + verified.ssd_storage.as_ref().map(|cfg| cfg.max_bytes), + Some(1048576) + ); + } + + #[test] + fn client_config_owner_rejects_too_small_ssd_storage() { + let cfg = ClientConfigYaml::from_str( + r#" +instance_key: test_owner +contribute_to_cluster_pool_size: + dram: 16777216 + vram: {} +fluxonkv_spec: + etcd_addresses: ["127.0.0.1:2379"] + cluster_name: test_cluster + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] + ssd_storage: + max_bytes: 1 + sub_cluster: rack-a +"#, + ) + .unwrap(); + let err = cfg.verify().unwrap_err(); + let text = format!("{err}"); + assert!( + text.contains("fluxonkv_spec.ssd_storage.max_bytes must be >= 512"), + "{text}" + ); + } + + #[test] + fn client_config_zero_contribution_rejects_ssd_storage() { + let cfg = ClientConfigYaml::from_str( + r#" +instance_key: test_external +fluxonkv_spec: + cluster_name: test_cluster + share_mem_path: /tmp/test_external + ssd_storage: + max_bytes: 1048576 +"#, + ) + .unwrap(); + let err = cfg.verify().unwrap_err(); + let text = format!("{err}"); + assert!(text.contains("fluxonkv_spec.ssd_storage is forbidden in zero-contribution mode")); } #[test] @@ -1667,7 +1826,9 @@ fluxonkv_spec: let logs_dir = large_file_paths.kv_logs_dir("test_cluster").unwrap(); assert_eq!( logs_dir, - first_root.join("child").join("test_cluster_cluster_kv_logs") + first_root + .join("child") + .join("test_cluster_cluster_kv_logs") ); assert!(logs_dir.exists()); @@ -1683,6 +1844,32 @@ fluxonkv_spec: assert!(third_party_logs_dir.exists()); } + #[test] + fn large_file_paths_uses_all_usable_roots_for_kv_ssd_storage() { + let tempdir = new_test_dir("fluxon_large_paths_uses_all_usable_roots_for_kv_ssd_storage"); + let first_root = tempdir.join("first_root"); + let second_root = tempdir.join("second_root"); + + let large_file_paths = LargeFilePaths { + paths: vec![ + first_root.to_string_lossy().into_owned(), + second_root.to_string_lossy().into_owned(), + ], + }; + + let dirs = large_file_paths + .kv_ssd_storage_dirs("test_cluster", "owner/a:b") + .unwrap(); + assert_eq!( + dirs, + vec![ + first_root.join("test_cluster_cluster_kv_ssd_storage/owner_a_b"), + second_root.join("test_cluster_cluster_kv_ssd_storage/owner_a_b"), + ] + ); + assert!(dirs.iter().all(|dir| dir.exists())); + } + #[test] fn client_test_spec_config_accepts_explicit_rdma_device_names() { let cfg = ClientConfigYaml::from_str( diff --git a/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs b/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs index da701cd..630a8ea 100644 --- a/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs +++ b/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs @@ -89,6 +89,7 @@ fn new_client_config( large_file_paths: LargeFilePaths { paths: vec![format!("{}_large", shm_path)], }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), } } @@ -130,6 +131,7 @@ fn new_zero_contribution_client_config( }, share_mem_path: shm_path.to_string(), large_file_paths: LargeFilePaths { paths: Vec::new() }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), } } diff --git a/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs b/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs index 9cb291f..b7715dd 100644 --- a/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs +++ b/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs @@ -865,8 +865,7 @@ impl ExternalInner { return Ok(false); } - self.finish_owner_recover(&share_mem_path, payload) - .await?; + self.finish_owner_recover(&share_mem_path, payload).await?; Ok(true) } diff --git a/fluxon_rs/fluxon_kv/src/kv_ssd_storage.rs b/fluxon_rs/fluxon_kv/src/kv_ssd_storage.rs new file mode 100644 index 0000000..26d711e --- /dev/null +++ b/fluxon_rs/fluxon_kv/src/kv_ssd_storage.rs @@ -0,0 +1,2159 @@ +use crate::master_kv_router::put::PutIDForAKey; +use crate::rpcresp_kvresult_convert::msg_and_error::{ApiError, KvError, KvResult}; +use ::tokio::{ + sync::{Notify, mpsc as tokio_mpsc, oneshot}, + task, +}; +use futures::stream::{FuturesUnordered, StreamExt}; +use io_uring::{IoUring, opcode, types::Fd}; +use parking_lot::Mutex; +use std::collections::{HashMap, HashSet, VecDeque}; +use std::fs::{self, OpenOptions}; +use std::io; +use std::os::fd::{AsRawFd, RawFd}; +use std::os::unix::fs::MetadataExt; +use std::os::unix::fs::OpenOptionsExt; +use std::path::{Path, PathBuf}; +use std::ptr::NonNull; +use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::thread::JoinHandle; + +pub(crate) const SSD_ALIGNMENT: usize = 512; +const DEFAULT_SHARDS_PER_OWNER: usize = 4; +const DEFAULT_URING_THREADS: usize = 16; +const DEFAULT_URING_IO_DEPTH: usize = 128; +const DEFAULT_URING_READ_WEIGHT: usize = 2; +const DEFAULT_WRITE_QUEUE_DEPTH: usize = 8; +const DEFAULT_READ_QUEUE_DEPTH: usize = 16; +const DEFAULT_WRITE_INFLIGHT: usize = 2; +const DEFAULT_READ_INFLIGHT: usize = 16; +pub(crate) const DEFAULT_READ_TRANSFER_PIPELINE_CHUNK_BYTES: u64 = 4 * 1024 * 1024; +pub(crate) const DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT: usize = 4; + +#[derive(Clone, Debug)] +pub struct KvSsdStorageInit { + pub root_dirs: Vec, + pub max_bytes: u64, +} + +#[derive(Debug)] +pub struct KvSsdStorage { + root_dirs: Vec, + devices: Vec, + shard_to_device: Vec, + next_write_device: AtomicUsize, + inner: Arc>, + space_notify: Arc, +} + +#[derive(Debug)] +struct SsdDeviceWorker { + device_id: u64, + root_dir: PathBuf, + shard_ids: Vec, + _files: Vec, + _io: Arc, + write_tx: tokio_mpsc::Sender, + read_tx: tokio_mpsc::Sender, +} + +#[derive(Clone, Debug)] +struct SsdDeviceRoot { + device_id: u64, + root_dir: PathBuf, +} + +struct OpenedSsdShard { + shard_id: usize, + device_idx: usize, + file: std::fs::File, +} + +#[derive(Clone, Copy, Debug)] +pub(crate) struct SsdLoadedChunk { + pub offset: u64, + pub stage_addr: u64, + pub len: u64, +} + +#[derive(Debug)] +struct KvSsdStorageInner { + ring: SsdRingBuffer, +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +struct KvSsdKey { + key: String, + put_id: PutIDForAKey, +} + +#[derive(Clone, Debug)] +struct SsdIndexEntry { + shard_id: usize, + begin: u64, + len: u64, + aligned_len: u64, + file_offset: u64, +} + +#[derive(Clone, Debug)] +struct SsdReadPinInfo { + entry: SsdIndexEntry, + count: usize, +} + +#[derive(Clone, Debug)] +enum SsdEntryState { + Writing(SsdIndexEntry), + Committed(SsdIndexEntry), +} + +impl SsdEntryState { + fn entry(&self) -> &SsdIndexEntry { + match self { + Self::Writing(entry) | Self::Committed(entry) => entry, + } + } +} + +#[derive(Debug)] +struct SsdShardRing { + capacity: u64, + head: u64, + tail: u64, + order: VecDeque, +} + +#[derive(Debug)] +struct SsdRingBuffer { + shards: Vec, + next_shard: usize, + entries: HashMap, + read_pins: HashMap, +} + +#[derive(Debug)] +enum SsdPreparedWrite { + Ready(SsdIndexEntry), + Existing, + BlockedByBusyIo, +} + +#[derive(Debug)] +enum SsdAllocation { + Ready { begin: u64, file_offset: u64 }, + BlockedByBusyIo, + TooLarge, +} + +impl SsdRingBuffer { + fn new(shard_capacities: Vec) -> Self { + assert!(!shard_capacities.is_empty()); + Self { + shards: shard_capacities + .into_iter() + .map(|capacity| SsdShardRing { + capacity, + head: 0, + tail: 0, + order: VecDeque::new(), + }) + .collect(), + next_shard: 0, + entries: HashMap::new(), + read_pins: HashMap::new(), + } + } + + #[cfg(test)] + fn get(&self, key: &KvSsdKey) -> Option { + match self.entries.get(key) { + Some(SsdEntryState::Committed(entry)) if self.is_offset_valid(entry) => { + Some(entry.clone()) + } + _ => None, + } + } + + fn pin_read(&mut self, key: &KvSsdKey) -> Option { + let entry = match self.entries.get(key) { + Some(SsdEntryState::Committed(entry)) if self.is_offset_valid(entry) => entry.clone(), + _ => return None, + }; + let pin = self + .read_pins + .entry(key.clone()) + .or_insert_with(|| SsdReadPinInfo { + entry: entry.clone(), + count: 0, + }); + pin.count += 1; + Some(entry) + } + + fn unpin_read(&mut self, key: &KvSsdKey) { + match self.read_pins.get_mut(key) { + Some(pin) if pin.count > 1 => pin.count -= 1, + Some(_) => { + self.read_pins.remove(key); + } + None => debug_assert!(false, "missing kv ssd read pin for key={key:?}"), + } + } + + #[cfg(test)] + fn prepare_write(&mut self, key: KvSsdKey, len: u64) -> KvResult { + let allowed_shards = (0..self.shards.len()).collect::>(); + self.prepare_write_on_shards(key, len, &allowed_shards) + } + + fn prepare_write_on_shards( + &mut self, + key: KvSsdKey, + len: u64, + allowed_shards: &[usize], + ) -> KvResult { + if allowed_shards.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd device has no shards".to_string(), + })); + } + if self.entries.contains_key(&key) { + return Ok(SsdPreparedWrite::Existing); + } + let aligned_len = align_up_u64(len, SSD_ALIGNMENT as u64)?; + let max_capacity = self + .shards + .iter() + .enumerate() + .filter(|(idx, _)| allowed_shards.contains(idx)) + .map(|(_, shard)| shard.capacity) + .max() + .ok_or_else(|| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd device has invalid shard set: {allowed_shards:?}"), + }) + })?; + if aligned_len > max_capacity { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd value len={} aligned_len={} exceeds shard capacity={}", + len, aligned_len, max_capacity + ), + })); + } + + let shard_count = self.shards.len(); + for offset in 0..shard_count { + let shard_id = (self.next_shard + offset) % shard_count; + if !allowed_shards.contains(&shard_id) { + continue; + } + let (begin, file_offset) = match self.allocate_contiguous(shard_id, aligned_len) { + SsdAllocation::Ready { begin, file_offset } => (begin, file_offset), + SsdAllocation::BlockedByBusyIo => continue, + SsdAllocation::TooLarge => unreachable!("aligned_len was checked against capacity"), + }; + self.next_shard = (shard_id + 1) % shard_count; + + let entry = SsdIndexEntry { + shard_id, + begin, + len, + aligned_len, + file_offset, + }; + self.entries + .insert(key.clone(), SsdEntryState::Writing(entry.clone())); + self.shards[shard_id].order.push_back(key); + return Ok(SsdPreparedWrite::Ready(entry)); + } + + Ok(SsdPreparedWrite::BlockedByBusyIo) + } + + fn allocate_contiguous(&mut self, shard_id: usize, size: u64) -> SsdAllocation { + let shard = &self.shards[shard_id]; + if size > shard.capacity { + return SsdAllocation::TooLarge; + } + let capacity = shard.capacity; + let mut head = shard.head; + let phys = head % capacity; + let space_until_end = capacity - phys; + if size > space_until_end { + head += space_until_end; + } + let begin = head; + let new_head = head + size; + let new_tail = new_head.saturating_sub(capacity); + if self.has_busy_entries_before(shard_id, new_tail) { + return SsdAllocation::BlockedByBusyIo; + } + + self.shards[shard_id].head = new_head; + self.advance_tail(shard_id, new_tail); + SsdAllocation::Ready { + begin, + file_offset: begin % capacity, + } + } + + fn advance_tail(&mut self, shard_id: usize, new_tail: u64) { + if new_tail <= self.shards[shard_id].tail { + return; + } + debug_assert!(!self.has_busy_entries_before(shard_id, new_tail)); + self.shards[shard_id].tail = new_tail; + + while let Some(key) = self.shards[shard_id].order.front() { + match self.entries.get(key) { + Some(state) if state.entry().begin >= new_tail => break, + _ => { + let key = self.shards[shard_id] + .order + .pop_front() + .expect("front key exists"); + self.entries.remove(&key); + } + } + } + } + + fn commit(&mut self, key: &KvSsdKey, success: bool) -> bool { + let Some(state) = self.entries.get(key) else { + return false; + }; + let entry = match state { + SsdEntryState::Writing(entry) => entry.clone(), + SsdEntryState::Committed(_) => return true, + }; + if !self.is_offset_valid(&entry) || !success { + self.entries.remove(key); + return false; + } + self.entries + .insert(key.clone(), SsdEntryState::Committed(entry)); + true + } + + fn remove(&mut self, key: &KvSsdKey) { + self.entries.remove(key); + } + + fn is_offset_valid(&self, entry: &SsdIndexEntry) -> bool { + self.shards + .get(entry.shard_id) + .is_some_and(|shard| entry.begin >= shard.tail) + } + + fn has_busy_entries_before(&self, shard_id: usize, new_tail: u64) -> bool { + if new_tail <= self.shards[shard_id].tail { + return false; + } + let writing_busy = self.entries.values().any(|state| match state { + SsdEntryState::Writing(entry) => entry.shard_id == shard_id && entry.begin < new_tail, + SsdEntryState::Committed(_) => false, + }); + if writing_busy { + return true; + } + self.read_pins + .values() + .any(|pin| pin.entry.shard_id == shard_id && pin.entry.begin < new_tail) + } +} + +struct SsdReadPin { + inner: Arc>, + space_notify: Arc, + key: KvSsdKey, +} + +impl Drop for SsdReadPin { + fn drop(&mut self) { + self.inner.lock().ring.unpin_read(&self.key); + self.space_notify.notify_one(); + } +} + +struct WriteCommand { + key: KvSsdKey, + entry_len: u64, + data: AlignedBuffer, + done_tx: oneshot::Sender>, +} + +struct ReadCommand { + key: KvSsdKey, + entry: SsdIndexEntry, + file_offset: u64, + target: ReadTarget, + _read_pin: Option, + done_tx: oneshot::Sender>, +} + +struct WriteTask { + key: KvSsdKey, + entry: SsdIndexEntry, + data: AlignedBuffer, + done_tx: oneshot::Sender>, +} + +struct ReadTask { + key: KvSsdKey, + entry: SsdIndexEntry, + file_offset: u64, + target: ReadTarget, + _read_pin: Option, + done_tx: oneshot::Sender>, +} + +struct WriteCompletion { + key: KvSsdKey, + success: bool, + result: KvResult<()>, + done_tx: oneshot::Sender>, +} + +struct ReadCompletion { + key: KvSsdKey, + entry: SsdIndexEntry, + result: KvResult, + _read_pin: Option, + done_tx: oneshot::Sender>, +} + +enum ReadTarget { + Scratch(AlignedBuffer), + Direct { target_addr: u64, len: usize }, +} + +enum ReadOutput { + Scratch(AlignedBuffer), + Direct, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum SsdReadPath { + Scratch, + Direct, +} + +pub fn safe_path_component(raw: &str) -> String { + let mut out = String::with_capacity(raw.len().max(1)); + for ch in raw.chars() { + if ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.') { + out.push(ch); + } else { + out.push('_'); + } + } + if out.is_empty() { + "unnamed".to_string() + } else { + out + } +} + +impl KvSsdStorage { + pub fn new(init: KvSsdStorageInit) -> KvResult { + if init.max_bytes < SSD_ALIGNMENT as u64 { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd storage max_bytes must be >= {}", SSD_ALIGNMENT), + })); + } + if init.root_dirs.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage root_dirs must contain at least one path".to_string(), + })); + } + + let device_roots = deduplicate_device_roots(&init.root_dirs)?; + let effective_root_dirs = device_roots + .iter() + .map(|root| root.root_dir.clone()) + .collect::>(); + let shard_count = choose_shard_count(init.max_bytes, device_roots.len()); + let shard_capacity = aligned_shard_capacity(init.max_bytes, shard_count)?; + let opened_shards = open_cache_files(&device_roots, shard_count, shard_capacity)?; + let inner = Arc::new(Mutex::new(KvSsdStorageInner { + ring: SsdRingBuffer::new(vec![shard_capacity; shard_count]), + })); + let space_notify = Arc::new(Notify::new()); + let mut shard_to_device = vec![0usize; shard_count]; + let mut device_shards = device_roots + .iter() + .map(|root| (root.clone(), Vec::<(usize, std::fs::File)>::new())) + .collect::>(); + for opened in opened_shards { + shard_to_device[opened.shard_id] = opened.device_idx; + device_shards[opened.device_idx] + .1 + .push((opened.shard_id, opened.file)); + } + + let mut devices = Vec::with_capacity(device_shards.len()); + for (device_root, shard_files) in device_shards { + let shard_ids = shard_files + .iter() + .map(|(shard_id, _)| *shard_id) + .collect::>(); + let fds = shard_files + .iter() + .map(|(shard_id, file)| (*shard_id, file.as_raw_fd())) + .collect::>(); + let io = Arc::new(UringIoEngine::new_multi( + fds, + UringConfig { + threads: DEFAULT_URING_THREADS, + io_depth: DEFAULT_URING_IO_DEPTH, + }, + )?); + let (write_tx, write_rx) = tokio_mpsc::channel(DEFAULT_WRITE_QUEUE_DEPTH); + let (read_tx, read_rx) = tokio_mpsc::channel(DEFAULT_READ_QUEUE_DEPTH); + + task::spawn(ssd_writer_loop( + Arc::clone(&inner), + write_rx, + Arc::clone(&io), + Arc::clone(&space_notify), + DEFAULT_WRITE_INFLIGHT, + shard_ids.clone(), + )); + task::spawn(ssd_reader_loop( + Arc::clone(&inner), + read_rx, + Arc::clone(&io), + DEFAULT_READ_INFLIGHT, + )); + + devices.push(SsdDeviceWorker { + device_id: device_root.device_id, + root_dir: device_root.root_dir, + shard_ids, + _files: shard_files + .into_iter() + .map(|(_, file)| file) + .collect::>(), + _io: io, + write_tx, + read_tx, + }); + } + + Ok(Self { + root_dirs: effective_root_dirs, + devices, + shard_to_device, + next_write_device: AtomicUsize::new(0), + inner, + space_notify, + }) + } + + pub fn root_dirs(&self) -> &[PathBuf] { + &self.root_dirs + } + + fn next_write_tx(&self) -> KvResult> { + if self.devices.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage has no active device".to_string(), + })); + } + let idx = self.next_write_device.fetch_add(1, Ordering::Relaxed) % self.devices.len(); + Ok(self.devices[idx].write_tx.clone()) + } + + fn read_tx_for_shard(&self, shard_id: usize) -> KvResult> { + let Some(device_idx) = self.shard_to_device.get(shard_id).copied() else { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd invalid shard id for read: {}", shard_id), + })); + }; + let Some(device) = self.devices.get(device_idx) else { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd invalid device index for read: shard_id={} device_idx={}", + shard_id, device_idx + ), + })); + }; + if !device.shard_ids.contains(&shard_id) { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd shard/device route mismatch: shard_id={} device_idx={} device_id={} root_dir={}", + shard_id, + device_idx, + device.device_id, + device.root_dir.display() + ), + })); + } + Ok(device.read_tx.clone()) + } + + pub async fn persist_from_addr( + &self, + key: &str, + put_id: PutIDForAKey, + addr: u64, + len: u64, + ) -> KvResult<()> { + validate_key(key)?; + let len_usize = usize::try_from(len).map_err(|_| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd persist len does not fit usize: {}", len), + }) + })?; + let aligned_len = align_up_usize(len_usize, SSD_ALIGNMENT)?; + let data = unsafe { AlignedBuffer::copy_from_addr(addr, len_usize, aligned_len)? }; + self.persist_buffer(key, put_id, len, data).await + } + + pub async fn persist(&self, key: &str, put_id: PutIDForAKey, data: &[u8]) -> KvResult<()> { + validate_key(key)?; + let aligned_len = align_up_usize(data.len(), SSD_ALIGNMENT)?; + let mut buffer = AlignedBuffer::zeroed(aligned_len)?; + unsafe { + std::ptr::copy_nonoverlapping(data.as_ptr(), buffer.as_mut_ptr(), data.len()); + } + self.persist_buffer(key, put_id, data.len() as u64, buffer) + .await + } + + async fn persist_buffer( + &self, + key: &str, + put_id: PutIDForAKey, + entry_len: u64, + data: AlignedBuffer, + ) -> KvResult<()> { + let (done_tx, done_rx) = oneshot::channel(); + let write_tx = self.next_write_tx()?; + write_tx + .send(WriteCommand { + key: KvSsdKey { + key: key.to_string(), + put_id, + }, + entry_len, + data, + done_tx, + }) + .await + .map_err(|err| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd write queue closed: {}", err), + }) + })?; + done_rx.await.map_err(|err| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd write completion closed: {}", err), + }) + })? + } + + pub async fn load_into_addr( + &self, + key: &str, + put_id: PutIDForAKey, + target_addr: u64, + len: u64, + target_len: u64, + ) -> KvResult<()> { + validate_key(key)?; + if target_len < len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd target capacity too small for key={} put_id=({},{}) len={} target_len={}", + key, put_id.0, put_id.1, len, target_len + ), + })); + } + let key = KvSsdKey { + key: key.to_string(), + put_id, + }; + let (entry, read_pin) = { + let mut inner = self.inner.lock(); + let Some(entry) = inner.ring.pin_read(&key) else { + return Err(KvError::Api(ApiError::KeyNotFound { + key: key.key.clone(), + })); + }; + ( + entry, + SsdReadPin { + inner: Arc::clone(&self.inner), + space_notify: Arc::clone(&self.space_notify), + key: key.clone(), + }, + ) + }; + if entry.len != len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd length mismatch for key={} put_id=({},{}) expected={} actual={}", + key.key, put_id.0, put_id.1, len, entry.len + ), + })); + } + + let len_usize = usize::try_from(len).map_err(|_| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd load len does not fit usize: {}", len), + }) + })?; + let aligned_len_usize = usize::try_from(entry.aligned_len).map_err(|_| { + KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd aligned load len does not fit usize: {}", + entry.aligned_len + ), + }) + })?; + let target = match choose_read_path(&entry, target_addr, len, target_len) { + SsdReadPath::Direct => ReadTarget::Direct { + target_addr, + len: aligned_len_usize, + }, + SsdReadPath::Scratch => ReadTarget::Scratch(AlignedBuffer::zeroed(aligned_len_usize)?), + }; + let output = self + .submit_read_command( + key, + entry.clone(), + entry.file_offset, + target, + Some(read_pin), + ) + .await?; + if let ReadOutput::Scratch(buffer) = output { + unsafe { + std::ptr::copy_nonoverlapping(buffer.as_ptr(), target_addr as *mut u8, len_usize); + } + } + Ok(()) + } + + pub(crate) async fn load_into_addr_chunks( + &self, + key: &str, + put_id: PutIDForAKey, + target_addr: u64, + len: u64, + target_len: u64, + chunk_bytes: u64, + max_read_inflight: usize, + ready_tx: tokio_mpsc::Sender, + ) -> KvResult<()> { + validate_key(key)?; + if target_len < len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd target capacity too small for chunked load: key={} put_id=({},{}) len={} target_len={}", + key, put_id.0, put_id.1, len, target_len + ), + })); + } + let chunk_bytes = align_up_u64(chunk_bytes.max(1), SSD_ALIGNMENT as u64)?; + let key = KvSsdKey { + key: key.to_string(), + put_id, + }; + let (entry, _read_pin) = { + let mut inner = self.inner.lock(); + let Some(entry) = inner.ring.pin_read(&key) else { + return Err(KvError::Api(ApiError::KeyNotFound { + key: key.key.clone(), + })); + }; + ( + entry, + SsdReadPin { + inner: Arc::clone(&self.inner), + space_notify: Arc::clone(&self.space_notify), + key: key.clone(), + }, + ) + }; + if entry.len != len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd length mismatch for chunked load: key={} put_id=({},{}) expected={} actual={}", + key.key, put_id.0, put_id.1, len, entry.len + ), + })); + } + + let mut next_offset = 0u64; + let mut inflight = FuturesUnordered::new(); + let max_read_inflight = max_read_inflight.max(1); + + loop { + while next_offset < len && inflight.len() < max_read_inflight { + let payload_len = chunk_bytes.min(len - next_offset); + let stage_addr = checked_add_u64(target_addr, next_offset, "chunk stage addr")?; + let remaining_target_len = target_len - next_offset; + inflight.push(self.load_entry_range_into_addr( + key.clone(), + entry.clone(), + next_offset, + payload_len, + stage_addr, + remaining_target_len, + )); + next_offset += payload_len; + } + + let Some(chunk) = inflight.next().await else { + break; + }; + let chunk = chunk?; + ready_tx.send(chunk).await.map_err(|err| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd chunk ready queue closed: {}", err), + }) + })?; + } + Ok(()) + } + + async fn load_entry_range_into_addr( + &self, + key: KvSsdKey, + entry: SsdIndexEntry, + offset: u64, + payload_len: u64, + target_addr: u64, + target_len: u64, + ) -> KvResult { + if payload_len == 0 { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd chunk payload len must be positive".to_string(), + })); + } + let payload_end = checked_add_u64(offset, payload_len, "chunk payload end")?; + if payload_end > entry.len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd chunk exceeds entry len: offset={} len={} entry_len={}", + offset, payload_len, entry.len + ), + })); + } + let read_len = align_up_u64(payload_len, SSD_ALIGNMENT as u64)?; + let read_end = checked_add_u64(offset, read_len, "chunk read end")?; + if read_end > entry.aligned_len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd aligned chunk exceeds entry aligned len: offset={} read_len={} aligned_len={}", + offset, read_len, entry.aligned_len + ), + })); + } + if target_len < read_len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd chunk target capacity too small: offset={} read_len={} target_len={}", + offset, read_len, target_len + ), + })); + } + let file_offset = checked_add_u64(entry.file_offset, offset, "chunk file offset")?; + let read_len_usize = usize::try_from(read_len).map_err(|_| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd chunk read len does not fit usize: {}", read_len), + }) + })?; + let payload_len_usize = usize::try_from(payload_len).map_err(|_| { + KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd chunk payload len does not fit usize: {}", + payload_len + ), + }) + })?; + let target = match choose_chunk_read_path(target_addr, read_len, target_len, file_offset) { + SsdReadPath::Direct => ReadTarget::Direct { + target_addr, + len: read_len_usize, + }, + SsdReadPath::Scratch => ReadTarget::Scratch(AlignedBuffer::zeroed(read_len_usize)?), + }; + let output = self + .submit_read_command(key, entry, file_offset, target, None) + .await?; + if let ReadOutput::Scratch(buffer) = output { + unsafe { + std::ptr::copy_nonoverlapping( + buffer.as_ptr(), + target_addr as *mut u8, + payload_len_usize, + ); + } + } + Ok(SsdLoadedChunk { + offset, + stage_addr: target_addr, + len: payload_len, + }) + } + + async fn submit_read_command( + &self, + key: KvSsdKey, + entry: SsdIndexEntry, + file_offset: u64, + target: ReadTarget, + read_pin: Option, + ) -> KvResult { + let (done_tx, done_rx) = oneshot::channel(); + let read_tx = self.read_tx_for_shard(entry.shard_id)?; + read_tx + .send(ReadCommand { + key, + entry, + file_offset, + target, + _read_pin: read_pin, + done_tx, + }) + .await + .map_err(|err| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd read queue closed: {}", err), + }) + })?; + done_rx.await.map_err(|err| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd read completion closed: {}", err), + }) + })? + } + + #[cfg(test)] + async fn has_entry(&self, key: &str, put_id: PutIDForAKey) -> bool { + let key = KvSsdKey { + key: key.to_string(), + put_id, + }; + self.inner.lock().ring.get(&key).is_some() + } +} + +async fn ssd_writer_loop( + inner: Arc>, + mut rx: tokio_mpsc::Receiver, + io: Arc, + space_notify: Arc, + write_inflight: usize, + shard_ids: Vec, +) { + let mut pending: VecDeque = VecDeque::new(); + let mut inflight = FuturesUnordered::new(); + let max_inflight = write_inflight.max(1); + + loop { + while inflight.len() < max_inflight { + let Some(cmd) = pending.pop_front() else { + break; + }; + let prepared = { + let mut inner = inner.lock(); + inner + .ring + .prepare_write_on_shards(cmd.key.clone(), cmd.entry_len, &shard_ids) + }; + match prepared { + Ok(SsdPreparedWrite::Ready(entry)) => { + inflight.push(execute_write( + WriteTask { + key: cmd.key, + entry, + data: cmd.data, + done_tx: cmd.done_tx, + }, + Arc::clone(&io), + )); + } + Ok(SsdPreparedWrite::Existing) => { + let _ = cmd.done_tx.send(Ok(())); + } + Ok(SsdPreparedWrite::BlockedByBusyIo) => { + pending.push_front(cmd); + break; + } + Err(err) => { + let _ = cmd.done_tx.send(Err(err)); + } + } + } + + tokio::select! { + Some(completion) = inflight.next(), if !inflight.is_empty() => { + finish_write_completion(&inner, &space_notify, completion); + } + Some(cmd) = rx.recv() => { + pending.push_back(cmd); + } + _ = space_notify.notified(), if !pending.is_empty() => { + // Retry pending commands after an active read/write releases a ring position. + } + else => { + if pending.is_empty() && inflight.is_empty() { + break; + } + }, + } + } + + while !pending.is_empty() || !inflight.is_empty() { + while inflight.len() < max_inflight { + let Some(cmd) = pending.pop_front() else { + break; + }; + let prepared = { + let mut inner = inner.lock(); + inner + .ring + .prepare_write_on_shards(cmd.key.clone(), cmd.entry_len, &shard_ids) + }; + match prepared { + Ok(SsdPreparedWrite::Ready(entry)) => { + inflight.push(execute_write( + WriteTask { + key: cmd.key, + entry, + data: cmd.data, + done_tx: cmd.done_tx, + }, + Arc::clone(&io), + )); + } + Ok(SsdPreparedWrite::Existing) => { + let _ = cmd.done_tx.send(Ok(())); + } + Ok(SsdPreparedWrite::BlockedByBusyIo) => { + pending.push_front(cmd); + break; + } + Err(err) => { + let _ = cmd.done_tx.send(Err(err)); + } + } + } + + if let Some(completion) = inflight.next().await { + finish_write_completion(&inner, &space_notify, completion); + } else if !pending.is_empty() { + space_notify.notified().await; + } + } +} + +fn finish_write_completion( + inner: &Arc>, + space_notify: &Notify, + completion: WriteCompletion, +) { + let committed = inner + .lock() + .ring + .commit(&completion.key, completion.success); + space_notify.notify_one(); + let result = if completion.success && !committed { + Err(KvError::Api(ApiError::KeyNotFound { + key: completion.key.key.clone(), + })) + } else { + completion.result + }; + let _ = completion.done_tx.send(result); +} + +async fn execute_write(task: WriteTask, io: Arc) -> WriteCompletion { + let WriteTask { + key, + entry, + data, + done_tx, + } = task; + let data_len = data.len(); + let shard_id = entry.shard_id; + let file_offset = entry.file_offset; + let result = async move { + let rx = { + let data_ptr = data.as_ptr(); + io.writev_at_async(shard_id, vec![(data_ptr, data_len)], file_offset)? + }; + let written = rx + .await + .map_err(|_| io::Error::other("kv ssd write completion dropped"))??; + if written != data_len { + return Err(io::Error::new( + io::ErrorKind::WriteZero, + format!("short kv ssd write: {} != {}", written, data_len), + ) + .into()); + } + Ok(()) + } + .await; + let result = result.map_err(|err| file_error_for_entry(&key, file_offset, err)); + WriteCompletion { + key, + success: result.is_ok(), + result, + done_tx, + } +} + +async fn ssd_reader_loop( + inner: Arc>, + mut rx: tokio_mpsc::Receiver, + io: Arc, + read_inflight: usize, +) { + let mut pending = VecDeque::new(); + let mut inflight = FuturesUnordered::new(); + let max_inflight = read_inflight.max(1); + + loop { + while inflight.len() < max_inflight { + let Some(task) = pending.pop_front() else { + break; + }; + inflight.push(execute_read(task, Arc::clone(&io))); + } + + tokio::select! { + Some(completion) = inflight.next(), if !inflight.is_empty() => { + let valid = inner.lock().ring.is_offset_valid(&completion.entry); + let result = if valid { + completion.result + } else { + inner.lock().ring.remove(&completion.key); + Err(KvError::Api(ApiError::KeyNotFound { + key: completion.key.key.clone(), + })) + }; + let _ = completion.done_tx.send(result); + } + Some(cmd) = rx.recv() => { + pending.push_back(ReadTask { + key: cmd.key, + entry: cmd.entry, + file_offset: cmd.file_offset, + target: cmd.target, + _read_pin: cmd._read_pin, + done_tx: cmd.done_tx, + }); + } + else => break, + } + } + + while let Some(completion) = inflight.next().await { + let valid = inner.lock().ring.is_offset_valid(&completion.entry); + let result = if valid { + completion.result + } else { + inner.lock().ring.remove(&completion.key); + Err(KvError::Api(ApiError::KeyNotFound { + key: completion.key.key.clone(), + })) + }; + let _ = completion.done_tx.send(result); + } +} + +async fn execute_read(task: ReadTask, io: Arc) -> ReadCompletion { + let ReadTask { + key, + entry, + file_offset, + target, + _read_pin, + done_tx, + } = task; + let shard_id = entry.shard_id; + let result = async move { + match target { + ReadTarget::Scratch(mut buffer) => { + let buffer_len = buffer.len(); + let rx = { + let buffer_ptr = buffer.as_mut_ptr(); + io.readv_at_async(shard_id, vec![(buffer_ptr, buffer_len)], file_offset)? + }; + let read = rx + .await + .map_err(|_| io::Error::other("kv ssd read completion dropped"))??; + if read != buffer_len { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!("short kv ssd read: {} != {}", read, buffer_len), + )); + } + Ok(ReadOutput::Scratch(buffer)) + } + ReadTarget::Direct { target_addr, len } => { + let rx = + io.readv_at_async(shard_id, vec![(target_addr as *mut u8, len)], file_offset)?; + let read = rx + .await + .map_err(|_| io::Error::other("kv ssd read completion dropped"))??; + if read != len { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!("short kv ssd direct read: {} != {}", read, len), + )); + } + Ok(ReadOutput::Direct) + } + } + } + .await + .map_err(|err| file_error_for_entry(&key, file_offset, err)); + ReadCompletion { + key, + entry, + result, + _read_pin, + done_tx, + } +} + +#[derive(Clone, Copy)] +struct UringConfig { + threads: usize, + io_depth: usize, +} + +#[derive(Clone, Copy)] +enum IoType { + Readv, + Writev, +} + +struct IoCtx { + io_type: IoType, + fd: RawFd, + len: usize, + offset: u64, + complete: oneshot::Sender>, + iovecs: Box<[libc::iovec]>, +} + +unsafe impl Send for IoCtx {} + +struct UringShard { + read_rx: crossbeam::channel::Receiver, + write_rx: crossbeam::channel::Receiver, + uring: IoUring, + io_depth: usize, + read_weight: usize, +} + +impl UringShard { + fn run(mut self) { + let mut read_inflight = 0usize; + let mut write_inflight = 0usize; + let mut read_closed = false; + let mut write_closed = false; + + loop { + let mut inflight = read_inflight + write_inflight; + while inflight < self.io_depth && !(read_closed && write_closed) { + let next = self.try_recv_weighted( + &mut read_closed, + &mut write_closed, + read_inflight, + write_inflight, + ); + let Some(ctx) = next else { + break; + }; + self.submit_ctx(ctx, &mut read_inflight, &mut write_inflight); + inflight = read_inflight + write_inflight; + } + + if read_closed && write_closed && inflight == 0 { + return; + } + if inflight == 0 { + let Some(ctx) = self.recv_blocking(&mut read_closed, &mut write_closed) else { + continue; + }; + self.submit_ctx(ctx, &mut read_inflight, &mut write_inflight); + continue; + } + if let Err(err) = self.uring.submit_and_wait(1) { + while let Some(cqe) = self.uring.completion().next() { + let data = cqe.user_data(); + if data != 0 { + let ctx = unsafe { Box::from_raw(data as *mut IoCtx) }; + let _ = ctx.complete.send(Err(io::Error::other(format!( + "io_uring submit failed: {err}" + )))); + } + } + return; + } + + for cqe in self.uring.completion() { + let data = cqe.user_data(); + if data == 0 { + continue; + } + let ctx = unsafe { Box::from_raw(data as *mut IoCtx) }; + match ctx.io_type { + IoType::Readv => read_inflight = read_inflight.saturating_sub(1), + IoType::Writev => write_inflight = write_inflight.saturating_sub(1), + } + let res = cqe.result(); + let send_res = if res < 0 { + Err(io::Error::from_raw_os_error(-res)) + } else { + Ok(res as usize) + }; + let _ = ctx.complete.send(send_res); + } + } + } + + fn try_recv_weighted( + &self, + read_closed: &mut bool, + write_closed: &mut bool, + read_inflight: usize, + write_inflight: usize, + ) -> Option { + let prefer_read = read_inflight <= write_inflight.saturating_mul(self.read_weight); + if prefer_read { + self.try_recv_read(read_closed) + .or_else(|| self.try_recv_write(write_closed)) + } else { + self.try_recv_write(write_closed) + .or_else(|| self.try_recv_read(read_closed)) + } + } + + fn try_recv_read(&self, read_closed: &mut bool) -> Option { + if *read_closed { + return None; + } + match self.read_rx.try_recv() { + Ok(ctx) => Some(ctx), + Err(crossbeam::channel::TryRecvError::Empty) => None, + Err(crossbeam::channel::TryRecvError::Disconnected) => { + *read_closed = true; + None + } + } + } + + fn try_recv_write(&self, write_closed: &mut bool) -> Option { + if *write_closed { + return None; + } + match self.write_rx.try_recv() { + Ok(ctx) => Some(ctx), + Err(crossbeam::channel::TryRecvError::Empty) => None, + Err(crossbeam::channel::TryRecvError::Disconnected) => { + *write_closed = true; + None + } + } + } + + fn recv_blocking(&self, read_closed: &mut bool, write_closed: &mut bool) -> Option { + loop { + match (!*read_closed, !*write_closed) { + (true, true) => { + crossbeam::channel::select! { + recv(self.read_rx) -> msg => match msg { + Ok(ctx) => return Some(ctx), + Err(_) => *read_closed = true, + }, + recv(self.write_rx) -> msg => match msg { + Ok(ctx) => return Some(ctx), + Err(_) => *write_closed = true, + }, + } + } + (true, false) => match self.read_rx.recv() { + Ok(ctx) => return Some(ctx), + Err(_) => *read_closed = true, + }, + (false, true) => match self.write_rx.recv() { + Ok(ctx) => return Some(ctx), + Err(_) => *write_closed = true, + }, + (false, false) => return None, + } + } + } + + fn submit_ctx(&mut self, ctx: IoCtx, read_inflight: &mut usize, write_inflight: &mut usize) { + let fd = Fd(ctx.fd); + let iovecs_ptr = ctx.iovecs.as_ptr(); + let sqe = match ctx.io_type { + IoType::Readv => opcode::Readv::new(fd, iovecs_ptr, ctx.len as _) + .offset(ctx.offset) + .build(), + IoType::Writev => opcode::Writev::new(fd, iovecs_ptr, ctx.len as _) + .offset(ctx.offset) + .build(), + }; + let io_type = ctx.io_type; + let data = Box::into_raw(Box::new(ctx)) as u64; + let sqe = sqe.user_data(data); + let push_result = unsafe { self.uring.submission().push(&sqe) }; + if push_result.is_err() { + let ctx = unsafe { Box::from_raw(data as *mut IoCtx) }; + let _ = ctx + .complete + .send(Err(io::Error::other("submission queue full"))); + return; + } + match io_type { + IoType::Readv => *read_inflight += 1, + IoType::Writev => *write_inflight += 1, + } + } +} + +#[derive(Debug)] +struct UringIoEngine { + fds: HashMap, + read_txs: Vec>, + write_txs: Vec>, + handles: Vec>, +} + +impl UringIoEngine { + fn new_multi(shard_fds: Vec<(usize, RawFd)>, cfg: UringConfig) -> io::Result { + if cfg.threads == 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "threads must be > 0", + )); + } + if shard_fds.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "at least one fd is required", + )); + } + let fds = shard_fds.into_iter().collect::>(); + let mut read_txs = Vec::with_capacity(cfg.threads); + let mut write_txs = Vec::with_capacity(cfg.threads); + let mut handles = Vec::with_capacity(cfg.threads); + for idx in 0..cfg.threads { + let (read_tx, read_rx) = crossbeam::channel::bounded(cfg.io_depth * 2); + let (write_tx, write_rx) = crossbeam::channel::bounded(cfg.io_depth * 2); + let uring = IoUring::builder().build(cfg.io_depth as u32)?; + let handle = std::thread::Builder::new() + .name(format!("fluxon-kv-ssd-uring-{idx}")) + .spawn(move || { + UringShard { + read_rx, + write_rx, + uring, + io_depth: cfg.io_depth, + read_weight: DEFAULT_URING_READ_WEIGHT, + } + .run() + })?; + read_txs.push(read_tx); + write_txs.push(write_tx); + handles.push(handle); + } + Ok(Self { + fds, + read_txs, + write_txs, + handles, + }) + } + + fn readv_at_async( + &self, + shard_id: usize, + iovecs: Vec<(*mut u8, usize)>, + offset: u64, + ) -> io::Result>> { + self.submit_iovecs(IoType::Readv, shard_id, iovecs, offset) + } + + fn writev_at_async( + &self, + shard_id: usize, + iovecs: Vec<(*const u8, usize)>, + offset: u64, + ) -> io::Result>> { + let iovecs = iovecs + .into_iter() + .map(|(ptr, len)| (ptr as *mut u8, len)) + .collect(); + self.submit_iovecs(IoType::Writev, shard_id, iovecs, offset) + } + + fn submit_iovecs( + &self, + io_type: IoType, + shard_id: usize, + iovecs: Vec<(*mut u8, usize)>, + offset: u64, + ) -> io::Result>> { + if iovecs.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "readv/writev requires at least one iovec", + )); + } + validate_direct_io( + iovecs.iter().map(|(ptr, len)| (*ptr as usize, *len)), + offset, + )?; + let iovecs_libc = iovecs + .iter() + .map(|(ptr, len)| libc::iovec { + iov_base: *ptr as *mut libc::c_void, + iov_len: *len, + }) + .collect::>() + .into_boxed_slice(); + let (tx, rx) = oneshot::channel(); + let ctx = IoCtx { + io_type, + fd: self.fd(shard_id)?, + len: iovecs_libc.len(), + offset, + complete: tx, + iovecs: iovecs_libc, + }; + self.pick_tx(io_type, shard_id).send(ctx).map_err(|err| { + io::Error::new( + io::ErrorKind::BrokenPipe, + format!("io_uring send failed: {}", err), + ) + })?; + Ok(rx) + } + + fn fd(&self, shard_id: usize) -> io::Result { + self.fds.get(&shard_id).copied().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("invalid SSD shard id {shard_id}"), + ) + }) + } + + fn pick_tx(&self, io_type: IoType, shard_id: usize) -> &crossbeam::channel::Sender { + match io_type { + IoType::Readv => &self.read_txs[shard_id % self.read_txs.len()], + IoType::Writev => &self.write_txs[shard_id % self.write_txs.len()], + } + } +} + +impl Drop for UringIoEngine { + fn drop(&mut self) { + self.read_txs.clear(); + self.write_txs.clear(); + for handle in self.handles.drain(..) { + let _ = handle.join(); + } + } +} + +struct AlignedBuffer { + ptr: NonNull, + len: usize, +} + +unsafe impl Send for AlignedBuffer {} + +impl AlignedBuffer { + fn zeroed(len: usize) -> KvResult { + if len == 0 || !len.is_multiple_of(SSD_ALIGNMENT) { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "aligned buffer len must be positive and {}-byte aligned: {}", + SSD_ALIGNMENT, len + ), + })); + } + let mut raw = std::ptr::null_mut(); + let rc = unsafe { libc::posix_memalign(&mut raw, SSD_ALIGNMENT, len) }; + if rc != 0 || raw.is_null() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!("posix_memalign failed with rc={}", rc), + })); + } + unsafe { + std::ptr::write_bytes(raw as *mut u8, 0, len); + } + Ok(Self { + ptr: NonNull::new(raw as *mut u8).expect("posix_memalign returned non-null"), + len, + }) + } + + unsafe fn copy_from_addr(addr: u64, actual_len: usize, aligned_len: usize) -> KvResult { + let mut buffer = Self::zeroed(aligned_len)?; + unsafe { + std::ptr::copy_nonoverlapping(addr as *const u8, buffer.as_mut_ptr(), actual_len); + } + Ok(buffer) + } + + fn as_ptr(&self) -> *const u8 { + self.ptr.as_ptr() + } + + fn as_mut_ptr(&mut self) -> *mut u8 { + self.ptr.as_ptr() + } + + fn len(&self) -> usize { + self.len + } +} + +impl Drop for AlignedBuffer { + fn drop(&mut self) { + unsafe { + libc::free(self.ptr.as_ptr() as *mut libc::c_void); + } + } +} + +fn validate_key(key: &str) -> KvResult<()> { + if key.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage key must be non-empty".to_string(), + })); + } + Ok(()) +} + +fn choose_read_path( + entry: &SsdIndexEntry, + target_addr: u64, + len: u64, + target_len: u64, +) -> SsdReadPath { + if len == 0 || entry.len != len { + return SsdReadPath::Scratch; + } + if target_addr.is_multiple_of(SSD_ALIGNMENT as u64) + && target_len >= entry.aligned_len + && entry.file_offset.is_multiple_of(SSD_ALIGNMENT as u64) + { + SsdReadPath::Direct + } else { + SsdReadPath::Scratch + } +} + +fn choose_chunk_read_path( + target_addr: u64, + read_len: u64, + target_len: u64, + file_offset: u64, +) -> SsdReadPath { + if read_len != 0 + && target_addr.is_multiple_of(SSD_ALIGNMENT as u64) + && read_len.is_multiple_of(SSD_ALIGNMENT as u64) + && target_len >= read_len + && file_offset.is_multiple_of(SSD_ALIGNMENT as u64) + { + SsdReadPath::Direct + } else { + SsdReadPath::Scratch + } +} + +fn choose_shard_count(max_bytes: u64, root_count: usize) -> usize { + let max_aligned_shards = (max_bytes / SSD_ALIGNMENT as u64).max(1) as usize; + DEFAULT_SHARDS_PER_OWNER + .max(root_count) + .min(max_aligned_shards) + .max(1) +} + +fn aligned_shard_capacity(capacity_bytes: u64, shard_count: usize) -> KvResult { + let raw = capacity_bytes / shard_count as u64; + let capacity = raw / SSD_ALIGNMENT as u64 * SSD_ALIGNMENT as u64; + if capacity == 0 { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage capacity is too small for shard count".to_string(), + })); + } + Ok(capacity) +} + +fn deduplicate_device_roots(root_dirs: &[PathBuf]) -> KvResult> { + if root_dirs.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage root_dirs must contain at least one path".to_string(), + })); + } + let mut seen_devices = HashSet::new(); + let mut device_roots = Vec::new(); + for root_dir in root_dirs { + fs::create_dir_all(root_dir).map_err(|err| file_error(root_dir, 0, err))?; + let metadata = fs::metadata(root_dir).map_err(|err| file_error(root_dir, 0, err))?; + let device_id = metadata.dev(); + if seen_devices.insert(device_id) { + device_roots.push(SsdDeviceRoot { + device_id, + root_dir: root_dir.clone(), + }); + } + } + if device_roots.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage root_dirs contains no usable device".to_string(), + })); + } + Ok(device_roots) +} + +fn open_cache_files( + device_roots: &[SsdDeviceRoot], + shard_count: usize, + shard_capacity: u64, +) -> KvResult> { + if device_roots.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage root_dirs must contain at least one path".to_string(), + })); + } + let mut files = Vec::with_capacity(shard_count); + for shard_id in 0..shard_count { + let device_idx = shard_id % device_roots.len(); + let root_dir = &device_roots[device_idx].root_dir; + let shards_dir = root_dir.join("shards"); + fs::create_dir_all(&shards_dir).map_err(|err| file_error(&shards_dir, 0, err))?; + let path = shards_dir.join(format!("shard-{shard_id:06}.dat")); + let file = OpenOptions::new() + .create(true) + .truncate(true) + .read(true) + .write(true) + .custom_flags(libc::O_DIRECT) + .open(&path) + .map_err(|err| file_error(&path, 0, err))?; + file.set_len(shard_capacity) + .map_err(|err| file_error(&path, 0, err))?; + files.push(OpenedSsdShard { + shard_id, + device_idx, + file, + }); + } + Ok(files) +} + +fn align_up_usize(value: usize, alignment: usize) -> KvResult { + value + .checked_add(alignment - 1) + .map(|v| v / alignment * alignment) + .ok_or_else(|| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("alignment overflow for value={}", value), + }) + }) +} + +fn align_up_u64(value: u64, alignment: u64) -> KvResult { + value + .checked_add(alignment - 1) + .map(|v| v / alignment * alignment) + .ok_or_else(|| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("alignment overflow for value={}", value), + }) + }) +} + +pub(crate) fn align_ssd_io_len(len: u64) -> KvResult { + align_up_u64(len, SSD_ALIGNMENT as u64) +} + +fn checked_add_u64(lhs: u64, rhs: u64, label: &str) -> KvResult { + lhs.checked_add(rhs).ok_or_else(|| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd {label} overflow: {lhs} + {rhs}"), + }) + }) +} + +fn validate_direct_io( + iovecs: impl IntoIterator, + offset: u64, +) -> io::Result<()> { + ensure_aligned("offset", offset as usize)?; + for (addr, len) in iovecs { + ensure_aligned("buffer address", addr)?; + ensure_aligned("iovec length", len)?; + } + Ok(()) +} + +fn ensure_aligned(name: &str, value: usize) -> io::Result<()> { + if value.is_multiple_of(SSD_ALIGNMENT) { + Ok(()) + } else { + Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("O_DIRECT {name} {value:#x} is not {SSD_ALIGNMENT}-byte aligned"), + )) + } +} + +fn file_error_for_entry(key: &KvSsdKey, offset: u64, err: io::Error) -> KvError { + KvError::Api(ApiError::FileWriteError { + path: format!("kv-ssd://{}@({},{})", key.key, key.put_id.0, key.put_id.1), + offset, + detail: err.to_string(), + }) +} + +fn file_error(path: &Path, offset: u64, err: io::Error) -> KvError { + KvError::Api(ApiError::FileWriteError { + path: path.to_string_lossy().to_string(), + offset, + detail: err.to_string(), + }) +} + +impl From for KvError { + fn from(err: io::Error) -> Self { + KvError::Api(ApiError::FileWriteError { + path: "kv-ssd://io".to_string(), + offset: 0, + detail: err.to_string(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use uuid::Uuid; + + fn new_root() -> PathBuf { + std::env::current_dir() + .unwrap() + .join("target") + .join("fluxon_kv_ssd_tests") + .join(Uuid::new_v4().to_string()) + } + + async fn new_store(max_bytes: u64) -> KvSsdStorage { + KvSsdStorage::new(KvSsdStorageInit { + root_dirs: vec![new_root()], + max_bytes, + }) + .unwrap() + } + + fn test_key(key: &str, version: u64) -> KvSsdKey { + KvSsdKey { + key: key.to_string(), + put_id: (version, 0), + } + } + + fn prepare_ready(ring: &mut SsdRingBuffer, key: &KvSsdKey) -> SsdIndexEntry { + match ring.prepare_write(key.clone(), 500).unwrap() { + SsdPreparedWrite::Ready(entry) => entry, + other => panic!("expected ready SSD write, got {other:?}"), + } + } + + #[::tokio::test] + async fn persist_and_load_roundtrip() { + let store = new_store(1024 * 1024).await; + let data = b"hello from ssd"; + let put_id = (10, 1); + store.persist("k", put_id, data).await.unwrap(); + + let mut out = vec![0u8; data.len()]; + store + .load_into_addr( + "k", + put_id, + out.as_mut_ptr() as u64, + out.len() as u64, + out.len() as u64, + ) + .await + .unwrap(); + assert_eq!(out, data); + } + + #[::tokio::test] + async fn aligned_load_roundtrip_uses_direct_target() { + let store = new_store(1024 * 1024).await; + let data = (0..4096).map(|idx| (idx % 251) as u8).collect::>(); + let put_id = (11, 1); + store.persist("aligned", put_id, &data).await.unwrap(); + + let mut out = AlignedBuffer::zeroed(data.len()).unwrap(); + let target_addr = out.as_mut_ptr() as u64; + let entry = { + let key = KvSsdKey { + key: "aligned".to_string(), + put_id, + }; + store.inner.lock().ring.get(&key).unwrap() + }; + assert_eq!( + choose_read_path(&entry, target_addr, data.len() as u64, data.len() as u64), + SsdReadPath::Direct + ); + + store + .load_into_addr( + "aligned", + put_id, + target_addr, + data.len() as u64, + data.len() as u64, + ) + .await + .unwrap(); + + let out_slice = unsafe { std::slice::from_raw_parts(out.as_ptr(), data.len()) }; + assert_eq!(out_slice, data.as_slice()); + } + + #[::tokio::test] + async fn chunked_load_roundtrip_streams_ready_chunks() { + let store = new_store(1024 * 1024).await; + let data = (0..2500).map(|idx| (idx % 251) as u8).collect::>(); + let put_id = (13, 1); + store.persist("chunked", put_id, &data).await.unwrap(); + + let mut out = + AlignedBuffer::zeroed(align_ssd_io_len(data.len() as u64).unwrap() as usize).unwrap(); + let target_addr = out.as_mut_ptr() as u64; + let (tx, mut rx) = ::tokio::sync::mpsc::channel(2); + let producer = store.load_into_addr_chunks( + "chunked", + put_id, + target_addr, + data.len() as u64, + out.len() as u64, + 1024, + 2, + tx, + ); + let consumer = async { + let mut chunks = Vec::new(); + while let Some(chunk) = rx.recv().await { + chunks.push((chunk.offset, chunk.len)); + } + chunks + }; + let (producer_res, mut chunks) = ::tokio::join!(producer, consumer); + producer_res.unwrap(); + chunks.sort_unstable(); + assert_eq!(chunks, vec![(0, 1024), (1024, 1024), (2048, 452)]); + + let out_slice = unsafe { std::slice::from_raw_parts(out.as_ptr(), data.len()) }; + assert_eq!(out_slice, data.as_slice()); + } + + #[test] + fn read_path_uses_direct_for_aligned_target_with_enough_capacity() { + let aligned = SsdIndexEntry { + shard_id: 0, + begin: 0, + len: 4096, + aligned_len: 4096, + file_offset: 0, + }; + assert_eq!( + choose_read_path(&aligned, 4096, 4096, 4096), + SsdReadPath::Direct + ); + assert_eq!( + choose_read_path(&aligned, 4097, 4096, 4096), + SsdReadPath::Scratch + ); + + let unaligned_len = SsdIndexEntry { + len: 500, + aligned_len: 512, + ..aligned + }; + assert_eq!( + choose_read_path(&unaligned_len, 4096, 500, 512), + SsdReadPath::Direct + ); + assert_eq!( + choose_read_path(&unaligned_len, 4096, 500, 500), + SsdReadPath::Scratch + ); + } + + #[::tokio::test] + async fn unaligned_payload_loads_direct_when_stage_capacity_is_aligned() { + let store = new_store(1024 * 1024).await; + let data = (0..500).map(|idx| (idx % 251) as u8).collect::>(); + let put_id = (12, 1); + store.persist("unaligned", put_id, &data).await.unwrap(); + + let mut out = AlignedBuffer::zeroed(SSD_ALIGNMENT).unwrap(); + let target_addr = out.as_mut_ptr() as u64; + let entry = { + let key = KvSsdKey { + key: "unaligned".to_string(), + put_id, + }; + store.inner.lock().ring.get(&key).unwrap() + }; + assert_eq!(entry.len, data.len() as u64); + assert_eq!(entry.aligned_len, SSD_ALIGNMENT as u64); + assert_eq!( + choose_read_path(&entry, target_addr, data.len() as u64, SSD_ALIGNMENT as u64), + SsdReadPath::Direct + ); + + store + .load_into_addr( + "unaligned", + put_id, + target_addr, + data.len() as u64, + SSD_ALIGNMENT as u64, + ) + .await + .unwrap(); + + let out_slice = unsafe { std::slice::from_raw_parts(out.as_ptr(), data.len()) }; + assert_eq!(out_slice, data.as_slice()); + } + + #[::tokio::test] + async fn storage_deduplicates_root_dirs_on_same_device() { + let root_a = new_root(); + let root_b = new_root(); + let store = KvSsdStorage::new(KvSsdStorageInit { + root_dirs: vec![root_a.clone(), root_b.clone()], + max_bytes: 4 * SSD_ALIGNMENT as u64, + }) + .unwrap(); + + assert_eq!( + fs::metadata(&root_a).unwrap().dev(), + fs::metadata(&root_b).unwrap().dev() + ); + assert_eq!(store.root_dirs(), &[root_a.clone()]); + assert_eq!(store.devices.len(), 1); + assert_eq!(store.shard_to_device, vec![0, 0, 0, 0]); + assert!(root_a.join("shards/shard-000000.dat").exists()); + assert!(root_a.join("shards/shard-000001.dat").exists()); + assert!(root_a.join("shards/shard-000002.dat").exists()); + assert!(root_a.join("shards/shard-000003.dat").exists()); + assert!(!root_b.join("shards").exists()); + } + + #[test] + fn ring_prepare_write_on_shards_uses_only_allowed_shards() { + let mut ring = SsdRingBuffer::new(vec![1024, 1024, 1024, 1024]); + let mut allocated_shards = Vec::new(); + + for version in 0..4 { + let key = test_key("per-device", version); + let entry = match ring + .prepare_write_on_shards(key.clone(), 500, &[1, 3]) + .unwrap() + { + SsdPreparedWrite::Ready(entry) => entry, + other => panic!("expected ready SSD write, got {other:?}"), + }; + allocated_shards.push(entry.shard_id); + assert!(ring.commit(&key, true)); + } + + assert_eq!(allocated_shards, vec![1, 3, 1, 3]); + } + + #[::tokio::test] + async fn ring_keeps_new_entry_and_expires_old() { + let store = new_store(1024).await; + store.persist("old", (1, 0), &[1u8; 500]).await.unwrap(); + store.persist("filler", (2, 0), &[2u8; 500]).await.unwrap(); + store.persist("new", (3, 0), &[3u8; 500]).await.unwrap(); + + assert!(!store.has_entry("old", (1, 0)).await); + assert!(store.has_entry("filler", (2, 0)).await); + assert!(store.has_entry("new", (3, 0)).await); + } + + #[test] + fn ring_read_pin_blocks_overwrite_until_unpinned() { + let mut ring = SsdRingBuffer::new(vec![1024]); + let old = test_key("old", 1); + let filler = test_key("filler", 2); + let new = test_key("new", 3); + + let old_entry = prepare_ready(&mut ring, &old); + assert_eq!(old_entry.begin, 0); + assert!(ring.commit(&old, true)); + prepare_ready(&mut ring, &filler); + assert!(ring.commit(&filler, true)); + + let pinned = ring.pin_read(&old).unwrap(); + assert_eq!(pinned.begin, old_entry.begin); + assert!(matches!( + ring.prepare_write(new.clone(), 500).unwrap(), + SsdPreparedWrite::BlockedByBusyIo + )); + assert!(ring.get(&old).is_some()); + + ring.unpin_read(&old); + let new_entry = prepare_ready(&mut ring, &new); + assert_eq!(new_entry.file_offset, 0); + assert!(ring.commit(&new, true)); + assert!(ring.get(&old).is_none()); + } + + #[test] + fn ring_writing_entry_blocks_overwrite_until_write_finishes() { + let mut ring = SsdRingBuffer::new(vec![1024]); + let old = test_key("old", 1); + let filler = test_key("filler", 2); + let new = test_key("new", 3); + + let old_entry = prepare_ready(&mut ring, &old); + assert_eq!(old_entry.begin, 0); + prepare_ready(&mut ring, &filler); + + assert!(matches!( + ring.prepare_write(new.clone(), 500).unwrap(), + SsdPreparedWrite::BlockedByBusyIo + )); + + assert!(ring.commit(&old, true)); + let new_entry = prepare_ready(&mut ring, &new); + assert_eq!(new_entry.file_offset, 0); + } + + #[test] + fn safe_component_replaces_path_separators() { + assert_eq!(safe_path_component("owner/a:b"), "owner_a_b"); + } +} diff --git a/fluxon_rs/fluxon_kv/src/kv_test.rs b/fluxon_rs/fluxon_kv/src/kv_test.rs index 5f0a9e2..94d8ebe 100644 --- a/fluxon_rs/fluxon_kv/src/kv_test.rs +++ b/fluxon_rs/fluxon_kv/src/kv_test.rs @@ -11,9 +11,11 @@ use crate::cluster_manager::ClusterManagerRdmaControlInit; use crate::config::{ - ClientConfig, ContributeToClusterPoolSize, FluxonKvSpec, LargeFilePaths, MasterConfig, MonitoringConfig, - ProtocolConfig, ProtocolType, TestSpecConfig, TestSpecTransportMode, TransferEngineType, + ClientConfig, ContributeToClusterPoolSize, FluxonKvSpec, KvSsdStorageConfig, LargeFilePaths, + MasterConfig, MonitoringConfig, ProtocolConfig, ProtocolType, TestSpecConfig, + TestSpecTransportMode, TransferEngineType, }; +use crate::master_kv_router::msg_pack::GetSourceKind; use crate::run_master_with_test_overrides; use crate::{ClientRunTestOverrides, MasterRunTestOverrides, run_client_with_test_overrides}; // external client runs via run_client when contribution is zero @@ -38,6 +40,8 @@ const CLIENT_COMMUNICATION_VALUE: &[u8] = b"message_from_client1_to_client2"; const TRANSFER_DATA_PROBE_VALUE_LEN: usize = 256 * 1024; const KV_TEST_TRANSFER_PROBE_IO_TIMEOUT_SECS: u64 = 10; const KV_TEST_SHUTDOWN_TIMEOUT_SECS: u64 = 60; +const KV_TEST_SSD_STORAGE_BYTES: u64 = 64 * 1024 * 1024; +const KV_TEST_STORAGE_PROFILE_SSD_ROUTE_TIMEOUT_SECS: u64 = 30; fn kv_test_run_scope() -> &'static str { static RUN_SCOPE: OnceLock = OnceLock::new(); @@ -610,6 +614,7 @@ struct KvTestClientOptions { enable_transfer_rpc_fast_path: Option, contribute_to_cluster_pool_size: Option, share_mem_path: Option, + ssd_storage: Option, etcd_mode: Option, } @@ -642,6 +647,10 @@ impl KvTestClientOptions { .share_mem_path .clone() .or_else(|| self.share_mem_path.clone()), + ssd_storage: overrides + .ssd_storage + .clone() + .or_else(|| self.ssd_storage.clone()), etcd_mode: overrides .etcd_mode .clone() @@ -650,6 +659,40 @@ impl KvTestClientOptions { } } +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum KvTestStorageProfile { + Memory, + Ssd, + MemorySsd, +} + +impl KvTestStorageProfile { + fn round_suffix(self) -> &'static str { + match self { + Self::Memory => "", + Self::Ssd => "_ssd", + Self::MemorySsd => "_memory_ssd", + } + } + + fn ssd_storage(self) -> Option { + match self { + Self::Memory => None, + Self::Ssd | Self::MemorySsd => Some(KvSsdStorageConfig { + max_bytes: KV_TEST_SSD_STORAGE_BYTES, + }), + } + } + + fn requires_memory_source(self) -> bool { + matches!(self, Self::Memory | Self::MemorySsd) + } + + fn requires_ssd_source(self) -> bool { + matches!(self, Self::Ssd | Self::MemorySsd) + } +} + #[derive(Clone, Copy, Debug, PartialEq, Eq)] enum KvTestRoundProfile { P2pOnly, @@ -760,6 +803,7 @@ fn kv_test_round_test_spec_config(round_profile: KvTestRoundProfile) -> TestSpec #[derive(Clone, Debug)] struct KvTestRoundOptions { round_profile: KvTestRoundProfile, + storage_profile: KvTestStorageProfile, round_name: String, cluster_name: String, master_port: Option, @@ -803,6 +847,9 @@ impl KvTestRoundOptions { ) } + fn owner_sub_cluster(&self) -> String { + format!("{}_owners", self.round_name) + } } #[derive(Clone, Debug)] @@ -842,8 +889,7 @@ fn default_client_large_file_paths( instance_key: &str, contribute_to_cluster_pool_size: &ContributeToClusterPoolSize, ) -> LargeFilePaths { - if contribute_to_cluster_pool_size.dram == 0 - && contribute_to_cluster_pool_size.vram.is_empty() + if contribute_to_cluster_pool_size.dram == 0 && contribute_to_cluster_pool_size.vram.is_empty() { return LargeFilePaths { paths: Vec::new() }; } @@ -852,7 +898,10 @@ fn default_client_large_file_paths( } } -fn default_owner_test_client_options(round_profile: KvTestRoundProfile) -> KvTestClientOptions { +fn default_owner_test_client_options( + round_profile: KvTestRoundProfile, + storage_profile: KvTestStorageProfile, +) -> KvTestClientOptions { KvTestClientOptions { protocol_config: Some(round_profile.protocol_config()), transfer_engine: Some(round_profile.owner_transfer_engine()), @@ -861,6 +910,7 @@ fn default_owner_test_client_options(round_profile: KvTestRoundProfile) -> KvTes enable_transfer_rpc_fast_path: Some(round_profile.enable_transfer_rpc_fast_path()), contribute_to_cluster_pool_size: Some(default_owner_contribute_to_cluster_pool_size()), share_mem_path: None, + ssd_storage: storage_profile.ssd_storage(), etcd_mode: Some(KvTestEtcdMode::Enabled), } } @@ -874,6 +924,7 @@ fn default_master_test_client_options(round_profile: KvTestRoundProfile) -> KvTe enable_transfer_rpc_fast_path: Some(round_profile.enable_transfer_rpc_fast_path()), contribute_to_cluster_pool_size: None, share_mem_path: None, + ssd_storage: None, etcd_mode: None, } } @@ -887,22 +938,31 @@ fn default_external_test_client_options() -> KvTestClientOptions { enable_transfer_rpc_fast_path: Some(false), contribute_to_cluster_pool_size: Some(default_external_contribute_to_cluster_pool_size()), share_mem_path: None, + ssd_storage: None, etcd_mode: Some(KvTestEtcdMode::Disabled), } } -fn new_kv_test_round(round_profile: KvTestRoundProfile) -> KvTestRoundOptions { - let round_name = round_profile.round_name(); +fn new_kv_test_round( + round_profile: KvTestRoundProfile, + storage_profile: KvTestStorageProfile, +) -> KvTestRoundOptions { + let round_name = format!( + "{}{}", + round_profile.round_name(), + storage_profile.round_suffix() + ); KvTestRoundOptions { round_profile, - round_name: round_name.to_string(), + storage_profile, + round_name: round_name.clone(), // Keep each process run on its own cluster namespace so a crashed/aborted previous run // cannot poison the next rerun with stale members. cluster_name: format!("test_cluster_{}_{}", round_name, kv_test_run_scope()), master_port: None, step8_master_port: None, master_options: default_master_test_client_options(round_profile), - owner_client_options: default_owner_test_client_options(round_profile), + owner_client_options: default_owner_test_client_options(round_profile, storage_profile), external_client_options: default_external_test_client_options(), } } @@ -919,15 +979,35 @@ fn default_kv_test_run_options() -> KvTestRunOptions { .filter(|item| !item.is_empty()) { let profile = match round_name { - "p2p_only" => KvTestRoundProfile::P2pOnly, + "p2p_only" => { + rounds.push(new_kv_test_round( + KvTestRoundProfile::P2pOnly, + KvTestStorageProfile::Memory, + )); + continue; + } + "p2p_only_ssd" => { + rounds.push(new_kv_test_round( + KvTestRoundProfile::P2pOnly, + KvTestStorageProfile::Ssd, + )); + continue; + } + "p2p_only_memory_ssd" => { + rounds.push(new_kv_test_round( + KvTestRoundProfile::P2pOnly, + KvTestStorageProfile::MemorySsd, + )); + continue; + } "rdma_transfer_only" => KvTestRoundProfile::RdmaTransferOnly, "rdma_transfer_with_rpc" => KvTestRoundProfile::RdmaTransferWithRpc, other => panic!( - "unsupported FLUXON_KV_TEST_ROUNDS entry '{}'; expected one of: p2p_only, rdma_transfer_only, rdma_transfer_with_rpc", + "unsupported FLUXON_KV_TEST_ROUNDS entry '{}'; expected one of: p2p_only, p2p_only_ssd, p2p_only_memory_ssd, rdma_transfer_only, rdma_transfer_with_rpc", other ), }; - rounds.push(new_kv_test_round(profile)); + rounds.push(new_kv_test_round(profile, KvTestStorageProfile::Memory)); } if rounds.is_empty() { panic!("FLUXON_KV_TEST_ROUNDS was set but produced no valid rounds"); @@ -937,9 +1017,17 @@ fn default_kv_test_run_options() -> KvTestRunOptions { KvTestRunOptions { rounds: vec![ - new_kv_test_round(KvTestRoundProfile::P2pOnly), - new_kv_test_round(KvTestRoundProfile::RdmaTransferOnly), - new_kv_test_round(KvTestRoundProfile::RdmaTransferWithRpc), + new_kv_test_round(KvTestRoundProfile::P2pOnly, KvTestStorageProfile::Memory), + new_kv_test_round(KvTestRoundProfile::P2pOnly, KvTestStorageProfile::Ssd), + new_kv_test_round(KvTestRoundProfile::P2pOnly, KvTestStorageProfile::MemorySsd), + new_kv_test_round( + KvTestRoundProfile::RdmaTransferOnly, + KvTestStorageProfile::Memory, + ), + new_kv_test_round( + KvTestRoundProfile::RdmaTransferWithRpc, + KvTestStorageProfile::Memory, + ), ], } } @@ -1022,6 +1110,8 @@ fn build_client_launch( let contribute_to_cluster_pool_size = options .contribute_to_cluster_pool_size .unwrap_or(default_owner_contribute_to_cluster_pool_size()); + let is_external = contribute_to_cluster_pool_size.dram == 0 + && contribute_to_cluster_pool_size.vram.is_empty(); let share_mem_path = options .share_mem_path .unwrap_or_else(|| format!("/tmp/kvcache_shared_memory/{}", instance_key)); @@ -1043,7 +1133,11 @@ fn build_client_launch( enable_transfer_rpc_fast_path: options .enable_transfer_rpc_fast_path .expect("kv_test requires enable_transfer_rpc_fast_path to be set explicitly"), - sub_cluster: None, + sub_cluster: if is_external { + None + } else { + Some(round.owner_sub_cluster()) + }, }, // English note: // kv_test uses a per-instance shared memory path by default so each owner/external share @@ -1054,6 +1148,7 @@ fn build_client_launch( &instance_key, &contribute_to_cluster_pool_size, ), + ssd_storage: options.ssd_storage, // Mirror round intent into the generated config so logs and runtime behavior // agree on whether this launch is transfer_only vs transfer_with_rpc. test_spec_config: kv_test_round_test_spec_config(round.round_profile), @@ -1381,7 +1476,10 @@ async fn key_meta_cache_check( } } - tracing::info!("🔍 Starting PUT and GET in parallel: {}", parallel_unique_key); + tracing::info!( + "🔍 Starting PUT and GET in parallel: {}", + parallel_unique_key + ); for i in 0..10 { let (put_client, other_client) = if i % 2 == 0 { (client, client2) @@ -1420,7 +1518,9 @@ async fn key_meta_cache_check( } assert!( - put_client.client_kv_api().has_cached_key(parallel_unique_key), + put_client + .client_kv_api() + .has_cached_key(parallel_unique_key), "put client should have immediate local cache metadata for key {} after put time {}", parallel_unique_key, i @@ -1577,6 +1677,208 @@ async fn shutdown_framework_with_timeout(label: &str, framework: &crate::Framewo } } +fn build_storage_profile_probe_value(tag: &str) -> Vec { + const STORAGE_PROFILE_PROBE_VALUE_LEN: usize = 64 * 1024; + build_storage_profile_probe_value_with_len(tag, STORAGE_PROFILE_PROBE_VALUE_LEN) +} + +fn build_storage_profile_probe_value_with_len(tag: &str, len: usize) -> Vec { + let pattern = format!("kv_test_storage_profile:{tag}:").into_bytes(); + let mut value = Vec::with_capacity(len); + while value.len() < len { + value.extend_from_slice(pattern.as_slice()); + } + value.truncate(len); + value +} + +async fn force_evict_memory_replicas_for_storage_probe( + master_framework: &crate::Framework, + key: &str, +) { + let master_view = master_framework.master_kv_router_view(); + let deadline = + Instant::now() + Duration::from_secs(KV_TEST_STORAGE_PROFILE_SSD_ROUTE_TIMEOUT_SECS); + let (put_id, memory_replica_nodes) = loop { + if let Some(route) = master_view.master_kv_router().inner().kv_routes.get(key) { + let put_id = route.put_id; + let memory_replica_nodes = route + .nodes_replicas + .read() + .keys() + .cloned() + .collect::>(); + let ssd_replica_count = route.ssd_replicas.read().len(); + if ssd_replica_count > 0 { + break (put_id, memory_replica_nodes); + } + } + + if Instant::now() >= deadline { + panic!( + "storage profile probe expected at least one SSD replica before memory eviction: key={} timeout={}s", + key, KV_TEST_STORAGE_PROFILE_SSD_ROUTE_TIMEOUT_SECS + ); + } + sleep(Duration::from_millis(50)).await; + }; + + for node_id in memory_replica_nodes { + crate::master_kv_router::delete::evict_one_kv_replica_for_node( + &master_view, + key.to_string(), + node_id.clone(), + put_id, + ) + .unwrap_or_else(|code| { + panic!( + "storage profile probe failed to evict memory replica: key={} node={} put_id=({},{}) code={}", + key, node_id, put_id.0, put_id.1, code + ) + }); + } + + let Some(route) = master_view.master_kv_router().inner().kv_routes.get(key) else { + panic!("storage profile probe route disappeared after memory replicas eviction: key={key}"); + }; + assert!( + route.nodes_replicas.read().is_empty(), + "storage profile probe memory replicas still exist after eviction: key={}", + key + ); + assert!( + !route.ssd_replicas.read().is_empty(), + "storage profile probe SSD replica disappeared after memory replicas eviction: key={}", + key + ); +} + +async fn assert_owner_get_source_kind( + reader_framework: &crate::Framework, + key: &str, + expected_value: &[u8], + expected_source_kind: GetSourceKind, +) { + let reader_view = reader_framework.client_kv_api_view().clone(); + let reader_api = reader_view.client_kv_api(); + let (mem_holder, get_info) = reader_api + .inner() + .get(key) + .await + .unwrap_or_else(|err| { + panic!( + "storage profile probe get failed: key={} expected_source={:?} err={}", + key, expected_source_kind, err + ) + }) + .unwrap_or_else(|| { + panic!( + "storage profile probe get returned None: key={} expected_source={:?}", + key, expected_source_kind + ) + }); + assert_eq!( + mem_holder.bytes(), + expected_value, + "storage profile probe value mismatch for key={key}" + ); + let Some(get_info) = get_info else { + panic!( + "storage profile probe expected remote get info for key={} source={:?}", + key, expected_source_kind + ); + }; + assert_eq!( + get_info.source_kind(), + expected_source_kind, + "storage profile probe source kind mismatch for key={key}" + ); +} + +async fn run_non_rdma_storage_profile_coverage( + round: &KvTestRoundOptions, + master_framework: &crate::Framework, + writer_framework: &crate::Framework, +) -> Option> { + if round.round_profile != KvTestRoundProfile::P2pOnly { + return None; + } + + info!( + "📋 Storage profile coverage: round={} storage={:?}", + round.round_name, round.storage_profile + ); + + let writer_view = writer_framework.client_kv_api_view().clone(); + let writer_api = writer_view.client_kv_api(); + let storage_probe_put_opts = || { + crate::client_kv_api::PutOptionalArgs(vec![ + crate::client_kv_api::PutOptionalArg::PreferredSubCluster(round.owner_sub_cluster()), + ]) + }; + + let memory_key = format!("storage_profile_memory_key_{}", round.round_name); + let memory_value = build_storage_profile_probe_value(&format!("{}:memory", round.round_name)); + if round.storage_profile.requires_memory_source() { + writer_api + .inner() + .put(&memory_key, &memory_value, storage_probe_put_opts()) + .await + .unwrap_or_else(|err| { + panic!( + "storage profile memory probe put failed: key={} err={}", + memory_key, err + ) + }); + } + + let ssd_key = format!("storage_profile_ssd_key_{}", round.round_name); + let ssd_value = build_storage_profile_probe_value_with_len( + &format!("{}:ssd", round.round_name), + 64 * 1024 + 123, + ); + if round.storage_profile.requires_ssd_source() { + writer_api + .inner() + .put(&ssd_key, &ssd_value, storage_probe_put_opts()) + .await + .unwrap_or_else(|err| { + panic!( + "storage profile SSD probe put failed: key={} err={}", + ssd_key, err + ) + }); + force_evict_memory_replicas_for_storage_probe(master_framework, &ssd_key).await; + } + + let reader_launch = new_client_launch(round, "test_storage_profile_reader", None); + let (reader_framework, _) = run_kv_test_client(reader_launch) + .await + .expect("Failed to start storage profile reader"); + + sleep(Duration::from_secs(10)).await; + + if round.storage_profile.requires_memory_source() { + assert_owner_get_source_kind( + &reader_framework, + &memory_key, + &memory_value, + GetSourceKind::Memory, + ) + .await; + } + if round.storage_profile.requires_ssd_source() { + assert_owner_get_source_kind(&reader_framework, &ssd_key, &ssd_value, GetSourceKind::Ssd) + .await; + } + + info!( + "✅ Storage profile coverage passed: round={} storage={:?}", + round.round_name, round.storage_profile + ); + Some(reader_framework) +} + async fn run_kv_step8(round: &KvTestRoundOptions) { info!("📋 Step 8: Verifying external client blocking and recovery behavior"); @@ -2720,6 +3022,9 @@ async fn run_kv_round(round: &KvTestRoundOptions) { info!("✅ Key meta cache testing completed"); } + let storage_profile_reader_framework = + run_non_rdma_storage_profile_coverage(round, &master_framework, &client1_framework).await; + // 清理旧资源 { info!("🧹 Cleaning up resources"); @@ -2743,6 +3048,14 @@ async fn run_kv_round(round: &KvTestRoundOptions) { .unwrap_or_else(|e| panic!("Client 1 framework shutdown failed: {}", e)); info!("✅ Client 1 framework shutdown successfully"); + if let Some(storage_profile_reader_framework) = storage_profile_reader_framework { + shutdown_framework_with_timeout( + "storage profile reader", + &storage_profile_reader_framework, + ) + .await; + } + master_framework .shutdown() .await diff --git a/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs b/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs index c74b64a..43d3c09 100644 --- a/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs +++ b/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs @@ -148,6 +148,7 @@ fn new_client_config_with_cluster_and_dram( large_file_paths: crate::config::LargeFilePaths { paths: vec![format!("{}/large/{}", base, instance_key)], }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), }; println!("fluxonkv core created client config for test: {:?}", conf); diff --git a/fluxon_rs/fluxon_kv/src/lib.rs b/fluxon_rs/fluxon_kv/src/lib.rs index edaa386..3b1116d 100644 --- a/fluxon_rs/fluxon_kv/src/lib.rs +++ b/fluxon_rs/fluxon_kv/src/lib.rs @@ -7,6 +7,7 @@ pub mod external_client_api; pub mod panel_proxy; // #[cfg(test)] pub mod key_prefix; +pub mod kv_ssd_storage; #[cfg(feature = "test_bins")] pub mod kv_test; pub mod kvlease; @@ -797,6 +798,7 @@ fn build_side_transfer_worker_config( }, share_mem_path: owner_config.share_mem_path.clone(), large_file_paths: owner_config.large_file_paths.clone(), + ssd_storage: None, test_spec_config, }) } @@ -841,6 +843,7 @@ fn build_side_transfer_worker_config_yaml( cluster_name: side_config.cluster_name, share_mem_path: side_config.share_mem_path, large_file_paths: None, + ssd_storage: None, p2p_listen_port: side_config.fluxonkv_spec.p2p_listen_port, redis_compat: None, sub_cluster: None, @@ -1915,6 +1918,9 @@ async fn run_client_impl( if is_side_transfer_worker { metadata.insert("side_transfer_worker".to_string(), "true".to_string()); } + if !is_external && !is_side_transfer_worker && config.ssd_storage.is_some() { + metadata.insert("kv_ssd_storage".to_string(), "true".to_string()); + } // Local IPC routing requires both share-group owner id and the local IPC root. // The owner id is also published via a dedicated share-group key; we denormalize it into @@ -2004,6 +2010,20 @@ async fn run_client_impl( .await .map_err(|e| anyhow::anyhow!("Failed to initialize framework: {:#}", e))?; } else { + let ssd_storage = if is_side_transfer_worker { + None + } else if let Some(ssd_cfg) = config.ssd_storage.as_ref() { + let root_dirs = config + .large_file_paths + .kv_ssd_storage_dirs(&config.cluster_name, &config.instance_key) + .map_err(|err| anyhow::anyhow!("invalid kv ssd storage dirs: {}", err))?; + Some(crate::kv_ssd_storage::KvSsdStorageInit { + root_dirs, + max_bytes: ssd_cfg.max_bytes, + }) + } else { + None + }; let init_args = InitArgsOwner { cluster_manager_arg: ClusterManagerNewArg { etcd_endpoints: config.fluxonkv_spec.etcd_addresses.clone(), @@ -2036,6 +2056,7 @@ async fn run_client_impl( }, client_kv_api_arg: ClientKvApiNewArg { test_spec_config: config.test_spec_config.clone(), + ssd_storage, }, client_seg_pool_arg: ClientSegPoolNewArg { contribute_size: config.contribute_to_cluster_pool_size.clone(), @@ -2468,6 +2489,7 @@ mod tests { large_file_paths: crate::config::LargeFilePaths { paths: vec!["/tmp/fluxon_side_transfer_test_large".to_string()], }, + ssd_storage: None, test_spec_config: TestSpecConfig { enable_side_transfer: true, side_transfer_worker_count: 4, @@ -2736,8 +2758,8 @@ mod tests { large_file_paths: crate::config::LargeFilePaths { paths: vec![owner_large_root.to_string_lossy().into_owned()], }, - protocol_version: - fluxon_util::git_version_build_record::get_current_git_commitid().unwrap(), + protocol_version: fluxon_util::git_version_build_record::get_current_git_commitid() + .unwrap(), write_ts: Some(chrono::Utc::now().timestamp_micros()), }; let shared_meta_json = serde_json::to_string(&shared_meta).unwrap(); @@ -2773,6 +2795,7 @@ mod tests { }, share_mem_path: share_mem_root.to_string_lossy().into_owned(), large_file_paths: crate::config::LargeFilePaths { paths: Vec::new() }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), }; diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/delete.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/delete.rs index 12a55ee..52ac76e 100755 --- a/fluxon_rs/fluxon_kv/src/master_kv_router/delete.rs +++ b/fluxon_rs/fluxon_kv/src/master_kv_router/delete.rs @@ -130,7 +130,7 @@ pub fn evict_one_kv_replica_for_node( return Ok(()); } - let last_replica_gone = route.nodes_replicas.read().is_empty(); + let last_replica_gone = !route.has_live_replica(); if last_replica_gone { let removed = view .master_kv_router() diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/get.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/get.rs index 8c17155..346df40 100755 --- a/fluxon_rs/fluxon_kv/src/master_kv_router/get.rs +++ b/fluxon_rs/fluxon_kv/src/master_kv_router/get.rs @@ -2,9 +2,10 @@ use super::{ InflightGetInfo, KvRouteInfo, MasterKvRouterView, NodeValueReplicaDesc, OwnerHoldingGetInfo, msg_pack::{ GetAllocationMode, GetDoneReq, GetDoneResp, GetMetaReq, GetMetaResp, GetRevokeReq, - GetRevokeResp, GetStartReq, GetStartResp, + GetRevokeResp, GetSourceKind, GetStartReq, GetStartResp, }, }; +use crate::kv_ssd_storage::{SSD_ALIGNMENT, align_ssd_io_len}; use crate::master_kv_router::OneKvNodesRoutes; use crate::master_kv_router::put::PutIDForAKey; use crate::memholder::MemholderManagerTrait; @@ -82,7 +83,7 @@ pub async fn handle_get_start( let mut remove_in_kv_routes = false; if let Some(one_kv_nodes_routes) = view.master_kv_router().inner().kv_routes.get(key) { one_kv_nodes_routes.clean_up_tomb_nodes_replicas(put_id, tombs, view); - if one_kv_nodes_routes.nodes_replicas.read().is_empty() { + if !one_kv_nodes_routes.has_live_replica() { remove_in_kv_routes = true; } } @@ -113,6 +114,67 @@ pub async fn handle_get_start( }, ) } + fn allocate_get_buffer_on_node( + view: &MasterKvRouterView, + node_id: &NodeID, + len: u64, + get_id: u64, + purpose: &str, + ) -> Result, msg_and_error::KvError> { + let node_allocators = view.master_seg_manager().get_node_allocators(node_id); + if node_allocators.is_empty() { + tracing::info!( + "No allocators found for {} during get: {}, node is not ready", + purpose, + node_id + ); + return Err(msg_and_error::KvError::Unreachable( + msg_and_error::UnreachableError::OwnerNoSeg { detail: "config=0 initializes as external; non-zero initializes as owner; the owner must have memory space (segment)".to_string() } + )); + } + + let allocator = node_allocators.choose(&mut rand::thread_rng()).unwrap(); + let mut allocated_addr: Option = None; + for attempt in 1..=3 { + if let Ok(allocation) = allocator.allocate(len) { + allocated_addr = Some(allocation); + break; + } else { + tracing::info!( + "{} allocation attempt {}/3 failed for get_id {} on node {}", + purpose, + attempt, + get_id, + node_id + ); + } + } + if let Some(allocation) = allocated_addr { + return Ok(Arc::new(allocation)); + } + + let total = allocator.total_size_bytes(); + let used = allocator.used_size_bytes(); + let free = total.saturating_sub(used); + Err(msg_and_error::KvError::Api( + msg_and_error::ApiError::NoSpace { + node: node_id.as_ref().to_string(), + segment: allocator.seg_device_id.clone(), + total_capacity: total, + free_capacity: free, + }, + )) + } + fn align_ssd_stage_addr(raw_addr: u64) -> Result { + raw_addr + .checked_add(SSD_ALIGNMENT as u64 - 1) + .map(|addr| addr / SSD_ALIGNMENT as u64 * SSD_ALIGNMENT as u64) + .ok_or_else(|| { + msg_and_error::KvError::Api(msg_and_error::ApiError::InvalidArgument { + detail: format!("ssd source staging address alignment overflow: {raw_addr}"), + }) + }) + } tracing::debug!("Handling GetStartReq: {:?}", req.serialize_part); @@ -253,11 +315,13 @@ pub async fn handle_get_start( put_id: one_kv_nodes_routes.put_id, get_id, node_id: resp_node_id.clone().into(), + source_kind: GetSourceKind::Memory, src_addr: resp_src_addr, target_addr: resp_target_addr, src_base_addr: resp_src_base, target_base_addr: resp_target_base, len: src_allocation.size(), + ssd_stage_len: 0, error_code: msg_and_error::OK, error_json: String::new(), server_process_us: 0, @@ -270,8 +334,10 @@ pub async fn handle_get_start( req_node_id, len: src_allocation.size(), allocation: target_allocation, // 存储target allocation + source_allocation: None, route: one_kv_nodes_routes.clone(), allocation_mode, + source_kind: GetSourceKind::Memory, }; view.master_kv_router() @@ -308,6 +374,167 @@ pub async fn handle_get_start( }, ); } + + let ssd_replicas = one_kv_nodes_routes.ssd_replicas.read().clone(); + let mut ssd_replica_keys = ssd_replicas.keys().collect::>(); + while !ssd_replica_keys.is_empty() { + let to_remove_idx = rand::thread_rng().gen_range(0..ssd_replica_keys.len()); + let selected_ssd_key = ssd_replica_keys.remove(to_remove_idx); + let ssd_replica = ssd_replicas + .get(&*selected_ssd_key) + .expect("selected SSD replica key must exist"); + if ssd_replica.tomb_tag.is_tomb() { + tombs.insert(selected_ssd_key.to_owned()); + } else { + let ssd_stage_len = match align_ssd_io_len(ssd_replica.len) { + Ok(len) => len, + Err(err) => { + return failed_resp_err( + err, + Some((tombs, one_kv_nodes_routes.put_id)), + &view, + &req.serialize_part.key, + ); + } + }; + let source_alloc_len = match ssd_stage_len.checked_add(SSD_ALIGNMENT as u64 - 1) { + Some(len) => len, + None => { + let err = + msg_and_error::KvError::Api(msg_and_error::ApiError::InvalidArgument { + detail: format!( + "ssd source staging allocation length overflow: {ssd_stage_len}" + ), + }); + return failed_resp_err( + err, + Some((tombs, one_kv_nodes_routes.put_id)), + &view, + &req.serialize_part.key, + ); + } + }; + let source_allocation = match allocate_get_buffer_on_node( + &view, + &ssd_replica.node_id, + source_alloc_len, + get_id, + "ssd source staging", + ) { + Ok(allocation) => allocation, + Err(err) => { + tracing::info!( + "Skipping SSD source for get_id {} on node {}: {}", + get_id, + ssd_replica.node_id, + err + ); + continue; + } + }; + let target_allocation = match allocate_get_buffer_on_node( + &view, + &req_node_id, + ssd_replica.len, + get_id, + "requesting target", + ) { + Ok(allocation) => allocation, + Err(err) => { + return failed_resp_err( + err, + Some((tombs, one_kv_nodes_routes.put_id)), + &view, + &req.serialize_part.key, + ); + } + }; + let allocation_mode = if one_kv_nodes_routes.try_reserve_get_durable_slot() { + GetAllocationMode::DurableReplica + } else { + GetAllocationMode::Temporary + }; + let source_base = source_allocation.base_addr(); + let source_raw_addr = match source_base.checked_add(source_allocation.addr()) { + Some(addr) => addr, + None => { + let err = + msg_and_error::KvError::Api(msg_and_error::ApiError::InvalidArgument { + detail: format!( + "ssd source staging raw address overflow: base={} offset={}", + source_base, + source_allocation.addr() + ), + }); + return failed_resp_err( + err, + Some((tombs, one_kv_nodes_routes.put_id)), + &view, + &req.serialize_part.key, + ); + } + }; + let source_addr = match align_ssd_stage_addr(source_raw_addr) { + Ok(addr) => addr, + Err(err) => { + return failed_resp_err( + err, + Some((tombs, one_kv_nodes_routes.put_id)), + &view, + &req.serialize_part.key, + ); + } + }; + let target_base = target_allocation.base_addr(); + let target_addr = target_base + target_allocation.addr(); + let resp = GetStartResp { + put_id: one_kv_nodes_routes.put_id, + get_id, + node_id: ssd_replica.node_id.clone().into(), + source_kind: GetSourceKind::Ssd, + src_addr: source_addr, + target_addr, + src_base_addr: source_base, + target_base_addr: target_base, + len: ssd_replica.len, + ssd_stage_len, + error_code: msg_and_error::OK, + error_json: String::new(), + server_process_us: 0, + }; + let info = InflightGetInfo { + put_id: one_kv_nodes_routes.put_id, + src_node_id: ssd_replica.node_id.clone(), + key: req.serialize_part.key.clone(), + req_node_id, + len: ssd_replica.len, + allocation: target_allocation, + source_allocation: Some(source_allocation), + route: one_kv_nodes_routes.clone(), + allocation_mode, + source_kind: GetSourceKind::Ssd, + }; + + view.master_kv_router() + .inner() + .inflight_gets + .insert(get_id, info) + .await; + + clean_up_tombs( + &view, + Some((tombs, one_kv_nodes_routes.put_id)), + &req.serialize_part.key, + ); + return ( + get_id, + MsgPack { + serialize_part: resp, + raw_bytes: Vec::new(), + }, + ); + } + } tracing::info!("Key not found: {}", req.serialize_part.key); { let err = msg_and_error::KvError::Api(msg_and_error::ApiError::KeyNotFound { @@ -322,6 +549,64 @@ pub async fn handle_get_start( } } +fn drop_failed_ssd_source(view: &MasterKvRouterView, inflight_info: &InflightGetInfo) { + if inflight_info.source_kind != GetSourceKind::Ssd { + tracing::warn!( + "Ignoring drop_ssd_source for non-SSD get: get_key={} put_id=({},{}) source_kind={:?}", + inflight_info.key, + inflight_info.put_id.0, + inflight_info.put_id.1, + inflight_info.source_kind + ); + return; + } + + let route = inflight_info.route.clone(); + if route.put_id != inflight_info.put_id { + return; + } + + let removed = route + .ssd_replicas + .write() + .remove(&inflight_info.src_node_id) + .is_some(); + if !removed { + return; + } + + tracing::warn!( + "Removed failed SSD replica: key={} node={} put_id=({},{})", + inflight_info.key, + inflight_info.src_node_id, + inflight_info.put_id.0, + inflight_info.put_id.1 + ); + + if route.has_live_replica() { + return; + } + + let route_for_compare = route.clone(); + let removed_route = view + .master_kv_router() + .inner() + .kv_routes + .remove_if(&inflight_info.key, |_, current| { + Arc::ptr_eq(current, &route_for_compare) && current.put_id == inflight_info.put_id + }) + .is_some(); + if removed_route && view.master_kv_router().prefix_index_enabled() { + let view_task = view.clone(); + let key_for_prefix = inflight_info.key.clone(); + let _ = view.spawn("ssd_failure_remove_prefix_index", async move { + let inner = view_task.master_kv_router().inner(); + let mut tree = inner.prefix_index.write().await; + tree.remove(&key_for_prefix); + }); + } +} + pub async fn handle_get_revoke( view: MasterKvRouterView, req: MsgPack, @@ -338,6 +623,9 @@ pub async fn handle_get_revoke( .remove(&get_id) .await { + if req.serialize_part.drop_ssd_source { + drop_failed_ssd_source(&view, &inflight_info); + } inflight_info.release_durable_slot_if_needed(); tracing::info!("Revoked get operation with get_id: {}", get_id); } else { @@ -381,7 +669,6 @@ pub async fn handle_get_done( .next_holder_id .fetch_add(1, Ordering::Relaxed); - let src_node_id = inflight_info.src_node_id; let key = inflight_info.key; // Create holding info @@ -404,7 +691,7 @@ pub async fn handle_get_done( if one_kv_nodes_routes.put_id == inflight_info.put_id { let mut nodes_replicas = one_kv_nodes_routes.nodes_replicas.write(); if let Some(tomb_tag) = - view.master_seg_manager().get_node_tomb_tag(&src_node_id) + view.master_seg_manager().get_node_tomb_tag(&req_node_id) { if !tomb_tag.is_tomb() { nodes_replicas.insert( @@ -632,6 +919,21 @@ pub async fn handle_get_meta( raw_bytes: Vec::new(), }; } + let ssd_replicas = (*one_kv_nodes_routes.ssd_replicas.read()).clone(); + for (_, kv_info) in ssd_replicas.iter() { + if kv_info.tomb_tag.is_tomb() { + continue; + } + return MsgPack { + serialize_part: GetMetaResp { + exists: true, + len: kv_info.len, + error_code: msg_and_error::OK, + error_json: String::new(), + }, + raw_bytes: Vec::new(), + }; + } // if let Some((_, kv_info)) = replicas.iter().next() { // let len = kv_info.allocation.size(); diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/mod.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/mod.rs index ee4ca2b..afbfc41 100644 --- a/fluxon_rs/fluxon_kv/src/master_kv_router/mod.rs +++ b/fluxon_rs/fluxon_kv/src/master_kv_router/mod.rs @@ -15,13 +15,14 @@ use self::{ msg_pack::{ BatchDeleteAckReq, BatchDeleteClientKvMetaCacheReq, CountPrefixReq, CountPrefixResp, DeleteAckReq, DeleteReq, GetAllocationMode, GetDoneReq, GetMetaReq, GetRevokeReq, - GetStartReq, PutDoneReq, PutRevokeReq, PutStartReq, + GetSourceKind, GetStartReq, PutDoneReq, PutRevokeReq, PutStartReq, SsdReplicaCommitReq, }, placement::{PlacementDefault, PlacementPolicy}, - put::{handle_put_done, handle_put_revoke, handle_put_start}, + put::{handle_put_done, handle_put_revoke, handle_put_start, handle_ssd_replica_commit}, }; use crate::ClientKvApiAccessTrait; use crate::client_kv_api::ClientKvApi; +use crate::client_kv_api::msg_pack::SsdReplicaPersistReq; use crate::cluster_manager::{ ClusterEvent, ClusterManager, ClusterManagerAccessTrait, NodeID, NodeIDString, }; @@ -116,8 +117,10 @@ pub struct InflightGetInfo { pub req_node_id: NodeID, pub len: u64, pub allocation: Arc, + pub source_allocation: Option>, pub route: Arc, pub allocation_mode: GetAllocationMode, + pub source_kind: GetSourceKind, } impl InflightGetInfo { @@ -201,6 +204,13 @@ pub struct KvRouteInfo { pub tomb_tag: NodeTombTag, } +#[derive(Clone, Debug)] +pub struct KvSsdRouteInfo { + pub node_id: NodeID, + pub len: u64, + pub tomb_tag: NodeTombTag, +} + #[derive(Debug)] pub struct OneKvNodesRoutes { /// the version id for a kv put operation @@ -230,6 +240,8 @@ pub struct OneKvNodesRoutes { /// node_id -> KvRouteInfo pub nodes_replicas: RwLock>, + /// node_id -> SSD replica metadata for the same key-version. + pub ssd_replicas: RwLock>, pub get_durable_slots_used: AtomicU32, } @@ -247,9 +259,16 @@ impl OneKvNodesRoutes { let mut nodes_replicas = self.nodes_replicas.write(); nodes_replicas.retain(|_, kv_info| !tombs.contains(&kv_info.node_id)); + let mut ssd_replicas = self.ssd_replicas.write(); + ssd_replicas.retain(|_, kv_info| !tombs.contains(&kv_info.node_id)); + return true; } + fn has_live_replica(&self) -> bool { + !self.nodes_replicas.read().is_empty() || !self.ssd_replicas.read().is_empty() + } + fn try_reserve_get_durable_slot(&self) -> bool { self.get_durable_slots_used .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| { @@ -283,6 +302,7 @@ mod tests { put_id: (1, 0), lease_id: None, nodes_replicas: RwLock::new(HashMap::new()), + ssd_replicas: RwLock::new(HashMap::new()), get_durable_slots_used: AtomicU32::new(0), }; @@ -607,6 +627,7 @@ impl MasterKvRouter { fn register_rpc_callers(&self) { RPCCaller::::new().regist(self.0.view().p2p_module()); + RPCCaller::::new().regist(self.0.view().p2p_module()); } fn register_rpc_handlers(&self) { @@ -766,6 +787,22 @@ impl MasterKvRouter { Ok(()) }); + let view = self.0.view().clone(); + RPCHandler::::new().regist(p2p, move |resp, msg| { + let view = view.clone(); + let view2 = view.clone(); + let view_task = view2.clone(); + let _ = view.spawn("rpc_ssd_replica_commit", async move { + let t0 = Utc::now().timestamp_micros(); + let mut ack = handle_ssd_replica_commit(view_task, msg).await; + ack.serialize_part.server_process_us = Utc::now().timestamp_micros() - t0; + if let Err(e) = resp.send_resp(ack).await { + error!("Failed to send SsdReplicaCommitResp: {:?}", e); + } + }); + Ok(()) + }); + // --- MemHolder Handlers --- // let view = inner.view.clone(); // RPCHandler::::new().regist(p2p, move |resp, msg| { diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/msg_pack.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/msg_pack.rs index 9d5eb1d..bdd85b6 100755 --- a/fluxon_rs/fluxon_kv/src/master_kv_router/msg_pack.rs +++ b/fluxon_rs/fluxon_kv/src/master_kv_router/msg_pack.rs @@ -18,6 +18,13 @@ pub enum GetAllocationMode { DurableReplica = 2, } +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Encode, Decode)] +pub enum GetSourceKind { + #[default] + Memory = 0, + Ssd = 1, +} + #[derive(Default, Debug, Clone, Encode, Decode)] pub struct GetStartReq { pub key: String, @@ -32,6 +39,7 @@ pub struct GetStartResp { pub get_id: u64, pub node_id: NodeIDString, pub put_id: PutIDForAKey, + pub source_kind: GetSourceKind, // absolute addresses because Mooncake transfer engine requires absolute addresses (not offsets) pub target_addr: u64, pub src_addr: u64, @@ -39,6 +47,8 @@ pub struct GetStartResp { pub target_base_addr: u64, pub src_base_addr: u64, pub len: u64, + /// SSD source staging bytes available at src_addr. Zero for memory sources. + pub ssd_stage_len: u64, pub error_code: ErrorCode, pub error_json: String, /// Server-side processing time in microseconds for this RPC handler @@ -56,6 +66,8 @@ impl RPCReq for GetStartReq { #[derive(Default, Debug, Clone, Encode, Decode)] pub struct GetRevokeReq { pub get_id: u64, + /// True only when an SSD stage failed and the source must be removed from routing. + pub drop_ssd_source: bool, } impl MsgPackSerializePart for GetRevokeReq { fn msg_id(&self) -> u32 { @@ -250,6 +262,34 @@ impl RPCReq for PutDoneReq { type Resp = PutDoneResp; } +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdReplicaCommitReq { + pub key: String, + pub put_id: PutIDForAKey, + pub node_id: NodeIDString, + pub len: u64, +} +impl MsgPackSerializePart for SsdReplicaCommitReq { + fn msg_id(&self) -> u32 { + MsgId::SsdReplicaCommitReq as u32 + } +} +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdReplicaCommitResp { + pub error_code: ErrorCode, + pub error_json: String, + /// Server-side processing time in microseconds for this RPC handler + pub server_process_us: i64, +} +impl MsgPackSerializePart for SsdReplicaCommitResp { + fn msg_id(&self) -> u32 { + MsgId::SsdReplicaCommitResp as u32 + } +} +impl RPCReq for SsdReplicaCommitReq { + type Resp = SsdReplicaCommitResp; +} + // --- RPC for MemHolder KeepAlive --- #[derive(Default, Debug, Clone, Encode, Decode)] diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/put.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/put.rs index 70d8858..06e41cc 100755 --- a/fluxon_rs/fluxon_kv/src/master_kv_router/put.rs +++ b/fluxon_rs/fluxon_kv/src/master_kv_router/put.rs @@ -1,15 +1,19 @@ -use super::NodeValueReplicaDesc; use super::{ InflightPutAllocation, InflightPutInfo, KvRouteInfo, MasterKvRouterView, PutPlacementMode, - msg_pack::{PutDoneReq, PutDoneResp, PutRevokeReq, PutRevokeResp, PutStartReq, PutStartResp}, + msg_pack::{ + PutDoneReq, PutDoneResp, PutRevokeReq, PutRevokeResp, PutStartReq, PutStartResp, + SsdReplicaCommitReq, SsdReplicaCommitResp, + }, placement::PutPlacementTarget, }; +use super::{KvSsdRouteInfo, NodeValueReplicaDesc}; +use crate::client_kv_api::msg_pack::SsdReplicaPersistReq; use crate::master_kv_router::OneKvNodesRoutes; use crate::master_kv_router::delete::DeleteKeyInfo; use crate::{ cluster_manager::{META_KEY_LOCAL_IPC_ROOT, NodeID}, master_seg_manager::one_seg_allocator::Allocation, - p2p::msg_pack::MsgPack, + p2p::msg_pack::{MsgPack, RPCCaller}, rpcresp_kvresult_convert::msg_and_error, }; use fluxon_commu::{META_KEY_SHARED_STORAGE_NODE_ID, META_KEY_SHARED_STORAGE_NODE_START_TIME}; @@ -19,6 +23,7 @@ use rand::seq::SliceRandom; use std::{ collections::HashMap, sync::{Arc, atomic::AtomicU32}, + time::Duration, }; pub type PutIDForAKey = (u64, u32); @@ -474,6 +479,171 @@ pub async fn handle_put_revoke( } } +fn spawn_ssd_replica_persist_request( + view: &MasterKvRouterView, + key: String, + put_id: PutIDForAKey, + node_id: NodeID, + len: u64, + allocation: Arc, +) { + let target_addr = allocation.base_addr() + allocation.addr(); + let view = view.clone(); + let view_task = view.clone(); + let _ = view.spawn("post_put_ssd_replica_persist", async move { + let _allocation_guard = allocation; + let req = MsgPack { + serialize_part: SsdReplicaPersistReq { + key: key.clone(), + put_id, + target_addr, + len, + }, + raw_bytes: Vec::new(), + }; + let resp = RPCCaller::::new() + .call( + view_task.p2p_module(), + node_id.clone(), + req, + Some(Duration::from_secs(60)), + 2, + ) + .await; + match resp { + Ok(resp) => { + if let Err(err) = crate::rpcresp_kvresult_convert::try_from_code( + resp.serialize_part.error_code, + resp.serialize_part.error_json, + ) { + tracing::warn!( + "SSD replica persist failed: key={} put_id=({},{}) node={} err={}", + key, + put_id.0, + put_id.1, + node_id, + err + ); + } else if resp.serialize_part.persisted { + tracing::debug!( + "SSD replica persist completed: key={} put_id=({},{}) node={}", + key, + put_id.0, + put_id.1, + node_id + ); + } else { + tracing::debug!( + "SSD replica persist skipped because owner has no SSD store: key={} put_id=({},{}) node={}", + key, + put_id.0, + put_id.1, + node_id + ); + } + } + Err(err) => { + tracing::warn!( + "SSD replica persist RPC failed: key={} put_id=({},{}) node={} err={:?}", + key, + put_id.0, + put_id.1, + node_id, + err + ); + } + } + }); +} + +fn ok_ssd_replica_commit_resp() -> MsgPack { + MsgPack { + serialize_part: SsdReplicaCommitResp { + error_code: msg_and_error::OK, + error_json: String::new(), + server_process_us: 0, + }, + raw_bytes: Vec::new(), + } +} + +pub async fn handle_ssd_replica_commit( + view: MasterKvRouterView, + req: MsgPack, +) -> MsgPack { + let req = req.serialize_part; + let node_id: NodeID = req.node_id.clone().into(); + let Some(route_ref) = view.master_kv_router().inner().kv_routes.get(&req.key) else { + tracing::debug!( + "Ignoring SSD replica commit for missing key: key={} put_id=({},{}) node={}", + req.key, + req.put_id.0, + req.put_id.1, + req.node_id + ); + return ok_ssd_replica_commit_resp(); + }; + let route = route_ref.value().clone(); + drop(route_ref); + + if route.put_id != req.put_id { + tracing::debug!( + "Ignoring stale SSD replica commit: key={} req_put_id=({},{}) current_put_id=({},{}) node={}", + req.key, + req.put_id.0, + req.put_id.1, + route.put_id.0, + route.put_id.1, + req.node_id + ); + return ok_ssd_replica_commit_resp(); + } + + let tomb_tag = { + let replicas = route.nodes_replicas.read(); + let Some(memory_replica) = replicas.get(&node_id) else { + tracing::debug!( + "Ignoring SSD replica commit without matching memory replica: key={} put_id=({},{}) node={}", + req.key, + req.put_id.0, + req.put_id.1, + req.node_id + ); + return ok_ssd_replica_commit_resp(); + }; + memory_replica.tomb_tag.clone() + }; + + if tomb_tag.is_tomb() { + tracing::debug!( + "Ignoring SSD replica commit for tombed node: key={} put_id=({},{}) node={}", + req.key, + req.put_id.0, + req.put_id.1, + req.node_id + ); + return ok_ssd_replica_commit_resp(); + } + + route.ssd_replicas.write().insert( + node_id.clone(), + KvSsdRouteInfo { + node_id, + len: req.len, + tomb_tag, + }, + ); + tracing::debug!( + "Committed SSD replica route: key={} put_id=({},{}) node={} len={}", + req.key, + req.put_id.0, + req.put_id.1, + req.node_id, + req.len + ); + ok_ssd_replica_commit_resp() +} + pub async fn handle_put_done( view: MasterKvRouterView, req: MsgPack, @@ -488,6 +658,7 @@ pub async fn handle_put_done( if let Some(InflightPutInfo { node_id, key, + len, src_target_allocation, .. }) = view @@ -631,8 +802,9 @@ pub async fn handle_put_done( let completed_info = KvRouteInfo { node_id: node_id.clone(), allocation: Arc::new(target_allocation), - tomb_tag, + tomb_tag: tomb_tag.clone(), }; + let target_allocation_for_ssd = Arc::clone(&completed_info.allocation); // Insert into kv_routes with replica support let mut old_one_kv_routes: Option> = None; @@ -649,6 +821,7 @@ pub async fn handle_put_done( put_id, lease_id: lease_id_opt, nodes_replicas: RwLock::new(HashMap::new()), + ssd_replicas: RwLock::new(HashMap::new()), get_durable_slots_used: AtomicU32::new(0), }) }); @@ -659,6 +832,7 @@ pub async fn handle_put_done( put_id, lease_id: lease_id_opt, nodes_replicas: RwLock::new(HashMap::new()), + ssd_replicas: RwLock::new(HashMap::new()), get_durable_slots_used: AtomicU32::new(0), }); } @@ -668,6 +842,15 @@ pub async fn handle_put_done( .insert(node_id.clone(), completed_info); } + spawn_ssd_replica_persist_request( + &view, + key.clone(), + put_id, + node_id.clone(), + len, + target_allocation_for_ssd, + ); + if let Some(old) = old_one_kv_routes { if let Err(err) = view .master_kv_router() diff --git a/fluxon_rs/fluxon_kv/src/master_lease_manager/lease_manager_test.rs b/fluxon_rs/fluxon_kv/src/master_lease_manager/lease_manager_test.rs index 5c20cc1..5d344c9 100755 --- a/fluxon_rs/fluxon_kv/src/master_lease_manager/lease_manager_test.rs +++ b/fluxon_rs/fluxon_kv/src/master_lease_manager/lease_manager_test.rs @@ -22,7 +22,8 @@ async fn test1_lease_expire_removes_keys() { unsafe { std::env::set_var("FLUXON_LOG", "debug"); } - let (master_fw, client_fw) = start_master_and_client("lease_master_t1", "lease_client_t1").await; + let (master_fw, client_fw) = + start_master_and_client("lease_master_t1", "lease_client_t1").await; let client_view = client_fw.client_kv_api_view(); wait_master_ready(&client_view).await; @@ -82,7 +83,8 @@ async fn test2_rebind_to_new_lease_preserves_until_new_expire() { unsafe { std::env::set_var("FLUXON_LOG", "debug"); } - let (master_fw, client_fw) = start_master_and_client("lease_master_t2", "lease_client_t2").await; + let (master_fw, client_fw) = + start_master_and_client("lease_master_t2", "lease_client_t2").await; let client_view = client_fw.client_kv_api_view(); wait_master_ready(&client_view).await; @@ -161,7 +163,8 @@ async fn test3_keepalive() { unsafe { std::env::set_var("FLUXON_LOG", "debug"); } - let (master_fw, client_fw) = start_master_and_client("lease_master_t3", "lease_client_t3").await; + let (master_fw, client_fw) = + start_master_and_client("lease_master_t3", "lease_client_t3").await; let client_view = client_fw.client_kv_api_view(); wait_master_ready(&client_view).await; @@ -236,7 +239,8 @@ async fn test4_delete_under_lease_then_get_fails() { unsafe { std::env::set_var("FLUXON_LOG", "debug"); } - let (master_fw, client_fw) = start_master_and_client("lease_master_t4", "lease_client_t4").await; + let (master_fw, client_fw) = + start_master_and_client("lease_master_t4", "lease_client_t4").await; let client_view = client_fw.client_kv_api_view(); wait_master_ready(&client_view).await; diff --git a/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs b/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs index 692a9a0..cfd6d55 100644 --- a/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs +++ b/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs @@ -101,6 +101,7 @@ fn new_client_config_with_size( large_file_paths: crate::config::LargeFilePaths { paths: vec![format!("/tmp/kvcache_large/{}", instance_key)], }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), } } @@ -134,6 +135,7 @@ fn new_zero_contribution_client_config( }, share_mem_path: format!("/tmp/kvcache_shared_memory/{}", owner_instance_key), large_file_paths: crate::config::LargeFilePaths { paths: Vec::new() }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), } } diff --git a/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/msg_and_error.rs b/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/msg_and_error.rs index 42a9cbc..def8b1c 100644 --- a/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/msg_and_error.rs +++ b/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/msg_and_error.rs @@ -35,6 +35,8 @@ pub enum MsgId { DeleteAckResp = 3024, BatchDeleteAckReq = 3029, BatchDeleteAckResp = 3030, + SsdReplicaCommitReq = 3031, + SsdReplicaCommitResp = 3032, GetMetaReq = 3019, GetMetaResp = 3020, BatchDeleteClientKvMetaCacheReq = 3021, diff --git a/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/rpcresp_kvresult_convert.rs b/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/rpcresp_kvresult_convert.rs index b6eb7d6..a5a18b4 100755 --- a/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/rpcresp_kvresult_convert.rs +++ b/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/rpcresp_kvresult_convert.rs @@ -3,11 +3,12 @@ use super::msg_and_error::{ErrorCode, KvError, KvResult}; use crate::client_kv_api::msg_pack::{ ExternalDeleteAckResp, ExternalDeleteResp, ExternalGetResp, ExternalIsExistResp, ExternalPutCommitResp, ExternalPutRevokeResp, ExternalPutStartResp, ExternalPutTransferEndResp, + SsdReplicaPersistResp, SsdStageReadResp, }; use crate::master_kv_router::msg_pack::{ BatchDeleteAckResp, BatchDeleteClientKvMetaCacheResp, DeleteAckResp, DeleteResp, GetDoneResp, GetMasterOnlyMetricPartResp, GetMetaResp, GetRevokeResp, GetStartResp, MemHolderKeepAliveResp, - MemHolderReleaseResp, PutDoneResp, PutRevokeResp, PutStartResp, + MemHolderReleaseResp, PutDoneResp, PutRevokeResp, PutStartResp, SsdReplicaCommitResp, }; use crate::master_seg_manager::msg_pack::RequestSegmentRegistrationResp; use crate::memholder::ExternalMemHolderInfo; @@ -232,6 +233,26 @@ impl FromError for ExternalDeleteAckResp { } } } +impl FromError for SsdStageReadResp { + fn from_error(e: &KvError) -> Self { + let code = e.code(); + Self { + error_code: code, + error_json: e.to_json(), + ..Default::default() + } + } +} +impl FromError for SsdReplicaPersistResp { + fn from_error(e: &KvError) -> Self { + let code = e.code(); + Self { + error_code: code, + error_json: e.to_json(), + ..Default::default() + } + } +} // ---- FromError for Master KV Router Resps ---- impl FromError for GetStartResp { @@ -294,6 +315,16 @@ impl FromError for PutDoneResp { } } } +impl FromError for SsdReplicaCommitResp { + fn from_error(e: &KvError) -> Self { + let code = e.code(); + Self { + error_code: code, + error_json: e.to_json(), + ..Default::default() + } + } +} impl FromError for MemHolderKeepAliveResp { fn from_error(e: &KvError) -> Self { let code = e.code();