diff --git a/README.md b/README.md
index 2df13d00..bdf76c2a 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
   <img src="docs/imgs/assets/logo.png" alt="DLSlime logo" width="44%">
 </div>
 <p align="center">
-  <a href="https://deeplink-org.github.io/DLSlime">Docs</a> |
+  <a href="https://deeplink-org.github.io/DLSlime"><img src="docs/imgs/assets/docs.png" width="16" height="16" style="vertical-align: middle;"> Docs </a> |
   <a href="docs/roadmap.md"><img src="docs/imgs/assets/roadmap.svg" width="16" height="16" style="vertical-align: middle;"> Roadmap </a> |
   <a href="https://join.slack.com/t/dlslime/shared_invite/zt-3e9zvercw-a89KI_Ig8N1UTaol_q6MXg"><img src="docs/imgs/assets/slack.svg" width="16" height="16" style="vertical-align: middle;"> Slack </a> |
   <a href="docs/imgs/assets/wechat_qrcode.jpg"><img src="docs/imgs/assets/wechat.svg" width="16" height="16" style="vertical-align: middle;"> WeChat Group </a> |
diff --git a/README_zh.md b/README_zh.md
index 7d8bd36c..4794a028 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -2,7 +2,7 @@
   <img src="docs/imgs/assets/logo.png" alt="DLSlime logo" width="44%">
 </div>
 <p align="center">
-  <a href="https://deeplink-org.github.io/DLSlime">文档</a> |
+  <a href="https://deeplink-org.github.io/DLSlime"><img src="docs/imgs/assets/docs.png" width="16" height="16" style="vertical-align: middle;"> 文档 </a> |
   <a href="docs/roadmap.md"><img src="docs/imgs/assets/roadmap.svg" width="16" height="16" style="vertical-align: middle;"> 路线图 </a> |
   <a href="https://join.slack.com/t/dlslime/shared_invite/zt-3e9zvercw-a89KI_Ig8N1UTaol_q6MXg"><img src="docs/imgs/assets/slack.svg" width="16" height="16" style="vertical-align: middle;"> Slack </a> |
   <a href="docs/imgs/assets/wechat_qrcode.jpg"><img src="docs/imgs/assets/wechat.svg" width="16" height="16" style="vertical-align: middle;"> 微信群 </a> |
diff --git a/bench/README.md b/bench/README.md
index d4d459c2..8cdd2f6f 100644
--- a/bench/README.md
+++ b/bench/README.md
@@ -12,8 +12,8 @@ so benchmark commands, hardware notes, and result files can evolve together.
 | `python/endpoint_io_bench.py`       | Endpoint-level I/O benchmark                                  |
 | `python/endpoint_sendrecv_bench.py` | Endpoint send/recv benchmark                                  |
 | `python/cache_bench.py`             | DLSlimeCache benchmark                                        |
-| `python/run_rpc_bench.sh`           | SlimeRPC vs Ray benchmark wrapper                             |
-| `python/rpc_bench_*.py`             | SlimeRPC and Ray benchmark implementations                    |
+| `python/run_rpc_bench.sh`           | SlimeRPC vs Ray (optionally + Pulsing) benchmark wrapper      |
+| `python/rpc_bench_*.py`             | SlimeRPC, Ray, and Pulsing benchmark implementations          |
 | `results/`                          | CSV outputs and captured worker logs                          |
 
 ## Prerequisites
@@ -112,12 +112,21 @@ dlslime-cache stop
 ## SlimeRPC vs Ray Benchmark
 
 The RPC benchmark compares SlimeRPC round-trip latency and bandwidth with a Ray
-actor baseline.
+actor baseline. A Pulsing (`@pul.remote`) actor baseline is available as an
+opt-in third comparator.
 
 ```bash
 bash bench/python/run_rpc_bench.sh
 ```
 
+Include the Pulsing baseline (requires `pip install pulsing`):
+
+```bash
+bash bench/python/run_rpc_bench.sh --with-pulsing
+# or
+WITH_PULSING=1 bash bench/python/run_rpc_bench.sh
+```
+
 With explicit parameters:
 
 ```bash
@@ -132,6 +141,7 @@ The script writes:
 ```text
 bench/results/slime_rpc.csv
 bench/results/ray_rpc.csv
+bench/results/pulsing_rpc.csv   # only when --with-pulsing is passed
 ```
 
 See [../docs/benchmark-rpc.md](../docs/benchmark-rpc.md) for the full RPC
diff --git a/bench/python/rpc_bench_compare.py b/bench/python/rpc_bench_compare.py
index f9ffdc59..2d404caf 100644
--- a/bench/python/rpc_bench_compare.py
+++ b/bench/python/rpc_bench_compare.py
@@ -1,8 +1,11 @@
 #!/usr/bin/env python3
-"""Print a side-by-side comparison of SlimeRPC vs Ray benchmark CSVs.
+"""Print a side-by-side comparison of SlimeRPC, Ray (and optionally Pulsing) benchmark CSVs.
 
 Usage:
-    python rpc_bench_compare.py [--slime results/slime_rpc.csv] [--ray results/ray_rpc.csv]
+    python rpc_bench_compare.py \\
+        [--slime results/slime_rpc.csv] \\
+        [--ray results/ray_rpc.csv] \\
+        [--pulsing results/pulsing_rpc.csv]   # optional
 """
 
 import argparse
@@ -22,10 +25,15 @@ def label(size: int) -> str:
 def main():
     default_dir = os.path.join(os.path.dirname(__file__), "..", "results")
     parser = argparse.ArgumentParser(
-        description="Compare SlimeRPC vs Ray benchmark results"
+        description="Compare SlimeRPC vs Ray (and optionally Pulsing) benchmark results"
     )
     parser.add_argument("--slime", default=os.path.join(default_dir, "slime_rpc.csv"))
     parser.add_argument("--ray", default=os.path.join(default_dir, "ray_rpc.csv"))
+    parser.add_argument(
+        "--pulsing",
+        default=None,
+        help="Optional Pulsing CSV; if omitted, Pulsing columns are skipped.",
+    )
     args = parser.parse_args()
 
     for path in (args.slime, args.ray):
@@ -37,25 +45,43 @@ def main():
     slime = load(args.slime)
     ray_r = load(args.ray)
 
-    common = sorted(set(slime) & set(ray_r))
+    pulsing = None
+    if args.pulsing:
+        if not os.path.exists(args.pulsing):
+            print(f"Missing Pulsing results file: {args.pulsing}")
+            print("Run rpc_bench_pulsing.py first, or omit --pulsing.")
+            return
+        pulsing = load(args.pulsing)
+
+    common = set(slime) & set(ray_r)
+    if pulsing is not None:
+        common &= set(pulsing)
+    common = sorted(common)
     if not common:
-        print("No overlapping sizes between the two result files.")
+        print("No overlapping sizes between the result files.")
         return
 
     col = 12
-    sep = "-" * (
-        10 + 1 + col + 1 + col + 1 + col + 1 + col + 1 + col + 1 + col + 1 + 12
-    )
-
     print(
         "\n┌─ Avg Latency (µs) ─────────────────────────────────────────────────────────┐"
     )
-    header = (
-        f"{'Size':<10} | "
-        f"{'Slime avg':>{col}} | {'Slime p99':>{col}} | {'Slime BW':>{col}} | "
-        f"{'Ray avg':>{col}} | {'Ray p99':>{col}} | {'Ray BW':>{col}} | "
-        f"{'Speedup':>10}"
-    )
+
+    if pulsing is not None:
+        header = (
+            f"{'Size':<10} | "
+            f"{'Slime avg':>{col}} | {'Slime p99':>{col}} | {'Slime BW':>{col}} | "
+            f"{'Puls avg':>{col}} | {'Puls p99':>{col}} | {'Puls BW':>{col}} | "
+            f"{'Ray avg':>{col}} | {'Ray p99':>{col}} | {'Ray BW':>{col}} | "
+            f"{'S/Pul':>10} | {'S/Ray':>10}"
+        )
+    else:
+        header = (
+            f"{'Size':<10} | "
+            f"{'Slime avg':>{col}} | {'Slime p99':>{col}} | {'Slime BW':>{col}} | "
+            f"{'Ray avg':>{col}} | {'Ray p99':>{col}} | {'Ray BW':>{col}} | "
+            f"{'S/Ray':>10}"
+        )
+    sep = "-" * len(header)
     print(header)
     print(sep)
 
@@ -66,17 +92,36 @@ def main():
         ray_avg = float(ray_r[size]["avg_us"])
         ray_p99 = float(ray_r[size]["p99_us"])
         ray_bw = float(ray_r[size]["bw_gbps"])
-        speedup = ray_avg / sl_avg  # >1 means SlimeRPC is faster
+        sp_ray = ray_avg / sl_avg  # >1 means SlimeRPC is faster than Ray
+
+        if pulsing is not None:
+            pu_avg = float(pulsing[size]["avg_us"])
+            pu_p99 = float(pulsing[size]["p99_us"])
+            pu_bw = float(pulsing[size]["bw_gbps"])
+            sp_pul = pu_avg / sl_avg  # >1 means SlimeRPC is faster than Pulsing
+            print(
+                f"{label(size):<10} | "
+                f"{sl_avg:>{col}.1f} | {sl_p99:>{col}.1f} | {sl_bw:>{col}.3f} | "
+                f"{pu_avg:>{col}.1f} | {pu_p99:>{col}.1f} | {pu_bw:>{col}.3f} | "
+                f"{ray_avg:>{col}.1f} | {ray_p99:>{col}.1f} | {ray_bw:>{col}.3f} | "
+                f"{'%.2fx' % sp_pul:>10} | {'%.2fx' % sp_ray:>10}"
+            )
+        else:
+            print(
+                f"{label(size):<10} | "
+                f"{sl_avg:>{col}.1f} | {sl_p99:>{col}.1f} | {sl_bw:>{col}.3f} | "
+                f"{ray_avg:>{col}.1f} | {ray_p99:>{col}.1f} | {ray_bw:>{col}.3f} | "
+                f"{'%.2fx' % sp_ray:>10}"
+            )
 
+    print(sep)
+    if pulsing is not None:
         print(
-            f"{label(size):<10} | "
-            f"{sl_avg:>{col}.1f} | {sl_p99:>{col}.1f} | {sl_bw:>{col}.3f} | "
-            f"{ray_avg:>{col}.1f} | {ray_p99:>{col}.1f} | {ray_bw:>{col}.3f} | "
-            f"{'%.2fx' % speedup:>10}"
+            "S/Pul = Pulsing avg latency / SlimeRPC avg latency  (>1 means SlimeRPC wins)"
         )
-
-    print(sep)
-    print("Speedup = Ray avg latency / SlimeRPC avg latency  (>1 means SlimeRPC wins)")
+    print(
+        "S/Ray = Ray avg latency / SlimeRPC avg latency      (>1 means SlimeRPC wins)"
+    )
     print()
 
 
diff --git a/bench/python/rpc_bench_pulsing.py b/bench/python/rpc_bench_pulsing.py
new file mode 100644
index 00000000..69a66b0d
--- /dev/null
+++ b/bench/python/rpc_bench_pulsing.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""Pulsing RPC benchmark — same-machine CPU bytes echo.
+
+Measures round-trip latency for bytes echo via a Pulsing actor across a range
+of payload sizes, using identical metrics as rpc_bench_slime_driver.py and
+rpc_bench_ray.py so the three CSVs can be compared directly.
+
+Usage:
+    python rpc_bench_pulsing.py [--out results/pulsing_rpc.csv]
+"""
+
+import argparse
+import asyncio
+import csv
+import os
+import time
+
+import pulsing as pul
+
+
+@pul.remote
+class EchoActor:
+    def echo(self, data: bytes) -> bytes:
+        return data
+
+
+SIZES = [
+    1 * 1024,
+    4 * 1024,
+    16 * 1024,
+    64 * 1024,
+    256 * 1024,
+    1 * 1024 * 1024,
+    4 * 1024 * 1024,
+    16 * 1024 * 1024,
+    64 * 1024 * 1024,
+]
+
+
+async def _benchmark(actor, data: bytes, warmup: int, iterations: int) -> dict:
+    for _ in range(warmup):
+        await actor.echo(data)
+
+    latencies_us = []
+    for _ in range(iterations):
+        t0 = time.perf_counter()
+        await actor.echo(data)
+        latencies_us.append((time.perf_counter() - t0) * 1e6)
+
+    latencies_us.sort()
+    n = len(latencies_us)
+    avg = sum(latencies_us) / n
+    return {
+        "avg_us": avg,
+        "p50_us": latencies_us[n // 2],
+        "p99_us": latencies_us[int(n * 0.99)],
+        "bw_gbps": (2 * len(data)) / 1e9 / (avg / 1e6),
+    }
+
+
+def _label(size: int) -> str:
+    return f"{size // 1024}KB" if size < 1024 * 1024 else f"{size // (1024 * 1024)}MB"
+
+
+async def _run(out_path: str):
+    await pul.init()
+    try:
+        actor = await EchoActor.spawn()
+
+        header = (
+            f"{'Size':<10} | {'Avg (µs)':<12} | {'P50 (µs)':<12} "
+            f"| {'P99 (µs)':<12} | {'BW (GB/s)':<10}"
+        )
+        print(header)
+        print("-" * len(header))
+
+        records = []
+        for size in SIZES:
+            data = bytes(size)
+            iters = max(50, min(500, 50_000_000 // size))
+            r = await _benchmark(actor, data, warmup=20, iterations=iters)
+            records.append({"size": size, **r})
+            print(
+                f"{_label(size):<10} | {r['avg_us']:<12.1f} | {r['p50_us']:<12.1f} "
+                f"| {r['p99_us']:<12.1f} | {r['bw_gbps']:<10.3f}"
+            )
+
+        os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
+        with open(out_path, "w", newline="") as f:
+            writer = csv.DictWriter(
+                f, fieldnames=["size", "avg_us", "p50_us", "p99_us", "bw_gbps"]
+            )
+            writer.writeheader()
+            writer.writerows(records)
+        print(f"\nResults saved → {out_path}")
+    finally:
+        await pul.shutdown()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Pulsing RPC benchmark")
+    parser.add_argument(
+        "--out",
+        default=os.path.join(
+            os.path.dirname(__file__), "..", "results", "pulsing_rpc.csv"
+        ),
+    )
+    args = parser.parse_args()
+    asyncio.run(_run(args.out))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/python/rpc_bench_slime_driver.py b/bench/python/rpc_bench_slime_driver.py
index 572b917f..240357f1 100644
--- a/bench/python/rpc_bench_slime_driver.py
+++ b/bench/python/rpc_bench_slime_driver.py
@@ -118,9 +118,8 @@ def main():
     driver._rpc_buffer_size = buf_bytes
     driver._rpc_max_inflight = max(1, int(getattr(args, "max_inflight", 4)))
 
-    driver.set_desired_topology(["bench-worker"])
     print("Waiting for worker…")
-    driver.wait_for_peers(["bench-worker"], timeout_sec=120)
+    driver.connect_to("bench-worker", ib_port=1, qp_num=1).wait(timeout=120)
     print("Connected. Starting benchmark.\n")
 
     w = make_proxy(driver, "bench-worker", EchoService)
diff --git a/bench/python/rpc_bench_slime_worker.py b/bench/python/rpc_bench_slime_worker.py
index 7b2efb4c..1221cb05 100644
--- a/bench/python/rpc_bench_slime_worker.py
+++ b/bench/python/rpc_bench_slime_worker.py
@@ -67,9 +67,8 @@ def main():
     # buf_mb * SLIME_RPC_MAX_INFLIGHT bytes of CPU memory needlessly.
     worker._rpc_max_inflight = max(1, int(getattr(args, "max_inflight", 4)))
 
-    worker.set_desired_topology(["bench-driver"])
     print("Worker ready, waiting for driver to connect…")
-    worker.wait_for_peers(["bench-driver"], timeout_sec=120)
+    worker.connect_to("bench-driver", ib_port=1, qp_num=1).wait(timeout=120)
     print("Connected. Serving (Ctrl-C to stop).")
 
     try:
diff --git a/bench/python/run_rpc_bench.sh b/bench/python/run_rpc_bench.sh
index 38edb753..c4bef7d3 100644
--- a/bench/python/run_rpc_bench.sh
+++ b/bench/python/run_rpc_bench.sh
@@ -1,11 +1,12 @@
 #!/usr/bin/env bash
-# run_rpc_bench.sh — run SlimeRPC + Ray benchmarks and print comparison.
+# run_rpc_bench.sh — run SlimeRPC + Ray benchmarks (+ optional Pulsing) and print comparison.
 #
 # Usage:
-#   bash run_rpc_bench.sh [--ctrl http://127.0.0.1:3000] [--buf-mb 256] [--max-size-mb 16] [--scope rpc-bench-...]
+#   bash run_rpc_bench.sh [--ctrl http://127.0.0.1:3000] [--buf-mb 256] [--max-size-mb 16] \
+#                         [--scope rpc-bench-...] [--with-pulsing]
 #
 # Environment overrides:
-#   CTRL=http://host:3000 MAX_SIZE_MB=16 bash run_rpc_bench.sh
+#   CTRL=http://host:3000 MAX_SIZE_MB=16 WITH_PULSING=1 bash run_rpc_bench.sh
 
 set -euo pipefail
 
@@ -15,6 +16,7 @@ CTRL="${CTRL:-http://127.0.0.1:3000}"
 BUF_MB="${BUF_MB:-256}"
 MAX_SIZE_MB="${MAX_SIZE_MB:-16}"
 SCOPE="${SCOPE:-rpc-bench-$(date +%s)-$$}"
+WITH_PULSING="${WITH_PULSING:-0}"
 
 # Parse args
 while [[ $# -gt 0 ]]; do
@@ -23,26 +25,38 @@ while [[ $# -gt 0 ]]; do
         --buf-mb) BUF_MB="$2"; shift 2 ;;
         --max-size-mb) MAX_SIZE_MB="$2"; shift 2 ;;
         --scope) SCOPE="$2"; shift 2 ;;
+        --with-pulsing) WITH_PULSING=1; shift ;;
         *) echo "Unknown argument: $1"; exit 1 ;;
     esac
 done
 
+if [[ "$WITH_PULSING" == "1" ]]; then
+    TOTAL=4
+else
+    TOTAL=3
+fi
+
 mkdir -p "$RESULTS_DIR"
 WORKER_LOG="$RESULTS_DIR/slime_worker_${SCOPE}.log"
 
 echo "╔══════════════════════════════════════════════════╗"
-echo "║        SlimeRPC vs Ray — RPC Benchmark           ║"
+if [[ "$WITH_PULSING" == "1" ]]; then
+    echo "║   SlimeRPC vs Pulsing vs Ray — RPC Benchmark     ║"
+else
+    echo "║        SlimeRPC vs Ray — RPC Benchmark           ║"
+fi
 echo "╚══════════════════════════════════════════════════╝"
 echo "  NanoCtrl : $CTRL"
 echo "  Scope    : $SCOPE"
 echo "  Buffer   : ${BUF_MB} MB"
 echo "  Max Size : ${MAX_SIZE_MB} MB"
+echo "  Pulsing  : $([[ "$WITH_PULSING" == "1" ]] && echo on || echo off)"
 echo "  Results  : $RESULTS_DIR"
 echo "  Worker Log: $WORKER_LOG"
 echo ""
 
-# ── [1/3] SlimeRPC ──────────────────────────────────────────────────────────
-echo "▶ [1/3] SlimeRPC — starting worker..."
+# ── [1/N] SlimeRPC ──────────────────────────────────────────────────────────
+echo "▶ [1/$TOTAL] SlimeRPC — starting worker..."
 PYTHONUNBUFFERED=1 python "$SCRIPT_DIR/rpc_bench_slime_worker.py" \
     --ctrl "$CTRL" --scope "$SCOPE" --buf-mb "$BUF_MB" \
     >"$WORKER_LOG" 2>&1 &
@@ -51,7 +65,7 @@ WORKER_PID=$!
 # Give the worker time to register with NanoCtrl
 sleep 2
 
-echo "▶ [1/3] SlimeRPC — running driver..."
+echo "▶ [1/$TOTAL] SlimeRPC — running driver..."
 if ! python "$SCRIPT_DIR/rpc_bench_slime_driver.py" \
     --ctrl "$CTRL" --scope "$SCOPE" --buf-mb "$BUF_MB" \
     --max-size-mb "$MAX_SIZE_MB" \
@@ -70,13 +84,26 @@ kill "$WORKER_PID" 2>/dev/null || true
 wait "$WORKER_PID" 2>/dev/null || true
 echo ""
 
-# ── [2/3] Ray ───────────────────────────────────────────────────────────────
-echo "▶ [2/3] Ray — running benchmark..."
+STAGE=2
+
+# ── [STAGE/N] Pulsing (optional) ────────────────────────────────────────────
+if [[ "$WITH_PULSING" == "1" ]]; then
+    echo "▶ [$STAGE/$TOTAL] Pulsing — running benchmark..."
+    python "$SCRIPT_DIR/rpc_bench_pulsing.py" --out "$RESULTS_DIR/pulsing_rpc.csv"
+    echo ""
+    STAGE=$((STAGE + 1))
+fi
+
+# ── [STAGE/N] Ray ───────────────────────────────────────────────────────────
+echo "▶ [$STAGE/$TOTAL] Ray — running benchmark..."
 python "$SCRIPT_DIR/rpc_bench_ray.py" --out "$RESULTS_DIR/ray_rpc.csv"
 echo ""
+STAGE=$((STAGE + 1))
 
-# ── [3/3] Compare ───────────────────────────────────────────────────────────
-echo "▶ [3/3] Comparison"
-python "$SCRIPT_DIR/rpc_bench_compare.py" \
-    --slime "$RESULTS_DIR/slime_rpc.csv" \
-    --ray   "$RESULTS_DIR/ray_rpc.csv"
+# ── [STAGE/N] Compare ───────────────────────────────────────────────────────
+echo "▶ [$STAGE/$TOTAL] Comparison"
+COMPARE_ARGS=(--slime "$RESULTS_DIR/slime_rpc.csv" --ray "$RESULTS_DIR/ray_rpc.csv")
+if [[ "$WITH_PULSING" == "1" ]]; then
+    COMPARE_ARGS+=(--pulsing "$RESULTS_DIR/pulsing_rpc.csv")
+fi
+python "$SCRIPT_DIR/rpc_bench_compare.py" "${COMPARE_ARGS[@]}"
diff --git a/bench/results/AA.csv b/bench/results/AA.csv
deleted file mode 100755
index cd0ed335..00000000
--- a/bench/results/AA.csv
+++ /dev/null
@@ -1,18 +0,0 @@
-Message Size (bytes),Total Transport (bytes),Target Affinity,Initiator Affinity,Avg Latency(ms),Bandwidth(MB/s)
-"2,048","32,768",0,4,0.02,196.02
-"4,096","65,536",0,4,0.02,371.41
-"8,192","131,072",0,4,0.02,742.45
-"16,384","262,144",0,4,0.02,1481.07
-"32,768","524,288",0,4,0.02,2959.75
-"65,536","1,048,576",0,4,0.02,5888.44
-"131,072","2,097,152",0,4,0.02,11611.46
-"262,144","4,194,304",0,4,0.03,22872.46
-"524,288","8,388,608",0,4,0.08,39340.64
-"1,048,576","16,777,216",0,4,0.18,43292.38
-"2,097,152","33,554,432",0,4,0.40,45970.34
-"4,194,304","67,108,864",0,4,0.83,47447.48
-"8,388,608","134,217,728",0,4,1.69,48214.60
-"16,777,216","268,435,456",0,4,3.39,48580.47
-"33,554,432","536,870,912",0,4,6.82,48800.45
-"67,108,864","1,073,741,824",0,4,13.66,48911.09
-"134,217,728","2,147,483,648",0,4,27.36,48967.45
diff --git a/bench/results/AB_affi.csv b/bench/results/AB_affi.csv
deleted file mode 100755
index c8194cb1..00000000
--- a/bench/results/AB_affi.csv
+++ /dev/null
@@ -1,34 +0,0 @@
-Message Size (bytes),Total Transport (bytes),Target Affinity,Initiator Affinity,Avg Latency(ms),Bandwidth(MB/s)
-"134,217,728","2,147,483,648",0,0,62.82,21321.65
-"134,217,728","2,147,483,648",0,1,58.65,22827.92
-"134,217,728","2,147,483,648",0,2,58.65,22822.99
-"134,217,728","2,147,483,648",0,3,62.84,21314.43
-"134,217,728","2,147,483,648",0,4,46.71,28683.17
-"134,217,728","2,147,483,648",0,5,46.72,28684.64
-"134,217,728","2,147,483,648",0,6,46.71,28681.58
-"134,217,728","2,147,483,648",0,7,46.71,28682.90
-"134,217,728","2,147,483,648",0,8,62.86,21319.35
-"134,217,728","2,147,483,648",0,9,58.63,22824.66
-"134,217,728","2,147,483,648",0,10,58.69,22827.80
-"134,217,728","2,147,483,648",0,11,62.86,21319.49
-"134,217,728","2,147,483,648",0,12,58.72,22824.56
-"134,217,728","2,147,483,648",0,13,62.85,21319.14
-"134,217,728","2,147,483,648",0,14,62.86,21318.71
-"134,217,728","2,147,483,648",0,15,58.68,22825.61
-"2,048","32,768",0,6,0.02,216.70
-"4,096","65,536",0,6,0.02,431.07
-"8,192","131,072",0,6,0.02,851.31
-"16,384","262,144",0,6,0.02,1711.90
-"32,768","524,288",0,6,0.02,3445.05
-"65,536","1,048,576",0,6,0.02,6821.34
-"131,072","2,097,152",0,6,0.03,13543.92
-"262,144","4,194,304",0,6,0.05,22257.61
-"524,288","8,388,608",0,6,0.15,24878.99
-"1,048,576","16,777,216",0,6,0.33,26412.48
-"2,097,152","33,554,432",0,6,0.70,27746.73
-"4,194,304","67,108,864",0,6,1.43,28163.56
-"8,388,608","134,217,728",0,6,2.88,28398.95
-"16,777,216","268,435,456",0,6,5.79,28540.74
-"33,554,432","536,870,912",0,6,11.65,28630.73
-"67,108,864","1,073,741,824",0,6,23.33,28667.31
-"134,217,728","2,147,483,648",0,6,46.71,28686.29
diff --git a/bench/results/AC_affi.csv b/bench/results/AC_affi.csv
deleted file mode 100755
index 46111e5a..00000000
--- a/bench/results/AC_affi.csv
+++ /dev/null
@@ -1,26 +0,0 @@
-Message Size (bytes),Total Transport (bytes),Target Affinity,Initiator Affinity,Avg Latency(ms),Bandwidth(MB/s)
-"134,217,728","2,147,483,648",0,0,54.69,24489.72
-"134,217,728","2,147,483,648",1,0,54.73,24484.69
-"134,217,728","2,147,483,648",2,0,54.68,24506.96
-"134,217,728","2,147,483,648",3,0,54.69,24535.05
-"134,217,728","2,147,483,648",4,0,54.73,24501.22
-"134,217,728","2,147,483,648",5,0,54.69,24516.08
-"134,217,728","2,147,483,648",6,0,54.69,24503.68
-"134,217,728","2,147,483,648",7,0,54.69,24486.84
-"2,048","32,768",0,0,0.03,129.08
-"4,096","65,536",0,0,0.03,252.46
-"8,192","131,072",0,0,0.03,773.15
-"16,384","262,144",0,0,0.03,965.87
-"32,768","524,288",0,0,0.03,1470.05
-"65,536","1,048,576",0,0,0.03,4720.31
-"131,072","2,097,152",0,0,0.04,5701.97
-"262,144","4,194,304",0,0,0.04,15874.61
-"524,288","8,388,608",0,0,0.08,19473.47
-"1,048,576","16,777,216",0,0,0.33,21516.93
-"2,097,152","33,554,432",0,0,0.75,26086.32
-"4,194,304","67,108,864",0,0,1.60,24111.70
-"8,388,608","134,217,728",0,0,3.35,24330.94
-"16,777,216","268,435,456",0,0,6.75,24500.82
-"33,554,432","536,870,912",0,0,13.61,24450.78
-"67,108,864","1,073,741,824",0,0,27.29,24451.09
-"134,217,728","2,147,483,648",0,0,54.69,24506.82
diff --git a/bench/results/A_D_affi.csv b/bench/results/A_D_affi.csv
deleted file mode 100755
index 12f20b2e..00000000
--- a/bench/results/A_D_affi.csv
+++ /dev/null
@@ -1,17 +0,0 @@
-Message Size (bytes),Total Transport (bytes),Target Affinity,Initiator Affinity,Avg Latency(ms),Bandwidth(MB/s)
-"134,217,728","2,147,483,648",0,7,48.68,27521.27
-"134,217,728","2,147,483,648",1,7,48.68,27526.97
-"134,217,728","2,147,483,648",2,7,48.69,27524.65
-"134,217,728","2,147,483,648",3,7,48.69,27528.49
-"134,217,728","2,147,483,648",4,7,48.69,27526.21
-"134,217,728","2,147,483,648",5,7,48.69,27528.31
-"134,217,728","2,147,483,648",6,7,48.69,27528.41
-"134,217,728","2,147,483,648",7,7,48.69,27525.33
-"134,217,728","2,147,483,648",0,0,48.42,27670.15
-"134,217,728","2,147,483,648",0,1,48.46,27672.29
-"134,217,728","2,147,483,648",0,2,48.44,27672.69
-"134,217,728","2,147,483,648",0,3,48.43,27671.53
-"134,217,728","2,147,483,648",0,4,48.42,27674.43
-"134,217,728","2,147,483,648",0,5,48.42,27676.57
-"134,217,728","2,147,483,648",0,6,48.69,27526.57
-"134,217,728","2,147,483,648",0,7,48.69,27526.66
diff --git a/bench/results/BA_affi.csv b/bench/results/BA_affi.csv
deleted file mode 100755
index 3ebb4a82..00000000
--- a/bench/results/BA_affi.csv
+++ /dev/null
@@ -1,34 +0,0 @@
-Message Size (bytes),Total Transport (bytes),Target Affinity,Initiator Affinity,Avg Latency(ms),Bandwidth(MB/s)
-"134,217,728","2,147,483,648",0,0,62.90,21323.17
-"134,217,728","2,147,483,648",1,0,57.21,23433.06
-"134,217,728","2,147,483,648",2,0,57.22,23431.92
-"134,217,728","2,147,483,648",3,0,62.89,21321.35
-"134,217,728","2,147,483,648",4,0,57.22,23430.36
-"134,217,728","2,147,483,648",5,0,62.90,21321.60
-"134,217,728","2,147,483,648",6,0,62.88,21320.40
-"134,217,728","2,147,483,648",7,0,57.22,23432.35
-"134,217,728","2,147,483,648",8,0,62.89,21321.22
-"134,217,728","2,147,483,648",9,0,57.22,23433.89
-"134,217,728","2,147,483,648",10,0,57.22,23428.90
-"134,217,728","2,147,483,648",11,0,62.89,21318.73
-"134,217,728","2,147,483,648",12,0,46.36,28914.65
-"134,217,728","2,147,483,648",13,0,46.36,28911.38
-"134,217,728","2,147,483,648",14,0,46.36,28912.41
-"134,217,728","2,147,483,648",15,0,46.37,28912.13
-"2,048","32,768",15,0,0.02,259.02
-"4,096","65,536",15,0,0.02,500.66
-"8,192","131,072",15,0,0.02,1022.53
-"16,384","262,144",15,0,0.02,2034.45
-"32,768","524,288",15,0,0.02,4021.68
-"65,536","1,048,576",15,0,0.03,7992.82
-"131,072","2,097,152",15,0,0.03,15932.78
-"262,144","4,194,304",15,0,0.06,22640.78
-"524,288","8,388,608",15,0,0.16,25413.76
-"1,048,576","16,777,216",15,0,0.35,26884.69
-"2,097,152","33,554,432",15,0,0.70,27900.60
-"4,194,304","67,108,864",15,0,1.43,28394.32
-"8,388,608","134,217,728",15,0,2.88,28661.44
-"16,777,216","268,435,456",15,0,5.78,28802.17
-"33,554,432","536,870,912",15,0,11.58,28861.69
-"67,108,864","1,073,741,824",15,0,23.17,28898.01
-"134,217,728","2,147,483,648",15,0,46.36,28915.72
diff --git a/bench/results/BB.csv b/bench/results/BB.csv
deleted file mode 100755
index f41f3cd2..00000000
--- a/bench/results/BB.csv
+++ /dev/null
@@ -1,18 +0,0 @@
-Message Size (bytes),Total Transport (bytes),Target Affinity,Initiator Affinity,Avg Latency(ms),Bandwidth(MB/s)
-"2,048","32,768",15,7,0.02,186.22
-"4,096","65,536",15,7,0.02,380.64
-"8,192","131,072",15,7,0.02,759.87
-"16,384","262,144",15,7,0.02,1505.36
-"32,768","524,288",15,7,0.02,3009.92
-"65,536","1,048,576",15,7,0.03,5991.92
-"131,072","2,097,152",15,7,0.03,11780.19
-"262,144","4,194,304",15,7,0.05,20974.75
-"524,288","8,388,608",15,7,0.15,24501.33
-"1,048,576","16,777,216",15,7,0.34,26502.06
-"2,097,152","33,554,432",15,7,0.70,27566.30
-"4,194,304","67,108,864",15,7,1.42,28142.58
-"8,388,608","134,217,728",15,7,2.90,28393.64
-"16,777,216","268,435,456",15,7,5.81,28544.30
-"33,554,432","536,870,912",15,7,11.66,28617.29
-"67,108,864","1,073,741,824",15,7,23.44,28410.72
-"134,217,728","2,147,483,648",15,7,47.05,28275.85
diff --git a/bench/results/BC_affi.csv b/bench/results/BC_affi.csv
deleted file mode 100755
index a5d953fa..00000000
--- a/bench/results/BC_affi.csv
+++ /dev/null
@@ -1,34 +0,0 @@
-Message Size (bytes),Total Transport (bytes),Target Affinity,Initiator Affinity,Avg Latency(ms),Bandwidth(MB/s)
-"134,217,728","2,147,483,648",0,0,62.81,21335.93
-"134,217,728","2,147,483,648",1,0,57.14,23453.94
-"134,217,728","2,147,483,648",2,0,57.13,23458.82
-"134,217,728","2,147,483,648",3,0,62.81,21330.01
-"134,217,728","2,147,483,648",4,0,57.11,23445.78
-"134,217,728","2,147,483,648",5,0,62.82,21336.52
-"134,217,728","2,147,483,648",6,0,62.81,21357.24
-"134,217,728","2,147,483,648",7,0,57.14,23446.67
-"134,217,728","2,147,483,648",8,0,62.81,21342.34
-"134,217,728","2,147,483,648",9,0,57.13,23437.67
-"134,217,728","2,147,483,648",10,0,57.14,23456.65
-"134,217,728","2,147,483,648",11,0,62.81,21339.10
-"134,217,728","2,147,483,648",12,0,66.42,19877.77
-"134,217,728","2,147,483,648",13,0,65.93,19834.39
-"134,217,728","2,147,483,648",14,0,67.10,19882.69
-"134,217,728","2,147,483,648",15,0,72.07,19805.54
-"2,048","32,768",2,0,0.03,191.65
-"4,096","65,536",2,0,0.03,287.50
-"8,192","131,072",2,0,0.03,767.89
-"16,384","262,144",2,0,0.03,799.03
-"32,768","524,288",2,0,0.04,2549.87
-"65,536","1,048,576",2,0,0.04,3530.37
-"131,072","2,097,152",2,0,0.04,6276.05
-"262,144","4,194,304",2,0,0.05,20870.75
-"524,288","8,388,608",2,0,0.07,20225.39
-"1,048,576","16,777,216",2,0,0.35,21390.89
-"2,097,152","33,554,432",2,0,0.80,23297.66
-"4,194,304","67,108,864",2,0,1.69,24324.57
-"8,388,608","134,217,728",2,0,3.48,23378.32
-"16,777,216","268,435,456",2,0,7.05,23628.05
-"33,554,432","536,870,912",2,0,14.21,23419.31
-"67,108,864","1,073,741,824",2,0,28.52,23474.30
-"134,217,728","2,147,483,648",2,0,57.14,23472.29
diff --git a/bench/results/B_D_affi.csv b/bench/results/B_D_affi.csv
deleted file mode 100755
index 0f7854a8..00000000
--- a/bench/results/B_D_affi.csv
+++ /dev/null
@@ -1,25 +0,0 @@
-Message Size (bytes),Total Transport (bytes),Target Affinity,Initiator Affinity,Avg Latency(ms),Bandwidth(MB/s)
-"134,217,728","2,147,483,648",0,7,62.86,19963.24
-"134,217,728","2,147,483,648",1,7,57.20,23426.24
-"134,217,728","2,147,483,648",2,7,57.20,23426.84
-"134,217,728","2,147,483,648",3,7,62.87,21318.32
-"134,217,728","2,147,483,648",4,7,57.19,23428.55
-"134,217,728","2,147,483,648",5,7,62.87,19989.97
-"134,217,728","2,147,483,648",6,7,62.87,21318.29
-"134,217,728","2,147,483,648",7,7,57.21,23428.74
-"134,217,728","2,147,483,648",8,7,62.88,21317.80
-"134,217,728","2,147,483,648",9,7,57.19,23429.03
-"134,217,728","2,147,483,648",10,7,57.19,23430.83
-"134,217,728","2,147,483,648",11,7,62.88,21317.28
-"134,217,728","2,147,483,648",12,7,49.53,26927.22
-"134,217,728","2,147,483,648",13,7,49.57,26933.33
-"134,217,728","2,147,483,648",14,7,49.55,26923.98
-"134,217,728","2,147,483,648",15,7,49.59,26948.25
-"134,217,728","2,147,483,648",15,0,48.87,27234.60
-"134,217,728","2,147,483,648",15,1,49.17,27216.30
-"134,217,728","2,147,483,648",15,2,49.19,27212.34
-"134,217,728","2,147,483,648",15,3,49.84,27207.04
-"134,217,728","2,147,483,648",15,4,49.23,27179.73
-"134,217,728","2,147,483,648",15,5,48.67,27219.37
-"134,217,728","2,147,483,648",15,6,49.62,26942.26
-"134,217,728","2,147,483,648",15,7,49.54,26926.76
diff --git a/bench/results/CA_affi.csv b/bench/results/CA_affi.csv
deleted file mode 100755
index 98b918a0..00000000
--- a/bench/results/CA_affi.csv
+++ /dev/null
@@ -1,26 +0,0 @@
-Message Size (bytes),Total Transport (bytes),Target Affinity,Initiator Affinity,Avg Latency(ms),Bandwidth(MB/s)
-"134,217,728","2,147,483,648",0,0,54.74,24498.34
-"134,217,728","2,147,483,648",0,1,54.74,24495.31
-"134,217,728","2,147,483,648",0,2,54.75,24497.10
-"134,217,728","2,147,483,648",0,3,54.74,24481.20
-"134,217,728","2,147,483,648",0,4,54.73,24495.34
-"134,217,728","2,147,483,648",0,5,54.74,24495.64
-"134,217,728","2,147,483,648",0,6,54.74,24495.90
-"134,217,728","2,147,483,648",0,7,54.74,24485.14
-"2,048","32,768",0,0,0.02,275.63
-"4,096","65,536",0,0,0.02,528.19
-"8,192","131,072",0,0,0.02,1051.23
-"16,384","262,144",0,0,0.02,2023.28
-"32,768","524,288",0,0,0.02,3890.43
-"65,536","1,048,576",0,0,0.03,8124.08
-"131,072","2,097,152",0,0,0.03,16216.00
-"262,144","4,194,304",0,0,0.07,20419.25
-"524,288","8,388,608",0,0,0.19,22044.91
-"1,048,576","16,777,216",0,0,0.40,23237.08
-"2,097,152","33,554,432",0,0,0.82,23900.71
-"4,194,304","67,108,864",0,0,1.69,24163.36
-"8,388,608","134,217,728",0,0,3.40,24344.22
-"16,777,216","268,435,456",0,0,6.82,24408.30
-"33,554,432","536,870,912",0,0,13.67,24456.58
-"67,108,864","1,073,741,824",0,0,27.36,24485.03
-"134,217,728","2,147,483,648",0,0,54.73,24496.14
diff --git a/bench/results/CB_affi.csv b/bench/results/CB_affi.csv
deleted file mode 100755
index 30eb3c02..00000000
--- a/bench/results/CB_affi.csv
+++ /dev/null
@@ -1,34 +0,0 @@
-Message Size (bytes),Total Transport (bytes),Target Affinity,Initiator Affinity,Avg Latency(ms),Bandwidth(MB/s)
-"134,217,728","2,147,483,648",0,0,63.32,21115.32
-"134,217,728","2,147,483,648",0,1,58.73,22616.25
-"134,217,728","2,147,483,648",0,2,58.75,22549.50
-"134,217,728","2,147,483,648",0,3,63.30,21199.11
-"134,217,728","2,147,483,648",0,4,54.75,24497.23
-"134,217,728","2,147,483,648",0,5,54.74,24495.94
-"134,217,728","2,147,483,648",0,6,54.75,24496.62
-"134,217,728","2,147,483,648",0,7,54.74,24496.53
-"134,217,728","2,147,483,648",0,8,63.32,21189.85
-"134,217,728","2,147,483,648",0,9,58.74,22655.58
-"134,217,728","2,147,483,648",0,10,58.74,22716.60
-"134,217,728","2,147,483,648",0,11,63.31,21197.24
-"134,217,728","2,147,483,648",0,12,58.75,22597.88
-"134,217,728","2,147,483,648",0,13,63.28,21195.90
-"134,217,728","2,147,483,648",0,14,63.26,21188.03
-"134,217,728","2,147,483,648",0,15,58.86,22711.31
-"2,048","32,768",0,4,0.02,238.61
-"4,096","65,536",0,4,0.02,485.34
-"8,192","131,072",0,4,0.02,941.53
-"16,384","262,144",0,4,0.02,1922.75
-"32,768","524,288",0,4,0.02,3756.02
-"65,536","1,048,576",0,4,0.03,7506.86
-"131,072","2,097,152",0,4,0.03,14645.73
-"262,144","4,194,304",0,4,0.08,19733.07
-"524,288","8,388,608",0,4,0.19,22014.06
-"1,048,576","16,777,216",0,4,0.40,23040.64
-"2,097,152","33,554,432",0,4,0.83,23844.16
-"4,194,304","67,108,864",0,4,1.69,24155.24
-"8,388,608","134,217,728",0,4,3.40,24193.92
-"16,777,216","268,435,456",0,4,6.83,24418.50
-"33,554,432","536,870,912",0,4,13.67,24463.27
-"67,108,864","1,073,741,824",0,4,27.36,24484.79
-"134,217,728","2,147,483,648",0,4,54.75,24496.51
diff --git a/bench/results/CC.csv b/bench/results/CC.csv
deleted file mode 100755
index 5b405781..00000000
--- a/bench/results/CC.csv
+++ /dev/null
@@ -1,19 +0,0 @@
-Message Size (bytes),Total Transport (bytes),Target Affinity,Initiator Affinity,Avg Latency(ms),Bandwidth(MB/s)
-"134,217,728","2,147,483,648",0,2,54.66,24498.47
-"2,048","32,768",0,2,0.03,144.72
-"4,096","65,536",0,2,0.03,196.35
-"8,192","131,072",0,2,0.03,1028.93
-"16,384","262,144",0,2,0.03,765.88
-"32,768","524,288",0,2,0.03,2497.77
-"65,536","1,048,576",0,2,0.03,3782.66
-"131,072","2,097,152",0,2,0.04,11154.85
-"262,144","4,194,304",0,2,0.04,20311.35
-"524,288","8,388,608",0,2,0.10,31235.86
-"1,048,576","16,777,216",0,2,0.32,24258.45
-"2,097,152","33,554,432",0,2,0.75,25957.08
-"4,194,304","67,108,864",0,2,1.60,24383.17
-"8,388,608","134,217,728",0,2,3.31,24805.42
-"16,777,216","268,435,456",0,2,6.74,24379.71
-"33,554,432","536,870,912",0,2,13.59,24558.78
-"67,108,864","1,073,741,824",0,2,27.25,24558.24
-"134,217,728","2,147,483,648",0,2,54.65,24513.57
diff --git a/bench/results/C_D_affi.csv b/bench/results/C_D_affi.csv
deleted file mode 100755
index d1f4278f..00000000
--- a/bench/results/C_D_affi.csv
+++ /dev/null
@@ -1,17 +0,0 @@
-Message Size (bytes),Total Transport (bytes),Target Affinity,Initiator Affinity,Avg Latency(ms),Bandwidth(MB/s)
-"134,217,728","2,147,483,648",0,7,54.72,24491.40
-"134,217,728","2,147,483,648",1,7,54.72,24490.74
-"134,217,728","2,147,483,648",2,7,54.72,24491.14
-"134,217,728","2,147,483,648",3,7,54.72,24493.89
-"134,217,728","2,147,483,648",4,7,54.71,24490.40
-"134,217,728","2,147,483,648",5,7,54.73,24493.04
-"134,217,728","2,147,483,648",6,7,54.72,24492.51
-"134,217,728","2,147,483,648",7,7,54.71,24494.84
-"134,217,728","2,147,483,648",8,7,56.93,23539.90
-"134,217,728","2,147,483,648",9,7,56.93,23538.21
-"134,217,728","2,147,483,648",10,7,56.93,23541.97
-"134,217,728","2,147,483,648",11,7,56.94,21836.20
-"134,217,728","2,147,483,648",12,7,56.94,23538.32
-"134,217,728","2,147,483,648",13,7,56.92,23547.92
-"134,217,728","2,147,483,648",14,7,56.91,21913.73
-"134,217,728","2,147,483,648",15,7,56.91,23549.99
diff --git a/bench/results/D_A_affi.csv b/bench/results/D_A_affi.csv
deleted file mode 100755
index ff5439d2..00000000
--- a/bench/results/D_A_affi.csv
+++ /dev/null
@@ -1,9 +0,0 @@
-Message Size (bytes),Total Transport (bytes),Target Affinity,Initiator Affinity,Avg Latency(ms),Bandwidth(MB/s)
-"134,217,728","2,147,483,648",4,0,45.74,29314.02
-"134,217,728","2,147,483,648",4,1,45.69,29315.77
-"134,217,728","2,147,483,648",4,2,45.76,29309.79
-"134,217,728","2,147,483,648",4,3,45.73,29315.84
-"134,217,728","2,147,483,648",4,4,45.74,29317.55
-"134,217,728","2,147,483,648",4,5,45.75,29309.67
-"134,217,728","2,147,483,648",4,6,45.74,29307.28
-"134,217,728","2,147,483,648",4,7,45.73,29317.66
diff --git a/bench/results/D_B_affi.csv b/bench/results/D_B_affi.csv
deleted file mode 100755
index 7227137f..00000000
--- a/bench/results/D_B_affi.csv
+++ /dev/null
@@ -1,17 +0,0 @@
-Message Size (bytes),Total Transport (bytes),Target Affinity,Initiator Affinity,Avg Latency(ms),Bandwidth(MB/s)
-"134,217,728","2,147,483,648",4,0,62.9,21301.88
-"134,217,728","2,147,483,648",4,1,58.74,22806.47
-"134,217,728","2,147,483,648",4,2,58.74,22771.85
-"134,217,728","2,147,483,648",4,3,62.93,21304.81
-"134,217,728","2,147,483,648",4,4,46.74,28680.66
-"134,217,728","2,147,483,648",4,5,46.73,28683.25
-"134,217,728","2,147,483,648",4,6,46.73,28681.84
-"134,217,728","2,147,483,648",4,7,46.74,28681.1
-"134,217,728","2,147,483,648",4,8,62.9,21298.85
-"134,217,728","2,147,483,648",4,9,58.75,22612.52
-"134,217,728","2,147,483,648",4,10,58.74,22765.7
-"134,217,728","2,147,483,648",4,11,62.9,21304.81
-"134,217,728","2,147,483,648",4,12,58.74,22806.75
-"134,217,728","2,147,483,648",4,13,62.9,21284.54
-"134,217,728","2,147,483,648",4,14,62.9,21294.32
-"134,217,728","2,147,483,648",4,15,58.74,22770.65
diff --git a/bench/results/D_C_affi.csv b/bench/results/D_C_affi.csv
deleted file mode 100755
index c1868456..00000000
--- a/bench/results/D_C_affi.csv
+++ /dev/null
@@ -1,17 +0,0 @@
-Message Size (bytes),Total Transport (bytes),Target Affinity,Initiator Affinity,Avg Latency(ms),Bandwidth(MB/s)
-"134,217,728","2,147,483,648",4,0,54.68,24490.73
-"134,217,728","2,147,483,648",4,1,54.69,24506.81
-"134,217,728","2,147,483,648",4,2,54.68,24468.7
-"134,217,728","2,147,483,648",4,3,54.68,24514.4
-"134,217,728","2,147,483,648",4,4,54.68,24515.3
-"134,217,728","2,147,483,648",4,5,54.69,24467.47
-"134,217,728","2,147,483,648",4,6,54.69,24482.49
-"134,217,728","2,147,483,648",4,7,54.68,24494.96
-"134,217,728","2,147,483,648",4,8,105.99,12649.06
-"134,217,728","2,147,483,648",4,9,105.85,12664.4
-"134,217,728","2,147,483,648",4,10,106,12645.67
-"134,217,728","2,147,483,648",4,11,106.21,12609.33
-"134,217,728","2,147,483,648",4,12,105.84,12666.78
-"134,217,728","2,147,483,648",4,13,105.85,12667.23
-"134,217,728","2,147,483,648",4,14,106.01,12646.66
-"134,217,728","2,147,483,648",4,15,0.51,12245.73
diff --git a/bench/results/D_D_affi.csv b/bench/results/D_D_affi.csv
deleted file mode 100755
index 6b8f3fff..00000000
--- a/bench/results/D_D_affi.csv
+++ /dev/null
@@ -1,18 +0,0 @@
-Message Size (bytes),Total Transport (bytes),Target Affinity,Initiator Affinity,Avg Latency(ms),Bandwidth(MB/s)
-"134,217,728","2,147,483,648",7,7,58.78,22790.95
-"134,217,728","2,147,483,648",0,7,49.9,26835.56
-"134,217,728","2,147,483,648",1,7,50.04,26783.6
-"134,217,728","2,147,483,648",2,7,51.19,26164.79
-"134,217,728","2,147,483,648",3,7,51.29,26112.61
-"134,217,728","2,147,483,648",4,7,48.77,27412.99
-"134,217,728","2,147,483,648",5,7,48.69,27491.33
-"134,217,728","2,147,483,648",6,7,57.2,23437.03
-"134,217,728","2,147,483,648",7,7,58.25,23011.38
-"134,217,728","2,147,483,648",0,7,49.93,26821.45
-"134,217,728","2,147,483,648",1,7,50.05,26779.6
-"134,217,728","2,147,483,648",2,7,51.16,26193.63
-"134,217,728","2,147,483,648",3,7,51.36,26111.74
-"134,217,728","2,147,483,648",4,7,48.75,27416
-"134,217,728","2,147,483,648",5,7,48.69,27491.31
-"134,217,728","2,147,483,648",6,7,57.22,21740.47
-"134,217,728","2,147,483,648",7,7,58.26,23000.13
diff --git a/bench/results/ray_rpc.csv b/bench/results/ray_rpc.csv
deleted file mode 100644
index c1e5f31a..00000000
--- a/bench/results/ray_rpc.csv
+++ /dev/null
@@ -1,10 +0,0 @@
-size,avg_us,p50_us,p99_us,bw_gbps
-1024,321.0414529312402,313.80902510136366,488.6779934167862,0.006379238510481808
-4096,348.55153784155846,339.17196560651064,511.0709462314844,0.023502980508219272
-16384,381.16832822561264,374.3050619959831,549.4569195434451,0.08596726845732235
-65536,577.4983225855976,552.2669525817037,1670.3959554433823,0.22696516141061573
-262144,1027.6467196251217,958.6790110915899,1614.8729482665658,0.510183110584206
-1048576,1731.2549171037972,1366.1759439855814,2533.0260396003723,1.2113478952644994
-4194304,3272.672554012388,2896.21006231755,6399.256060831249,2.563228633954024
-16777216,8063.031900674105,6645.97493596375,13586.94804366678,4.161515471270144
-67108864,73681.16008583456,72670.11108342558,99878.37704829872,1.8216017207606887
diff --git a/bench/results/slime_rpc.csv b/bench/results/slime_rpc.csv
deleted file mode 100644
index 32b71a9c..00000000
--- a/bench/results/slime_rpc.csv
+++ /dev/null
@@ -1,9 +0,0 @@
-size,avg_us,p50_us,p99_us,bw_gbps
-1024,52.21534846350551,51.33496597409248,71.66399154812098,0.0392221839030988
-4096,55.94716197811067,54.75303623825312,66.44800305366516,0.1464238704941838
-16384,67.23385164514184,66.52099546045065,77.82597094774246,0.48737353577403947
-65536,98.09479210525751,98.93800597637892,112.82600462436676,1.3361769487146407
-262144,141.2927381400215,141.50398783385754,154.55996617674828,3.710650716390174
-1048576,399.80457397177815,418.485957197845,435.4199627414346,5.245442740102408
-4194304,1104.8975097946823,1083.2069674506783,1260.1850321516395,7.592204639468157
-16777216,4280.842423904687,4272.751975804567,4447.276005521417,7.838277768092659
diff --git a/dlslime/peer_agent/_mailbox.py b/dlslime/peer_agent/_mailbox.py
index 4797c876..63caa9d1 100644
--- a/dlslime/peer_agent/_mailbox.py
+++ b/dlslime/peer_agent/_mailbox.py
@@ -56,13 +56,17 @@ def __init__(self, agent: "PeerAgent", stream_block_ms: int = 100):
 
     def start(self) -> None:
         """Start the stream listener thread."""
-        # Delete any stale stream messages from a previous run before listening.
-        prefix = (
-            f"{self._agent._redis_key_prefix}:" if self._agent._redis_key_prefix else ""
-        )
-        stream_key = f"{prefix}stream:{self._agent.alias}"
-        self._agent._redis_client.delete(stream_key)
-
+        # Do NOT delete the stream here. A peer whose connect_to runs before
+        # this agent registers will XADD qp_ready onto stream:<our-alias>
+        # before our listener exists; deleting here wipes that message. The
+        # peer marks _notified_peers after its first XADD and will not re-send,
+        # so losing the message stalls the handshake forever.
+        #
+        # Instead, _listen_loop reads from "0-0" and processes every message
+        # on the stream. Truly stale messages from a prior crashed session
+        # with the same alias fail endpoint.connect at handshake time — the
+        # listen loop catches that and continues, so later in-flight qp_ready
+        # from the current peer session still completes the handshake.
         self._thread = threading.Thread(target=self._listen_loop, daemon=True)
         self._thread.start()
         logger.info(
@@ -85,7 +89,11 @@ def _listen_loop(self) -> None:
         )
         stream_key = f"{prefix}stream:{self._agent.alias}"
 
-        # Start from the beginning of the (now-freshly-cleared) stream.
+        # Start from the beginning of the stream. We intentionally do not
+        # delete the stream on start — see the note in start() — so there may
+        # be messages from before our listener thread was running (including
+        # in-flight qp_ready from a peer whose connect_to fired before our
+        # registration). Processing from "0-0" picks those up.
         last_id = "0-0"
 
         logger.info(
diff --git a/docs/benchmark-rpc.md b/docs/benchmark-rpc.md
index a5b4253e..9e62aa94 100644
--- a/docs/benchmark-rpc.md
+++ b/docs/benchmark-rpc.md
@@ -1,18 +1,20 @@
 # SlimeRPC Benchmark
 
 This document describes the Python RPC microbenchmark added for comparing
-SlimeRPC against Ray on a single machine.
+SlimeRPC against Ray (and optionally Pulsing) on a single machine.
 
 ## What It Measures
 
 The benchmark measures round-trip latency and effective bandwidth for a raw
 bytes echo RPC across payload sizes from `1KB` up to `16MB` by default.
 
-It runs two implementations:
+It runs two implementations by default, with a third opt-in baseline:
 
 - `SlimeRPC`: RDMA-backed `PeerAgent` RPC echo between `bench-driver` and
   `bench-worker`
 - `Ray`: a local `EchoActor` baseline using the same payload sizes and metrics
+- `Pulsing` (optional, off by default): a `@pul.remote` actor echo using the
+  same payload sizes and metrics. Enable with `--with-pulsing`.
 
 The comparison script prints:
 
@@ -20,7 +22,9 @@ The comparison script prints:
 - p50 latency
 - p99 latency
 - effective round-trip bandwidth
-- Ray/Slime speedup
+- `S/Ray` = Ray avg latency / SlimeRPC avg latency (> 1 means SlimeRPC wins)
+- `S/Pul` = Pulsing avg latency / SlimeRPC avg latency (only shown when
+  Pulsing was enabled)
 
 ## Files
 
@@ -28,6 +32,7 @@ The comparison script prints:
 - `bench/python/rpc_bench_slime_worker.py`
 - `bench/python/rpc_bench_slime_driver.py`
 - `bench/python/rpc_bench_ray.py`
+- `bench/python/rpc_bench_pulsing.py`
 - `bench/python/rpc_bench_compare.py`
 
 ## Prerequisites
@@ -38,14 +43,25 @@ Before running the SlimeRPC side:
 2. Make sure Redis is reachable through NanoCtrl.
 3. Build and install DLSlime with Python bindings and RDMA support.
 
+For the optional Pulsing baseline, also install `pulsing`
+(`pip install pulsing`) in the same environment.
+
 ## Run
 
-Default run:
+Default run (SlimeRPC + Ray, Pulsing disabled):
 
 ```bash
 bash bench/python/run_rpc_bench.sh
 ```
 
+Include the Pulsing baseline:
+
+```bash
+bash bench/python/run_rpc_bench.sh --with-pulsing
+# or
+WITH_PULSING=1 bash bench/python/run_rpc_bench.sh
+```
+
 Specify control-plane address or buffer size:
 
 ```bash
@@ -64,12 +80,17 @@ CTRL=http://127.0.0.1:3000 BUF_MB=256 MAX_SIZE_MB=16 \
 
 ## Output
 
-The script writes:
+The script always writes:
 
 - `bench/results/slime_rpc.csv`
 - `bench/results/ray_rpc.csv`
 
-and then prints a merged comparison table.
+and, when `--with-pulsing` is passed, additionally writes:
+
+- `bench/results/pulsing_rpc.csv`
+
+It then prints a merged comparison table. The `S/Pul` column only appears in
+the table when the Pulsing CSV is present.
 
 ## Stability Notes
 
diff --git a/docs/imgs/assets/docs.png b/docs/imgs/assets/docs.png
new file mode 100644
index 00000000..245d3446
Binary files /dev/null and b/docs/imgs/assets/docs.png differ
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 5b29070f..a3f4a441 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -84,6 +84,7 @@ plugins:
             Overview: 总览
             Endpoint API: Endpoint API
             PeerAgent API: PeerAgent API
+            SlimeRPC: SlimeRPC
             Deployment: 部署
             DLSlimeCache Service: DLSlimeCache 服务
             SlimeRPC Benchmark: SlimeRPC 基准测试
@@ -118,6 +119,7 @@ nav:
       - Overview: guide/index.md
       - Endpoint API: guide/endpoint-api.md
       - PeerAgent API: guide/peeragent-api.md
+      - SlimeRPC: guide/slimerpc.md
       - Deployment: guide/deployment.md
       - DLSlimeCache Service: guide/dlslime-cache.md
       - SlimeRPC Benchmark: guide/benchmark-rpc.md
diff --git a/docs/src/api_reference.md b/docs/src/api_reference.md
index 5483803b..6866f5bc 100644
--- a/docs/src/api_reference.md
+++ b/docs/src/api_reference.md
@@ -32,6 +32,7 @@ from dlslime.logging import get_logger, set_log_level
 
 - [Endpoint API](guide/endpoint-api.md)
 - [PeerAgent API](guide/peeragent-api.md)
+- [SlimeRPC](guide/slimerpc.md)
 - [DLSlimeCache Service](guide/dlslime-cache.md)
 - [Versions](versions.md)
 - `dlslime.cache.CacheClient`
diff --git a/docs/src/api_reference.zh.md b/docs/src/api_reference.zh.md
index f0bd88f6..ab5528eb 100644
--- a/docs/src/api_reference.zh.md
+++ b/docs/src/api_reference.zh.md
@@ -30,5 +30,6 @@ from dlslime.logging import get_logger, set_log_level
 
 - [Endpoint API](guide/endpoint-api.md)
 - [PeerAgent API](guide/peeragent-api.md)
+- [SlimeRPC](guide/slimerpc.md)
 - [DLSlimeCache 服务](guide/dlslime-cache.md)
 - [版本](versions.md)
diff --git a/docs/src/guide/benchmark-rpc.md b/docs/src/guide/benchmark-rpc.md
index a5b4253e..9e62aa94 100644
--- a/docs/src/guide/benchmark-rpc.md
+++ b/docs/src/guide/benchmark-rpc.md
@@ -1,18 +1,20 @@
 # SlimeRPC Benchmark
 
 This document describes the Python RPC microbenchmark added for comparing
-SlimeRPC against Ray on a single machine.
+SlimeRPC against Ray (and optionally Pulsing) on a single machine.
 
 ## What It Measures
 
 The benchmark measures round-trip latency and effective bandwidth for a raw
 bytes echo RPC across payload sizes from `1KB` up to `16MB` by default.
 
-It runs two implementations:
+It runs two implementations by default, with a third opt-in baseline:
 
 - `SlimeRPC`: RDMA-backed `PeerAgent` RPC echo between `bench-driver` and
   `bench-worker`
 - `Ray`: a local `EchoActor` baseline using the same payload sizes and metrics
+- `Pulsing` (optional, off by default): a `@pul.remote` actor echo using the
+  same payload sizes and metrics. Enable with `--with-pulsing`.
 
 The comparison script prints:
 
@@ -20,7 +22,9 @@ The comparison script prints:
 - p50 latency
 - p99 latency
 - effective round-trip bandwidth
-- Ray/Slime speedup
+- `S/Ray` = Ray avg latency / SlimeRPC avg latency (> 1 means SlimeRPC wins)
+- `S/Pul` = Pulsing avg latency / SlimeRPC avg latency (only shown when
+  Pulsing was enabled)
 
 ## Files
 
@@ -28,6 +32,7 @@ The comparison script prints:
 - `bench/python/rpc_bench_slime_worker.py`
 - `bench/python/rpc_bench_slime_driver.py`
 - `bench/python/rpc_bench_ray.py`
+- `bench/python/rpc_bench_pulsing.py`
 - `bench/python/rpc_bench_compare.py`
 
 ## Prerequisites
@@ -38,14 +43,25 @@ Before running the SlimeRPC side:
 2. Make sure Redis is reachable through NanoCtrl.
 3. Build and install DLSlime with Python bindings and RDMA support.
 
+For the optional Pulsing baseline, also install `pulsing`
+(`pip install pulsing`) in the same environment.
+
 ## Run
 
-Default run:
+Default run (SlimeRPC + Ray, Pulsing disabled):
 
 ```bash
 bash bench/python/run_rpc_bench.sh
 ```
 
+Include the Pulsing baseline:
+
+```bash
+bash bench/python/run_rpc_bench.sh --with-pulsing
+# or
+WITH_PULSING=1 bash bench/python/run_rpc_bench.sh
+```
+
 Specify control-plane address or buffer size:
 
 ```bash
@@ -64,12 +80,17 @@ CTRL=http://127.0.0.1:3000 BUF_MB=256 MAX_SIZE_MB=16 \
 
 ## Output
 
-The script writes:
+The script always writes:
 
 - `bench/results/slime_rpc.csv`
 - `bench/results/ray_rpc.csv`
 
-and then prints a merged comparison table.
+and, when `--with-pulsing` is passed, additionally writes:
+
+- `bench/results/pulsing_rpc.csv`
+
+It then prints a merged comparison table. The `S/Pul` column only appears in
+the table when the Pulsing CSV is present.
 
 ## Stability Notes
 
diff --git a/docs/src/guide/index.md b/docs/src/guide/index.md
index 1891832f..900dcc44 100644
--- a/docs/src/guide/index.md
+++ b/docs/src/guide/index.md
@@ -8,5 +8,6 @@ than a one-off transfer library.
 - [Deployment](deployment.md): run NanoCtrl, Redis, DLSlimeCache, and examples in a predictable layout.
 - [Endpoint API](endpoint-api.md): use the low-level RDMA endpoint surface directly.
 - [PeerAgent API](peeragent-api.md): use control-plane discovery, named memory regions, and service-friendly I/O.
+- [SlimeRPC](slimerpc.md): define Python services and call them through PeerAgent-backed RDMA RPC.
 - [DLSlimeCache Service](dlslime-cache.md): service lifecycle and client flow.
 - [SlimeRPC Benchmark](benchmark-rpc.md): benchmark SlimeRPC against Ray.
diff --git a/docs/src/guide/index.zh.md b/docs/src/guide/index.zh.md
index 82617c27..61fb03ce 100644
--- a/docs/src/guide/index.zh.md
+++ b/docs/src/guide/index.zh.md
@@ -7,5 +7,6 @@
 - [部署](deployment.md)：以可复现方式运行 NanoCtrl、Redis、DLSlimeCache 和示例。
 - [Endpoint API](endpoint-api.md)：直接使用底层 RDMA endpoint 接口。
 - [PeerAgent API](peeragent-api.md)：使用控制面发现、命名 memory region 和服务化 I/O。
+- [SlimeRPC](slimerpc.md)：定义 Python 服务，并通过 PeerAgent-backed RDMA RPC 调用。
 - [DLSlimeCache 服务](dlslime-cache.md)：服务生命周期和客户端流程。
 - [SlimeRPC 基准测试](benchmark-rpc.md)：对比 SlimeRPC 与 Ray。
diff --git a/docs/src/guide/slimerpc.md b/docs/src/guide/slimerpc.md
new file mode 100644
index 00000000..ecb203cc
--- /dev/null
+++ b/docs/src/guide/slimerpc.md
@@ -0,0 +1,275 @@
+# SlimeRPC
+
+SlimeRPC is a typed RPC layer built on PeerAgent and the DLSlime RDMA data path.
+Use it when application logic should look like a Python service call, while
+connection setup, memory registration, and request/reply transport stay inside
+DLSlime.
+
+The public API is:
+
+```python
+from dlslime.rpc import (
+    method,
+    proxy,
+    serve,
+    serve_once,
+    wait_all,
+    RpcError,
+    RemoteRpcError,
+    RpcTimeoutError,
+    SoftRnrMonitor,
+)
+```
+
+## Requirements
+
+SlimeRPC requires:
+
+- NanoCtrl running and reachable by both peers.
+- Redis reachable through NanoCtrl.
+- Connected PeerAgents on both client and worker.
+- A DLSlime build with the C++ RPC session enabled. If the extension was built
+  without RPC support, `proxy()` raises an error asking you to rebuild with
+  `BUILD_RPC=ON`.
+
+```bash
+nanoctrl start
+python examples/python/rpc_example.py --ctrl http://127.0.0.1:3000
+```
+
+## Basic Service
+
+Define a service class and mark RPC-callable methods with `@method`:
+
+```python
+from dlslime.rpc import method
+
+
+class CalcService:
+    @method
+    def add(self, a: int, b: int) -> int:
+        return a + b
+
+    @method
+    def echo(self, msg: str) -> str:
+        return f"echo: {msg}"
+```
+
+By default, SlimeRPC serializes arguments and return values with `pickle`.
+Both sides must import the same service class definition, because method tags
+are assigned deterministically by sorted method name.
+
+## Connect PeerAgents
+
+SlimeRPC does not replace PeerAgent discovery. First create and connect the two
+agents:
+
+```python
+from dlslime import PeerAgent
+
+worker = PeerAgent(nanoctrl_url="http://127.0.0.1:3000", alias="worker:0")
+driver = PeerAgent(nanoctrl_url="http://127.0.0.1:3000", alias="driver:0")
+
+driver_conn = driver.connect_to("worker:0", ib_port=1, qp_num=1)
+worker_conn = worker.connect_to("driver:0", ib_port=1, qp_num=1)
+
+driver_conn.wait()
+worker_conn.wait()
+```
+
+## Worker Side
+
+`serve()` blocks and dispatches requests for one peer:
+
+```python
+from dlslime.rpc import serve
+
+serve(worker, CalcService(), peer="driver:0")
+```
+
+For examples or tests, run it in a daemon thread:
+
+```python
+import threading
+
+threading.Thread(
+    target=serve,
+    args=(worker, CalcService(), "driver:0"),
+    daemon=True,
+).start()
+```
+
+`serve()` can infer `peer` only when exactly one peer is connected. Pass
+`peer=...` explicitly in multi-peer services.
+
+Server dispatch settings:
+
+| Setting                    | Purpose                                                            |
+| -------------------------- | ------------------------------------------------------------------ |
+| `max_workers=`             | Overrides the handler dispatch pool size.                          |
+| `SLIME_RPC_SERVER_WORKERS` | Default handler pool size when `max_workers` is unset.             |
+| `@method(parallel=True)`   | Allows a handler to run concurrently on the same service instance. |
+
+Handlers are serialized by default under a per-service lock. This is safer for
+stateful services such as model runners, NCCL workers, and shared PyTorch
+objects.
+
+## Client Side
+
+Create a proxy from the client PeerAgent to the worker alias:
+
+```python
+from dlslime.rpc import proxy
+
+calc = proxy(driver, "worker:0", CalcService)
+
+future = calc.add(1, 2)
+assert future.wait(timeout=30.0) == 3
+
+echo_future = calc.echo("hello")
+assert echo_future.wait() == "echo: hello"
+```
+
+Every proxy call returns a future. Use `wait()` to receive the result:
+
+```python
+result = calc.add(1, 2).wait()
+```
+
+Wait for several calls in order:
+
+```python
+from dlslime.rpc import wait_all
+
+futures = [calc.add(i, i * 10) for i in range(5)]
+results = wait_all(futures, timeout=30.0)
+```
+
+## Channel Buffers
+
+SlimeRPC creates an RDMA mailbox channel per `(local_agent, peer_agent)` pair.
+The channel registers receive memory under names like:
+
+```text
+rpc:mailbox:<local_alias>:<peer_alias>
+```
+
+Buffer controls:
+
+| Setting                   |                          Default | Purpose                                     |
+| ------------------------- | -------------------------------: | ------------------------------------------- |
+| `agent._rpc_buffer_size`  |               `32_000_000` bytes | Per-inflight request/reply slot size.       |
+| `agent._rpc_max_inflight` | `SLIME_RPC_MAX_INFLIGHT` or `16` | Number of receive slots.                    |
+| `SLIME_RPC_MAX_INFLIGHT`  |                             `16` | Environment fallback for max inflight RPCs. |
+
+Set these before creating the proxy or starting `serve()`:
+
+```python
+driver._rpc_buffer_size = 64 * 1024 * 1024
+driver._rpc_max_inflight = 8
+worker._rpc_buffer_size = 64 * 1024 * 1024
+worker._rpc_max_inflight = 8
+```
+
+Client and worker should use compatible slot sizes for large payloads.
+
+## Raw Mode
+
+Use `@method(raw=True)` to bypass pickle. Raw handlers receive the channel, a
+request pointer, and request byte length:
+
+```python
+import ctypes
+from dlslime.rpc import method
+
+
+class RawEcho:
+    @method(raw=True)
+    def echo(self, channel, ptr, nbytes):
+        request = bytes((ctypes.c_char * nbytes).from_address(ptr))
+        return request.upper()
+```
+
+The client passes one `bytes` payload and receives `bytes`:
+
+```python
+raw = proxy(driver, "worker:0", RawEcho)
+response = raw.echo(b"hello").wait()
+assert response == b"HELLO"
+```
+
+This is the right mode for FlatBuffers, Cap'n Proto, protobuf bytes, or custom
+binary layouts. See `examples/python/rpc_flatbuf_example.py` for a complete
+FlatBuffers loopback.
+
+## In-Place Raw Replies
+
+For very small hot-path handlers, `@method(raw=True, inplace=True)` lets the
+handler write the reply directly into the registered send buffer:
+
+```python
+class InplaceService:
+    @method(raw=True, inplace=True)
+    def echo(self, req_ptr, req_nbytes, resp_ptr, resp_cap) -> int:
+        n = min(req_nbytes, resp_cap)
+        ctypes.memmove(resp_ptr, req_ptr, n)
+        return n
+```
+
+The handler must return the number of bytes written. While the handler runs,
+the session send lock is held, so use this only for very short handlers.
+
+## Error Handling
+
+Common client-visible errors:
+
+| Error             | Meaning                                        |
+| ----------------- | ---------------------------------------------- |
+| `RpcTimeoutError` | `future.wait(timeout=...)` timed out.          |
+| `RemoteRpcError`  | The remote handler raised an exception.        |
+| `RpcError`        | Base class for SlimeRPC client-visible errors. |
+
+```python
+from dlslime.rpc import RemoteRpcError, RpcTimeoutError
+
+try:
+    result = calc.add(1, 2).wait(timeout=1.0)
+except RpcTimeoutError:
+    ...
+except RemoteRpcError as exc:
+    print(exc.type_name, exc.message)
+```
+
+## One-Shot Serving
+
+`serve_once()` dispatches exactly one incoming call and returns. It is useful for
+tests and controlled loops:
+
+```python
+from dlslime.rpc import serve_once
+
+serve_once(worker, CalcService(), peer="driver:0")
+```
+
+## RNR Monitoring
+
+For RDMA receiver-not-ready debugging, use `SoftRnrMonitor`:
+
+```python
+from dlslime.rpc import SoftRnrMonitor
+
+monitor = SoftRnrMonitor()
+monitor.snapshot()
+
+# run workload
+
+print(monitor.delta())
+print(monitor.total_delta())
+```
+
+## Full Examples
+
+- `examples/python/rpc_example.py`: typed pickle RPC loopback.
+- `examples/python/rpc_flatbuf_example.py`: raw FlatBuffers RPC loopback.
+- `bench/python/rpc_bench_slime_worker.py`: benchmark worker.
+- `bench/python/rpc_bench_slime_driver.py`: benchmark driver.
diff --git a/docs/src/guide/slimerpc.zh.md b/docs/src/guide/slimerpc.zh.md
new file mode 100644
index 00000000..cf92e210
--- /dev/null
+++ b/docs/src/guide/slimerpc.zh.md
@@ -0,0 +1,201 @@
+# SlimeRPC
+
+SlimeRPC 是构建在 PeerAgent 和 DLSlime RDMA 数据面上的 typed RPC 层。它适合把应用逻辑写成
+Python 服务调用，同时把连接建立、内存注册和 request/reply 传输交给 DLSlime。
+
+公开 API：
+
+```python
+from dlslime.rpc import (
+    method,
+    proxy,
+    serve,
+    serve_once,
+    wait_all,
+    RpcError,
+    RemoteRpcError,
+    RpcTimeoutError,
+    SoftRnrMonitor,
+)
+```
+
+## 前置条件
+
+- NanoCtrl 正在运行，client 和 worker 都能访问。
+- Redis 可通过 NanoCtrl 发现。
+- 两侧 PeerAgent 已连接。
+- DLSlime 构建时启用了 C++ RPC session。若未启用，`proxy()` 会提示需要用 `BUILD_RPC=ON` 重新构建。
+
+```bash
+nanoctrl start
+python examples/python/rpc_example.py --ctrl http://127.0.0.1:3000
+```
+
+## 定义服务
+
+用 `@method` 标记可远程调用的方法：
+
+```python
+from dlslime.rpc import method
+
+
+class CalcService:
+    @method
+    def add(self, a: int, b: int) -> int:
+        return a + b
+
+    @method
+    def echo(self, msg: str) -> str:
+        return f"echo: {msg}"
+```
+
+默认使用 `pickle` 序列化参数和返回值。两端需要导入同一个 service class 定义，因为 method tag 会按方法名排序确定。
+
+## 连接 PeerAgent
+
+SlimeRPC 依赖 PeerAgent 连接，不替代 discovery 流程：
+
+```python
+from dlslime import PeerAgent
+
+worker = PeerAgent(nanoctrl_url="http://127.0.0.1:3000", alias="worker:0")
+driver = PeerAgent(nanoctrl_url="http://127.0.0.1:3000", alias="driver:0")
+
+driver_conn = driver.connect_to("worker:0", ib_port=1, qp_num=1)
+worker_conn = worker.connect_to("driver:0", ib_port=1, qp_num=1)
+
+driver_conn.wait()
+worker_conn.wait()
+```
+
+## Worker 侧
+
+`serve()` 会阻塞并处理某个 peer 的请求：
+
+```python
+from dlslime.rpc import serve
+
+serve(worker, CalcService(), peer="driver:0")
+```
+
+在示例或测试中可放到后台线程：
+
+```python
+import threading
+
+threading.Thread(
+    target=serve,
+    args=(worker, CalcService(), "driver:0"),
+    daemon=True,
+).start()
+```
+
+`serve()` 只有在恰好连接一个 peer 时才能省略 `peer`。多 peer 服务应显式传 `peer=...`。
+
+## Client 侧
+
+```python
+from dlslime.rpc import proxy, wait_all
+
+calc = proxy(driver, "worker:0", CalcService)
+
+future = calc.add(1, 2)
+assert future.wait(timeout=30.0) == 3
+
+futures = [calc.add(i, i * 10) for i in range(5)]
+results = wait_all(futures, timeout=30.0)
+```
+
+每次 proxy 调用都会返回 future，通过 `wait()` 获取结果。
+
+## Channel Buffer
+
+SlimeRPC 会为每个 `(local_agent, peer_agent)` 创建一个 RDMA mailbox channel，并注册类似下面的 MR：
+
+```text
+rpc:mailbox:<local_alias>:<peer_alias>
+```
+
+常用参数：
+
+| 设置                      |                           默认值 | 作用                                    |
+| ------------------------- | -------------------------------: | --------------------------------------- |
+| `agent._rpc_buffer_size`  |               `32_000_000` bytes | 每个 inflight request/reply slot 大小。 |
+| `agent._rpc_max_inflight` | `SLIME_RPC_MAX_INFLIGHT` 或 `16` | receive slot 数量。                     |
+| `SLIME_RPC_MAX_INFLIGHT`  |                             `16` | max inflight 的环境变量默认值。         |
+
+创建 proxy 或启动 `serve()` 前设置：
+
+```python
+driver._rpc_buffer_size = 64 * 1024 * 1024
+driver._rpc_max_inflight = 8
+worker._rpc_buffer_size = 64 * 1024 * 1024
+worker._rpc_max_inflight = 8
+```
+
+## Raw 模式
+
+`@method(raw=True)` 会跳过 pickle。handler 接收 channel、请求指针和请求字节数：
+
+```python
+import ctypes
+from dlslime.rpc import method
+
+
+class RawEcho:
+    @method(raw=True)
+    def echo(self, channel, ptr, nbytes):
+        request = bytes((ctypes.c_char * nbytes).from_address(ptr))
+        return request.upper()
+```
+
+客户端传入一个 `bytes`，返回值也是 `bytes`：
+
+```python
+raw = proxy(driver, "worker:0", RawEcho)
+response = raw.echo(b"hello").wait()
+```
+
+Raw 模式适合 FlatBuffers、protobuf bytes 或自定义二进制布局。完整示例见
+`examples/python/rpc_flatbuf_example.py`。
+
+## In-Place Raw 回复
+
+`@method(raw=True, inplace=True)` 允许 handler 直接把回复写入已注册的 send buffer：
+
+```python
+class InplaceService:
+    @method(raw=True, inplace=True)
+    def echo(self, req_ptr, req_nbytes, resp_ptr, resp_cap) -> int:
+        n = min(req_nbytes, resp_cap)
+        ctypes.memmove(resp_ptr, req_ptr, n)
+        return n
+```
+
+handler 返回写入的字节数。执行期间 session send lock 会被持有，因此只适合非常短的 hot-path handler。
+
+## 错误处理
+
+| 错误              | 含义                              |
+| ----------------- | --------------------------------- |
+| `RpcTimeoutError` | `future.wait(timeout=...)` 超时。 |
+| `RemoteRpcError`  | 远端 handler 抛出异常。           |
+| `RpcError`        | SlimeRPC 客户端可见错误基类。     |
+
+```python
+from dlslime.rpc import RemoteRpcError, RpcTimeoutError
+
+try:
+    result = calc.add(1, 2).wait(timeout=1.0)
+except RpcTimeoutError:
+    ...
+except RemoteRpcError as exc:
+    print(exc.type_name, exc.message)
+```
+
+## 示例
+
+- `examples/python/rpc_example.py`：typed pickle RPC loopback。
+- `examples/python/rpc_flatbuf_example.py`：raw FlatBuffers RPC loopback。
+- `bench/python/rpc_bench_slime_worker.py`：benchmark worker。
+- `bench/python/rpc_bench_slime_driver.py`：benchmark driver。