From 6765dc54bd4a0afd95c02d9559fe2aebcd4f39a1 Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Sun, 5 Oct 2025 01:20:38 +0800
Subject: [PATCH 01/20] gpu

---
 .gitignore            |   4 +-
 benchmark/test_gpu.py |  25 +++
 csrc/binding.cpp      | 119 +++++++++++-
 csrc/blake3.h         |   9 +-
 csrc/blake3_sm80.cu   | 410 ++++++++++++++++++++++++++++++++++++++++++
 setup.py              | 189 +++++++++++++++----
 6 files changed, 722 insertions(+), 34 deletions(-)
 create mode 100644 benchmark/test_gpu.py
 create mode 100644 csrc/blake3_sm80.cu
diff --git a/.gitignore b/.gitignore
index aebc9d7..dfa07fd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,6 @@ run.sh
 
 build/
 dist/
-*.egg-info/
\ No newline at end of file
+*.egg-info/
+
+*.ncu-rep
\ No newline at end of file
diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py
new file mode 100644
index 0000000..c867025
--- /dev/null
+++ b/benchmark/test_gpu.py
@@ -0,0 +1,25 @@
+import flashashing as fh
+import hashlib
+import torch
+import time
+
+GiB = 1024**3
+
+s = "A" * GiB                 # 1 GiB
+data = s.encode("utf-8")
+print(len(data))              # 1073741824
+
+t0 = time.perf_counter()
+repeat = 20
+for i in range(repeat):
+    cv_hex = fh.blake3_gpu_sm80_hex(data)
+torch.cuda.synchronize()
+
+t1 = time.perf_counter()
+elapsed = t1 - t0
+print(f"Elapsed time for f{repeat}x BLAKE3 (GPU SM80): {elapsed:.3f} seconds")
+print(f"Throughput: {repeat * len(data) / elapsed / (1024**2):.2f} MiB/s")
+print("root CV (hex) =", cv_hex)
+
+expected_sha256_result = hashlib.sha256(data).hexdigest()
+print(f"SHA256 Expected: {expected_sha256_result}")
\ No newline at end of file
diff --git a/csrc/binding.cpp b/csrc/binding.cpp
index 2da163f..54bb3d6 100644
--- a/csrc/binding.cpp
+++ b/csrc/binding.cpp
@@ -76,6 +76,109 @@ static std::string blake3_hash_naive(py::object obj) {
     }
 }
 
+
+struct PyBytesView {
+    const uint8_t* ptr = nullptr;
+    size_t len = 0;
+    std::string storage;
+    py::object keep_alive;
+};
+
+static PyBytesView get_bytes_view(py::object obj) {
+  PyBytesView v;
+
+  // bytes -> 复制到 storage
+  if (py::isinstance<py::bytes>(obj)) {
+    v.storage = static_cast<std::string>(py::bytes(obj));
+    v.ptr = reinterpret_cast<const uint8_t*>(v.storage.data());
+    v.len = v.storage.size();
+    return v;
+  }
+
+  // str -> 按 UTF-8 编码复制；如果你想按“原始字节”处理，建议在 Python 侧先 .encode()
+  if (py::isinstance<py::str>(obj)) {
+    v.storage = obj.cast<std::string>();
+    v.ptr = reinterpret_cast<const uint8_t*>(v.storage.data());
+    v.len = v.storage.size();
+    return v;
+  }
+
+  // bytearray -> 正确姿势：转成 py::bytes 再拿 std::string（不要用迭代器）
+  if (py::isinstance<py::bytearray>(obj)) {
+    v.storage = static_cast<std::string>(py::bytes(obj));
+    v.ptr = reinterpret_cast<const uint8_t*>(v.storage.data());
+    v.len = v.storage.size();
+    return v;
+  }
+
+  // 任意 buffer（memoryview / numpy 等）
+  if (PyObject_CheckBuffer(obj.ptr())) {
+    py::buffer buf = py::reinterpret_borrow<py::buffer>(obj);
+    py::buffer_info info = buf.request(/*writable=*/false);
+
+    // 一维 C 连续：零拷贝，注意保活
+    if (info.ndim == 1 &&
+        info.strides.size() == 1 &&
+        info.strides[0] == static_cast<ssize_t>(info.itemsize)) {
+      v.ptr = reinterpret_cast<const uint8_t*>(info.ptr);
+      v.len = static_cast<size_t>(info.size) * static_cast<size_t>(info.itemsize);
+      v.keep_alive = obj;  // 保活底层内存
+      return v;
+    }
+
+    // 非连续：用 tobytes() 拷贝成线性字节
+    py::object tobytes = obj.attr("tobytes")();
+    v.storage = static_cast<std::string>(py::bytes(tobytes));
+    v.ptr = reinterpret_cast<const uint8_t*>(v.storage.data());
+    v.len = v.storage.size();
+    return v;
+  }
+
+  throw std::invalid_argument(
+      "blake3_gpu_root_* expects bytes/str/bytearray/memoryview/numpy buffer.");
+}
+
+static std::string cv_words_to_bytes_le(const std::array<uint32_t,8>& cv) {
+    std::string out;
+    out.resize(32);
+    uint8_t* p = reinterpret_cast<uint8_t*>(&out[0]);
+    for (int i = 0; i < 8; ++i) {
+        uint32_t w = cv[i];
+        p[4*i + 0] = static_cast<uint8_t>( w        & 0xFF);
+        p[4*i + 1] = static_cast<uint8_t>((w >> 8)  & 0xFF);
+        p[4*i + 2] = static_cast<uint8_t>((w >> 16) & 0xFF);
+        p[4*i + 3] = static_cast<uint8_t>((w >> 24) & 0xFF);
+    }
+    return out;
+}
+
+struct GilRelease {
+    py::gil_scoped_release rel;
+};
+
+static py::bytes blake3_gpu_root_cv_bytes(py::object obj) {
+    auto v = get_bytes_view(obj);
+    std::array<uint32_t,8> root{};
+    {
+        GilRelease _g;
+        blake3_block_reduce_sm80(v.ptr, static_cast<uint64_t>(v.len), &root, /*stream=*/0);
+    }
+    std::string b = cv_words_to_bytes_le(root);
+    return py::bytes(b);
+}
+
+static std::string blake3_gpu_root_hex(py::object obj) {
+    auto v = get_bytes_view(obj);
+    std::array<uint32_t,8> root{};
+    {
+        GilRelease _g;
+        blake3_block_reduce_sm80(v.ptr, static_cast<uint64_t>(v.len), &root, /*stream=*/0);
+    }
+    std::string b = cv_words_to_bytes_le(root);
+    return bytes_to_hex(reinterpret_cast<const uint8_t*>(b.data()), b.size());
+}
+
+
 PYBIND11_MODULE(flashashing, m) {
     m.doc() = "SHA-256 and Blake3 bindings (pybind11)";
 
@@ -84,9 +187,23 @@ PYBIND11_MODULE(flashashing, m) {
           "Compute SHA-256 of a str/bytes and return hex string.");
     m.def("hash_simd", &hash_hex_simd4,
           py::arg("data"), 
-          "SIMD enhanced sha256."),
+          "SIMD enhanced sha256.");
 
     m.def("blake3_hash_naive", &blake3_hash_naive,
           py::arg("data"),
           "Compute BLAKE3 hash (single-threaded).");
+
+    m.def("blake3_gpu_sm80",
+          &blake3_gpu_root_cv_bytes,
+          py::arg("data"),
+          R"pbdoc(
+Return the 32-byte *root chaining value* (CV) computed on GPU for the given data.
+NOTE: This is not the standard BLAKE3 digest/XOF output. It's the root CV.
+)pbdoc");
+    m.def("blake3_gpu_sm80_hex",
+          &blake3_gpu_root_hex,
+          py::arg("data"),
+          R"pbdoc(
+Return the hex string of the *root chaining value* (CV) computed on GPU.
+)pbdoc");
 }
\ No newline at end of file
diff --git a/csrc/blake3.h b/csrc/blake3.h
index 55aaa43..c1e27de 100644
--- a/csrc/blake3.h
+++ b/csrc/blake3.h
@@ -2,6 +2,9 @@
 
 #include <cstdint>
 #include <string>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <array>
 
 namespace flashashing {
 
@@ -23,4 +26,8 @@ class Blake3 {
 
 std::string bytes_to_hex(const uint8_t *data, size_t len);
 
-} // namespace flashashing
\ No newline at end of file
+} // namespace flashashing
+
+void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
+                            std::array<uint32_t,8>* root_out = nullptr,
+                            cudaStream_t stream = 0);
\ No newline at end of file
diff --git a/csrc/blake3_sm80.cu b/csrc/blake3_sm80.cu
new file mode 100644
index 0000000..53c3575
--- /dev/null
+++ b/csrc/blake3_sm80.cu
@@ -0,0 +1,410 @@
+
+#include <cstdint>
+#include <array>
+#include <cuda_runtime.h>
+#include <cstdio>
+
+#define WARP_SIZE 32
+
+#define CUDA_CHECK(expr) do {                                   \
+    cudaError_t _e = (expr);                                    \
+    if (_e != cudaSuccess) {                                    \
+      fprintf(stderr, "CUDA error %s at %s:%d: %s\n",           \
+              #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\
+      std::abort();                                             \
+    }                                                           \
+  } while(0)
+
+__constant__ uint32_t BLAKE3_IV[8] = {
+    0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
+    0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
+};
+// 轮常量/消息置换表…（略）
+// TODO: 按 BLAKE3/BLAKE2s 规范补齐
+
+// ---- 小工具 ----
+__device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
+    // 使用 funnel shift 更快（算力>=Volta都有）
+    return __funnelshift_r(x, x, n);
+}
+
+__device__ void blake3_compress_words_7r(
+    const uint32_t block_words[16],   // 64B
+    const uint32_t cv[8], // 8×u32
+    uint64_t chunk_counter,           // 64-bit
+    uint32_t block_len,               // [0..64]
+    uint32_t flags,                   // CHUNK_START/END/PARENT/ROOT…
+    uint32_t out_state[16])           // 返回 16×u32 状态向量（按规范）
+{
+    // TODO: 根据 BLAKE3（基于 BLAKE2s）的 G/round 实现7轮
+    // 这里先给占位：将 IV+cv 混合到 out_state，真实实现请替换
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_state[i] = cv[i];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_state[8+i] = BLAKE3_IV[i];
+
+    // 混入计数、block_len、flags，和 block_words（简化占位）
+    out_state[12] ^= (uint32_t)chunk_counter;
+    out_state[13] ^= (uint32_t)(chunk_counter >> 32);
+    out_state[14] ^= block_len;
+    out_state[15] ^= flags;
+
+#pragma unroll
+    for (int i = 0; i < 16; ++i) {
+        out_state[i] ^= block_words[i];
+        // 做一点点搅动（占位）
+        out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31);
+    }
+}
+
+// 从 out_state 派生新的 CV（按规范应取 state[0..7] ^ state[8..15]）
+__device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_cv[i] = st[i] ^ st[8+i];
+}
+
+// 叶：处理 1KiB chunk（16×64B blocks）→ 1 个 CV
+// 假定输入为小端 u32 流，chunk 不足 1KiB 最后一块 block_len<64 并置 END 标志
+__device__ void blake3_leaf_cv(const uint32_t* chunk_words, 
+                                int chunk_len_bytes,
+                               uint64_t chunk_counter, 
+                               uint32_t out_cv[8])
+{
+    uint32_t cv[8];
+    // 初始 cv = IV
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        cv[i] = BLAKE3_IV[i];
+
+    const int nblocks = (chunk_len_bytes + 63) / 64; // ceil
+    for (int b = 0; b < nblocks; ++b) {
+        uint32_t st[16];
+        const uint32_t* block = chunk_words + b*16;
+        const int remain = chunk_len_bytes - b*64;
+        const uint32_t blk_len = remain >= 64 ? 64u : (uint32_t)remain;
+
+        const uint32_t flags =
+            ((b == 0) ? (1u<<0) : 0u) |                       // CHUNK_START（示意：bit0）
+            ((b == nblocks-1) ? (1u<<1) : 0u);                // CHUNK_END   （示意：bit1）
+
+        blake3_compress_words_7r(block, cv, chunk_counter, blk_len, flags, st);
+        blake3_state_to_cv(st, cv);
+    }
+
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_cv[i] = cv[i];
+}
+
+__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){
+    uint32_t msg[16];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[i] = L[i]; 
+    }
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[8+i] = R[i]; 
+    }
+    uint32_t st[16];
+    blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st);
+    blake3_state_to_cv(st, out_cv);
+}
+
+// ============ Big kernel: 1 warp -> 32 chunks, 1 thread = 1 chunk, 16 WARPS in total ============
+// Each block has 512 threads
+// 1 warp process 32 chunk -> 32 KiB
+// NUM_WARPS = 512 / 32 = 16
+// Each block processes 16 x 32 chunks = 16 x 32 KiB = 512 KiB
+template<const int NUM_THREADS=512, 
+        const int CHUNK_SIZE = 1024, 
+        const int PADSIZE=0>        // pad shared memory
+__global__ void blake3_block_reduce_kernel(uint32_t* d_input, 
+                                            uint32_t* block_cvs,
+                                            int chunk_len_bytes,
+                                            uint64_t base_chunk_counter,
+                                            int total_chunks) {
+    // NUM_WARPS also stands for NUM_CHUKNS per block
+    constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;        // consider it as 16
+
+    // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total
+    // 8 x 4 x 16 = 512 B shared memory in sum
+    __shared__ uint32_t cv_smem[NUM_WARPS][8 + PADSIZE];        // avoid bank conflict
+
+    // reduce pipeline: 16 -> 8 -> 4 -> 2 -> 1
+    const int tid = threadIdx.x;
+    const int warp_id = tid / WARP_SIZE;
+    const int lane_id = tid % WARP_SIZE;
+
+    const uint64_t global_warp_id = blockIdx.x * NUM_WARPS + warp_id;
+    const uint64_t chunk_counter = base_chunk_counter + global_warp_id;
+
+    // index
+    const uint64_t warp_chunk_base = global_warp_id * WARP_SIZE;        // the start of this warp
+    // each thread process one chunk
+    const uint64_t chunk_idx = warp_chunk_base + lane_id;
+
+    // edge processing
+    int valid = total_chunks - warp_chunk_base;
+    if (valid <= 0) return;     // TODO: will this affect warp shfl?
+    if (valid > WARP_SIZE) valid = WARP_SIZE;
+
+    // compute idx for this thread
+    const int WORDS_PER_CHUNK = CHUNK_SIZE / 4;     // 256
+    const uint32_t* chunk_words_ptr = d_input + (size_t)chunk_idx * WORDS_PER_CHUNK;
+
+    uint32_t cv[8] = {0}; // 8 x u32
+    bool active = lane_id < valid;
+    if (active) {
+        const uint64_t chunk_counter = base_chunk_counter + chunk_idx;
+        blake3_leaf_cv(chunk_words_ptr, chunk_len_bytes, chunk_counter, cv);
+    }
+
+    // take care: we cannot use general reduce
+    // 0-1-2-3-4-...-31, keep this sequential
+    unsigned mask = __ballot_sync(0xFFFFFFFFu, active);
+    // step = 1,2,4,8,16
+    for (int step = 1; step < WARP_SIZE; step <<= 1) {
+        int partner_lane = lane_id + step;
+
+        // neighbor cv
+        uint32_t neighbor_cv[8];
+#pragma unroll
+        for (int j = 0; j < 8; ++j) {
+            neighbor_cv[j] = __shfl_down_sync(mask, cv[j], step);
+        }
+
+        // the left be parent, and make sure `the right` is valid
+        if (active && ((lane_id & ((step << 1) - 1)) == 0) && (partner_lane < valid)) {
+            blake3_parent_cv(cv, neighbor_cv, cv);
+        }
+        __syncwarp(mask);
+
+        // in the next level, reduce half of active threads
+        if (lane_id >= (valid & ~(step))) 
+            active = false;
+    }
+
+    // now, lane 0 holds the root
+    if (lane_id == 0) {
+#pragma unroll
+        for (int j = 0 ; j < 8; ++j) 
+            cv_smem[warp_id][j] = cv[j];
+    }
+    __syncthreads();
+
+    // after all these things, we have finished
+    // 32 -> 16 -> 8 -> 4 -> 2 -> 1 merge
+    // and now, we are going to implement higher-level merge
+    // we have 16 warps, each warp has a root cv
+    // so we are going to execute another logN steps
+
+    // 16 -> 8 -> 4 -> 2 -> 1
+    for (int stride = NUM_WARPS >> 1; stride >= 1; stride >>= 1) {
+        if (warp_id < stride && lane_id == 0) {
+            uint32_t p[8];
+            blake3_parent_cv(&cv_smem[2*warp_id][0], &cv_smem[2*warp_id + 1][0], p);
+#pragma unroll
+            for (int j=0;j<8;++j) 
+                cv_smem[warp_id][j] = p[j];     // write back to shared memory
+        }
+        __syncthreads();
+    }
+
+    // write this root cv to global memory, not done yet! we need another tiny kernel to sweep
+    if (tid == 0) {
+        uint32_t* out = block_cvs + (size_t)blockIdx.x * 8;
+#pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            out[j] = cv_smem[0][j];
+    }
+
+}   // blake3_block_reduce_kernel
+
+
+// ============ Tiny kernel ============
+// In big kernel, it will consume 512 KiB each block
+// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 512 root = 2048 root
+// And this tiny kernel is designed to process these 2048 root
+template <int NUM_THREADS>
+__global__ void blake3_pair_reduce_kernel(const uint32_t* __restrict__ in_cv32,
+                                          uint32_t* __restrict__ out_cv32,
+                                          int n) {
+    extern __shared__ uint32_t smem[];   // dynamic shared memory
+    uint32_t* tile = smem;               // -> [tile_n][8]
+
+    const int tid   = threadIdx.x;
+    const int b     = blockIdx.x;
+    const int B     = gridDim.x;
+
+    const int start = (int)((1ll * n * b)     / B);
+    const int end   = (int)((1ll * n * (b+1)) / B);
+    int tile_n      = end - start;
+
+    if (tile_n <= 0) return;   // border
+
+    const int words = tile_n * 8;
+    for (int w = tid; w < words; w += NUM_THREADS) {
+        tile[w] = in_cv32[start * 8 + w];
+    }
+    __syncthreads();
+
+    int cur = tile_n;
+    while (cur > 1) {
+        const int pairs = cur >> 1;   // floor(cur/2)
+        // process pairs
+        for (int i = tid; i < pairs; i += NUM_THREADS) {
+            const uint32_t* L = &tile[(2*i)   * 8];
+            const uint32_t* R = &tile[(2*i+1) * 8];
+            uint32_t p[8];
+            blake3_parent_cv(L, R, p);
+#pragma unroll
+            for (int j=0;j<8;++j) tile[i*8 + j] = p[j];
+        }
+        __syncthreads();
+
+        // even situation: 
+        if ((cur & 1) && tid == 0) {
+            uint32_t* dst = &tile[pairs * 8];
+            uint32_t* src = &tile[(cur - 1) * 8];
+#pragma unroll
+            for (int j=0;j<8;++j) 
+                dst[j] = src[j];
+        }
+        __syncthreads();
+
+        cur = pairs + (cur & 1);
+    }
+
+    // write output
+    if (tid == 0) {
+        uint32_t* out = &out_cv32[b * 8];
+#pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            out[j] = tile[j];
+    }
+}
+
+void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
+                            std::array<uint32_t,8>* root_out = nullptr,
+                            cudaStream_t stream = 0) {
+    if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) {
+        fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n",
+                (unsigned long long)bytes_len);
+        std::abort();
+    }
+
+    constexpr int  CHUNK_SIZE = 1024;                       // 1 KiB
+    constexpr int  WORDS_PER_CHUNK = CHUNK_SIZE / 4;        // 256
+    constexpr int  NUM_THREADS = 512;                       // for big kernel
+    constexpr int  NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;       // 16
+    constexpr int  CHUNKS_PER_BLOCK= NUM_WARPS * WARP_SIZE;                     // 16 * 32 = 512
+    const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
+    const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
+    const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);
+
+    uint8_t*  d_bytes   = nullptr;
+    uint32_t* d_words   = nullptr; // alias
+    uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32
+
+    // TODO: use thrust
+    cudaMalloc(&d_bytes, bytes_len);
+    cudaMemcpyAsync(d_bytes, data, bytes_len, cudaMemcpyHostToDevice, stream);
+    d_words = reinterpret_cast<uint32_t*>(d_bytes);
+
+    cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t));
+
+    // launch big kernel
+    dim3 grid_big(num_blocks);
+    dim3 block_big(NUM_THREADS);
+    uint64_t base_chunk_counter = 0ull;
+    
+    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, 0>
+        <<<grid_big, block_big, /*smem*/0, stream>>>(
+            d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks);
+
+    CUDA_CHECK(cudaGetLastError());
+
+    if (num_blocks == 1) {
+        std::array<uint32_t,8> host_root{};
+        CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+        if (root_out) *root_out = host_root;
+        else {
+            // 简单打印
+            printf("root CV:");
+            for (int i=0;i<8;++i) 
+                printf(" %08x", host_root[i]);
+            printf("\n");
+        }
+
+        CUDA_CHECK(cudaFree(d_blockCV));
+        CUDA_CHECK(cudaFree(d_bytes));
+        return;
+    }
+
+    // the first round of tiny kernel
+    const int B = (num_blocks >= 8) ? 8 : num_blocks;
+    uint32_t* d_midCV = nullptr;
+    cudaMalloc(&d_midCV, (size_t)B * 8u * sizeof(uint32_t));
+
+    {
+        dim3 grid(B);
+        dim3 block(512);                                  // 你指定 “每个 block 512 线程”
+        // 每个 block 负责 ceil(num_blocks / B) 个 CV；SMEM 大小按此计算
+        const int tile = (num_blocks + B - 1) / B;
+        const size_t smem_bytes = (size_t)tile * 8u * sizeof(uint32_t);
+
+        blake3_pair_reduce_kernel<512>
+            <<<grid, block, smem_bytes, stream>>>(d_blockCV, d_midCV, num_blocks);
+        CUDA_CHECK(cudaGetLastError());
+    }
+
+    // second round
+    uint32_t* d_root = nullptr;
+    cudaMalloc(&d_root, 8u * sizeof(uint32_t));
+
+    {
+        dim3 grid(1);
+        dim3 block(B);
+        const size_t smem_bytes = (size_t)B * 8u * sizeof(uint32_t);
+
+        // generate kernel during compile time
+        switch (B) {
+            case 1:  blake3_pair_reduce_kernel<1 ><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 2:  blake3_pair_reduce_kernel<2 ><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 4:  blake3_pair_reduce_kernel<4 ><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 8:  blake3_pair_reduce_kernel<8 ><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 16: blake3_pair_reduce_kernel<16><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 32: blake3_pair_reduce_kernel<32><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 64: blake3_pair_reduce_kernel<64><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            default: {
+                blake3_pair_reduce_kernel<256><<<grid, /*blockDim*/B, smem_bytes, stream>>>(d_midCV, d_root, B);
+            } break;
+        }
+        CUDA_CHECK(cudaGetLastError());
+    }
+
+    std::array<uint32_t, 8> host_root{};
+    CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    if (root_out) {
+        *root_out = host_root;
+    } else {
+        printf("root CV:");
+        for (int i=0;i<8;++i) printf(" %08x", host_root[i]);
+        printf("\n");
+    }
+
+    // clear
+    CUDA_CHECK(cudaFree(d_root));
+    CUDA_CHECK(cudaFree(d_midCV));
+    CUDA_CHECK(cudaFree(d_blockCV));
+    CUDA_CHECK(cudaFree(d_bytes));
+}
\ No newline at end of file
diff --git a/setup.py b/setup.py
index eb3a2e4..749c743 100644
--- a/setup.py
+++ b/setup.py
@@ -1,39 +1,166 @@
+# setup.py
 from setuptools import setup
-from pybind11.setup_helpers import Pybind11Extension, build_ext
-import sys
-import sysconfig
-
-import numpy as np
-
-cxx_std = 17
-extra_compile_args = []
-extra_link_args = []
-
-# linux environment, default
-extra_compile_args += ["-O2", "-ffast-math", "-march=native", "-fopenmp", "-Wall", "-Wextra", "-Wpedantic", "-mavx2", "-mfma"]
-extra_link_args += ["-fopenmp"]
-
-ext_modules = [
-    Pybind11Extension(
-        "flashashing",
-        sources=[
-          "csrc/sha256_base.cpp",
-          "csrc/sha256_simd.cpp",
-          "csrc/blake3_base.cpp",
-          "csrc/binding.cpp"
-        ],
-        include_dirs=[np.get_include()],
-        cxx_std=cxx_std,
-        extra_compile_args=extra_compile_args,
-        extra_link_args=extra_link_args,
-    )
+from setuptools.command.build_ext import build_ext
+from pybind11.setup_helpers import Pybind11Extension
+import pybind11, numpy as np
+import sys, os, shutil
+
+def find_in_path(name, path):
+    for d in path.split(os.pathsep):
+        p = os.path.join(d, name)
+        if os.path.exists(p):
+            return os.path.abspath(p)
+    return None
+
+def locate_cuda():
+    cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
+    nvcc = None
+    if cuda_home:
+        nvcc = os.path.join(cuda_home, "bin", "nvcc")
+    else:
+        nvcc = find_in_path("nvcc", os.environ.get("PATH", ""))
+        if nvcc:
+            cuda_home = os.path.dirname(os.path.dirname(nvcc))
+    if not nvcc or not os.path.exists(nvcc):
+        raise RuntimeError("nvcc not found. Set CUDA_HOME/CUDA_PATH or add nvcc to PATH.")
+    cuda_include = os.path.join(cuda_home, "include")
+    if sys.platform.startswith("win"):
+        cuda_libdir = os.path.join(cuda_home, "lib", "x64")
+    else:
+        cuda_libdir = os.path.join(cuda_home, "lib64")
+    return {"home": cuda_home, "nvcc": nvcc, "include": cuda_include, "libdir": cuda_libdir}
+
+CUDA = locate_cuda()
+
+CXX_STD = 17
+
+arch_list = os.environ.get("FLASHASHING_CUDA_ARCH_LIST", "80;86;89").split(";")
+NVCC_ARCH_FLAGS = []
+for a in arch_list:
+    a = a.strip()
+    if a:
+        NVCC_ARCH_FLAGS += ["-gencode", f"arch=compute_{a},code=sm_{a}"]
+
+COMMON_DEFINES = []
+COMMON_INCLUDES = [np.get_include(), pybind11.get_include(), pybind11.get_include(user=True), CUDA["include"]]
+COMMON_LIB_DIRS = [CUDA["libdir"]]
+COMMON_LIBS = ["cudart"]
+RPATH = [CUDA["libdir"]] if not sys.platform.startswith("win") else []
+
+CXX_FLAGS = [
+    f"-std=c++{CXX_STD}", "-O3", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math"
 ]
+LINK_FLAGS = []
+
+# OpenMP
+if sys.platform.startswith("win"):
+    # MSVC
+    CXX_FLAGS += ["/openmp"]
+else:
+    CXX_FLAGS += ["-fopenmp"]
+    LINK_FLAGS += ["-fopenmp"]
+
+NVCC_FLAGS = [
+    f"-std=c++{CXX_STD}",
+    "-O3", "-Xcompiler", "-fPIC",
+    "--expt-relaxed-constexpr",
+    "--use_fast_math",
+    "-lineinfo",
+] + NVCC_ARCH_FLAGS
+
+if not sys.platform.startswith("win"):
+    NVCC_FLAGS += ["-Xcompiler", "-fopenmp"]
+else:
+    # MSVC 的 NVCC 透传
+    NVCC_FLAGS += ["-Xcompiler", "/openmp", "-Xcompiler", "/MD", "-Xcompiler", "/O2"]
+
+# give .cu to nvcc
+from distutils.unixccompiler import UnixCCompiler
+
+class BuildExtNVCC(build_ext):
+    def build_extensions(self):
+
+        self.compiler.src_extensions.append(".cu")
+
+        original_compile = self.compiler.compile
+
+        def nvcc_compile(sources, output_dir=None, macros=None, include_dirs=None,
+                         debug=0, extra_preargs=None, extra_postargs=None, depends=None):
+            cxx_sources, cu_sources = [], []
+            for s in sources:
+                (cu_sources if os.path.splitext(s)[1] == ".cu" else cxx_sources).append(s)
+
+            objects = []
+            if cxx_sources:
+
+                postargs = extra_postargs.get("cxx", []) if isinstance(extra_postargs, dict) else extra_postargs
+                objects += original_compile(
+                    cxx_sources, output_dir, macros, include_dirs, debug, extra_preargs, postargs, depends
+                )
+
+            if cu_sources:
+
+                for src in cu_sources:
+                    obj = self.compiler.object_filenames([src], output_dir=output_dir)[0]
+                    cmd = [CUDA["nvcc"], "-c", src, "-o", obj] + NVCC_FLAGS
+
+                    incs = include_dirs or []
+                    for inc in incs:
+                        cmd += ["-I", inc]
+
+                    if macros:
+                        for m in macros:
+                            if isinstance(m, tuple):
+                                name, val = m
+                                cmd += ["-D%s=%s" % (name, val)]
+                            else:
+                                cmd += ["-D%s" % m]
+
+                    if isinstance(extra_postargs, dict):
+                        cmd += extra_postargs.get("nvcc", [])
+                    elif extra_postargs:
+                        cmd += extra_postargs
+
+                    os.makedirs(os.path.dirname(obj), exist_ok=True)
+                    self.spawn(cmd)
+                    objects.append(obj)
+            return objects
+
+        self.compiler.compile = nvcc_compile
+
+        for ext in self.extensions:
+            if not sys.platform.startswith("win"):
+                ext.runtime_library_dirs = list(set((ext.runtime_library_dirs or []) + RPATH))
+        build_ext.build_extensions(self)
+
+# ---------- 扩展模块 ----------
+sources = [
+    "csrc/sha256_base.cpp",
+    "csrc/sha256_simd.cpp",
+    "csrc/blake3_base.cpp",
+    "csrc/blake3_sm80.cu",
+    "csrc/binding.cpp",
+]
+
+ext = Pybind11Extension(
+    "flashashing",
+    sources=sources,
+    include_dirs=COMMON_INCLUDES,
+    library_dirs=COMMON_LIB_DIRS,
+    libraries=COMMON_LIBS,
+    extra_compile_args={
+        "cxx": CXX_FLAGS,
+        "nvcc": []
+    },
+    extra_link_args=LINK_FLAGS,
+    define_macros=[("PYBIND11_DETAILED_ERROR_MESSAGES", "1")] + [(d, None) for d in COMMON_DEFINES],
+)
 
 setup(
     name="flashashing",
     version="0.1.0",
-    description="High performance hashing (SHA-256, BLAKE3) implementation with pybind11",
-    ext_modules=ext_modules,
-    cmdclass={"build_ext": build_ext},
+    description="High performance hashing (SHA-256, BLAKE3) with CUDA + pybind11",
+    ext_modules=[ext],
+    cmdclass={"build_ext": BuildExtNVCC},
     zip_safe=False,
 )

From 4f31a2c55551965a2e5f048565f021c4551554ae Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Sun, 5 Oct 2025 14:03:27 +0800
Subject: [PATCH 02/20] finish v1 building

---
 .gitignore            |  1 +
 benchmark/test_gpu.py |  8 +++++---
 csrc/blake3_sm80.cu   | 41 ++++++++++++++++++++++++++++++++++-------
 3 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/.gitignore b/.gitignore
index dfa07fd..5f07045 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 run.sh
+profile.sh
 
 **.__pycache__
 
diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py
index c867025..75fcef9 100644
--- a/benchmark/test_gpu.py
+++ b/benchmark/test_gpu.py
@@ -2,13 +2,16 @@
 import hashlib
 import torch
 import time
+import blake3
 
-GiB = 1024**3
+GiB = 1024*1024*1024  # bytes -> 1 GiB
 
 s = "A" * GiB                 # 1 GiB
 data = s.encode("utf-8")
 print(len(data))              # 1073741824
 
+std_hex = blake3.blake3(data).hexdigest()
+
 t0 = time.perf_counter()
 repeat = 20
 for i in range(repeat):
@@ -21,5 +24,4 @@
 print(f"Throughput: {repeat * len(data) / elapsed / (1024**2):.2f} MiB/s")
 print("root CV (hex) =", cv_hex)
 
-expected_sha256_result = hashlib.sha256(data).hexdigest()
-print(f"SHA256 Expected: {expected_sha256_result}")
\ No newline at end of file
+print(f"std BLAKE3 Expected: {std_hex}")
\ No newline at end of file
diff --git a/csrc/blake3_sm80.cu b/csrc/blake3_sm80.cu
index 53c3575..a58ad5f 100644
--- a/csrc/blake3_sm80.cu
+++ b/csrc/blake3_sm80.cu
@@ -15,20 +15,21 @@
     }                                                           \
   } while(0)
 
-__constant__ uint32_t BLAKE3_IV[8] = {
+__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = {
     0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
     0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
 };
-// 轮常量/消息置换表…（略）
-// TODO: 按 BLAKE3/BLAKE2s 规范补齐
 
 // ---- 小工具 ----
-__device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
-    // 使用 funnel shift 更快（算力>=Volta都有）
+__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
+#if defined(__CUDA_ARCH__)
     return __funnelshift_r(x, x, n);
+#else
+  return (x >> n) | (x << (32 - n));    // host 路径
+#endif
 }
 
-__device__ void blake3_compress_words_7r(
+__host__ __device__ void blake3_compress_words_7r(
     const uint32_t block_words[16],   // 64B
     const uint32_t cv[8], // 8×u32
     uint64_t chunk_counter,           // 64-bit
@@ -60,7 +61,7 @@ __device__ void blake3_compress_words_7r(
 }
 
 // 从 out_state 派生新的 CV（按规范应取 state[0..7] ^ state[8..15]）
-__device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){
+__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){
 #pragma unroll
     for (int i = 0; i < 8; ++i) 
         out_cv[i] = st[i] ^ st[8+i];
@@ -288,6 +289,23 @@ __global__ void blake3_pair_reduce_kernel(const uint32_t* __restrict__ in_cv32,
     }
 }
 
+constexpr uint32_t FLAG_ROOT = 8;
+
+inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) {
+    uint32_t zero_block[16] = {0};
+    uint32_t st[16];
+    blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st);
+    // 写出前 32 字节（state[0..7]，小端）
+    for (int i = 0; i < 8; ++i) {
+        uint32_t w = st[i];
+        out32[4*i+0] = (uint8_t)( w        & 0xFF);
+        out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF);
+        out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF);
+        out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF);
+    }
+}
+
+// wrapper function
 void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
                             std::array<uint32_t,8>* root_out = nullptr,
                             cudaStream_t stream = 0) {
@@ -333,6 +351,11 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
         CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t),
                                 cudaMemcpyDeviceToHost, stream));
         CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        // last final process
+        uint8_t digest32[32];
+        blake3_digest32_from_root_cv(host_root.data(), digest32);
+
         if (root_out) *root_out = host_root;
         else {
             // 简单打印
@@ -394,6 +417,10 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
                                 cudaMemcpyDeviceToHost, stream));
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
+    // last final process
+    uint8_t digest32[32];
+    blake3_digest32_from_root_cv(host_root.data(), digest32);
+
     if (root_out) {
         *root_out = host_root;
     } else {

From 2e71051367a9533af36f6c54a46876e20bc0bab5 Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Sun, 5 Oct 2025 21:17:46 +0800
Subject: [PATCH 03/20] coalsced gmem access

---
 README.md              |   7 +-
 csrc/blake3_sm80.cu    | 636 ++++++++++++++++++++++++++++++-----------
 csrc/blake3_sm80_v1.cu | 454 +++++++++++++++++++++++++++++
 setup.py               |  12 +-
 4 files changed, 942 insertions(+), 167 deletions(-)
 create mode 100644 csrc/blake3_sm80_v1.cu

diff --git a/README.md b/README.md
index ef873a4..854f865 100644
--- a/README.md
+++ b/README.md
@@ -19,4 +19,9 @@ python benchmark/test_script.py
 
 + 10.3: Implement `SHA256-SIMD` version, still some flaws: only allow short string processing (`len < 55`)
 + 10.2: Consultation with doctor, invite to repo, add `BLAKE3` basic implementations
-+ 10.1: Provide basic template for project, implementing basic sha256 in C++.
\ No newline at end of file
++ 10.1: Provide basic template for project, implementing basic sha256 in C++.
+
+# GPU kernel performance on File Compress hashing
+
++ 10.5 - v1 - [commit:`4f31a2c55551965a2e5f048565f021c4551554ae`]: 1709.34 MiB/s
++ 10.5 - v2 - []: 1961.36 MiB/s
\ No newline at end of file
diff --git a/csrc/blake3_sm80.cu b/csrc/blake3_sm80.cu
index a58ad5f..bbc2ae3 100644
--- a/csrc/blake3_sm80.cu
+++ b/csrc/blake3_sm80.cu
@@ -3,8 +3,10 @@
 #include <array>
 #include <cuda_runtime.h>
 #include <cstdio>
+#include <cstring>
 
 #define WARP_SIZE 32
+#define LDST128BITS(value) (reinterpret_cast<const float4*>(&(value))[0])
 
 #define CUDA_CHECK(expr) do {                                   \
     cudaError_t _e = (expr);                                    \
@@ -29,9 +31,18 @@ __host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
 #endif
 }
 
+__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) {
+#if defined(__CUDA_ARCH__)
+    const uint4 v = *reinterpret_cast<const uint4*>(src);
+    dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w;
+#else
+    std::memcpy(dst, src, 16);
+#endif
+}
+
 __host__ __device__ void blake3_compress_words_7r(
-    const uint32_t block_words[16],   // 64B
-    const uint32_t cv[8], // 8×u32
+    const uint32_t block_words[16],   // 64B -> shared memory
+    const uint32_t cv[8],             // 8×u32 -> shared memory
     uint64_t chunk_counter,           // 64-bit
     uint32_t block_len,               // [0..64]
     uint32_t flags,                   // CHUNK_START/END/PARENT/ROOT…
@@ -46,14 +57,18 @@ __host__ __device__ void blake3_compress_words_7r(
     for (int i = 0; i < 8; ++i) 
         out_state[8+i] = BLAKE3_IV[i];
 
-    // 混入计数、block_len、flags，和 block_words（简化占位）
     out_state[12] ^= (uint32_t)chunk_counter;
     out_state[13] ^= (uint32_t)(chunk_counter >> 32);
     out_state[14] ^= block_len;
     out_state[15] ^= flags;
 
+    // so far, the block_words are still pointers.
+    // now we load it into kernel, as pointed out by ncu profile
+    uint32_t block_reg_1[4];
+
 #pragma unroll
-    for (int i = 0; i < 16; ++i) {
+    for (int i = 0; i < 16; i += 4) {        // the gap is 4
+        // load_u128_u32x4(block_words + i, block_reg_1);
         out_state[i] ^= block_words[i];
         // 做一点点搅动（占位）
         out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31);
@@ -115,14 +130,13 @@ __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint3
     blake3_state_to_cv(st, out_cv);
 }
 
-// ============ Big kernel: 1 warp -> 32 chunks, 1 thread = 1 chunk, 16 WARPS in total ============
-// Each block has 512 threads
-// 1 warp process 32 chunk -> 32 KiB
-// NUM_WARPS = 512 / 32 = 16
-// Each block processes 16 x 32 chunks = 16 x 32 KiB = 512 KiB
+// ============ Big kernel: 16 WARPS in total ============
+// grid: (chunks / 64), thread: (512,)
 template<const int NUM_THREADS=512, 
-        const int CHUNK_SIZE = 1024, 
-        const int PADSIZE=0>        // pad shared memory
+        const int CHUNK_SIZE=1024, 
+        const int CHUNKS_PER_BLOCK=64,
+        const int PAD_CHUNK=4,
+        const int PAD_CV=4>        // pad shared memory
 __global__ void blake3_block_reduce_kernel(uint32_t* d_input, 
                                             uint32_t* block_cvs,
                                             int chunk_len_bytes,
@@ -130,169 +144,451 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
                                             int total_chunks) {
     // NUM_WARPS also stands for NUM_CHUKNS per block
     constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;        // consider it as 16
+    constexpr int CHUNKS_PROCEED = 64;
+
+    static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned");
+    static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4");
 
     // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total
-    // 8 x 4 x 16 = 512 B shared memory in sum
-    __shared__ uint32_t cv_smem[NUM_WARPS][8 + PADSIZE];        // avoid bank conflict
+    // 8 x 4 x 64 = 2 KiB shared memory in sum
+    __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV];        // avoid bank conflict
+
+    // 4 bytes x 256 x 64 = 64 KiB shared memory.
+    __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][(CHUNK_SIZE / 4) + PAD_CHUNK];     // [64][256]
 
-    // reduce pipeline: 16 -> 8 -> 4 -> 2 -> 1
     const int tid = threadIdx.x;
+    const int bx = blockIdx.x;
     const int warp_id = tid / WARP_SIZE;
     const int lane_id = tid % WARP_SIZE;
 
-    const uint64_t global_warp_id = blockIdx.x * NUM_WARPS + warp_id;
-    const uint64_t chunk_counter = base_chunk_counter + global_warp_id;
+    constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4;     // 256
+    constexpr int VEC_ELEMS = 4;                        // uint4
+    constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB)
+    constexpr int WARPS_PER_CTA = NUM_WARPS;            // 16
 
-    // index
-    const uint64_t warp_chunk_base = global_warp_id * WARP_SIZE;        // the start of this warp
-    // each thread process one chunk
-    const uint64_t chunk_idx = warp_chunk_base + lane_id;
+    // ============== STAGE 1: Coalsced Global Memory Loading ==============
+    const int tile_id   = blockIdx.x;
+    const int tile_base = tile_id * CHUNKS_PER_BLOCK;       // which chunk do this block start loading
+
+    int valid_chunks = total_chunks - tile_base;
+    if (valid_chunks <= 0) {
+        return; // overflow
+    }
+    if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK;
+
+    for (int ldt = 0; ldt < 4; ldt++) {
+        // each warp load 4 chunks
+        int chunk_local  = ldt * WARPS_PER_CTA + warp_id;           // ldt*16 + warp -> start chunk
+        int chunk_global = tile_base + chunk_local;                 // global chunk idx
+
+        // the pointer for shared memory
+        uint32_t* s_u32 = &chunk_smem[chunk_local][0];
+
+        // only read from global, when it's valid
+        // or, we fill it with 0
+        if (chunk_local < valid_chunks) {
+            const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK;
+
+            // move 16 bytes -> 128 bits each time
+            // each thread will load 2 x 16 bytes
+            // so 32 threads will load 32 x 2 x 16 = 1024 B
+            const uint4* __restrict__ g4 = reinterpret_cast<const uint4*>(g_u32);
+            uint4* __restrict__ s4 = reinterpret_cast<uint4*>(s_u32);
+
+            // idx = lane_id (0..31) 与 lane_id+32 (32..63)
+            int idx0 = lane_id;           // 0..31
+            int idx1 = lane_id + WARP_SIZE; // 32..63
+
+            // thread 0 -> 0, 32
+            // thread 1 -> 1, 33
+            // ...
+            // thread 31 -> 31, 63
+            // so the global memory access is coalsced
+
+            // notice, we load 16 bytes a time. the index is compressed
+            // tid 0 -> 0,       tid 1 -> 16
+            // tid 0 -> 32 x 16, tid 2 -> 32 x 16 + 16
+
+            uint4 v0 = g4[idx0];        // still, this step we load from gmem, in 4 elements aligned.
+            uint4 v1 = g4[idx1];
+
+            s_u32[4*idx0 + 0] = v0.x; s_u32[4*idx0 + 1] = v0.y;     // when load into shared mem, do manually
+            s_u32[4*idx0 + 2] = v0.z; s_u32[4*idx0 + 3] = v0.w;
+            s_u32[4*idx1 + 0] = v1.x; s_u32[4*idx1 + 1] = v1.y;
+            s_u32[4*idx1 + 2] = v1.z; s_u32[4*idx1 + 3] = v1.w;
+        } else {
+            uint4* s4 = reinterpret_cast<uint4*>(s_u32);
+            int idx0 = lane_id;
+            int idx1 = lane_id + WARP_SIZE;
+            s4[idx0] = make_uint4(0u,0u,0u,0u);
+            s4[idx1] = make_uint4(0u,0u,0u,0u);
+        }
+    }
 
-    // edge processing
-    int valid = total_chunks - warp_chunk_base;
-    if (valid <= 0) return;     // TODO: will this affect warp shfl?
-    if (valid > WARP_SIZE) valid = WARP_SIZE;
+    __syncthreads();        // sync all warps
 
-    // compute idx for this thread
-    const int WORDS_PER_CHUNK = CHUNK_SIZE / 4;     // 256
-    const uint32_t* chunk_words_ptr = d_input + (size_t)chunk_idx * WORDS_PER_CHUNK;
+    // ============== STAGE 2: Compress Leaf to 64 chain value ==============
+    const int pass0_valid = min(32, valid_chunks);              // pass0 cover [0, 31] chunks
+    const int pass1_valid = max(0, valid_chunks - 32);          // pass1 cover [32, 63] chunks
 
-    uint32_t cv[8] = {0}; // 8 x u32
-    bool active = lane_id < valid;
-    if (active) {
-        const uint64_t chunk_counter = base_chunk_counter + chunk_idx;
-        blake3_leaf_cv(chunk_words_ptr, chunk_len_bytes, chunk_counter, cv);
+    __shared__ int parents_count;
+    if (threadIdx.x == 0) {
+        const int parents0 = (pass0_valid + 1) >> 1;
+        const int parents1 = (pass1_valid + 1) >> 1;
+        parents_count = parents0 + parents1;             // ≤ 32
     }
+    // __syncthreads();
 
-    // take care: we cannot use general reduce
-    // 0-1-2-3-4-...-31, keep this sequential
-    unsigned mask = __ballot_sync(0xFFFFFFFFu, active);
-    // step = 1,2,4,8,16
-    for (int step = 1; step < WARP_SIZE; step <<= 1) {
-        int partner_lane = lane_id + step;
+    auto compute_leaf_cv_from_row = [&](int chunk_local, uint32_t out_cv[8]) {
+        const uint32_t* mline = &chunk_smem[chunk_local][0];   // 1 KiB = 256 u32
+        const uint64_t  cc = base_chunk_counter + (uint64_t)(tile_base + chunk_local);
+        blake3_leaf_cv(mline, chunk_len_bytes, cc, out_cv);
+    };
 
-        // neighbor cv
-        uint32_t neighbor_cv[8];
-#pragma unroll
-        for (int j = 0; j < 8; ++j) {
-            neighbor_cv[j] = __shfl_down_sync(mask, cv[j], step);
+    uint32_t warp_cv_pass0[8], warp_cv_pass1[8];
+    bool have_pass0 = false, have_pass1 = false;
+
+    // ------- pass = 0 -------
+    {
+        const int left  = (warp_id << 1);            // 2*warp_id : 0,2,4,...,30
+        const int right = left + 1;                  // neighbor: 1,3,5,...,31
+        const int pair_idx = left >> 1;              // 0..15
+
+        uint32_t left_cv[8], right_cv[8];
+
+        bool have_left = false, have_right = false;
+
+        if (lane_id == 0 && left < pass0_valid) {
+            compute_leaf_cv_from_row(left, left_cv);
+            have_left = true;
+        }
+        if (lane_id == 1 && right < pass0_valid) {
+            compute_leaf_cv_from_row(right, right_cv);
+            have_right = true;
         }
 
-        // the left be parent, and make sure `the right` is valid
-        if (active && ((lane_id & ((step << 1) - 1)) == 0) && (partner_lane < valid)) {
-            blake3_parent_cv(cv, neighbor_cv, cv);
+        // merge two neighbor
+        unsigned mask = __ballot_sync(0xFFFFFFFFu, (lane_id==0 && have_left) || (lane_id==1 && have_right));
+        if (lane_id == 0 && left < pass0_valid) {
+            uint32_t parent[8];
+            if (have_right) {
+                uint32_t rcv[8];
+                #pragma unroll
+                for (int j = 0; j < 8; ++j) 
+                    rcv[j] = __shfl_sync(mask, right_cv[j], 1);
+                blake3_parent_cv(left_cv, rcv, parent);
+            } else {
+                #pragma unroll
+                for (int j = 0; j < 8; ++j) 
+                    parent[j] = left_cv[j];   // 奇数晋级
+            }
+            // 写入 cv_smem 的正确位置：pair_idx = left/2  → 0..15
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                cv_smem[pair_idx][j] = parent[j];
         }
-        __syncwarp(mask);
+        __syncwarp();
+    }
+
+    // ---- pass 1 ----
+    {
+        const int left  = 32 + (warp_id << 1);       // 32,34,...,62
+        const int right = left + 1;                  // 33,35,...,63
+        const int pair_idx = left >> 1;              // 16..31
+
+        uint32_t left_cv[8], right_cv[8];
+
+        bool have_left = false, have_right = false;
 
-        // in the next level, reduce half of active threads
-        if (lane_id >= (valid & ~(step))) 
-            active = false;
+        if (lane_id == 0 && (left - 32) < pass1_valid) {
+            compute_leaf_cv_from_row(left, left_cv);
+            have_left = true;
+        }
+        if (lane_id == 1 && (right - 32) < pass1_valid) {
+            compute_leaf_cv_from_row(right, right_cv);
+            have_right = true;
+        }
+
+        // TODO: here we may have some issue: overflow and border situation
+        unsigned mask = __ballot_sync(0xFFFFFFFFu, (lane_id==0 && have_left) || (lane_id==1 && have_right));
+        if (lane_id == 0 && (left - 32) < pass1_valid) {
+            uint32_t parent[8];
+            if (have_right) {
+                uint32_t rcv[8];
+                #pragma unroll
+                for (int j = 0; j < 8; ++j) 
+                    rcv[j] = __shfl_sync(mask, right_cv[j], 1);
+                blake3_parent_cv(left_cv, rcv, parent);
+            } else {
+                #pragma unroll
+                for (int j = 0; j < 8; ++j) 
+                    parent[j] = left_cv[j];   // 奇数晋级
+            }
+            // write to the right position
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                cv_smem[pair_idx][j] = parent[j];
+        }
+        __syncwarp();
     }
 
-    // now, lane 0 holds the root
-    if (lane_id == 0) {
-#pragma unroll
-        for (int j = 0 ; j < 8; ++j) 
-            cv_smem[warp_id][j] = cv[j];
+    __syncthreads();
+
+    // ============== STAGE 3: Block-Reduce ==============
+    // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv
+    // we will only use warp 0 to handle this thing
+    if (warp_id == 0) {
+        uint32_t cv[8] = {0,0,0,0,0,0,0,0};
+
+        const bool active_lane = (lane_id < parents_count);
+        if (active_lane) {
+    #pragma unroll
+            for (int j = 0; j < 8; ++j) cv[j] = cv_smem[lane_id][j];
+        }
+
+        // 2) warp reduce 32 -> 16 -> 8 -> 4 -> 2 -> 1
+        unsigned mask = __ballot_sync(0xFFFFFFFFu, active_lane);
+        int cur_n = parents_count;  // 当前层的有效节点数（逐层更新）
+
+        for (int step = 1; step < WARP_SIZE; step <<= 1) {
+            // right-neighbor
+            uint32_t nbr[8];
+    #pragma unroll
+            for (int j = 0; j < 8; ++j) {
+                nbr[j] = __shfl_down_sync(mask, cv[j], step);
+            }
+
+            // safety checking
+            const bool do_pair =
+                (lane_id % (step << 1) == 0) &&               // 左侧
+                (lane_id + step < cur_n) &&                   // 右侧在当前层有效范围内
+                (lane_id < cur_n);                            // 左侧也必须有效
+
+            if (do_pair) {
+                blake3_parent_cv(cv, nbr, cv);               // parent(left, right) -> cv
+            }
+
+            cur_n = (cur_n + 1) >> 1;
+            __syncwarp(mask);
+        }
+
+        // 3) write back to global memory
+        if (lane_id == 0 && parents_count > 0) {
+            const int tile_id = blockIdx.x;
+            uint32_t* out = block_cvs + (size_t)tile_id * 8;        // 8 x 4 = 32 B
+
+            // two different write ways
+            #if 0
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                out[j] = cv[j];
+            #else
+            // block_cvs should be cudaMalloc ed
+            reinterpret_cast<uint4*>(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]);
+            reinterpret_cast<uint4*>(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]);
+            #endif
+        }
+    }
+}   // blake3_block_reduce_kernel
+
+__device__ __forceinline__
+void load_cv_g2r(const uint32_t* g, uint32_t r[8]) {
+    const uint4* g4 = reinterpret_cast<const uint4*>(g);
+    uint4 a = g4[0], b = g4[1];
+    r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w;
+    r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w;
+}
+
+__device__ __forceinline__
+void store_cv_r2g(const uint32_t r[8], uint32_t* g) {
+    uint4* g4 = reinterpret_cast<uint4*>(g);
+    g4[0] = make_uint4(r[0],r[1],r[2],r[3]);
+    g4[1] = make_uint4(r[4],r[5],r[6],r[7]);
+}
+
+// ============ Tiny kernel ============
+// In big kernel, it will consume 64 KiB each block
+// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 64 root = 16384 root
+// And this tiny kernel is designed to process these 16384 root
+template<int NUM_THREADS=512, int TILE_CVS=2048, int PAD=0>
+__global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32,
+                                              uint32_t* __restrict__ out_cv32,
+                                              int N)
+{
+    extern __shared__ __align__(16) uint32_t smem[]; // 动态 SMEM；需要 >= TILE_CVS*8*4 字节
+    // 视作 2D：[TILE_CVS][8+PAD]
+    uint32_t* cv_tile = smem;
+
+    const int tid     = threadIdx.x;
+    const int warp_id = tid / WARP_SIZE;   // 0..15
+    const int lane_id = tid % WARP_SIZE;   // 0..31
+
+    // 本 block 负责的分片起点
+    const int tile_start = blockIdx.x * TILE_CVS;
+    if (tile_start >= N) return;
+
+    // N等于8的时候，这里就是8
+    const int tile_n = min(TILE_CVS, N - tile_start); // 该分片的实际 CV 数（<=2048）
+
+    // ---------------- Stage 1: 合并访存 loading 到 SMEM ----------------
+    // 每线程搬多个 CV：i = tid, tid+blockDim, ...
+    for (int i = tid; i < tile_n; i += NUM_THREADS) {       // 注意：i = tid, 不是等于0
+        const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8;
+        uint32_t* s = cv_tile + (size_t)i * (8 + PAD);
+        // 两次 16B
+        const uint4* g4 = reinterpret_cast<const uint4*>(g);
+        uint4*       s4 = reinterpret_cast<uint4*>(s);
+        // s4[0] = g4[0];
+        // s4[1] = g4[1];
+
+        // in case that the address is not aligned
+        uint4 v0 = g4[0];
+        uint4 v1 = g4[1];
+
+        s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w;
+        s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w;
     }
+    // 对于 tile_n < TILE_CVS 的尾部，无需清零；后续按有效范围处理
     __syncthreads();
 
-    // after all these things, we have finished
-    // 32 -> 16 -> 8 -> 4 -> 2 -> 1 merge
-    // and now, we are going to implement higher-level merge
-    // we have 16 warps, each warp has a root cv
-    // so we are going to execute another logN steps
+    // ---------------- Stage 2: 线程内 4→1（保持相邻配对） ----------------
+    // 共有 reduced_n0 = ceil(tile_n / 4) 个 lane-root
+    const int reduced_n0 = (tile_n + 3) >> 1 >> 1; // 等价于 (tile_n+3)/4
+    uint32_t lane_cv[8]; // 本线程输出的 lane-root
+    bool lane_valid = false;
+
+    // 每线程的 4 个输入的起始索引
+    int base4 = tid << 2; // tid*4
+    if (base4 < tile_n) {
+        // 读取最多 4 个相邻 CV：idx = base4 + 0,1,2,3
+        uint32_t a[8], b[8], c[8], d[8];
+        const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD);
+        load_cv_g2r(s0, a);
+
+        int remain = tile_n - base4;
+
+        if (remain >= 2) {
+            const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD);
+            load_cv_g2r(s1, b);
+        }
+        if (remain >= 3) {
+            const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD);
+            load_cv_g2r(s2, c);
+        }
+        if (remain >= 4) {
+            const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD);
+            load_cv_g2r(s3, d);
+        }
 
-    // 16 -> 8 -> 4 -> 2 -> 1
-    for (int stride = NUM_WARPS >> 1; stride >= 1; stride >>= 1) {
-        if (warp_id < stride && lane_id == 0) {
-            uint32_t p[8];
-            blake3_parent_cv(&cv_smem[2*warp_id][0], &cv_smem[2*warp_id + 1][0], p);
-#pragma unroll
-            for (int j=0;j<8;++j) 
-                cv_smem[warp_id][j] = p[j];     // write back to shared memory
+        // 两层相邻配对（奇数晋级）
+        if (remain == 1) {
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                lane_cv[j] = a[j];
+        } else if (remain == 2) {
+            blake3_parent_cv(a, b, lane_cv);
+        } else if (remain == 3) {
+            uint32_t p01[8];
+            blake3_parent_cv(a, b, p01);
+            blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01，(p01,c)->lane_cv
+        } else { // remain >= 4
+            uint32_t p01[8], p23[8];
+            blake3_parent_cv(a, b, p01);
+            blake3_parent_cv(c, d, p23);
+            blake3_parent_cv(p01, p23, lane_cv);
         }
-        __syncthreads();
+        lane_valid = true;
     }
 
-    // write this root cv to global memory, not done yet! we need another tiny kernel to sweep
-    if (tid == 0) {
-        uint32_t* out = block_cvs + (size_t)blockIdx.x * 8;
-#pragma unroll
+    // ---------------- Stage 3: Warp 内 32→1 相邻配对 ----------------
+    // 每个 warp 负责一个连续段：warp_base = warp_id*32
+    const int warp_base = warp_id * WARP_SIZE;
+    const int cur_n_w   = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // 本 warp 段内的有效数量
+
+    // 把 lane_cv 保留在寄存器里做归约；无效 lane 用 mask 剔除
+    unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // 仅做存在检测
+    int cur_n = cur_n_w;
+
+    // 把“段外的线程”标成无效（避免读越界）
+    bool active_lane = (lane_id < cur_n_w);
+
+    // 对无效 lane 把值清成 0（不会被使用）
+    if (!active_lane) { 
+        #pragma unroll
         for (int j = 0; j < 8; ++j) 
-            out[j] = cv_smem[0][j];
+            lane_cv[j] = 0u; 
     }
 
-}   // blake3_block_reduce_kernel
+    // 逐层配对：1,2,4,8,16 - warp-reduce
+    for (int step = 1; step < WARP_SIZE; step <<= 1) {
+        // 取右邻
+        uint32_t nbr[8];
+        #pragma unroll
+        for (int j = 0; j < 8; ++j)
+            nbr[j] = __shfl_down_sync(0xFFFFFFFFu, lane_cv[j], step);
+
+        const bool do_pair =
+            active_lane &&
+            ((lane_id % (step<<1)) == 0) &&
+            (lane_id + step < cur_n);
+
+        if (do_pair) {
+            blake3_parent_cv(lane_cv, nbr, lane_cv);
+        }
 
+        cur_n = (cur_n + 1) >> 1;
+        // __syncwarp();
+    }
 
-// ============ Tiny kernel ============
-// In big kernel, it will consume 512 KiB each block
-// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 512 root = 2048 root
-// And this tiny kernel is designed to process these 2048 root
-template <int NUM_THREADS>
-__global__ void blake3_pair_reduce_kernel(const uint32_t* __restrict__ in_cv32,
-                                          uint32_t* __restrict__ out_cv32,
-                                          int n) {
-    extern __shared__ uint32_t smem[];   // dynamic shared memory
-    uint32_t* tile = smem;               // -> [tile_n][8]
-
-    const int tid   = threadIdx.x;
-    const int b     = blockIdx.x;
-    const int B     = gridDim.x;
-
-    const int start = (int)((1ll * n * b)     / B);
-    const int end   = (int)((1ll * n * (b+1)) / B);
-    int tile_n      = end - start;
-
-    if (tile_n <= 0) return;   // border
-
-    const int words = tile_n * 8;
-    for (int w = tid; w < words; w += NUM_THREADS) {
-        tile[w] = in_cv32[start * 8 + w];
+    // 这一段的结果在 lane0；把 16 个 warp-root 写入 SMEM 的前 16 行
+    __shared__ uint32_t warp_roots[WARP_SIZE/2][8]; // 16×8
+    if (lane_id == 0 && cur_n_w > 0) {
+        #pragma unroll
+        for (int j=0;j<8;++j) 
+            warp_roots[warp_id][j] = lane_cv[j];
     }
     __syncthreads();
 
-    int cur = tile_n;
-    while (cur > 1) {
-        const int pairs = cur >> 1;   // floor(cur/2)
-        // process pairs
-        for (int i = tid; i < pairs; i += NUM_THREADS) {
-            const uint32_t* L = &tile[(2*i)   * 8];
-            const uint32_t* R = &tile[(2*i+1) * 8];
+    // ---------------- Stage 4: CTA 内 16→1 相邻配对 ----------------
+    // 有效 warp 数：ceil(reduced_n0 / 32)
+    int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..16
+    if (valid_warps == 0) return;
+
+    // 每一个warp的lane 0来做计算
+    // 用 lane0 做计算，其它 lane 空转
+    for (int stride = (valid_warps >> 1); stride >= 1; stride >>= 1) {
+        if (warp_id < stride && lane_id == 0) {
             uint32_t p[8];
-            blake3_parent_cv(L, R, p);
-#pragma unroll
-            for (int j=0;j<8;++j) tile[i*8 + j] = p[j];
+            blake3_parent_cv(&warp_roots[2*warp_id][0],
+                             &warp_roots[2*warp_id + 1][0], p);
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                warp_roots[warp_id][j] = p[j];
         }
         __syncthreads();
-
-        // even situation: 
-        if ((cur & 1) && tid == 0) {
-            uint32_t* dst = &tile[pairs * 8];
-            uint32_t* src = &tile[(cur - 1) * 8];
-#pragma unroll
-            for (int j=0;j<8;++j) 
-                dst[j] = src[j];
+        // 奇数晋级
+        if ((valid_warps & 1) && warp_id==0 && lane_id==0) {
+            #pragma unroll
+            for (int j=0;j<8;++j)
+                warp_roots[stride][j] = warp_roots[valid_warps-1][j];
         }
         __syncthreads();
-
-        cur = pairs + (cur & 1);
+        valid_warps = (valid_warps + 1) >> 1;
     }
 
-    // write output
-    if (tid == 0) {
-        uint32_t* out = &out_cv32[b * 8];
-#pragma unroll
+    // 写回本 block 的根
+    if (threadIdx.x == 0) {
+        uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8;
+        #pragma unroll
         for (int j = 0; j < 8; ++j) 
-            out[j] = tile[j];
+            out[j] = warp_roots[0][j];
     }
 }
 
 constexpr uint32_t FLAG_ROOT = 8;
 
 inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) {
-    uint32_t zero_block[16] = {0};
+    const uint32_t zero_block[16] = {0};
     uint32_t st[16];
     blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st);
     // 写出前 32 字节（state[0..7]，小端）
@@ -315,14 +611,28 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
         std::abort();
     }
 
+    int optin = 0, deflt = 0;
+    cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0);
+    cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0);
+
+    const int dyn_smem = 64 * 1024;     // 64KiB
+
+    // 编译器在编译期决定分配多少动态shmem给kernel
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_cv_block_reduce_kernel<512, 2048, 0>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_cv_block_reduce_kernel<32, 2048, 0>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+
     constexpr int  CHUNK_SIZE = 1024;                       // 1 KiB
     constexpr int  WORDS_PER_CHUNK = CHUNK_SIZE / 4;        // 256
     constexpr int  NUM_THREADS = 512;                       // for big kernel
     constexpr int  NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;       // 16
-    constexpr int  CHUNKS_PER_BLOCK= NUM_WARPS * WARP_SIZE;                     // 16 * 32 = 512
+    constexpr int  CHUNKS_PER_BLOCK = 64;                     // 16 * 32 = 512
     const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
     const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
-    const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);
+    const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);     // 16384 blocks
 
     uint8_t*  d_bytes   = nullptr;
     uint32_t* d_words   = nullptr; // alias
@@ -333,18 +643,19 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
     cudaMemcpyAsync(d_bytes, data, bytes_len, cudaMemcpyHostToDevice, stream);
     d_words = reinterpret_cast<uint32_t*>(d_bytes);
 
-    cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t));
+    cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t));     // 512 KiB
 
-    // launch big kernel
+    // ============= launch big kernel =============
     dim3 grid_big(num_blocks);
     dim3 block_big(NUM_THREADS);
     uint64_t base_chunk_counter = 0ull;
     
-    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, 0>
+    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, 4, 4>
         <<<grid_big, block_big, /*smem*/0, stream>>>(
             d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks);
 
     CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 
     if (num_blocks == 1) {
         std::array<uint32_t,8> host_root{};
@@ -371,49 +682,44 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
     }
 
     // the first round of tiny kernel
-    const int B = (num_blocks >= 8) ? 8 : num_blocks;
-    uint32_t* d_midCV = nullptr;
-    cudaMalloc(&d_midCV, (size_t)B * 8u * sizeof(uint32_t));
-
+    // 1) 16384 output reduce -> 8
+    uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32
     {
-        dim3 grid(B);
-        dim3 block(512);                                  // 你指定 “每个 block 512 线程”
-        // 每个 block 负责 ceil(num_blocks / B) 个 CV；SMEM 大小按此计算
-        const int tile = (num_blocks + B - 1) / B;
-        const size_t smem_bytes = (size_t)tile * 8u * sizeof(uint32_t);
-
-        blake3_pair_reduce_kernel<512>
-            <<<grid, block, smem_bytes, stream>>>(d_blockCV, d_midCV, num_blocks);
+        const int N = 16384;        // total number
+        const int TILE = 2048;
+        const int grid = (N + TILE - 1) / TILE;  // = 8
+        const int block = 512;
+        const size_t smem_bytes = (size_t)(TILE) * 8u * sizeof(uint32_t); // 2048×8×4 = 64 KiB
+
+        cudaMalloc(&d_mid_out, (size_t)8 * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<512, 2048, 0>
+            <<<grid, block, smem_bytes, stream>>>(d_blockCV /*in: 16384×8 x 4*/,
+                                                d_mid_out   /*out: 8×8*/, N);
         CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
     }
 
     // second round
-    uint32_t* d_root = nullptr;
-    cudaMalloc(&d_root, 8u * sizeof(uint32_t));
-
+    uint32_t* d_root_cv = nullptr;
     {
-        dim3 grid(1);
-        dim3 block(B);
-        const size_t smem_bytes = (size_t)B * 8u * sizeof(uint32_t);
-
-        // generate kernel during compile time
-        switch (B) {
-            case 1:  blake3_pair_reduce_kernel<1 ><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
-            case 2:  blake3_pair_reduce_kernel<2 ><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
-            case 4:  blake3_pair_reduce_kernel<4 ><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
-            case 8:  blake3_pair_reduce_kernel<8 ><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
-            case 16: blake3_pair_reduce_kernel<16><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
-            case 32: blake3_pair_reduce_kernel<32><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
-            case 64: blake3_pair_reduce_kernel<64><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
-            default: {
-                blake3_pair_reduce_kernel<256><<<grid, /*blockDim*/B, smem_bytes, stream>>>(d_midCV, d_root, B);
-            } break;
-        }
+        const int N = 8;
+        const int TILE = 2048; // 任意 >=N 即可
+        const int grid = 1;
+        const int block = 32; // 32 线程够用
+        const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t);        // 8 x 8 x 4 = 8 x 32 B
+
+        cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<32, 2048, 0>
+            <<<grid, block, smem_bytes, stream>>>(d_mid_out /*in: 8×8*/,
+                                                d_root_cv /*out: 1×8*/, N);
         CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
     }
 
     std::array<uint32_t, 8> host_root{};
-    CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root, 8*sizeof(uint32_t),
+    CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root_cv, 8*sizeof(uint32_t),
                                 cudaMemcpyDeviceToHost, stream));
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
@@ -430,8 +736,8 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
     }
 
     // clear
-    CUDA_CHECK(cudaFree(d_root));
-    CUDA_CHECK(cudaFree(d_midCV));
+    CUDA_CHECK(cudaFree(d_mid_out));
+    CUDA_CHECK(cudaFree(d_root_cv));
     CUDA_CHECK(cudaFree(d_blockCV));
     CUDA_CHECK(cudaFree(d_bytes));
 }
\ No newline at end of file
diff --git a/csrc/blake3_sm80_v1.cu b/csrc/blake3_sm80_v1.cu
new file mode 100644
index 0000000..5f08a97
--- /dev/null
+++ b/csrc/blake3_sm80_v1.cu
@@ -0,0 +1,454 @@
+
+#include <cstdint>
+#include <array>
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <cstring>
+
+#define WARP_SIZE 32
+#define LDST128BITS(value) (reinterpret_cast<const float4*>(&(value))[0])
+
+#define CUDA_CHECK(expr) do {                                   \
+    cudaError_t _e = (expr);                                    \
+    if (_e != cudaSuccess) {                                    \
+      fprintf(stderr, "CUDA error %s at %s:%d: %s\n",           \
+              #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\
+      std::abort();                                             \
+    }                                                           \
+  } while(0)
+
+__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = {
+    0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
+    0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
+};
+
+// ---- 小工具 ----
+__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
+#if defined(__CUDA_ARCH__)
+    return __funnelshift_r(x, x, n);
+#else
+  return (x >> n) | (x << (32 - n));    // host 路径
+#endif
+}
+
+__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) {
+#if defined(__CUDA_ARCH__)
+    const uint4 v = *reinterpret_cast<const uint4*>(src);
+    dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w;
+#else
+    std::memcpy(dst, src, 16);
+#endif
+}
+
+__host__ __device__ void blake3_compress_words_7r(
+    const uint32_t block_words[16],   // 64B -> 16 x 4 bytes = 64 B = 4 x 16 bytes = 4 x 128 bits
+    const uint32_t cv[8], // 8×u32
+    uint64_t chunk_counter,           // 64-bit
+    uint32_t block_len,               // [0..64]
+    uint32_t flags,                   // CHUNK_START/END/PARENT/ROOT…
+    uint32_t out_state[16])           // 返回 16×u32 状态向量（按规范）
+{
+    // TODO: 根据 BLAKE3（基于 BLAKE2s）的 G/round 实现7轮
+    // 这里先给占位：将 IV+cv 混合到 out_state，真实实现请替换
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_state[i] = cv[i];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_state[8+i] = BLAKE3_IV[i];
+
+    out_state[12] ^= (uint32_t)chunk_counter;
+    out_state[13] ^= (uint32_t)(chunk_counter >> 32);
+    out_state[14] ^= block_len;
+    out_state[15] ^= flags;
+
+    // so far, the block_words are still pointers.
+    // now we load it into kernel, as pointed out by ncu profile
+    uint32_t block_reg_1[4];
+
+#pragma unroll
+    for (int i = 0; i < 16; i += 4) {        // the gap is 4
+        load_u128_u32x4(block_words + i, block_reg_1);
+        out_state[i] ^= block_words[i];
+        // 做一点点搅动（占位）
+        out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31);
+    }
+}
+
+// 从 out_state 派生新的 CV（按规范应取 state[0..7] ^ state[8..15]）
+__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_cv[i] = st[i] ^ st[8+i];
+}
+
+// 叶：处理 1KiB chunk（16×64B blocks）→ 1 个 CV
+// 假定输入为小端 u32 流，chunk 不足 1KiB 最后一块 block_len<64 并置 END 标志
+__device__ void blake3_leaf_cv(const uint32_t* chunk_words, 
+                                int chunk_len_bytes,
+                               uint64_t chunk_counter, 
+                               uint32_t out_cv[8])
+{
+    uint32_t cv[8];
+    // 初始 cv = IV
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        cv[i] = BLAKE3_IV[i];
+
+    const int nblocks = (chunk_len_bytes + 63) / 64; // ceil
+    for (int b = 0; b < nblocks; ++b) {
+        uint32_t st[16];
+        const uint32_t* block = chunk_words + b*16;
+        const int remain = chunk_len_bytes - b*64;
+        const uint32_t blk_len = remain >= 64 ? 64u : (uint32_t)remain;
+
+        const uint32_t flags =
+            ((b == 0) ? (1u<<0) : 0u) |                       // CHUNK_START（示意：bit0）
+            ((b == nblocks-1) ? (1u<<1) : 0u);                // CHUNK_END   （示意：bit1）
+
+        blake3_compress_words_7r(block, cv, chunk_counter, blk_len, flags, st);
+        blake3_state_to_cv(st, cv);
+    }
+
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_cv[i] = cv[i];
+}
+
+__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){
+    uint32_t msg[16];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[i] = L[i]; 
+    }
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[8+i] = R[i]; 
+    }
+    uint32_t st[16];
+    blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st);
+    blake3_state_to_cv(st, out_cv);
+}
+
+// ============ Big kernel: 1 warp -> 32 chunks, 1 thread = 1 chunk, 16 WARPS in total ============
+// Each block has 512 threads
+// 1 warp process 32 chunk -> 32 KiB
+// NUM_WARPS = 512 / 32 = 16
+// Each block processes 16 x 32 chunks = 16 x 32 KiB = 512 KiB
+template<const int NUM_THREADS=512, 
+        const int CHUNK_SIZE = 1024, 
+        const int PADSIZE=0>        // pad shared memory
+__global__ void blake3_block_reduce_kernel(uint32_t* d_input, 
+                                            uint32_t* block_cvs,
+                                            int chunk_len_bytes,
+                                            uint64_t base_chunk_counter,
+                                            int total_chunks) {
+    // NUM_WARPS also stands for NUM_CHUKNS per block
+    constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;        // consider it as 16
+
+    // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total
+    // 8 x 4 x 16 = 512 B shared memory in sum
+    __shared__ uint32_t cv_smem[NUM_WARPS][8 + PADSIZE];        // avoid bank conflict
+
+    // reduce pipeline: 16 -> 8 -> 4 -> 2 -> 1
+    const int tid = threadIdx.x;
+    const int warp_id = tid / WARP_SIZE;
+    const int lane_id = tid % WARP_SIZE;
+
+    const uint64_t global_warp_id = blockIdx.x * NUM_WARPS + warp_id;
+    const uint64_t chunk_counter = base_chunk_counter + global_warp_id;
+
+    // index
+    const uint64_t warp_chunk_base = global_warp_id * WARP_SIZE;        // the start of this warp
+    // each thread process one chunk
+    const uint64_t chunk_idx = warp_chunk_base + lane_id;
+
+    // edge processing
+    int valid = total_chunks - warp_chunk_base;
+    if (valid <= 0) return;     // TODO: will this affect warp shfl?
+    if (valid > WARP_SIZE) valid = WARP_SIZE;
+
+    // compute idx for this thread
+    const int WORDS_PER_CHUNK = CHUNK_SIZE / 4;     // 256
+    const uint32_t* chunk_words_ptr = d_input + (size_t)chunk_idx * WORDS_PER_CHUNK;
+
+    uint32_t cv[8] = {0}; // 8 x u32
+    bool active = lane_id < valid;
+    if (active) {
+        const uint64_t chunk_counter = base_chunk_counter + chunk_idx;
+        blake3_leaf_cv(chunk_words_ptr, chunk_len_bytes, chunk_counter, cv);
+    }
+
+    // Stage 2: merging to parent, in O(logN) depth
+
+    // take care: we cannot use general reduce
+    // 0-1-2-3-4-...-31, keep this sequential
+    unsigned mask = __ballot_sync(0xFFFFFFFFu, active);
+    // step = 1,2,4,8,16
+    for (int step = 1; step < WARP_SIZE; step <<= 1) {
+        int partner_lane = lane_id + step;
+
+        // neighbor cv
+        uint32_t neighbor_cv[8];
+#pragma unroll
+        for (int j = 0; j < 8; ++j) {
+            neighbor_cv[j] = __shfl_down_sync(mask, cv[j], step);
+        }
+
+        // the left be parent, and make sure `the right` is valid
+        if (active && ((lane_id & ((step << 1) - 1)) == 0) && (partner_lane < valid)) {
+            blake3_parent_cv(cv, neighbor_cv, cv);
+        }
+        __syncwarp(mask);
+
+        // in the next level, reduce half of active threads
+        if (lane_id >= (valid & ~(step))) 
+            active = false;
+    }
+
+    // now, lane 0 holds the root
+    if (lane_id == 0) {
+#pragma unroll
+        for (int j = 0 ; j < 8; ++j) 
+            cv_smem[warp_id][j] = cv[j];
+    }
+    __syncthreads();
+
+    // after all these things, we have finished
+    // 32 -> 16 -> 8 -> 4 -> 2 -> 1 merge
+    // and now, we are going to implement higher-level merge
+    // we have 16 warps, each warp has a root cv
+    // so we are going to execute another logN steps
+
+    // 16 -> 8 -> 4 -> 2 -> 1
+    for (int stride = NUM_WARPS >> 1; stride >= 1; stride >>= 1) {
+        if (warp_id < stride && lane_id == 0) {
+            uint32_t p[8];
+            blake3_parent_cv(&cv_smem[2*warp_id][0], &cv_smem[2*warp_id + 1][0], p);
+#pragma unroll
+            for (int j=0;j<8;++j) 
+                cv_smem[warp_id][j] = p[j];     // write back to shared memory
+        }
+        __syncthreads();
+    }
+
+    // write this root cv to global memory, not done yet! we need another tiny kernel to sweep
+    if (tid == 0) {
+        uint32_t* out = block_cvs + (size_t)blockIdx.x * 8;
+#pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            out[j] = cv_smem[0][j];
+    }
+
+}   // blake3_block_reduce_kernel
+
+
+// ============ Tiny kernel ============
+// In big kernel, it will consume 512 KiB each block
+// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 512 root = 2048 root
+// And this tiny kernel is designed to process these 2048 root
+template <int NUM_THREADS>
+__global__ void blake3_pair_reduce_kernel(const uint32_t* __restrict__ in_cv32,
+                                          uint32_t* __restrict__ out_cv32,
+                                          int n) {
+    extern __shared__ uint32_t smem[];   // dynamic shared memory
+    uint32_t* tile = smem;               // -> [tile_n][8]
+
+    const int tid   = threadIdx.x;
+    const int b     = blockIdx.x;
+    const int B     = gridDim.x;
+
+    const int start = (int)((1ll * n * b)     / B);
+    const int end   = (int)((1ll * n * (b+1)) / B);
+    int tile_n      = end - start;
+
+    if (tile_n <= 0) return;   // border
+
+    const int words = tile_n * 8;
+    for (int w = tid; w < words; w += NUM_THREADS) {
+        tile[w] = in_cv32[start * 8 + w];
+    }
+    __syncthreads();
+
+    int cur = tile_n;
+    while (cur > 1) {
+        const int pairs = cur >> 1;   // floor(cur/2)
+        // process pairs
+        for (int i = tid; i < pairs; i += NUM_THREADS) {
+            const uint32_t* L = &tile[(2*i)   * 8];
+            const uint32_t* R = &tile[(2*i+1) * 8];
+            uint32_t p[8];
+            blake3_parent_cv(L, R, p);
+#pragma unroll
+            for (int j=0;j<8;++j) tile[i*8 + j] = p[j];
+        }
+        __syncthreads();
+
+        // even situation: 
+        if ((cur & 1) && tid == 0) {
+            uint32_t* dst = &tile[pairs * 8];
+            uint32_t* src = &tile[(cur - 1) * 8];
+#pragma unroll
+            for (int j=0;j<8;++j) 
+                dst[j] = src[j];
+        }
+        __syncthreads();
+
+        cur = pairs + (cur & 1);
+    }
+
+    // write output
+    if (tid == 0) {
+        uint32_t* out = &out_cv32[b * 8];
+#pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            out[j] = tile[j];
+    }
+}
+
+constexpr uint32_t FLAG_ROOT = 8;
+
+inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) {
+    uint32_t zero_block[16] = {0};
+    uint32_t st[16];
+    blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st);
+    // 写出前 32 字节（state[0..7]，小端）
+    for (int i = 0; i < 8; ++i) {
+        uint32_t w = st[i];
+        out32[4*i+0] = (uint8_t)( w        & 0xFF);
+        out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF);
+        out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF);
+        out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF);
+    }
+}
+
+// wrapper function
+void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
+                            std::array<uint32_t,8>* root_out = nullptr,
+                            cudaStream_t stream = 0) {
+    if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) {
+        fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n",
+                (unsigned long long)bytes_len);
+        std::abort();
+    }
+
+    constexpr int  CHUNK_SIZE = 1024;                       // 1 KiB
+    constexpr int  WORDS_PER_CHUNK = CHUNK_SIZE / 4;        // 256
+    constexpr int  NUM_THREADS = 512;                       // for big kernel
+    constexpr int  NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;       // 16
+    constexpr int  CHUNKS_PER_BLOCK= NUM_WARPS * WARP_SIZE;                     // 16 * 32 = 512
+    const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
+    const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
+    const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);
+
+    uint8_t*  d_bytes   = nullptr;
+    uint32_t* d_words   = nullptr; // alias
+    uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32
+
+    // TODO: use thrust
+    cudaMalloc(&d_bytes, bytes_len);
+    cudaMemcpyAsync(d_bytes, data, bytes_len, cudaMemcpyHostToDevice, stream);
+    d_words = reinterpret_cast<uint32_t*>(d_bytes);
+
+    cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t));
+
+    // launch big kernel
+    dim3 grid_big(num_blocks);
+    dim3 block_big(NUM_THREADS);
+    uint64_t base_chunk_counter = 0ull;
+    
+    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, 0>
+        <<<grid_big, block_big, /*smem*/0, stream>>>(
+            d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks);
+
+    CUDA_CHECK(cudaGetLastError());
+
+    if (num_blocks == 1) {
+        std::array<uint32_t,8> host_root{};
+        CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        // last final process
+        uint8_t digest32[32];
+        blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+        if (root_out) *root_out = host_root;
+        else {
+            // 简单打印
+            printf("root CV:");
+            for (int i=0;i<8;++i) 
+                printf(" %08x", host_root[i]);
+            printf("\n");
+        }
+
+        CUDA_CHECK(cudaFree(d_blockCV));
+        CUDA_CHECK(cudaFree(d_bytes));
+        return;
+    }
+
+    // the first round of tiny kernel
+    const int B = (num_blocks >= 8) ? 8 : num_blocks;
+    uint32_t* d_midCV = nullptr;
+    cudaMalloc(&d_midCV, (size_t)B * 8u * sizeof(uint32_t));
+
+    {
+        dim3 grid(B);
+        dim3 block(512);                                  // 你指定 “每个 block 512 线程”
+        // 每个 block 负责 ceil(num_blocks / B) 个 CV；SMEM 大小按此计算
+        const int tile = (num_blocks + B - 1) / B;
+        const size_t smem_bytes = (size_t)tile * 8u * sizeof(uint32_t);
+
+        blake3_pair_reduce_kernel<512>
+            <<<grid, block, smem_bytes, stream>>>(d_blockCV, d_midCV, num_blocks);
+        CUDA_CHECK(cudaGetLastError());
+    }
+
+    // second round
+    uint32_t* d_root = nullptr;
+    cudaMalloc(&d_root, 8u * sizeof(uint32_t));
+
+    {
+        dim3 grid(1);
+        dim3 block(B);
+        const size_t smem_bytes = (size_t)B * 8u * sizeof(uint32_t);
+
+        // generate kernel during compile time
+        switch (B) {
+            case 1:  blake3_pair_reduce_kernel<1 ><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 2:  blake3_pair_reduce_kernel<2 ><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 4:  blake3_pair_reduce_kernel<4 ><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 8:  blake3_pair_reduce_kernel<8 ><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 16: blake3_pair_reduce_kernel<16><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 32: blake3_pair_reduce_kernel<32><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 64: blake3_pair_reduce_kernel<64><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            default: {
+                blake3_pair_reduce_kernel<256><<<grid, /*blockDim*/B, smem_bytes, stream>>>(d_midCV, d_root, B);
+            } break;
+        }
+        CUDA_CHECK(cudaGetLastError());
+    }
+
+    std::array<uint32_t, 8> host_root{};
+    CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    // last final process
+    uint8_t digest32[32];
+    blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+    if (root_out) {
+        *root_out = host_root;
+    } else {
+        printf("root CV:");
+        for (int i=0;i<8;++i) printf(" %08x", host_root[i]);
+        printf("\n");
+    }
+
+    // clear
+    CUDA_CHECK(cudaFree(d_root));
+    CUDA_CHECK(cudaFree(d_midCV));
+    CUDA_CHECK(cudaFree(d_blockCV));
+    CUDA_CHECK(cudaFree(d_bytes));
+}
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 749c743..b2fee0d 100644
--- a/setup.py
+++ b/setup.py
@@ -47,8 +47,12 @@ def locate_cuda():
 COMMON_LIBS = ["cudart"]
 RPATH = [CUDA["libdir"]] if not sys.platform.startswith("win") else []
 
+debug = False
+
 CXX_FLAGS = [
-    f"-std=c++{CXX_STD}", "-O3", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math"
+    f"-std=c++{CXX_STD}", "-O3", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math", "-march=native", "-mavx2", "-mfma"
+] if not debug else [
+    f"-std=c++{CXX_STD}", "-g", "-O0", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math", "-march=native", "-mavx2", "-mfma"
 ]
 LINK_FLAGS = []
 
@@ -66,6 +70,12 @@ def locate_cuda():
     "--expt-relaxed-constexpr",
     "--use_fast_math",
     "-lineinfo",
+] + NVCC_ARCH_FLAGS if not debug else [
+    f"-std=c++{CXX_STD}",
+    "-g", "-O0", "-Xcompiler", "-fPIC",
+    "--expt-relaxed-constexpr",
+    "--use_fast_math",
+    "-G", "-lineinfo",
 ] + NVCC_ARCH_FLAGS
 
 if not sys.platform.startswith("win"):

From fd142cc71197483ecb0bf74c10782931a2b7c259 Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Sun, 5 Oct 2025 21:30:54 +0800
Subject: [PATCH 04/20] update scores

---
 README.md             |  4 ++--
 benchmark/test_gpu.py | 11 +++++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 854f865..a99d68b 100644
--- a/README.md
+++ b/README.md
@@ -23,5 +23,5 @@ python benchmark/test_script.py
 
 # GPU kernel performance on File Compress hashing
 
-+ 10.5 - v1 - [commit:`4f31a2c55551965a2e5f048565f021c4551554ae`]: 1709.34 MiB/s
-+ 10.5 - v2 - []: 1961.36 MiB/s
\ No newline at end of file
++ 10.5 - v1 - [commit:4f31a2c55551965a2e5f048565f021c4551554ae]: 1709.34 MiB/s
++ 10.5 - v2 - [commit:2e71051367a9533af36f6c54a46876e20bc0bab5]: 2044.49 MiB/s
\ No newline at end of file
diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py
index 75fcef9..b009526 100644
--- a/benchmark/test_gpu.py
+++ b/benchmark/test_gpu.py
@@ -12,11 +12,18 @@
 
 std_hex = blake3.blake3(data).hexdigest()
 
+# 2) 预热，触发 JIT/驱动初始化，避免首轮偏慢
+for _ in range(2):
+    fh.blake3_gpu_sm80_hex(data)
+torch.cuda.synchronize()
+
+# 3) 正式计时：这测的是“端到端吞吐”（包含 H2D）
+repeat = 5  # 1GiB × 5 已经很重，按机器调整
 t0 = time.perf_counter()
-repeat = 20
-for i in range(repeat):
+for _ in range(repeat):
     cv_hex = fh.blake3_gpu_sm80_hex(data)
 torch.cuda.synchronize()
+t1 = time.perf_counter()
 
 t1 = time.perf_counter()
 elapsed = t1 - t0

From 329d5425de9558e9e43216c2b89bed43674b7d83 Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Mon, 6 Oct 2025 12:19:27 +0800
Subject: [PATCH 05/20] update kernel to SOTA

---
 .gitignore                              |   3 +-
 benchmark/test_gpu.py                   |  26 +++---
 csrc/binding.cpp                        |  50 ++++++-----
 csrc/blake3.h                           |   5 +-
 csrc/{blake3_sm80.cu => blake3_sm70.cu} |  37 +++++++--
 setup.py                                | 105 ++++++------------------
 6 files changed, 104 insertions(+), 122 deletions(-)
 rename csrc/{blake3_sm80.cu => blake3_sm70.cu} (96%)

diff --git a/.gitignore b/.gitignore
index 5f07045..ee53a7a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,5 @@ build/
 dist/
 *.egg-info/
 
-*.ncu-rep
\ No newline at end of file
+*.ncu-rep
+*.nsys-rep
\ No newline at end of file
diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py
index b009526..4e49107 100644
--- a/benchmark/test_gpu.py
+++ b/benchmark/test_gpu.py
@@ -1,34 +1,40 @@
+import torch
 import flashashing as fh
 import hashlib
-import torch
 import time
 import blake3
 
 GiB = 1024*1024*1024  # bytes -> 1 GiB
 
-s = "A" * GiB                 # 1 GiB
-data = s.encode("utf-8")
-print(len(data))              # 1073741824
+cpu = torch.empty(GiB * 1, dtype=torch.uint8, pin_memory=True)
+cpu[:] = ord('A')
+
+# 一次性 H2D（可重用）
+d = torch.empty_like(cpu, device='cuda')
+d.copy_(cpu, non_blocking=True)
+torch.cuda.synchronize()
+
+stream = torch.cuda.current_stream().cuda_stream
 
-std_hex = blake3.blake3(data).hexdigest()
+# std_hex = blake3.blake3(data).hexdigest()
 
 # 2) 预热，触发 JIT/驱动初始化，避免首轮偏慢
 for _ in range(2):
-    fh.blake3_gpu_sm80_hex(data)
+    fh.blake3_gpu_sm70_hex(d.data_ptr(), d.numel(), stream)
 torch.cuda.synchronize()
 
 # 3) 正式计时：这测的是“端到端吞吐”（包含 H2D）
 repeat = 5  # 1GiB × 5 已经很重，按机器调整
 t0 = time.perf_counter()
 for _ in range(repeat):
-    cv_hex = fh.blake3_gpu_sm80_hex(data)
+    cv_hex = fh.blake3_gpu_sm70_hex(d.data_ptr(), d.numel(), stream)
 torch.cuda.synchronize()
 t1 = time.perf_counter()
 
 t1 = time.perf_counter()
 elapsed = t1 - t0
-print(f"Elapsed time for f{repeat}x BLAKE3 (GPU SM80): {elapsed:.3f} seconds")
-print(f"Throughput: {repeat * len(data) / elapsed / (1024**2):.2f} MiB/s")
+print(f"Elapsed time for {repeat}x BLAKE3 (GPU SM70): {elapsed:.3f} seconds")
+print(f"Throughput: {repeat * d.numel() / elapsed / (1024**2):.2f} MiB/s")
 print("root CV (hex) =", cv_hex)
 
-print(f"std BLAKE3 Expected: {std_hex}")
\ No newline at end of file
+# print(f"std BLAKE3 Expected: {std_hex}")
\ No newline at end of file
diff --git a/csrc/binding.cpp b/csrc/binding.cpp
index 54bb3d6..d32b172 100644
--- a/csrc/binding.cpp
+++ b/csrc/binding.cpp
@@ -156,23 +156,29 @@ struct GilRelease {
     py::gil_scoped_release rel;
 };
 
-static py::bytes blake3_gpu_root_cv_bytes(py::object obj) {
-    auto v = get_bytes_view(obj);
-    std::array<uint32_t,8> root{};
-    {
-        GilRelease _g;
-        blake3_block_reduce_sm80(v.ptr, static_cast<uint64_t>(v.len), &root, /*stream=*/0);
-    }
-    std::string b = cv_words_to_bytes_le(root);
-    return py::bytes(b);
-}
+// static py::bytes blake3_gpu_root_cv_bytes(py::object obj) {
+//     auto v = get_bytes_view(obj);
+//     std::array<uint32_t,8> root{};
+//     {
+//         GilRelease _g;
+//         blake3_block_reduce_sm70(v.ptr, static_cast<uint64_t>(v.len), &root, /*stream=*/0);
+//     }
+//     std::string b = cv_words_to_bytes_le(root);
+//     return py::bytes(b);
+// }
+
+static std::string blake3_gpu_root_hex(uint64_t device_ptr,
+                                        uint64_t nbytes,
+                                        uint64_t stream_int = 0) {
+    auto d_data = reinterpret_cast<const uint8_t*>(device_ptr);
+    auto stream = reinterpret_cast<cudaStream_t>(stream_int);
+    cudaSetDevice(0);
+    cudaFree(0);               // attach
 
-static std::string blake3_gpu_root_hex(py::object obj) {
-    auto v = get_bytes_view(obj);
     std::array<uint32_t,8> root{};
     {
         GilRelease _g;
-        blake3_block_reduce_sm80(v.ptr, static_cast<uint64_t>(v.len), &root, /*stream=*/0);
+        blake3_block_reduce_sm70(d_data, nbytes, &root, stream);
     }
     std::string b = cv_words_to_bytes_le(root);
     return bytes_to_hex(reinterpret_cast<const uint8_t*>(b.data()), b.size());
@@ -193,16 +199,16 @@ PYBIND11_MODULE(flashashing, m) {
           py::arg("data"),
           "Compute BLAKE3 hash (single-threaded).");
 
-    m.def("blake3_gpu_sm80",
-          &blake3_gpu_root_cv_bytes,
-          py::arg("data"),
-          R"pbdoc(
-Return the 32-byte *root chaining value* (CV) computed on GPU for the given data.
-NOTE: This is not the standard BLAKE3 digest/XOF output. It's the root CV.
-)pbdoc");
-    m.def("blake3_gpu_sm80_hex",
+//     m.def("blake3_gpu_sm70",
+//           &blake3_gpu_root_cv_bytes,
+//           py::arg("data"),
+//           R"pbdoc(
+// Return the 32-byte *root chaining value* (CV) computed on GPU for the given data.
+// NOTE: This is not the standard BLAKE3 digest/XOF output. It's the root CV.
+// )pbdoc");
+    m.def("blake3_gpu_sm70_hex",
           &blake3_gpu_root_hex,
-          py::arg("data"),
+          py::arg("d_data"), py::arg("nbytes"), py::arg("stream")=0,
           R"pbdoc(
 Return the hex string of the *root chaining value* (CV) computed on GPU.
 )pbdoc");
diff --git a/csrc/blake3.h b/csrc/blake3.h
index c1e27de..52e762a 100644
--- a/csrc/blake3.h
+++ b/csrc/blake3.h
@@ -28,6 +28,9 @@ std::string bytes_to_hex(const uint8_t *data, size_t len);
 
 } // namespace flashashing
 
-void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
+
+// ============== GPU implementations ================
+void blake3_block_reduce_sm70(const uint8_t* d_data, 
+                            uint64_t bytes_len,
                             std::array<uint32_t,8>* root_out = nullptr,
                             cudaStream_t stream = 0);
\ No newline at end of file
diff --git a/csrc/blake3_sm80.cu b/csrc/blake3_sm70.cu
similarity index 96%
rename from csrc/blake3_sm80.cu
rename to csrc/blake3_sm70.cu
index bbc2ae3..03a3969 100644
--- a/csrc/blake3_sm80.cu
+++ b/csrc/blake3_sm70.cu
@@ -602,7 +602,8 @@ inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out3
 }
 
 // wrapper function
-void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
+void blake3_block_reduce_sm70(const uint8_t* d_data, 
+                            uint64_t bytes_len,
                             std::array<uint32_t,8>* root_out = nullptr,
                             cudaStream_t stream = 0) {
     if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) {
@@ -611,6 +612,22 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
         std::abort();
     }
 
+    // int dev = -1;
+    // cudaGetDevice(&dev);
+    // printf("[dbg] my runtime current device = %d\n", dev);
+
+    // cudaPointerAttributes attr{};
+    // auto st = cudaPointerGetAttributes(&attr, d_data);
+    // printf("[dbg] getAttr=%d, type=%d (0=host,1=device,2=managed), device=%d\n",
+    //     (int)st, (int)attr.type, attr.device);
+
+    // cudaPointerAttributes attr{};
+    // CUDA_CHECK(cudaPointerGetAttributes(&attr, d_data));
+    // if (attr.type != cudaMemoryTypeDevice) {
+    //     fprintf(stderr, "d_data is not device memory!\n");
+    //     std::abort();
+    // }
+
     int optin = 0, deflt = 0;
     cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0);
     cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0);
@@ -625,6 +642,7 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
         blake3_cv_block_reduce_kernel<32, 2048, 0>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
 
+        
     constexpr int  CHUNK_SIZE = 1024;                       // 1 KiB
     constexpr int  WORDS_PER_CHUNK = CHUNK_SIZE / 4;        // 256
     constexpr int  NUM_THREADS = 512;                       // for big kernel
@@ -633,16 +651,17 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
     const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
     const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
     const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);     // 16384 blocks
-
-    uint8_t*  d_bytes   = nullptr;
-    uint32_t* d_words   = nullptr; // alias
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, 4, 4>,
+        cudaFuncAttributePreferredSharedMemoryCarveout, 100));
+        
+    uint8_t*  d_bytes = const_cast<uint8_t*>(d_data);
+    uint32_t* d_words = reinterpret_cast<uint32_t*>(d_bytes);; // alias
     uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32
 
-    // TODO: use thrust
-    cudaMalloc(&d_bytes, bytes_len);
-    cudaMemcpyAsync(d_bytes, data, bytes_len, cudaMemcpyHostToDevice, stream);
-    d_words = reinterpret_cast<uint32_t*>(d_bytes);
+    // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch.
 
+    // TODO: use thrust
     cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t));     // 512 KiB
 
     // ============= launch big kernel =============
@@ -739,5 +758,5 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
     CUDA_CHECK(cudaFree(d_mid_out));
     CUDA_CHECK(cudaFree(d_root_cv));
     CUDA_CHECK(cudaFree(d_blockCV));
-    CUDA_CHECK(cudaFree(d_bytes));
+    // CUDA_CHECK(cudaFree(d_bytes));
 }
\ No newline at end of file
diff --git a/setup.py b/setup.py
index b2fee0d..e047cc4 100644
--- a/setup.py
+++ b/setup.py
@@ -1,9 +1,9 @@
 # setup.py
 from setuptools import setup
 from setuptools.command.build_ext import build_ext
-from pybind11.setup_helpers import Pybind11Extension
 import pybind11, numpy as np
 import sys, os, shutil
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 
 def find_in_path(name, path):
     for d in path.split(os.pathsep):
@@ -84,93 +84,40 @@ def locate_cuda():
     # MSVC 的 NVCC 透传
     NVCC_FLAGS += ["-Xcompiler", "/openmp", "-Xcompiler", "/MD", "-Xcompiler", "/O2"]
 
-# give .cu to nvcc
-from distutils.unixccompiler import UnixCCompiler
-
-class BuildExtNVCC(build_ext):
-    def build_extensions(self):
-
-        self.compiler.src_extensions.append(".cu")
-
-        original_compile = self.compiler.compile
-
-        def nvcc_compile(sources, output_dir=None, macros=None, include_dirs=None,
-                         debug=0, extra_preargs=None, extra_postargs=None, depends=None):
-            cxx_sources, cu_sources = [], []
-            for s in sources:
-                (cu_sources if os.path.splitext(s)[1] == ".cu" else cxx_sources).append(s)
-
-            objects = []
-            if cxx_sources:
-
-                postargs = extra_postargs.get("cxx", []) if isinstance(extra_postargs, dict) else extra_postargs
-                objects += original_compile(
-                    cxx_sources, output_dir, macros, include_dirs, debug, extra_preargs, postargs, depends
-                )
-
-            if cu_sources:
-
-                for src in cu_sources:
-                    obj = self.compiler.object_filenames([src], output_dir=output_dir)[0]
-                    cmd = [CUDA["nvcc"], "-c", src, "-o", obj] + NVCC_FLAGS
-
-                    incs = include_dirs or []
-                    for inc in incs:
-                        cmd += ["-I", inc]
-
-                    if macros:
-                        for m in macros:
-                            if isinstance(m, tuple):
-                                name, val = m
-                                cmd += ["-D%s=%s" % (name, val)]
-                            else:
-                                cmd += ["-D%s" % m]
-
-                    if isinstance(extra_postargs, dict):
-                        cmd += extra_postargs.get("nvcc", [])
-                    elif extra_postargs:
-                        cmd += extra_postargs
-
-                    os.makedirs(os.path.dirname(obj), exist_ok=True)
-                    self.spawn(cmd)
-                    objects.append(obj)
-            return objects
-
-        self.compiler.compile = nvcc_compile
-
-        for ext in self.extensions:
-            if not sys.platform.startswith("win"):
-                ext.runtime_library_dirs = list(set((ext.runtime_library_dirs or []) + RPATH))
-        build_ext.build_extensions(self)
-
 # ---------- 扩展模块 ----------
 sources = [
     "csrc/sha256_base.cpp",
     "csrc/sha256_simd.cpp",
     "csrc/blake3_base.cpp",
-    "csrc/blake3_sm80.cu",
+    "csrc/blake3_sm70.cu",
     "csrc/binding.cpp",
 ]
 
-ext = Pybind11Extension(
-    "flashashing",
-    sources=sources,
-    include_dirs=COMMON_INCLUDES,
-    library_dirs=COMMON_LIB_DIRS,
-    libraries=COMMON_LIBS,
-    extra_compile_args={
-        "cxx": CXX_FLAGS,
-        "nvcc": []
-    },
-    extra_link_args=LINK_FLAGS,
-    define_macros=[("PYBIND11_DETAILED_ERROR_MESSAGES", "1")] + [(d, None) for d in COMMON_DEFINES],
-)
+# ext = Pybind11Extension(
+#     "flashashing",
+#     sources=sources,
+#     include_dirs=COMMON_INCLUDES,
+#     library_dirs=COMMON_LIB_DIRS,
+#     libraries=COMMON_LIBS,
+#     extra_compile_args={
+#         "cxx": CXX_FLAGS,
+#         "nvcc": []
+#     },
+#     extra_link_args=LINK_FLAGS,
+#     define_macros=[("PYBIND11_DETAILED_ERROR_MESSAGES", "1")] + [(d, None) for d in COMMON_DEFINES],
+# )
 
 setup(
     name="flashashing",
-    version="0.1.0",
-    description="High performance hashing (SHA-256, BLAKE3) with CUDA + pybind11",
-    ext_modules=[ext],
-    cmdclass={"build_ext": BuildExtNVCC},
-    zip_safe=False,
+    ext_modules=[
+        CUDAExtension(
+            "flashashing",
+            sources=sources,
+            extra_compile_args={
+                "cxx": CXX_FLAGS,
+                "nvcc": NVCC_FLAGS,
+            },
+        )
+    ],
+    cmdclass={"build_ext": BuildExtension},
 )

From 9781ad9759d13e0e482162d57c68b65c946f4ff9 Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Mon, 6 Oct 2025 17:13:41 +0800
Subject: [PATCH 06/20] v4 kernel

---
 .gitignore                                    |   6 +-
 README.md                                     |   4 +-
 backup_deprecated/blake3_sm70_v1.cu           | 765 ++++++++++++++++++
 {csrc => backup_deprecated}/blake3_sm80_v1.cu |   0
 csrc/blake3_sm70.cu                           | 419 +++++++---
 5 files changed, 1074 insertions(+), 120 deletions(-)
 create mode 100644 backup_deprecated/blake3_sm70_v1.cu
 rename {csrc => backup_deprecated}/blake3_sm80_v1.cu (100%)

diff --git a/.gitignore b/.gitignore
index ee53a7a..47c25b7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,8 @@ dist/
 *.egg-info/
 
 *.ncu-rep
-*.nsys-rep
\ No newline at end of file
+*.nsys-rep
+
+*.ptx
+
+*.o
\ No newline at end of file
diff --git a/README.md b/README.md
index a99d68b..c22203f 100644
--- a/README.md
+++ b/README.md
@@ -24,4 +24,6 @@ python benchmark/test_script.py
 # GPU kernel performance on File Compress hashing
 
 + 10.5 - v1 - [commit:4f31a2c55551965a2e5f048565f021c4551554ae]: 1709.34 MiB/s
-+ 10.5 - v2 - [commit:2e71051367a9533af36f6c54a46876e20bc0bab5]: 2044.49 MiB/s
\ No newline at end of file
++ 10.5 - v2 - [commit:2e71051367a9533af36f6c54a46876e20bc0bab5]: 2044.49 MiB/s
++ 10.6 - v3 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s
++ 10.6 - v4 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s
\ No newline at end of file
diff --git a/backup_deprecated/blake3_sm70_v1.cu b/backup_deprecated/blake3_sm70_v1.cu
new file mode 100644
index 0000000..b170812
--- /dev/null
+++ b/backup_deprecated/blake3_sm70_v1.cu
@@ -0,0 +1,765 @@
+
+#include <cstdint>
+#include <array>
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <cstring>
+
+#define WARP_SIZE 32
+#define LDST128BITS(value) (reinterpret_cast<const float4*>(&(value))[0])
+
+#define CUDA_CHECK(expr) do {                                   \
+    cudaError_t _e = (expr);                                    \
+    if (_e != cudaSuccess) {                                    \
+      fprintf(stderr, "CUDA error %s at %s:%d: %s\n",           \
+              #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\
+      std::abort();                                             \
+    }                                                           \
+  } while(0)
+
+__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = {
+    0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
+    0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
+};
+
+// ---- 小工具 ----
+__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
+#if defined(__CUDA_ARCH__)
+    return __funnelshift_r(x, x, n);
+#else
+  return (x >> n) | (x << (32 - n));    // host 路径
+#endif
+}
+
+__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) {
+#if defined(__CUDA_ARCH__)
+    const uint4 v = *reinterpret_cast<const uint4*>(src);
+    dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w;
+#else
+    std::memcpy(dst, src, 16);
+#endif
+}
+
+__host__ __device__ void blake3_compress_words_7r(
+    const uint32_t block_words[16],   // 64B -> shared memory
+    const uint32_t cv[8],             // 8×u32 -> shared memory
+    uint64_t chunk_counter,           // 64-bit
+    uint32_t block_len,               // [0..64]
+    uint32_t flags,                   // CHUNK_START/END/PARENT/ROOT…
+    uint32_t out_state[16])           // 返回 16×u32 状态向量（按规范）
+{
+    // TODO: 根据 BLAKE3（基于 BLAKE2s）的 G/round 实现7轮
+    // 这里先给占位：将 IV+cv 混合到 out_state，真实实现请替换
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_state[i] = cv[i];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_state[8+i] = BLAKE3_IV[i];
+
+    out_state[12] ^= (uint32_t)chunk_counter;
+    out_state[13] ^= (uint32_t)(chunk_counter >> 32);
+    out_state[14] ^= block_len;
+    out_state[15] ^= flags;
+
+    // so far, the block_words are still pointers.
+    // now we load it into kernel, as pointed out by ncu profile
+    uint32_t block_reg_1[4];
+
+#pragma unroll
+    for (int i = 0; i < 16; i += 4) {        // the gap is 4
+        // load_u128_u32x4(block_words + i, block_reg_1);
+        out_state[i] ^= block_words[i];
+        // 做一点点搅动（占位）
+        out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31);
+    }
+}
+
+// 从 out_state 派生新的 CV（按规范应取 state[0..7] ^ state[8..15]）
+__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_cv[i] = st[i] ^ st[8+i];
+}
+
+// 叶：处理 1KiB chunk（16×64B blocks）→ 1 个 CV
+// 假定输入为小端 u32 流，chunk 不足 1KiB 最后一块 block_len<64 并置 END 标志
+__device__ void blake3_leaf_cv(const uint32_t* chunk_words, 
+                                int chunk_len_bytes,
+                               uint64_t chunk_counter, 
+                               uint32_t out_cv[8])
+{
+    uint32_t cv[8];
+    // 初始 cv = IV
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        cv[i] = BLAKE3_IV[i];
+
+    const int nblocks = (chunk_len_bytes + 63) / 64; // ceil
+    for (int b = 0; b < nblocks; ++b) {
+        uint32_t st[16];
+        const uint32_t* block = chunk_words + b*16;
+        const int remain = chunk_len_bytes - b*64;
+        const uint32_t blk_len = remain >= 64 ? 64u : (uint32_t)remain;
+
+        const uint32_t flags =
+            ((b == 0) ? (1u<<0) : 0u) |                       // CHUNK_START（示意：bit0）
+            ((b == nblocks-1) ? (1u<<1) : 0u);                // CHUNK_END   （示意：bit1）
+
+        blake3_compress_words_7r(block, cv, chunk_counter, blk_len, flags, st);
+        blake3_state_to_cv(st, cv);
+    }
+
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_cv[i] = cv[i];
+}
+
+__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){
+    uint32_t msg[16];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[i] = L[i]; 
+    }
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[8+i] = R[i]; 
+    }
+    uint32_t st[16];
+    blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st);
+    blake3_state_to_cv(st, out_cv);
+}
+
+// ============ Big kernel: 16 WARPS in total ============
+// grid: (chunks / 64), thread: (512,)
+template<const int NUM_THREADS=512, 
+        const int CHUNK_SIZE=1024, 
+        const int CHUNKS_PER_BLOCK=64,
+        const int PAD_CHUNK=4,
+        const int PAD_CV=4>        // pad shared memory
+__global__ void blake3_block_reduce_kernel(uint32_t* d_input, 
+                                            uint32_t* block_cvs,
+                                            int chunk_len_bytes,
+                                            uint64_t base_chunk_counter,
+                                            int total_chunks) {
+    // NUM_WARPS also stands for NUM_CHUKNS per block
+    constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;        // consider it as 16
+    constexpr int CHUNKS_PROCEED = 64;
+
+    static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned");
+    static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4");
+
+    // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total
+    // 8 x 4 x 64 = 2 KiB shared memory in sum
+    __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV];        // avoid bank conflict
+
+    // 4 bytes x 256 x 64 = 64 KiB shared memory.
+    __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][(CHUNK_SIZE / 4) + PAD_CHUNK];     // [64][256]
+
+    const int tid = threadIdx.x;
+    const int bx = blockIdx.x;
+    const int warp_id = tid / WARP_SIZE;
+    const int lane_id = tid % WARP_SIZE;
+
+    constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4;     // 256
+    constexpr int VEC_ELEMS = 4;                        // uint4
+    constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB)
+    constexpr int WARPS_PER_CTA = NUM_WARPS;            // 16
+
+    // ============== STAGE 1: Coalsced Global Memory Loading ==============
+    const int tile_id   = blockIdx.x;
+    const int tile_base = tile_id * CHUNKS_PER_BLOCK;       // which chunk do this block start loading
+
+    int valid_chunks = total_chunks - tile_base;
+    if (valid_chunks <= 0) {
+        return; // overflow
+    }
+    if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK;
+
+    for (int ldt = 0; ldt < 4; ldt++) {
+        // each warp load 4 chunks
+        int chunk_local  = ldt * WARPS_PER_CTA + warp_id;           // ldt*16 + warp -> start chunk
+        int chunk_global = tile_base + chunk_local;                 // global chunk idx
+
+        // the pointer for shared memory
+        uint32_t* s_u32 = &chunk_smem[chunk_local][0];
+
+        // only read from global, when it's valid
+        // or, we fill it with 0
+        if (chunk_local < valid_chunks) {
+            const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK;
+
+            // move 16 bytes -> 128 bits each time
+            // each thread will load 2 x 16 bytes
+            // so 32 threads will load 32 x 2 x 16 = 1024 B
+            const uint4* __restrict__ g4 = reinterpret_cast<const uint4*>(g_u32);
+            uint4* __restrict__ s4 = reinterpret_cast<uint4*>(s_u32);
+
+            // idx = lane_id (0..31) 与 lane_id+32 (32..63)
+            int idx0 = lane_id;           // 0..31
+            int idx1 = lane_id + WARP_SIZE; // 32..63
+
+            // thread 0 -> 0, 32
+            // thread 1 -> 1, 33
+            // ...
+            // thread 31 -> 31, 63
+            // so the global memory access is coalsced
+
+            // notice, we load 16 bytes a time. the index is compressed
+            // tid 0 -> 0,       tid 1 -> 16
+            // tid 0 -> 32 x 16, tid 2 -> 32 x 16 + 16
+
+            uint4 v0 = g4[idx0];        // still, this step we load from gmem, in 4 elements aligned.
+            uint4 v1 = g4[idx1];
+
+            s_u32[4*idx0 + 0] = v0.x;   // when load into shared mem, do manually
+            s_u32[4*idx0 + 1] = v0.y;     
+            s_u32[4*idx0 + 2] = v0.z; 
+            s_u32[4*idx0 + 3] = v0.w;
+
+            s_u32[4*idx1 + 0] = v1.x; 
+            s_u32[4*idx1 + 1] = v1.y;
+            s_u32[4*idx1 + 2] = v1.z; 
+            s_u32[4*idx1 + 3] = v1.w;
+        } else {
+            uint4* s4 = reinterpret_cast<uint4*>(s_u32);
+            int idx0 = lane_id;
+            int idx1 = lane_id + WARP_SIZE;
+            s4[idx0] = make_uint4(0u,0u,0u,0u);
+            s4[idx1] = make_uint4(0u,0u,0u,0u);
+        }
+    }
+
+    __syncthreads();        // sync all warps
+
+    // ============== STAGE 2: Compress Leaf to 64 chain value ==============
+    const int pass0_valid = min(32, valid_chunks);              // pass0 cover [0, 31] chunks
+    const int pass1_valid = max(0, valid_chunks - 32);          // pass1 cover [32, 63] chunks
+
+    __shared__ int parents_count;
+    if (threadIdx.x == 0) {
+        const int parents0 = (pass0_valid + 1) >> 1;
+        const int parents1 = (pass1_valid + 1) >> 1;
+        parents_count = parents0 + parents1;             // ≤ 32
+    }
+    __syncthreads();
+
+    auto compute_leaf_cv_from_row = [&](int chunk_local, uint32_t out_cv[8]) {
+        const uint32_t* mline = &chunk_smem[chunk_local][0];   // 1 KiB = 256 u32
+        const uint64_t  cc = base_chunk_counter + (uint64_t)(tile_base + chunk_local);
+        blake3_leaf_cv(mline, chunk_len_bytes, cc, out_cv);
+    };
+
+    uint32_t warp_cv_pass0[8], warp_cv_pass1[8];
+    bool have_pass0 = false, have_pass1 = false;
+
+    // ------- STAGE 2 pass = 0 -------
+    {
+        const int left  = (warp_id << 1);            // 2*warp_id : 0,2,4,...,30
+        const int right = left + 1;                  // neighbor: 1,3,5,...,31
+        const int pair_idx = left >> 1;              // 0..15
+
+        uint32_t left_cv[8], right_cv[8];
+
+        bool have_left = false, have_right = false;
+
+        if (lane_id == 0 && left < pass0_valid) {
+            compute_leaf_cv_from_row(left, left_cv);
+            have_left = true;
+        }
+        if (lane_id == 1 && right < pass0_valid) {
+            compute_leaf_cv_from_row(right, right_cv);
+            have_right = true;
+        }
+
+        // merge two neighbor
+        unsigned mask = __ballot_sync(0xFFFFFFFFu, (lane_id==0 && have_left) || (lane_id==1 && have_right));
+        if (lane_id == 0 && left < pass0_valid) {
+            uint32_t parent[8];
+            if (have_right) {
+                uint32_t rcv[8];
+                #pragma unroll
+                for (int j = 0; j < 8; ++j) 
+                    rcv[j] = __shfl_sync(mask, right_cv[j], 1);
+                blake3_parent_cv(left_cv, rcv, parent);
+            } else {
+                #pragma unroll
+                for (int j = 0; j < 8; ++j) 
+                    parent[j] = left_cv[j];   // 奇数晋级
+            }
+            // 写入 cv_smem 的正确位置：pair_idx = left/2  → 0..15
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                cv_smem[pair_idx][j] = parent[j];
+        }
+        __syncwarp();
+    }
+
+    // ---- STAGE 2 pass 1 ----
+    {
+        const int left  = 32 + (warp_id << 1);       // 32,34,...,62
+        const int right = left + 1;                  // 33,35,...,63
+        const int pair_idx = left >> 1;              // 16..31
+
+        uint32_t left_cv[8], right_cv[8];
+
+        bool have_left = false, have_right = false;
+
+        if (lane_id == 0 && (left - 32) < pass1_valid) {
+            compute_leaf_cv_from_row(left, left_cv);
+            have_left = true;
+        }
+        if (lane_id == 1 && (right - 32) < pass1_valid) {
+            compute_leaf_cv_from_row(right, right_cv);
+            have_right = true;
+        }
+
+        // TODO: here we may have some issue: overflow and border situation
+        unsigned mask = __ballot_sync(0xFFFFFFFFu, (lane_id==0 && have_left) || (lane_id==1 && have_right));
+        if (lane_id == 0 && (left - 32) < pass1_valid) {
+            uint32_t parent[8];
+            if (have_right) {
+                uint32_t rcv[8];
+                #pragma unroll
+                for (int j = 0; j < 8; ++j)
+                    rcv[j] = __shfl_sync(mask, right_cv[j], 1);
+                blake3_parent_cv(left_cv, rcv, parent);
+            } else {
+                #pragma unroll
+                for (int j = 0; j < 8; ++j) 
+                    parent[j] = left_cv[j];   // 奇数晋级
+            }
+            // write to the right position
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                cv_smem[pair_idx][j] = parent[j];
+        }
+        __syncwarp();
+    }
+
+    __syncthreads();
+
+    // ============== STAGE 3: Block-Reduce ==============
+    // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv
+    // we will only use warp 0 to handle this thing
+    if (warp_id == 0) {
+        uint32_t cv[8] = {0,0,0,0,0,0,0,0};
+
+        const bool active_lane = (lane_id < parents_count);
+        if (active_lane) {
+    #pragma unroll
+            for (int j = 0; j < 8; ++j) cv[j] = cv_smem[lane_id][j];
+        }
+
+        // 2) warp reduce 32 -> 16 -> 8 -> 4 -> 2 -> 1
+        unsigned mask = __ballot_sync(0xFFFFFFFFu, active_lane);
+        int cur_n = parents_count;  // 当前层的有效节点数（逐层更新）
+
+        for (int step = 1; step < WARP_SIZE; step <<= 1) {
+            // right-neighbor
+            uint32_t nbr[8];
+    #pragma unroll
+            for (int j = 0; j < 8; ++j) {
+                nbr[j] = __shfl_down_sync(mask, cv[j], step);
+            }
+
+            // safety checking
+            const bool do_pair =
+                (lane_id % (step << 1) == 0) &&               // 左侧
+                (lane_id + step < cur_n) &&                   // 右侧在当前层有效范围内
+                (lane_id < cur_n);                            // 左侧也必须有效
+
+            if (do_pair) {
+                blake3_parent_cv(cv, nbr, cv);               // parent(left, right) -> cv
+            }
+
+            cur_n = (cur_n + 1) >> 1;
+            __syncwarp(mask);
+        }
+
+        // 3) write back to global memory
+        if (lane_id == 0 && parents_count > 0) {
+            const int tile_id = blockIdx.x;
+            uint32_t* out = block_cvs + (size_t)tile_id * 8;        // 8 x 4 = 32 B
+
+            // two different write ways
+            #if 0
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                out[j] = cv[j];
+            #else
+            // block_cvs should be cudaMalloc ed
+            reinterpret_cast<uint4*>(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]);
+            reinterpret_cast<uint4*>(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]);
+            #endif
+        }
+    }
+}   // blake3_block_reduce_kernel
+
+__device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) {
+    const uint4* g4 = reinterpret_cast<const uint4*>(g);
+    uint4 a = g4[0], b = g4[1];
+    r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w;
+    r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w;
+}
+
+__device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) {
+    uint4* g4 = reinterpret_cast<uint4*>(g);
+    g4[0] = make_uint4(r[0],r[1],r[2],r[3]);
+    g4[1] = make_uint4(r[4],r[5],r[6],r[7]);
+}
+
+// ============ Tiny kernel ============
+// In big kernel, it will consume 64 KiB each block
+// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 64 root = 16384 root
+// And this tiny kernel is designed to process these 16384 root
+template<int NUM_THREADS=512, int TILE_CVS=2048, int PAD=0>
+__global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32,
+                                              uint32_t* __restrict__ out_cv32,
+                                              int N)
+{
+    extern __shared__ __align__(16) uint32_t smem[]; // 动态 SMEM；需要 >= TILE_CVS*8*4 字节
+    // 视作 2D：[TILE_CVS][8+PAD]
+    uint32_t* cv_tile = smem;
+
+    const int tid     = threadIdx.x;
+    const int warp_id = tid / WARP_SIZE;   // 0..15
+    const int lane_id = tid % WARP_SIZE;   // 0..31
+
+    // 本 block 负责的分片起点
+    const int tile_start = blockIdx.x * TILE_CVS;
+    if (tile_start >= N) return;
+
+    // N等于8的时候，这里就是8
+    const int tile_n = min(TILE_CVS, N - tile_start); // 该分片的实际 CV 数（<=2048）
+
+    // ---------------- Stage 1: 合并访存 loading 到 SMEM ----------------
+    // 每线程搬多个 CV：i = tid, tid+blockDim, ...
+    for (int i = tid; i < tile_n; i += NUM_THREADS) {       // 注意：i = tid, 不是等于0
+        const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8;
+        uint32_t* s = cv_tile + (size_t)i * (8 + PAD);
+        // 两次 16B
+        const uint4* g4 = reinterpret_cast<const uint4*>(g);
+        uint4*       s4 = reinterpret_cast<uint4*>(s);
+        // s4[0] = g4[0];
+        // s4[1] = g4[1];
+
+        // in case that the address is not aligned
+        uint4 v0 = g4[0];
+        uint4 v1 = g4[1];
+
+        s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w;
+        s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w;
+    }
+    // 对于 tile_n < TILE_CVS 的尾部，无需清零；后续按有效范围处理
+    __syncthreads();
+
+    // ---------------- Stage 2: 线程内 4→1（保持相邻配对） ----------------
+    // 共有 reduced_n0 = ceil(tile_n / 4) 个 lane-root
+    const int reduced_n0 = (tile_n + 3) >> 1 >> 1; // 等价于 (tile_n+3)/4
+    uint32_t lane_cv[8]; // 本线程输出的 lane-root
+    bool lane_valid = false;
+
+    // 每线程的 4 个输入的起始索引
+    int base4 = tid << 2; // tid*4
+    if (base4 < tile_n) {
+        // 读取最多 4 个相邻 CV：idx = base4 + 0,1,2,3
+        uint32_t a[8], b[8], c[8], d[8];
+        const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD);
+        load_cv_g2r(s0, a);
+
+        int remain = tile_n - base4;
+
+        if (remain >= 2) {
+            const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD);
+            load_cv_g2r(s1, b);
+        }
+        if (remain >= 3) {
+            const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD);
+            load_cv_g2r(s2, c);
+        }
+        if (remain >= 4) {
+            const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD);
+            load_cv_g2r(s3, d);
+        }
+
+        // 两层相邻配对（奇数晋级）
+        if (remain == 1) {
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                lane_cv[j] = a[j];
+        } else if (remain == 2) {
+            blake3_parent_cv(a, b, lane_cv);
+        } else if (remain == 3) {
+            uint32_t p01[8];
+            blake3_parent_cv(a, b, p01);
+            blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01，(p01,c)->lane_cv
+        } else { // remain >= 4
+            uint32_t p01[8], p23[8];
+            blake3_parent_cv(a, b, p01);
+            blake3_parent_cv(c, d, p23);
+            blake3_parent_cv(p01, p23, lane_cv);
+        }
+        lane_valid = true;
+    }
+
+    // ---------------- Stage 3: Warp 内 32→1 相邻配对 ----------------
+    // 每个 warp 负责一个连续段：warp_base = warp_id*32
+    const int warp_base = warp_id * WARP_SIZE;
+    const int cur_n_w   = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // 本 warp 段内的有效数量
+
+    // 把 lane_cv 保留在寄存器里做归约；无效 lane 用 mask 剔除
+    unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // 仅做存在检测
+    int cur_n = cur_n_w;
+
+    // 把“段外的线程”标成无效（避免读越界）
+    bool active_lane = (lane_id < cur_n_w);
+
+    // 对无效 lane 把值清成 0（不会被使用）
+    if (!active_lane) { 
+        #pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            lane_cv[j] = 0u; 
+    }
+
+    // 逐层配对：1,2,4,8,16 - warp-reduce
+    for (int step = 1; step < WARP_SIZE; step <<= 1) {
+        // 取右邻
+        uint32_t nbr[8];
+        #pragma unroll
+        for (int j = 0; j < 8; ++j)
+            nbr[j] = __shfl_down_sync(0xFFFFFFFFu, lane_cv[j], step);
+
+        const bool do_pair =
+            active_lane &&
+            ((lane_id % (step<<1)) == 0) &&
+            (lane_id + step < cur_n);
+
+        if (do_pair) {
+            blake3_parent_cv(lane_cv, nbr, lane_cv);
+        }
+
+        cur_n = (cur_n + 1) >> 1;
+        // __syncwarp();
+    }
+
+    // 这一段的结果在 lane0；把 16 个 warp-root 写入 SMEM 的前 16 行
+    __shared__ uint32_t warp_roots[WARP_SIZE/2][8]; // 16×8
+    if (lane_id == 0 && cur_n_w > 0) {
+        #pragma unroll
+        for (int j=0;j<8;++j) 
+            warp_roots[warp_id][j] = lane_cv[j];
+    }
+    __syncthreads();
+
+    // ---------------- Stage 4: CTA 内 16→1 相邻配对 ----------------
+    // 有效 warp 数：ceil(reduced_n0 / 32)
+    int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..16
+    if (valid_warps == 0) return;
+
+    // 每一个warp的lane 0来做计算
+    // 用 lane0 做计算，其它 lane 空转
+    for (int stride = (valid_warps >> 1); stride >= 1; stride >>= 1) {
+        if (warp_id < stride && lane_id == 0) {
+            uint32_t p[8];
+            blake3_parent_cv(&warp_roots[2*warp_id][0],
+                             &warp_roots[2*warp_id + 1][0], p);
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                warp_roots[warp_id][j] = p[j];
+        }
+        __syncthreads();
+        // 奇数晋级
+        if ((valid_warps & 1) && warp_id==0 && lane_id==0) {
+            #pragma unroll
+            for (int j=0;j<8;++j)
+                warp_roots[stride][j] = warp_roots[valid_warps-1][j];
+        }
+        __syncthreads();
+        valid_warps = (valid_warps + 1) >> 1;
+    }
+
+    // 写回本 block 的根
+    if (threadIdx.x == 0) {
+        uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8;
+        #pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            out[j] = warp_roots[0][j];
+    }
+}
+
+constexpr uint32_t FLAG_ROOT = 8;
+
+inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) {
+    const uint32_t zero_block[16] = {0};
+    uint32_t st[16];
+    blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st);
+    // 写出前 32 字节（state[0..7]，小端）
+    for (int i = 0; i < 8; ++i) {
+        uint32_t w = st[i];
+        out32[4*i+0] = (uint8_t)( w        & 0xFF);
+        out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF);
+        out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF);
+        out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF);
+    }
+}
+
+// wrapper function
+void blake3_block_reduce_sm70(const uint8_t* d_data, 
+                            uint64_t bytes_len,
+                            std::array<uint32_t,8>* root_out = nullptr,
+                            cudaStream_t stream = 0) {
+    if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) {
+        fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n",
+                (unsigned long long)bytes_len);
+        std::abort();
+    }
+
+    // int dev = -1;
+    // cudaGetDevice(&dev);
+    // printf("[dbg] my runtime current device = %d\n", dev);
+
+    // cudaPointerAttributes attr{};
+    // auto st = cudaPointerGetAttributes(&attr, d_data);
+    // printf("[dbg] getAttr=%d, type=%d (0=host,1=device,2=managed), device=%d\n",
+    //     (int)st, (int)attr.type, attr.device);
+
+    // cudaPointerAttributes attr{};
+    // CUDA_CHECK(cudaPointerGetAttributes(&attr, d_data));
+    // if (attr.type != cudaMemoryTypeDevice) {
+    //     fprintf(stderr, "d_data is not device memory!\n");
+    //     std::abort();
+    // }
+
+    int optin = 0, deflt = 0;
+    cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0);
+    cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0);
+
+    const int dyn_smem = 64 * 1024;     // 64KiB
+
+    // 编译器在编译期决定分配多少动态shmem给kernel
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_cv_block_reduce_kernel<512, 2048, 0>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_cv_block_reduce_kernel<32, 2048, 0>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+
+        
+    constexpr int  CHUNK_SIZE = 1024;                       // 1 KiB
+    constexpr int  WORDS_PER_CHUNK = CHUNK_SIZE / 4;        // 256
+    constexpr int  NUM_THREADS = 512;                       // for big kernel
+    constexpr int  NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;       // 16
+    constexpr int  CHUNKS_PER_BLOCK = 64;                     // 16 * 32 = 512
+    const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
+    const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
+    const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);     // 16384 blocks
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, 4, 4>,
+        cudaFuncAttributePreferredSharedMemoryCarveout, 100));
+        
+    uint8_t*  d_bytes = const_cast<uint8_t*>(d_data);
+    uint32_t* d_words = reinterpret_cast<uint32_t*>(d_bytes);; // alias
+    uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32
+
+    // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch.
+
+    // TODO: use thrust
+    cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t));     // 512 KiB
+
+    // ============= launch big kernel =============
+    dim3 grid_big(num_blocks);
+    dim3 block_big(NUM_THREADS);
+    uint64_t base_chunk_counter = 0ull;
+    
+    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, 4, 4>
+        <<<grid_big, block_big, /*smem*/0, stream>>>(
+            d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks);
+
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    if (num_blocks == 1) {
+        std::array<uint32_t,8> host_root{};
+        CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        // last final process
+        uint8_t digest32[32];
+        blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+        if (root_out) *root_out = host_root;
+        else {
+            // 简单打印
+            printf("root CV:");
+            for (int i=0;i<8;++i) 
+                printf(" %08x", host_root[i]);
+            printf("\n");
+        }
+
+        CUDA_CHECK(cudaFree(d_blockCV));
+        CUDA_CHECK(cudaFree(d_bytes));
+        return;
+    }
+
+    // the first round of tiny kernel
+    // 1) 16384 output reduce -> 8
+    uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32
+    {
+        const int N = 16384;        // total number
+        const int TILE = 2048;
+        const int grid = (N + TILE - 1) / TILE;  // = 8
+        const int block = 512;
+        const size_t smem_bytes = (size_t)(TILE) * 8u * sizeof(uint32_t); // 2048×8×4 = 64 KiB
+
+        cudaMalloc(&d_mid_out, (size_t)8 * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<512, 2048, 0>
+            <<<grid, block, smem_bytes, stream>>>(d_blockCV /*in: 16384×8 x 4*/,
+                                                d_mid_out   /*out: 8×8*/, N);
+        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    // second round
+    uint32_t* d_root_cv = nullptr;
+    {
+        const int N = 8;
+        const int TILE = 2048; // 任意 >=N 即可
+        const int grid = 1;
+        const int block = 32; // 32 线程够用
+        const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t);        // 8 x 8 x 4 = 8 x 32 B
+
+        cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<32, 2048, 0>
+            <<<grid, block, smem_bytes, stream>>>(d_mid_out /*in: 8×8*/,
+                                                d_root_cv /*out: 1×8*/, N);
+        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    std::array<uint32_t, 8> host_root{};
+    CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root_cv, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    // last final process
+    uint8_t digest32[32];
+    blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+    if (root_out) {
+        *root_out = host_root;
+    } else {
+        printf("root CV:");
+        for (int i=0;i<8;++i) printf(" %08x", host_root[i]);
+        printf("\n");
+    }
+
+    // clear
+    CUDA_CHECK(cudaFree(d_mid_out));
+    CUDA_CHECK(cudaFree(d_root_cv));
+    CUDA_CHECK(cudaFree(d_blockCV));
+    // CUDA_CHECK(cudaFree(d_bytes));
+}
\ No newline at end of file
diff --git a/csrc/blake3_sm80_v1.cu b/backup_deprecated/blake3_sm80_v1.cu
similarity index 100%
rename from csrc/blake3_sm80_v1.cu
rename to backup_deprecated/blake3_sm80_v1.cu
diff --git a/csrc/blake3_sm70.cu b/csrc/blake3_sm70.cu
index 03a3969..4a98b49 100644
--- a/csrc/blake3_sm70.cu
+++ b/csrc/blake3_sm70.cu
@@ -22,6 +22,30 @@ __host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = {
     0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
 };
 
+enum : uint32_t {
+    FLAG_CHUNK_START        = 1u << 0,
+    FLAG_CHUNK_END          = 1u << 1,
+    FLAG_PARENT             = 1u << 2,
+    FLAG_ROOT               = 1u << 3,
+    FLAG_KEYED_HASH         = 1u << 4,
+    FLAG_DERIVE_KEY_CONTEXT = 1u << 5,
+    FLAG_DERIVE_KEY_MATERIAL= 1u << 6,
+};
+
+__device__ __noinline__
+uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) {
+    uint32_t f = 0;
+    if (block_idx_in_chunk == 0)              f |= FLAG_CHUNK_START;
+    if (block_idx_in_chunk == nblocks_in_chunk - 1) f |= FLAG_CHUNK_END;
+    if (is_root_chunk)                         f |= FLAG_ROOT; // only this block in msg, or this is root
+    return f;
+}
+
+__device__ __forceinline__
+uint32_t blake3_parent_flags(bool is_root_parent) {
+    return FLAG_PARENT | (is_root_parent ? FLAG_ROOT : 0);
+}
+
 // ---- 小工具 ----
 __host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
 #if defined(__CUDA_ARCH__)
@@ -82,37 +106,206 @@ __host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16
         out_cv[i] = st[i] ^ st[8+i];
 }
 
-// 叶：处理 1KiB chunk（16×64B blocks）→ 1 个 CV
-// 假定输入为小端 u32 流，chunk 不足 1KiB 最后一块 block_len<64 并置 END 标志
-__device__ void blake3_leaf_cv(const uint32_t* chunk_words, 
-                                int chunk_len_bytes,
-                               uint64_t chunk_counter, 
-                               uint32_t out_cv[8])
+// swap-table
+// BLAKE3 message schedule: rows are P^r, r=0..6.
+// Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) 
+// https://www.cse.unsw.edu.au/~cs4601/refs/papers/blake3.pdf
+__device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = {
+    // r = 0: identity
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+    // r = 1: P
+    {  2,  6,  3, 10,  7,  0,  4, 13,  1, 11, 12,  5,  9, 14, 15,  8 },
+    // r = 2: P∘P
+    {  3,  4, 10, 12, 13,  2,  7, 14,  6,  5,  9,  0, 11, 15,  8,  1 },
+    // r = 3
+    { 10,  7, 12,  9, 14,  3, 13, 15,  4,  0, 11,  2,  5,  8,  1,  6 },
+    // r = 4
+    { 12, 13,  9, 11, 15, 10, 14,  8,  7,  2,  5,  3,  0,  1,  6,  4 },
+    // r = 5
+    {  9, 14, 11,  5,  8, 12, 15,  1, 13,  3,  0, 10,  2,  6,  4,  7 },
+    // r = 6
+    { 11, 15,  5,  0,  1,  9,  8,  6, 14, 10,  2, 12,  3,  4,  7, 13 },
+};
+
+// get the "r" round, "k" message, it is broadcasted from m[li] lane. li = k
+__device__ __forceinline__ uint32_t msg_rk(uint32_t m_lane, int round, int k, unsigned mask16) {
+    int src = B3_MSG_SCHEDULE[round][k];
+    return __shfl_sync(mask16, m_lane, src, 16);
+}
+
+__device__ __noinline__
+uint32_t G_update_role(uint32_t v_self, uint32_t v_b, uint32_t v_c, uint32_t v_d,
+                       uint32_t mx, uint32_t my, int role)
 {
-    uint32_t cv[8];
-    // 初始 cv = IV
-#pragma unroll
-    for (int i = 0; i < 8; ++i) 
-        cv[i] = BLAKE3_IV[i];
+    // 按 BLAKE2s 32-bit 的 G 序列算出 a',b',c',d'，最后返回“当前 role”的那个值
+    uint32_t a = v_self, b = v_b, c = v_c, d = v_d;
+
+    // a = a + b + mx; 
+    // d ^= a; 
+    // d >>>= 16
+    a = a + b + mx;   
+    d ^= a;   
+    d = rotr32(d, 16);
+
+    // c = c + d; 
+    // b ^= c; 
+    // b >>>= 12
+    c = c + d;        
+    b ^= c;   
+    b = rotr32(b, 12);
+
+    // a = a + b + my; 
+    // d ^= a; 
+    // d >>>= 8
+    a = a + b + my;   
+    d ^= a;   
+    d = rotr32(d, 8);
+
+    // c = c + d; 
+    // b ^= c; 
+    // b >>>= 7
+    c = c + d;        
+    b ^= c;   
+    b = rotr32(b, 7);
+
+    // role choice:
+    switch (role) {
+      case 0: return a;
+      case 1: return b;
+      case 2: return c;
+      default: return d;
+    }
+}
 
-    const int nblocks = (chunk_len_bytes + 63) / 64; // ceil
+// notice that, this function will proceed 2 chunks, each time.
+// - chunk_words_row: current chunk
+// - out_cv: written by lane 0, or lane 16
+__device__ __noinline__
+void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row, // 256×u32 -> 1024 Bytes, from shared memory
+                                    // so the chunks_row += 2 as gap
+                                    int chunk_len_bytes,
+                                    uint64_t chunk_counter,
+                                    uint32_t out_cv[8],
+                                    unsigned mask16) {
+    // computing index
+    int lane = threadIdx.x & 31;    // lane_id: 0-31
+    int sub  = lane >> 4;            // 0/1
+    int li   = lane & 15;            // 0..15, abstract lane id. for example, lane 16 will be li=0
+    int role = li & 3;               // a/b/c/d role
+    int base = (sub << 4);           // 0 or 16 the absolute base
+
+    const int nblocks = (chunk_len_bytes + 63) >> 6;  // ceil(chunk_len/64)
+
+    int warp_id = threadIdx.x / WARP_SIZE;
+
+    // initialize
+    uint32_t cv_word = 0;
+    if (li < 8) cv_word = BLAKE3_IV[li];
+
+    // process all blocks
+    // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks
+    // each block has 64B -> 16 x u32
     for (int b = 0; b < nblocks; ++b) {
-        uint32_t st[16];
-        const uint32_t* block = chunk_words + b*16;
-        const int remain = chunk_len_bytes - b*64;
-        const uint32_t blk_len = remain >= 64 ? 64u : (uint32_t)remain;
+        // each lane holds one u32, 
+        // 16 lane will hold 16 x 4 = 64 B -> it's block
+        // the another 16 lane will hold opposite 64 B
+        const uint32_t m_lane = chunk_words_row[b * 16 + li];
+
+        // 初始化 v：v[0..7]=cv, v[8..11]=IV，v[12..15]^=t/len/flags
+        // 先把“自己的那个索引”的初值准备好：
+        uint32_t v = (li < 8)
+            ? cv_word                                 // v[i]（i<8）
+            : BLAKE3_IV[li - 8];                      // v[8..15] ← IV
+
+        // 计数器/长度/标志（按 BLAKE3 规范）
+        const uint32_t t0 = (uint32_t)chunk_counter;
+        const uint32_t t1 = (uint32_t)(chunk_counter >> 32);
+        const int remain = chunk_len_bytes - (b << 6);
+        const uint32_t block_len = (remain >= 64) ? 64u : (uint32_t)remain;
+
+        const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false);
+
+        // 只在 12..15 四个索引上异或相应域（不分支，用谓词掩码）
+        v ^= (li == 12) ? t0       : 0u;
+        v ^= (li == 13) ? t1       : 0u;
+        v ^= (li == 14) ? block_len: 0u;
+        v ^= (li == 15) ? flags    : 0u;
+
+        // ===== 7 rounds =====
+        #pragma unroll 1            // 不要unroll
+        for (int r = 0; r < 7; ++r) {
+            // inside this loop, each lane will do one job
+            // 16 lane will execute 16 x 2 operations
+            // in sequential-programming, will do 8 operation
+
+            // ---- 列步（quartet: {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15}）----
+            {
+                // 取同 quartet 的 b/c/d（基于当前 v）
+                uint32_t vb = __shfl_xor_sync(mask16, v,  4, 16);
+                uint32_t vc = __shfl_xor_sync(mask16, v,  8, 16);
+                uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16);
+
+                // 本 quartet 的 i ∈ {0,1,2,3}，列步用 msg 索引 0..7（两两为一对）
+                int gi = (li & 3); // 0..3
+                uint32_t mx = msg_rk(m_lane, r, 2*gi + 0, mask16);
+                uint32_t my = msg_rk(m_lane, r, 2*gi + 1, mask16);
+
+                v = G_update_role(v, vb, vc, vd, mx, my, role);
+            }
 
-        const uint32_t flags =
-            ((b == 0) ? (1u<<0) : 0u) |                       // CHUNK_START（示意：bit0）
-            ((b == nblocks-1) ? (1u<<1) : 0u);                // CHUNK_END   （示意：bit1）
+            // ---- 对角步 ----
+            {
+                // 把索引写成 li = q + 4*rq；对角步先做置换 li' = (rq<<2) | ((q+rq)&3)
+                int q  = (li & 3);
+                int rq = (li >> 2);
+                int li_diag = (rq << 2) | ((q + rq) & 3);
 
-        blake3_compress_words_7r(block, cv, chunk_counter, blk_len, flags, st);
-        blake3_state_to_cv(st, cv);
+                // 在“对角置换域”取到当前 v 值
+                uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16);
+
+                // 在该域内做“列步”同样的四邻取值
+                uint32_t vb = __shfl_xor_sync(mask16, v_diag,  4, 16);
+                uint32_t vc = __shfl_xor_sync(mask16, v_diag,  8, 16);
+                uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16);
+
+                // 对角步的 4 组 G 使用本轮消息对的后半（索引 8..15）
+                int gi = (li_diag & 3); // 0..3
+                uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi + 0, mask16);
+                uint32_t my = msg_rk(m_lane, r, 8 + 2*gi + 1, mask16);
+
+                uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role);
+
+                // 反置换回原位：li_undo = (rq<<2) | ((q - rq) & 3)
+                int li_undo = (rq << 2) | ((q - rq) & 3);
+                // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16);
+                v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16);
+            }
+        } // 7 rounds end
+
+        // 派生新的 CV：cv[i] = v[i] ^ v[i+8]（仅 li=0..7 生效）
+        uint32_t vip8_all = __shfl_sync(mask16, v, li ^ 8, 16);
+        if (li < 8) {
+            cv_word = v ^ vip8_all;
+        }
+
+        // 下一块继续（本函数内 16 个 block 串行）
     }
 
-#pragma unroll
-    for (int i = 0; i < 8; ++i) 
-        out_cv[i] = cv[i];
+    // 由 lane0 / lane16 收集 8×u32 输出
+
+    // This will trigger problem!
+    // if (li == 0) {      // only thread 0 and thread 16 will do this.
+    //     #pragma unroll
+    //     for (int j = 0; j < 8; ++j) {
+    //         out_cv[j] = __shfl_sync(mask16, cv_word, j, 16);
+    //     }
+    // }
+
+    #pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用
+        if (li == 0) out_cv[j] = wj;                       // 仅 lane0 落盘
+    }
 }
 
 __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){
@@ -135,8 +328,8 @@ __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint3
 template<const int NUM_THREADS=512, 
         const int CHUNK_SIZE=1024, 
         const int CHUNKS_PER_BLOCK=64,
-        const int PAD_CHUNK=4,
-        const int PAD_CV=4>        // pad shared memory
+        const int PAD_CHUNK=0,
+        const int PAD_CV=0>        // pad shared memory
 __global__ void blake3_block_reduce_kernel(uint32_t* d_input, 
                                             uint32_t* block_cvs,
                                             int chunk_len_bytes,
@@ -212,10 +405,15 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
             uint4 v0 = g4[idx0];        // still, this step we load from gmem, in 4 elements aligned.
             uint4 v1 = g4[idx1];
 
-            s_u32[4*idx0 + 0] = v0.x; s_u32[4*idx0 + 1] = v0.y;     // when load into shared mem, do manually
-            s_u32[4*idx0 + 2] = v0.z; s_u32[4*idx0 + 3] = v0.w;
-            s_u32[4*idx1 + 0] = v1.x; s_u32[4*idx1 + 1] = v1.y;
-            s_u32[4*idx1 + 2] = v1.z; s_u32[4*idx1 + 3] = v1.w;
+            s_u32[4*idx0 + 0] = v0.x;   // when load into shared mem, do manually
+            s_u32[4*idx0 + 1] = v0.y;     
+            s_u32[4*idx0 + 2] = v0.z; 
+            s_u32[4*idx0 + 3] = v0.w;
+
+            s_u32[4*idx1 + 0] = v1.x; 
+            s_u32[4*idx1 + 1] = v1.y;
+            s_u32[4*idx1 + 2] = v1.z; 
+            s_u32[4*idx1 + 3] = v1.w;
         } else {
             uint4* s4 = reinterpret_cast<uint4*>(s_u32);
             int idx0 = lane_id;
@@ -237,103 +435,92 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
         const int parents1 = (pass1_valid + 1) >> 1;
         parents_count = parents0 + parents1;             // ≤ 32
     }
-    // __syncthreads();
-
-    auto compute_leaf_cv_from_row = [&](int chunk_local, uint32_t out_cv[8]) {
-        const uint32_t* mline = &chunk_smem[chunk_local][0];   // 1 KiB = 256 u32
-        const uint64_t  cc = base_chunk_counter + (uint64_t)(tile_base + chunk_local);
-        blake3_leaf_cv(mline, chunk_len_bytes, cc, out_cv);
-    };
-
-    uint32_t warp_cv_pass0[8], warp_cv_pass1[8];
-    bool have_pass0 = false, have_pass1 = false;
-
-    // ------- pass = 0 -------
-    {
-        const int left  = (warp_id << 1);            // 2*warp_id : 0,2,4,...,30
-        const int right = left + 1;                  // neighbor: 1,3,5,...,31
-        const int pair_idx = left >> 1;              // 0..15
-
-        uint32_t left_cv[8], right_cv[8];
-
-        bool have_left = false, have_right = false;
+    __syncthreads();
 
-        if (lane_id == 0 && left < pass0_valid) {
-            compute_leaf_cv_from_row(left, left_cv);
-            have_left = true;
-        }
-        if (lane_id == 1 && right < pass0_valid) {
-            compute_leaf_cv_from_row(right, right_cv);
-            have_right = true;
+    // if (blockIdx.x==0 && threadIdx.x==0) printf("after stage1\n");
+
+    // this is for each warp's lane0 and lane16 written
+    // to decrease the register usage.
+    __shared__ __align__(16) uint32_t tmp_cv[CHUNKS_PER_BLOCK][8 + PAD_CV]; // 2 KiB
+
+    // lambda function: compress this thing
+    auto do_big_pass = [&](int base /*0 或 32*/, int pass_valid) {
+        // left=base+2*warp_id, right=left+1
+        const int left       = base + (warp_id << 1);       // base + 0,2,4,6,...
+        const int right      = left + 1;
+        const int left_rel   = left  - base;   // 0..31
+        const int right_rel  = right - base;   // 1..32
+        const bool has_left  = (left_rel  < pass_valid);
+        const bool has_right = (right_rel < pass_valid);
+
+        // const int lane_id    = threadIdx.x & 31;
+        const int sub        = lane_id >> 4;               // sub-warp: 0 or 1, lane0-15: sub-warp0; lane16-lane31: sub-warp1
+        const int li         = lane_id & 15;               // 0..15
+        const unsigned mask16= (sub == 0 ? 0x0000FFFFu : 0xFFFF0000u);
+
+        const int chunk_local = left + sub;        // sub=0→left, sub=1→right
+        const bool active = (sub==0 ? (left - base)  < pass_valid
+                                    : (right - base) < pass_valid);
+
+        // uint32_t my_cv[8];
+
+        // the left-sub-warp and right-sub-warp will execute the same code
+        // distinguish the index by computing,
+        // to avoid warp-divergence
+        if (active) {
+            // the chunk local identifies the left or right chunk, so do not worry.
+            const uint32_t* row = &chunk_smem[chunk_local][0];
+            const uint64_t  cc  = base_chunk_counter + (uint64_t)(tile_base + chunk_local);
+            blake3_leaf_cv_simd16_onechunk(row, 
+                chunk_len_bytes,
+                cc, 
+                &tmp_cv[chunk_local][0], 
+                mask16);
         }
 
-        // merge two neighbor
-        unsigned mask = __ballot_sync(0xFFFFFFFFu, (lane_id==0 && have_left) || (lane_id==1 && have_right));
-        if (lane_id == 0 && left < pass0_valid) {
+        __syncwarp();       // make sure two warps written into `tmp_cv`
+
+        // now we have compute 2 chunks' cv
+        // merge it to a parent cv
+        if (lane_id == 0 && has_left) {
+            const uint32_t* lcv = &tmp_cv[left][0];
             uint32_t parent[8];
-            if (have_right) {
-                uint32_t rcv[8];
+            if ((right - base) < pass_valid) {
+                const uint32_t* rcv = &tmp_cv[right][0];
+                blake3_parent_cv(lcv, rcv, parent);
+            } else {        // odd: up-flow directly
                 #pragma unroll
-                for (int j = 0; j < 8; ++j) 
-                    rcv[j] = __shfl_sync(mask, right_cv[j], 1);
-                blake3_parent_cv(left_cv, rcv, parent);
-            } else {
-                #pragma unroll
-                for (int j = 0; j < 8; ++j) 
-                    parent[j] = left_cv[j];   // 奇数晋级
+                for (int j = 0 ; j < 8; ++j) 
+                    parent[j] = lcv[j];
             }
-            // 写入 cv_smem 的正确位置：pair_idx = left/2  → 0..15
+
+            // now, one warp computes 2 chunks, yield one parent-cv value
+            const int pair_idx = (base >> 1) + warp_id;     // 0, 16 + warp_id
             #pragma unroll
-            for (int j = 0; j < 8; ++j) 
+            for (int j = 0; j < 8; ++j)
                 cv_smem[pair_idx][j] = parent[j];
         }
-        __syncwarp();
-    }
 
-    // ---- pass 1 ----
-    {
-        const int left  = 32 + (warp_id << 1);       // 32,34,...,62
-        const int right = left + 1;                  // 33,35,...,63
-        const int pair_idx = left >> 1;              // 16..31
+        __syncwarp();       // NOTICE: this is necessary!
+    }; // do_big_pass
 
-        uint32_t left_cv[8], right_cv[8];
+    // big-pass 1: computing 0-31 chunks
+    do_big_pass(/*base=*/0,  pass0_valid);
 
-        bool have_left = false, have_right = false;
+    // if (bx == 0) printf("Finish 1 big pass\n");
 
-        if (lane_id == 0 && (left - 32) < pass1_valid) {
-            compute_leaf_cv_from_row(left, left_cv);
-            have_left = true;
-        }
-        if (lane_id == 1 && (right - 32) < pass1_valid) {
-            compute_leaf_cv_from_row(right, right_cv);
-            have_right = true;
-        }
+    // big-pass 2: computing 32-63 chunks
+    do_big_pass(/*base=*/32, pass1_valid);
 
-        // TODO: here we may have some issue: overflow and border situation
-        unsigned mask = __ballot_sync(0xFFFFFFFFu, (lane_id==0 && have_left) || (lane_id==1 && have_right));
-        if (lane_id == 0 && (left - 32) < pass1_valid) {
-            uint32_t parent[8];
-            if (have_right) {
-                uint32_t rcv[8];
-                #pragma unroll
-                for (int j = 0; j < 8; ++j) 
-                    rcv[j] = __shfl_sync(mask, right_cv[j], 1);
-                blake3_parent_cv(left_cv, rcv, parent);
-            } else {
-                #pragma unroll
-                for (int j = 0; j < 8; ++j) 
-                    parent[j] = left_cv[j];   // 奇数晋级
-            }
-            // write to the right position
-            #pragma unroll
-            for (int j = 0; j < 8; ++j) 
-                cv_smem[pair_idx][j] = parent[j];
-        }
-        __syncwarp();
-    }
+    // if (bx == 0) printf("Finish 2 big pass\n");
 
     __syncthreads();
 
+    // printf("Stage 2 done!!!\n");
+
+    // right now, we have got 32 chain values
+    // a warp-reduce to merge.
+
     // ============== STAGE 3: Block-Reduce ==============
     // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv
     // we will only use warp 0 to handle this thing
@@ -391,16 +578,14 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
     }
 }   // blake3_block_reduce_kernel
 
-__device__ __forceinline__
-void load_cv_g2r(const uint32_t* g, uint32_t r[8]) {
+__device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) {
     const uint4* g4 = reinterpret_cast<const uint4*>(g);
     uint4 a = g4[0], b = g4[1];
     r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w;
     r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w;
 }
 
-__device__ __forceinline__
-void store_cv_r2g(const uint32_t r[8], uint32_t* g) {
+__device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) {
     uint4* g4 = reinterpret_cast<uint4*>(g);
     g4[0] = make_uint4(r[0],r[1],r[2],r[3]);
     g4[1] = make_uint4(r[4],r[5],r[6],r[7]);
@@ -585,8 +770,6 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
     }
 }
 
-constexpr uint32_t FLAG_ROOT = 8;
-
 inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) {
     const uint32_t zero_block[16] = {0};
     uint32_t st[16];
@@ -652,7 +835,7 @@ void blake3_block_reduce_sm70(const uint8_t* d_data,
     const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
     const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);     // 16384 blocks
     CUDA_CHECK(cudaFuncSetAttribute(
-        blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, 4, 4>,
+        blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, 0, 0>,
         cudaFuncAttributePreferredSharedMemoryCarveout, 100));
         
     uint8_t*  d_bytes = const_cast<uint8_t*>(d_data);
@@ -669,7 +852,7 @@ void blake3_block_reduce_sm70(const uint8_t* d_data,
     dim3 block_big(NUM_THREADS);
     uint64_t base_chunk_counter = 0ull;
     
-    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, 4, 4>
+    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, 0, 0>
         <<<grid_big, block_big, /*smem*/0, stream>>>(
             d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks);
 

From cdae83c6be9184fd6d9949f29578f88b12f76d27 Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Mon, 6 Oct 2025 18:48:58 +0800
Subject: [PATCH 07/20] resolve bank conflict

---
 README.md                           |   6 +-
 backup_deprecated/blake3_sm70_v1.cu | 119 ++--
 backup_deprecated/blake3_sm70_v2.cu | 939 ++++++++++++++++++++++++++++
 benchmark/test_gpu.py               |   2 +-
 csrc/blake3_sm70.cu                 |  52 +-
 5 files changed, 1046 insertions(+), 72 deletions(-)
 create mode 100644 backup_deprecated/blake3_sm70_v2.cu

diff --git a/README.md b/README.md
index c22203f..0945c67 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,9 @@ python benchmark/test_script.py
 
 # GPU kernel performance on File Compress hashing
 
+> This logs were tested on RTX 4090 Laptop GPU, computation / memory is limited, soon in the latter section we will see the perf on different machine.
+
 + 10.5 - v1 - [commit:4f31a2c55551965a2e5f048565f021c4551554ae]: 1709.34 MiB/s
 + 10.5 - v2 - [commit:2e71051367a9533af36f6c54a46876e20bc0bab5]: 2044.49 MiB/s
-+ 10.6 - v3 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s
-+ 10.6 - v4 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s
\ No newline at end of file
++ 10.6 - v3 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s (False Result)
++ 10.6 - v4 - [commit:9781ad9759d13e0e482162d57c68b65c946f4ff9]: 17069.23 MiB/s
\ No newline at end of file
diff --git a/backup_deprecated/blake3_sm70_v1.cu b/backup_deprecated/blake3_sm70_v1.cu
index b170812..867b6af 100644
--- a/backup_deprecated/blake3_sm70_v1.cu
+++ b/backup_deprecated/blake3_sm70_v1.cu
@@ -31,6 +31,18 @@ __host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
 #endif
 }
 
+#define B3_G(a,b,c,d, mx, my)     \
+    do {                          \
+        a = a + b + (mx);         \
+        d = rotr32(d ^ a, 16);    \
+        c = c + d;                \
+        b = rotr32(b ^ c, 12);    \
+        a = a + b + (my);         \
+        d = rotr32(d ^ a,  8);    \
+        c = c + d;                \
+        b = rotr32(b ^ c,  7);    \
+    } while (0)
+
 __host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) {
 #if defined(__CUDA_ARCH__)
     const uint4 v = *reinterpret_cast<const uint4*>(src);
@@ -40,6 +52,23 @@ __host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t ds
 #endif
 }
 
+__host__ __device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = {
+    // r = 0: identity
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+    // r = 1: P
+    {  2,  6,  3, 10,  7,  0,  4, 13,  1, 11, 12,  5,  9, 14, 15,  8 },
+    // r = 2: P∘P
+    {  3,  4, 10, 12, 13,  2,  7, 14,  6,  5,  9,  0, 11, 15,  8,  1 },
+    // r = 3
+    { 10,  7, 12,  9, 14,  3, 13, 15,  4,  0, 11,  2,  5,  8,  1,  6 },
+    // r = 4
+    { 12, 13,  9, 11, 15, 10, 14,  8,  7,  2,  5,  3,  0,  1,  6,  4 },
+    // r = 5
+    {  9, 14, 11,  5,  8, 12, 15,  1, 13,  3,  0, 10,  2,  6,  4,  7 },
+    // r = 6
+    { 11, 15,  5,  0,  1,  9,  8,  6, 14, 10,  2, 12,  3,  4,  7, 13 },
+};
+
 __host__ __device__ void blake3_compress_words_7r(
     const uint32_t block_words[16],   // 64B -> shared memory
     const uint32_t cv[8],             // 8×u32 -> shared memory
@@ -48,31 +77,49 @@ __host__ __device__ void blake3_compress_words_7r(
     uint32_t flags,                   // CHUNK_START/END/PARENT/ROOT…
     uint32_t out_state[16])           // 返回 16×u32 状态向量（按规范）
 {
-    // TODO: 根据 BLAKE3（基于 BLAKE2s）的 G/round 实现7轮
-    // 这里先给占位：将 IV+cv 混合到 out_state，真实实现请替换
-#pragma unroll
-    for (int i = 0; i < 8; ++i) 
-        out_state[i] = cv[i];
-#pragma unroll
-    for (int i = 0; i < 8; ++i) 
-        out_state[8+i] = BLAKE3_IV[i];
-
-    out_state[12] ^= (uint32_t)chunk_counter;
-    out_state[13] ^= (uint32_t)(chunk_counter >> 32);
-    out_state[14] ^= block_len;
-    out_state[15] ^= flags;
-
-    // so far, the block_words are still pointers.
-    // now we load it into kernel, as pointed out by ncu profile
-    uint32_t block_reg_1[4];
-
-#pragma unroll
-    for (int i = 0; i < 16; i += 4) {        // the gap is 4
-        // load_u128_u32x4(block_words + i, block_reg_1);
-        out_state[i] ^= block_words[i];
-        // 做一点点搅动（占位）
-        out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31);
+    // v[0..7]=cv；v[8..11]=IV；v[12..15]=IV^t0^t1^b^flags
+    uint32_t v0=cv[0], v1=cv[1], v2=cv[2], v3=cv[3];
+    uint32_t v4=cv[4], v5=cv[5], v6=cv[6], v7=cv[7];
+
+    uint32_t v8 = BLAKE3_IV[0], v9  = BLAKE3_IV[1];
+    uint32_t v10= BLAKE3_IV[2], v11 = BLAKE3_IV[3];
+    uint32_t v12= BLAKE3_IV[4] ^ (uint32_t)chunk_counter;
+    uint32_t v13= BLAKE3_IV[5] ^ (uint32_t)(chunk_counter >> 32);
+    uint32_t v14= BLAKE3_IV[6] ^ block_len;
+    uint32_t v15= BLAKE3_IV[7] ^ flags;
+
+    // 7 轮
+#pragma unroll 1
+    for (int r = 0; r < 7; ++r) {
+        // 取本轮 16 个消息字（按 BLAKE3 调度表）
+        const uint8_t* SR = B3_MSG_SCHEDULE[r];  // 0..15
+        const uint32_t m0  = block_words[SR[0]],  m1  = block_words[SR[1]];
+        const uint32_t m2  = block_words[SR[2]],  m3  = block_words[SR[3]];
+        const uint32_t m4  = block_words[SR[4]],  m5  = block_words[SR[5]];
+        const uint32_t m6  = block_words[SR[6]],  m7  = block_words[SR[7]];
+        const uint32_t m8  = block_words[SR[8]],  m9  = block_words[SR[9]];
+        const uint32_t m10 = block_words[SR[10]], m11 = block_words[SR[11]];
+        const uint32_t m12 = block_words[SR[12]], m13 = block_words[SR[13]];
+        const uint32_t m14 = block_words[SR[14]], m15 = block_words[SR[15]];
+
+        // 列步：四个 G
+        B3_G(v0, v4, v8,  v12, m0,  m1);
+        B3_G(v1, v5, v9,  v13, m2,  m3);
+        B3_G(v2, v6, v10, v14, m4,  m5);
+        B3_G(v3, v7, v11, v15, m6,  m7);
+
+        // 对角步：四个 G
+        B3_G(v0, v5, v10, v15, m8,  m9);
+        B3_G(v1, v6, v11, v12, m10, m11);
+        B3_G(v2, v7, v8,  v13, m12, m13);
+        B3_G(v3, v4, v9,  v14, m14, m15);
     }
+
+    // 输出 16×u32 状态（后续由调用者做 state_to_cv）
+    out_state[0]=v0;  out_state[1]=v1;  out_state[2]=v2;  out_state[3]=v3;
+    out_state[4]=v4;  out_state[5]=v5;  out_state[6]=v6;  out_state[7]=v7;
+    out_state[8]=v8;  out_state[9]=v9;  out_state[10]=v10; out_state[11]=v11;
+    out_state[12]=v12;out_state[13]=v13;out_state[14]=v14; out_state[15]=v15;
 }
 
 // 从 out_state 派生新的 CV（按规范应取 state[0..7] ^ state[8..15]）
@@ -615,22 +662,6 @@ void blake3_block_reduce_sm70(const uint8_t* d_data,
         std::abort();
     }
 
-    // int dev = -1;
-    // cudaGetDevice(&dev);
-    // printf("[dbg] my runtime current device = %d\n", dev);
-
-    // cudaPointerAttributes attr{};
-    // auto st = cudaPointerGetAttributes(&attr, d_data);
-    // printf("[dbg] getAttr=%d, type=%d (0=host,1=device,2=managed), device=%d\n",
-    //     (int)st, (int)attr.type, attr.device);
-
-    // cudaPointerAttributes attr{};
-    // CUDA_CHECK(cudaPointerGetAttributes(&attr, d_data));
-    // if (attr.type != cudaMemoryTypeDevice) {
-    //     fprintf(stderr, "d_data is not device memory!\n");
-    //     std::abort();
-    // }
-
     int optin = 0, deflt = 0;
     cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0);
     cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0);
@@ -654,8 +685,12 @@ void blake3_block_reduce_sm70(const uint8_t* d_data,
     const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
     const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
     const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);     // 16384 blocks
+
+    constexpr int pad_chunk = 0;
+    constexpr int pad_cv = 0;
+
     CUDA_CHECK(cudaFuncSetAttribute(
-        blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, 4, 4>,
+        blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>,
         cudaFuncAttributePreferredSharedMemoryCarveout, 100));
         
     uint8_t*  d_bytes = const_cast<uint8_t*>(d_data);
@@ -672,7 +707,7 @@ void blake3_block_reduce_sm70(const uint8_t* d_data,
     dim3 block_big(NUM_THREADS);
     uint64_t base_chunk_counter = 0ull;
     
-    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, 4, 4>
+    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>
         <<<grid_big, block_big, /*smem*/0, stream>>>(
             d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks);
 
diff --git a/backup_deprecated/blake3_sm70_v2.cu b/backup_deprecated/blake3_sm70_v2.cu
new file mode 100644
index 0000000..fc418c2
--- /dev/null
+++ b/backup_deprecated/blake3_sm70_v2.cu
@@ -0,0 +1,939 @@
+
+#include <cstdint>
+#include <array>
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <cstring>
+
+#define WARP_SIZE 32
+#define LDST128BITS(value) (reinterpret_cast<const float4*>(&(value))[0])
+
+#define CUDA_CHECK(expr) do {                                   \
+    cudaError_t _e = (expr);                                    \
+    if (_e != cudaSuccess) {                                    \
+      fprintf(stderr, "CUDA error %s at %s:%d: %s\n",           \
+              #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\
+      std::abort();                                             \
+    }                                                           \
+  } while(0)
+
+__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = {
+    0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
+    0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
+};
+
+enum : uint32_t {
+    FLAG_CHUNK_START        = 1u << 0,
+    FLAG_CHUNK_END          = 1u << 1,
+    FLAG_PARENT             = 1u << 2,
+    FLAG_ROOT               = 1u << 3,
+    FLAG_KEYED_HASH         = 1u << 4,
+    FLAG_DERIVE_KEY_CONTEXT = 1u << 5,
+    FLAG_DERIVE_KEY_MATERIAL= 1u << 6,
+};
+
+__device__ __noinline__
+uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) {
+    uint32_t f = 0;
+    f |= (uint32_t)-(block_idx_in_chunk==0)          & FLAG_CHUNK_START;
+    f |= (uint32_t)-(block_idx_in_chunk==nblocks_in_chunk-1)  & FLAG_CHUNK_END;
+    if (is_root_chunk)                         f |= FLAG_ROOT; // only this block in msg, or this is root
+    return f;
+}
+
+__device__ __forceinline__
+uint32_t blake3_parent_flags(bool is_root_parent) {
+    return FLAG_PARENT | (is_root_parent ? FLAG_ROOT : 0);
+}
+
+// ---- 小工具 ----
+__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
+#if defined(__CUDA_ARCH__)
+    return __funnelshift_r(x, x, n);
+#else
+  return (x >> n) | (x << (32 - n));    // host 路径
+#endif
+}
+
+__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) {
+#if defined(__CUDA_ARCH__)
+    const uint4 v = *reinterpret_cast<const uint4*>(src);
+    dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w;
+#else
+    std::memcpy(dst, src, 16);
+#endif
+}
+
+__host__ __device__ void blake3_compress_words_7r(
+    const uint32_t block_words[16],   // 64B -> shared memory
+    const uint32_t cv[8],             // 8×u32 -> shared memory
+    uint64_t chunk_counter,           // 64-bit
+    uint32_t block_len,               // [0..64]
+    uint32_t flags,                   // CHUNK_START/END/PARENT/ROOT…
+    uint32_t out_state[16])           // 返回 16×u32 状态向量（按规范）
+{
+    // TODO: 根据 BLAKE3（基于 BLAKE2s）的 G/round 实现7轮
+    // 这里先给占位：将 IV+cv 混合到 out_state，真实实现请替换
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_state[i] = cv[i];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_state[8+i] = BLAKE3_IV[i];
+
+    out_state[12] ^= (uint32_t)chunk_counter;
+    out_state[13] ^= (uint32_t)(chunk_counter >> 32);
+    out_state[14] ^= block_len;
+    out_state[15] ^= flags;
+
+    // so far, the block_words are still pointers.
+    // now we load it into kernel, as pointed out by ncu profile
+    uint32_t block_reg_1[4];
+
+#pragma unroll
+    for (int i = 0; i < 16; i += 4) {        // the gap is 4
+        // load_u128_u32x4(block_words + i, block_reg_1);
+        out_state[i] ^= block_words[i];
+        // 做一点点搅动（占位）
+        out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31);
+    }
+}
+
+// 从 out_state 派生新的 CV（按规范应取 state[0..7] ^ state[8..15]）
+__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_cv[i] = st[i] ^ st[8+i];
+}
+
+// swap-table
+// BLAKE3 message schedule: rows are P^r, r=0..6.
+// Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) 
+// https://www.cse.unsw.edu.au/~cs4601/refs/papers/blake3.pdf
+__device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = {
+    // r = 0: identity
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+    // r = 1: P
+    {  2,  6,  3, 10,  7,  0,  4, 13,  1, 11, 12,  5,  9, 14, 15,  8 },
+    // r = 2: P∘P
+    {  3,  4, 10, 12, 13,  2,  7, 14,  6,  5,  9,  0, 11, 15,  8,  1 },
+    // r = 3
+    { 10,  7, 12,  9, 14,  3, 13, 15,  4,  0, 11,  2,  5,  8,  1,  6 },
+    // r = 4
+    { 12, 13,  9, 11, 15, 10, 14,  8,  7,  2,  5,  3,  0,  1,  6,  4 },
+    // r = 5
+    {  9, 14, 11,  5,  8, 12, 15,  1, 13,  3,  0, 10,  2,  6,  4,  7 },
+    // r = 6
+    { 11, 15,  5,  0,  1,  9,  8,  6, 14, 10,  2, 12,  3,  4,  7, 13 },
+};
+
+// get the "r" round, "k" message, it is broadcasted from m[li] lane. li = k
+__device__ __forceinline__ uint32_t msg_rk(uint32_t m_lane, int round, int k, unsigned mask16) {
+    int src = B3_MSG_SCHEDULE[round][k];
+    return __shfl_sync(mask16, m_lane, src, 16);
+}
+
+__device__ __noinline__
+uint32_t G_update_role(uint32_t v_self, uint32_t v_b, uint32_t v_c, uint32_t v_d,
+                       uint32_t mx, uint32_t my, int role)
+{
+    // 按 BLAKE2s 32-bit 的 G 序列算出 a',b',c',d'，最后返回“当前 role”的那个值
+    uint32_t a = v_self, b = v_b, c = v_c, d = v_d;
+
+    // a = a + b + mx; 
+    // d ^= a; 
+    // d >>>= 16
+    a = a + b + mx;   
+    d ^= a;   
+    d = rotr32(d, 16);
+
+    // c = c + d; 
+    // b ^= c; 
+    // b >>>= 12
+    c = c + d;        
+    b ^= c;   
+    b = rotr32(b, 12);
+
+    // a = a + b + my; 
+    // d ^= a; 
+    // d >>>= 8
+    a = a + b + my;   
+    d ^= a;   
+    d = rotr32(d, 8);
+
+    // c = c + d; 
+    // b ^= c; 
+    // b >>>= 7
+    c = c + d;        
+    b ^= c;   
+    b = rotr32(b, 7);
+
+    // role choice:
+    switch (role) {
+      case 0: return a;
+      case 1: return b;
+      case 2: return c;
+      default: return d;
+    }
+}
+
+// notice that, this function will proceed 2 chunks, each time.
+// - chunk_words_row: current chunk
+// - out_cv: written by lane 0, or lane 16
+__device__ __noinline__
+void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row, // 256×u32 -> 1024 Bytes, from shared memory
+                                    // so the chunks_row += 2 as gap
+                                    int chunk_len_bytes,
+                                    uint64_t chunk_counter,
+                                    uint32_t out_cv[8],
+                                    unsigned mask16) {
+    // computing index
+    int lane = threadIdx.x & 31;    // lane_id: 0-31
+    int sub  = lane >> 4;            // 0/1
+    int li   = lane & 15;            // 0..15, abstract lane id. for example, lane 16 will be li=0
+    int role = li & 3;               // a/b/c/d role
+    int base = (sub << 4);           // 0 or 16 the absolute base
+
+    const int nblocks = (chunk_len_bytes + 63) >> 6;  // ceil(chunk_len/64)
+
+    int warp_id = threadIdx.x / WARP_SIZE;
+
+    // initialize
+    uint32_t cv_word = 0;
+    if (li < 8) cv_word = BLAKE3_IV[li];
+
+    // process all blocks
+    // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks
+    // each block has 64B -> 16 x u32
+    for (int b = 0; b < nblocks; ++b) {
+        // each lane holds one u32, 
+        // 16 lane will hold 16 x 4 = 64 B -> it's block
+        // the another 16 lane will hold opposite 64 B
+        const uint32_t m_lane = chunk_words_row[b * 16 + li];
+
+        // 初始化 v：v[0..7]=cv, v[8..11]=IV，v[12..15]^=t/len/flags
+        // 先把“自己的那个索引”的初值准备好：
+        uint32_t v = (li < 8)
+            ? cv_word                                 // v[i]（i<8）
+            : BLAKE3_IV[li - 8];                      // v[8..15] ← IV
+
+        // 计数器/长度/标志（按 BLAKE3 规范）
+        const uint32_t t0 = (uint32_t)chunk_counter;
+        const uint32_t t1 = (uint32_t)(chunk_counter >> 32);
+        const int remain = chunk_len_bytes - (b << 6);
+        const uint32_t block_len = (remain >= 64) ? 64u : (uint32_t)remain;
+
+        const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false);
+
+        // 只在 12..15 四个索引上异或相应域（不分支，用谓词掩码）
+        v ^= (li == 12) ? t0       : 0u;
+        v ^= (li == 13) ? t1       : 0u;
+        v ^= (li == 14) ? block_len: 0u;
+        v ^= (li == 15) ? flags    : 0u;
+
+        // 把索引写成 li = q + 4*rq；对角步先做置换 li' = (rq<<2) | ((q+rq)&3)
+        int q  = (li & 3);
+        int rq = (li >> 2);
+        int li_diag = (rq << 2) | ((q + rq) & 3);
+        int li_undo = (rq << 2) | ((q - rq) & 3);
+        int gi_col = q; // 0..3
+        int gi_diag = (li_diag & 3); // 0..3
+
+        // ===== 7 rounds =====
+        #pragma unroll 4
+        for (int r = 0; r < 7; ++r) {
+            // inside this loop, each lane will do one job
+            // 16 lane will execute 16 x 2 operations
+            // in sequential-programming, will do 8 operation
+
+            // ---- 列步（quartet: {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15}）----
+            {
+                // 取同 quartet 的 b/c/d（基于当前 v）
+                uint32_t vb = __shfl_xor_sync(mask16, v,  4, 16);
+                uint32_t vc = __shfl_xor_sync(mask16, v,  8, 16);
+                uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16);
+
+                // 本 quartet 的 i ∈ {0,1,2,3}，列步用 msg 索引 0..7（两两为一对）
+                
+                uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16);
+                uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16);
+
+                v = G_update_role(v, vb, vc, vd, mx, my, role);
+            }
+
+            // ---- 对角步 ----
+            {
+                // 在“对角置换域”取到当前 v 值
+                uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16);
+
+                // 在该域内做“列步”同样的四邻取值
+                uint32_t vb = __shfl_xor_sync(mask16, v_diag,  4, 16);
+                uint32_t vc = __shfl_xor_sync(mask16, v_diag,  8, 16);
+                uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16);
+
+                // 对角步的 4 组 G 使用本轮消息对的后半（索引 8..15）
+                
+                uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16);
+                uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16);
+
+                uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role);
+
+                // 反置换回原位：li_undo = (rq<<2) | ((q - rq) & 3)
+                
+                // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16);
+                v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16);
+            }
+        } // 7 rounds end
+
+        // 派生新的 CV：cv[i] = v[i] ^ v[i+8]（仅 li=0..7 生效）
+        uint32_t vip8_all = __shfl_sync(mask16, v, li ^ 8, 16);
+        if (li < 8) {
+            cv_word = v ^ vip8_all;
+        }
+
+        // 下一块继续（本函数内 16 个 block 串行）
+    }
+
+    // 由 lane0 / lane16 收集 8×u32 输出
+    #pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用
+        if (li == 0) out_cv[j] = wj;                       // 仅 lane0 落盘
+    }
+}
+
+__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){
+    uint32_t msg[16];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[i] = L[i]; 
+    }
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[8+i] = R[i]; 
+    }
+    uint32_t st[16];
+    blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st);
+    blake3_state_to_cv(st, out_cv);
+}
+
+// ============ Big kernel: 16 WARPS in total ============
+// grid: (chunks / 64), thread: (512,)
+template<const int NUM_THREADS=512, 
+        const int CHUNK_SIZE=1024, 
+        const int CHUNKS_PER_BLOCK=64,
+        const int PAD_CHUNK=0,
+        const int PAD_CV=0>        // pad shared memory
+__global__ void blake3_block_reduce_kernel(uint32_t* d_input, 
+                                            uint32_t* block_cvs,
+                                            int chunk_len_bytes,
+                                            uint64_t base_chunk_counter,
+                                            int total_chunks) {
+    // NUM_WARPS also stands for NUM_CHUKNS per block
+    constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;        // consider it as 16
+    constexpr int CHUNKS_PROCEED = 64;
+
+    static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned");
+    static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4");
+
+    // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total
+    // 8 x 4 x 64 = 2 KiB shared memory in sum
+    __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV];        // avoid bank conflict
+
+    // 4 bytes x 256 x 64 = 64 KiB shared memory.
+    __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][(CHUNK_SIZE / 4) + PAD_CHUNK];     // [64][256]
+
+    const int tid = threadIdx.x;
+    const int bx = blockIdx.x;
+    const int warp_id = tid / WARP_SIZE;
+    const int lane_id = tid % WARP_SIZE;
+
+    constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4;     // 256
+    constexpr int VEC_ELEMS = 4;                        // uint4
+    constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB)
+    constexpr int WARPS_PER_CTA = NUM_WARPS;            // 16
+
+    // ============== STAGE 1: Coalsced Global Memory Loading ==============
+    const int tile_id   = blockIdx.x;
+    const int tile_base = tile_id * CHUNKS_PER_BLOCK;       // which chunk do this block start loading
+
+    int valid_chunks = total_chunks - tile_base;
+    if (valid_chunks <= 0) {
+        return; // overflow
+    }
+    if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK;
+
+    for (int ldt = 0; ldt < 4; ldt++) {
+        // each warp load 4 chunks
+        int chunk_local  = ldt * WARPS_PER_CTA + warp_id;           // ldt*16 + warp -> start chunk
+        int chunk_global = tile_base + chunk_local;                 // global chunk idx
+
+        // the pointer for shared memory
+        uint32_t* s_u32 = &chunk_smem[chunk_local][0];
+
+        // only read from global, when it's valid
+        // or, we fill it with 0
+        if (chunk_local < valid_chunks) {
+            const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK;
+
+            // move 16 bytes -> 128 bits each time
+            // each thread will load 2 x 16 bytes
+            // so 32 threads will load 32 x 2 x 16 = 1024 B
+            const uint4* __restrict__ g4 = reinterpret_cast<const uint4*>(g_u32);
+            uint4* __restrict__ s4 = reinterpret_cast<uint4*>(s_u32);
+
+            // idx = lane_id (0..31) 与 lane_id+32 (32..63)
+            int idx0 = lane_id;           // 0..31
+            int idx1 = lane_id + WARP_SIZE; // 32..63
+
+            // thread 0 -> 0, 32
+            // thread 1 -> 1, 33
+            // ...
+            // thread 31 -> 31, 63
+            // so the global memory access is coalsced
+
+            // notice, we load 16 bytes a time. the index is compressed
+            // tid 0 -> 0,       tid 1 -> 16
+            // tid 0 -> 32 x 16, tid 2 -> 32 x 16 + 16
+
+            uint4 v0 = g4[idx0];        // still, this step we load from gmem, in 4 elements aligned.
+            uint4 v1 = g4[idx1];
+
+            s_u32[4*idx0 + 0] = v0.x;   // when load into shared mem, do manually
+            s_u32[4*idx0 + 1] = v0.y;     
+            s_u32[4*idx0 + 2] = v0.z; 
+            s_u32[4*idx0 + 3] = v0.w;
+
+            s_u32[4*idx1 + 0] = v1.x; 
+            s_u32[4*idx1 + 1] = v1.y;
+            s_u32[4*idx1 + 2] = v1.z; 
+            s_u32[4*idx1 + 3] = v1.w;
+        } else {
+            uint4* s4 = reinterpret_cast<uint4*>(s_u32);
+            int idx0 = lane_id;
+            int idx1 = lane_id + WARP_SIZE;
+            s4[idx0] = make_uint4(0u,0u,0u,0u);
+            s4[idx1] = make_uint4(0u,0u,0u,0u);
+        }
+    }
+
+    __syncthreads();        // sync all warps
+
+    // ============== STAGE 2: Compress Leaf to 64 chain value ==============
+    const int pass0_valid = min(32, valid_chunks);              // pass0 cover [0, 31] chunks
+    const int pass1_valid = max(0, valid_chunks - 32);          // pass1 cover [32, 63] chunks
+
+    __shared__ int parents_count;
+    if (threadIdx.x == 0) {
+        const int parents0 = (pass0_valid + 1) >> 1;
+        const int parents1 = (pass1_valid + 1) >> 1;
+        parents_count = parents0 + parents1;             // ≤ 32
+    }
+    __syncthreads();
+
+    // if (blockIdx.x==0 && threadIdx.x==0) printf("after stage1\n");
+
+    // this is for each warp's lane0 and lane16 written
+    // to decrease the register usage.
+    __shared__ __align__(16) uint32_t tmp_cv[CHUNKS_PER_BLOCK][8 + PAD_CV]; // 2 KiB
+
+    // lambda function: compress this thing
+    auto do_big_pass = [&](int base /*0 或 32*/, int pass_valid) {
+        // left=base+2*warp_id, right=left+1
+        const int left       = base + (warp_id << 1);       // base + 0,2,4,6,...
+        const int right      = left + 1;
+        const int left_rel   = left  - base;   // 0..31
+        const int right_rel  = right - base;   // 1..32
+        const bool has_left  = (left_rel  < pass_valid);
+        const bool has_right = (right_rel < pass_valid);
+
+        // const int lane_id    = threadIdx.x & 31;
+        const int sub        = lane_id >> 4;               // sub-warp: 0 or 1, lane0-15: sub-warp0; lane16-lane31: sub-warp1
+        const int li         = lane_id & 15;               // 0..15
+        const unsigned mask16= (sub == 0 ? 0x0000FFFFu : 0xFFFF0000u);
+
+        const int chunk_local = left + sub;        // sub=0→left, sub=1→right
+        const bool active = (sub==0 ? (left - base)  < pass_valid
+                                    : (right - base) < pass_valid);
+
+        // uint32_t my_cv[8];
+
+        // the left-sub-warp and right-sub-warp will execute the same code
+        // distinguish the index by computing,
+        // to avoid warp-divergence
+        if (active) {
+            // the chunk local identifies the left or right chunk, so do not worry.
+            const uint32_t* row = &chunk_smem[chunk_local][0];
+            const uint64_t  cc  = base_chunk_counter + (uint64_t)(tile_base + chunk_local);
+            blake3_leaf_cv_simd16_onechunk(row, 
+                chunk_len_bytes,
+                cc, 
+                &tmp_cv[chunk_local][0], 
+                mask16);
+        }
+
+        __syncwarp();       // make sure two warps written into `tmp_cv`
+
+        // now we have compute 2 chunks' cv
+        // merge it to a parent cv
+        if (lane_id == 0 && has_left) {
+            const uint32_t* lcv = &tmp_cv[left][0];
+            uint32_t parent[8];
+            if ((right - base) < pass_valid) {
+                const uint32_t* rcv = &tmp_cv[right][0];
+                blake3_parent_cv(lcv, rcv, parent);
+            } else {        // odd: up-flow directly
+                #pragma unroll
+                for (int j = 0 ; j < 8; ++j) 
+                    parent[j] = lcv[j];
+            }
+
+            // now, one warp computes 2 chunks, yield one parent-cv value
+            const int pair_idx = (base >> 1) + warp_id;     // 0, 16 + warp_id
+            #pragma unroll
+            for (int j = 0; j < 8; ++j)
+                cv_smem[pair_idx][j] = parent[j];
+        }
+
+        __syncwarp();       // NOTICE: this is necessary!
+    }; // do_big_pass
+
+    // big-pass 1: computing 0-31 chunks
+    do_big_pass(/*base=*/0,  pass0_valid);
+
+    // if (bx == 0) printf("Finish 1 big pass\n");
+
+    // big-pass 2: computing 32-63 chunks
+    do_big_pass(/*base=*/32, pass1_valid);
+
+    // if (bx == 0) printf("Finish 2 big pass\n");
+
+    __syncthreads();
+
+    // printf("Stage 2 done!!!\n");
+
+    // right now, we have got 32 chain values
+    // a warp-reduce to merge.
+
+    // ============== STAGE 3: Block-Reduce ==============
+    // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv
+    // we will only use warp 0 to handle this thing
+    if (warp_id == 0) {
+        uint32_t cv[8] = {0,0,0,0,0,0,0,0};
+
+        const bool active_lane = (lane_id < parents_count);
+        if (active_lane) {
+    #pragma unroll
+            for (int j = 0; j < 8; ++j) cv[j] = cv_smem[lane_id][j];
+        }
+
+        // 2) warp reduce 32 -> 16 -> 8 -> 4 -> 2 -> 1
+        unsigned mask = __ballot_sync(0xFFFFFFFFu, active_lane);
+        int cur_n = parents_count;  // 当前层的有效节点数（逐层更新）
+
+        for (int step = 1; step < WARP_SIZE; step <<= 1) {
+            // right-neighbor
+            uint32_t nbr[8];
+    #pragma unroll
+            for (int j = 0; j < 8; ++j) {
+                nbr[j] = __shfl_down_sync(mask, cv[j], step);
+            }
+
+            // safety checking
+            const bool do_pair =
+                (lane_id % (step << 1) == 0) &&               // 左侧
+                (lane_id + step < cur_n) &&                   // 右侧在当前层有效范围内
+                (lane_id < cur_n);                            // 左侧也必须有效
+
+            if (do_pair) {
+                blake3_parent_cv(cv, nbr, cv);               // parent(left, right) -> cv
+            }
+
+            cur_n = (cur_n + 1) >> 1;
+            __syncwarp(mask);
+        }
+
+        // 3) write back to global memory
+        if (lane_id == 0 && parents_count > 0) {
+            const int tile_id = blockIdx.x;
+            uint32_t* out = block_cvs + (size_t)tile_id * 8;        // 8 x 4 = 32 B
+
+            // two different write ways
+            #if 0
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                out[j] = cv[j];
+            #else
+            // block_cvs should be cudaMalloc ed
+            reinterpret_cast<uint4*>(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]);
+            reinterpret_cast<uint4*>(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]);
+            #endif
+        }
+    }
+}   // blake3_block_reduce_kernel
+
+__device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) {
+    const uint4* g4 = reinterpret_cast<const uint4*>(g);
+    uint4 a = g4[0], b = g4[1];
+    r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w;
+    r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w;
+}
+
+__device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) {
+    uint4* g4 = reinterpret_cast<uint4*>(g);
+    g4[0] = make_uint4(r[0],r[1],r[2],r[3]);
+    g4[1] = make_uint4(r[4],r[5],r[6],r[7]);
+}
+
+// ============ Tiny kernel ============
+// In big kernel, it will consume 64 KiB each block
+// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 64 root = 16384 root
+// And this tiny kernel is designed to process these 16384 root
+template<int NUM_THREADS=512, int TILE_CVS=2048, int PAD=0>
+__global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32,
+                                              uint32_t* __restrict__ out_cv32,
+                                              int N)
+{
+    extern __shared__ __align__(16) uint32_t smem[]; // 动态 SMEM；需要 >= TILE_CVS*8*4 字节
+    // 视作 2D：[TILE_CVS][8+PAD]
+    uint32_t* cv_tile = smem;
+
+    const int tid     = threadIdx.x;
+    const int warp_id = tid / WARP_SIZE;   // 0..15
+    const int lane_id = tid % WARP_SIZE;   // 0..31
+
+    // 本 block 负责的分片起点
+    const int tile_start = blockIdx.x * TILE_CVS;
+    if (tile_start >= N) return;
+
+    // N等于8的时候，这里就是8
+    const int tile_n = min(TILE_CVS, N - tile_start); // 该分片的实际 CV 数（<=2048）
+
+    // ---------------- Stage 1: 合并访存 loading 到 SMEM ----------------
+    // 每线程搬多个 CV：i = tid, tid+blockDim, ...
+    for (int i = tid; i < tile_n; i += NUM_THREADS) {       // 注意：i = tid, 不是等于0
+        const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8;
+        uint32_t* s = cv_tile + (size_t)i * (8 + PAD);
+        // 两次 16B
+        const uint4* g4 = reinterpret_cast<const uint4*>(g);
+        uint4*       s4 = reinterpret_cast<uint4*>(s);
+        // s4[0] = g4[0];
+        // s4[1] = g4[1];
+
+        // in case that the address is not aligned
+        uint4 v0 = g4[0];
+        uint4 v1 = g4[1];
+
+        s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w;
+        s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w;
+    }
+    // 对于 tile_n < TILE_CVS 的尾部，无需清零；后续按有效范围处理
+    __syncthreads();
+
+    // ---------------- Stage 2: 线程内 4→1（保持相邻配对） ----------------
+    // 共有 reduced_n0 = ceil(tile_n / 4) 个 lane-root
+    const int reduced_n0 = (tile_n + 3) >> 1 >> 1; // 等价于 (tile_n+3)/4
+    uint32_t lane_cv[8]; // 本线程输出的 lane-root
+    bool lane_valid = false;
+
+    // 每线程的 4 个输入的起始索引
+    int base4 = tid << 2; // tid*4
+    if (base4 < tile_n) {
+        // 读取最多 4 个相邻 CV：idx = base4 + 0,1,2,3
+        uint32_t a[8], b[8], c[8], d[8];
+        const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD);
+        load_cv_g2r(s0, a);
+
+        int remain = tile_n - base4;
+
+        if (remain >= 2) {
+            const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD);
+            load_cv_g2r(s1, b);
+        }
+        if (remain >= 3) {
+            const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD);
+            load_cv_g2r(s2, c);
+        }
+        if (remain >= 4) {
+            const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD);
+            load_cv_g2r(s3, d);
+        }
+
+        // 两层相邻配对（奇数晋级）
+        if (remain == 1) {
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                lane_cv[j] = a[j];
+        } else if (remain == 2) {
+            blake3_parent_cv(a, b, lane_cv);
+        } else if (remain == 3) {
+            uint32_t p01[8];
+            blake3_parent_cv(a, b, p01);
+            blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01，(p01,c)->lane_cv
+        } else { // remain >= 4
+            uint32_t p01[8], p23[8];
+            blake3_parent_cv(a, b, p01);
+            blake3_parent_cv(c, d, p23);
+            blake3_parent_cv(p01, p23, lane_cv);
+        }
+        lane_valid = true;
+    }
+
+    // ---------------- Stage 3: Warp 内 32→1 相邻配对 ----------------
+    // 每个 warp 负责一个连续段：warp_base = warp_id*32
+    const int warp_base = warp_id * WARP_SIZE;
+    const int cur_n_w   = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // 本 warp 段内的有效数量
+
+    // 把 lane_cv 保留在寄存器里做归约；无效 lane 用 mask 剔除
+    unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // 仅做存在检测
+    int cur_n = cur_n_w;
+
+    // 把“段外的线程”标成无效（避免读越界）
+    bool active_lane = (lane_id < cur_n_w);
+
+    // 对无效 lane 把值清成 0（不会被使用）
+    if (!active_lane) { 
+        #pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            lane_cv[j] = 0u; 
+    }
+
+    // 逐层配对：1,2,4,8,16 - warp-reduce
+    for (int step = 1; step < WARP_SIZE; step <<= 1) {
+        // 取右邻
+        uint32_t nbr[8];
+        #pragma unroll
+        for (int j = 0; j < 8; ++j)
+            nbr[j] = __shfl_down_sync(0xFFFFFFFFu, lane_cv[j], step);
+
+        const bool do_pair =
+            active_lane &&
+            ((lane_id % (step<<1)) == 0) &&
+            (lane_id + step < cur_n);
+
+        if (do_pair) {
+            blake3_parent_cv(lane_cv, nbr, lane_cv);
+        }
+
+        cur_n = (cur_n + 1) >> 1;
+        // __syncwarp();
+    }
+
+    // 这一段的结果在 lane0；把 16 个 warp-root 写入 SMEM 的前 16 行
+    __shared__ uint32_t warp_roots[WARP_SIZE/2][8]; // 16×8
+    if (lane_id == 0 && cur_n_w > 0) {
+        #pragma unroll
+        for (int j=0;j<8;++j) 
+            warp_roots[warp_id][j] = lane_cv[j];
+    }
+    __syncthreads();
+
+    // ---------------- Stage 4: CTA 内 16→1 相邻配对 ----------------
+    // 有效 warp 数：ceil(reduced_n0 / 32)
+    int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..16
+    if (valid_warps == 0) return;
+
+    // 每一个warp的lane 0来做计算
+    // 用 lane0 做计算，其它 lane 空转
+    for (int stride = (valid_warps >> 1); stride >= 1; stride >>= 1) {
+        if (warp_id < stride && lane_id == 0) {
+            uint32_t p[8];
+            blake3_parent_cv(&warp_roots[2*warp_id][0],
+                             &warp_roots[2*warp_id + 1][0], p);
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                warp_roots[warp_id][j] = p[j];
+        }
+        __syncthreads();
+        // 奇数晋级
+        if ((valid_warps & 1) && warp_id==0 && lane_id==0) {
+            #pragma unroll
+            for (int j=0;j<8;++j)
+                warp_roots[stride][j] = warp_roots[valid_warps-1][j];
+        }
+        __syncthreads();
+        valid_warps = (valid_warps + 1) >> 1;
+    }
+
+    // 写回本 block 的根
+    if (threadIdx.x == 0) {
+        uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8;
+        #pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            out[j] = warp_roots[0][j];
+    }
+}
+
+inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) {
+    const uint32_t zero_block[16] = {0};
+    uint32_t st[16];
+    blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st);
+    // 写出前 32 字节（state[0..7]，小端）
+    for (int i = 0; i < 8; ++i) {
+        uint32_t w = st[i];
+        out32[4*i+0] = (uint8_t)( w        & 0xFF);
+        out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF);
+        out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF);
+        out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF);
+    }
+}
+
+// wrapper function
+void blake3_block_reduce_sm70(const uint8_t* d_data, 
+                            uint64_t bytes_len,
+                            std::array<uint32_t,8>* root_out = nullptr,
+                            cudaStream_t stream = 0) {
+    if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) {
+        fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n",
+                (unsigned long long)bytes_len);
+        std::abort();
+    }
+
+    // int dev = -1;
+    // cudaGetDevice(&dev);
+    // printf("[dbg] my runtime current device = %d\n", dev);
+
+    // cudaPointerAttributes attr{};
+    // auto st = cudaPointerGetAttributes(&attr, d_data);
+    // printf("[dbg] getAttr=%d, type=%d (0=host,1=device,2=managed), device=%d\n",
+    //     (int)st, (int)attr.type, attr.device);
+
+    // cudaPointerAttributes attr{};
+    // CUDA_CHECK(cudaPointerGetAttributes(&attr, d_data));
+    // if (attr.type != cudaMemoryTypeDevice) {
+    //     fprintf(stderr, "d_data is not device memory!\n");
+    //     std::abort();
+    // }
+
+    int optin = 0, deflt = 0;
+    cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0);
+    cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0);
+
+    const int dyn_smem = 64 * 1024;     // 64KiB
+
+    // 编译器在编译期决定分配多少动态shmem给kernel
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_cv_block_reduce_kernel<512, 2048, 0>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_cv_block_reduce_kernel<32, 2048, 0>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+
+        
+    constexpr int  CHUNK_SIZE = 1024;                       // 1 KiB
+    constexpr int  WORDS_PER_CHUNK = CHUNK_SIZE / 4;        // 256
+    constexpr int  NUM_THREADS = 512;                       // for big kernel
+    constexpr int  NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;       // 16
+    constexpr int  CHUNKS_PER_BLOCK = 64;                     // 16 * 32 = 512
+    const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
+    const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
+    const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);     // 16384 blocks
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, 0, 0>,
+        cudaFuncAttributePreferredSharedMemoryCarveout, 100));
+        
+    uint8_t*  d_bytes = const_cast<uint8_t*>(d_data);
+    uint32_t* d_words = reinterpret_cast<uint32_t*>(d_bytes);; // alias
+    uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32
+
+    // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch.
+
+    // TODO: use thrust
+    cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t));     // 512 KiB
+
+    // ============= launch big kernel =============
+    dim3 grid_big(num_blocks);
+    dim3 block_big(NUM_THREADS);
+    uint64_t base_chunk_counter = 0ull;
+    
+    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, 0, 0>
+        <<<grid_big, block_big, /*smem*/0, stream>>>(
+            d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks);
+
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    if (num_blocks == 1) {
+        std::array<uint32_t,8> host_root{};
+        CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        // last final process
+        uint8_t digest32[32];
+        blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+        if (root_out) *root_out = host_root;
+        else {
+            // 简单打印
+            printf("root CV:");
+            for (int i=0;i<8;++i) 
+                printf(" %08x", host_root[i]);
+            printf("\n");
+        }
+
+        CUDA_CHECK(cudaFree(d_blockCV));
+        CUDA_CHECK(cudaFree(d_bytes));
+        return;
+    }
+
+    // the first round of tiny kernel
+    // 1) 16384 output reduce -> 8
+    uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32
+    {
+        const int N = 16384;        // total number
+        const int TILE = 2048;
+        const int grid = (N + TILE - 1) / TILE;  // = 8
+        const int block = 512;
+        const size_t smem_bytes = (size_t)(TILE) * 8u * sizeof(uint32_t); // 2048×8×4 = 64 KiB
+
+        cudaMalloc(&d_mid_out, (size_t)8 * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<512, 2048, 0>
+            <<<grid, block, smem_bytes, stream>>>(d_blockCV /*in: 16384×8 x 4*/,
+                                                d_mid_out   /*out: 8×8*/, N);
+        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    // second round
+    uint32_t* d_root_cv = nullptr;
+    {
+        const int N = 8;
+        const int TILE = 2048; // 任意 >=N 即可
+        const int grid = 1;
+        const int block = 32; // 32 线程够用
+        const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t);        // 8 x 8 x 4 = 8 x 32 B
+
+        cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<32, 2048, 0>
+            <<<grid, block, smem_bytes, stream>>>(d_mid_out /*in: 8×8*/,
+                                                d_root_cv /*out: 1×8*/, N);
+        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    std::array<uint32_t, 8> host_root{};
+    CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root_cv, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    // last final process
+    uint8_t digest32[32];
+    blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+    if (root_out) {
+        *root_out = host_root;
+    } else {
+        printf("root CV:");
+        for (int i=0;i<8;++i) printf(" %08x", host_root[i]);
+        printf("\n");
+    }
+
+    // clear
+    CUDA_CHECK(cudaFree(d_mid_out));
+    CUDA_CHECK(cudaFree(d_root_cv));
+    CUDA_CHECK(cudaFree(d_blockCV));
+    // CUDA_CHECK(cudaFree(d_bytes));
+}
\ No newline at end of file
diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py
index 4e49107..a8ecbd1 100644
--- a/benchmark/test_gpu.py
+++ b/benchmark/test_gpu.py
@@ -24,7 +24,7 @@
 torch.cuda.synchronize()
 
 # 3) 正式计时：这测的是“端到端吞吐”（包含 H2D）
-repeat = 5  # 1GiB × 5 已经很重，按机器调整
+repeat = 3  # 1GiB × 5 已经很重，按机器调整
 t0 = time.perf_counter()
 for _ in range(repeat):
     cv_hex = fh.blake3_gpu_sm70_hex(d.data_ptr(), d.numel(), stream)
diff --git a/csrc/blake3_sm70.cu b/csrc/blake3_sm70.cu
index 4a98b49..bb70888 100644
--- a/csrc/blake3_sm70.cu
+++ b/csrc/blake3_sm70.cu
@@ -35,8 +35,8 @@ enum : uint32_t {
 __device__ __noinline__
 uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) {
     uint32_t f = 0;
-    if (block_idx_in_chunk == 0)              f |= FLAG_CHUNK_START;
-    if (block_idx_in_chunk == nblocks_in_chunk - 1) f |= FLAG_CHUNK_END;
+    f |= (uint32_t)-(block_idx_in_chunk==0)          & FLAG_CHUNK_START;
+    f |= (uint32_t)-(block_idx_in_chunk==nblocks_in_chunk-1)  & FLAG_CHUNK_END;
     if (is_root_chunk)                         f |= FLAG_ROOT; // only this block in msg, or this is root
     return f;
 }
@@ -231,8 +231,16 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row
         v ^= (li == 14) ? block_len: 0u;
         v ^= (li == 15) ? flags    : 0u;
 
+        // 把索引写成 li = q + 4*rq；对角步先做置换 li' = (rq<<2) | ((q+rq)&3)
+        int q  = (li & 3);
+        int rq = (li >> 2);
+        int li_diag = (rq << 2) | ((q + rq) & 3);
+        int li_undo = (rq << 2) | ((q - rq) & 3);
+        int gi_col = q; // 0..3
+        int gi_diag = (li_diag & 3); // 0..3
+
         // ===== 7 rounds =====
-        #pragma unroll 1            // 不要unroll
+        #pragma unroll 4
         for (int r = 0; r < 7; ++r) {
             // inside this loop, each lane will do one job
             // 16 lane will execute 16 x 2 operations
@@ -246,20 +254,15 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row
                 uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16);
 
                 // 本 quartet 的 i ∈ {0,1,2,3}，列步用 msg 索引 0..7（两两为一对）
-                int gi = (li & 3); // 0..3
-                uint32_t mx = msg_rk(m_lane, r, 2*gi + 0, mask16);
-                uint32_t my = msg_rk(m_lane, r, 2*gi + 1, mask16);
+                
+                uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16);
+                uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16);
 
                 v = G_update_role(v, vb, vc, vd, mx, my, role);
             }
 
             // ---- 对角步 ----
             {
-                // 把索引写成 li = q + 4*rq；对角步先做置换 li' = (rq<<2) | ((q+rq)&3)
-                int q  = (li & 3);
-                int rq = (li >> 2);
-                int li_diag = (rq << 2) | ((q + rq) & 3);
-
                 // 在“对角置换域”取到当前 v 值
                 uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16);
 
@@ -269,14 +272,14 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row
                 uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16);
 
                 // 对角步的 4 组 G 使用本轮消息对的后半（索引 8..15）
-                int gi = (li_diag & 3); // 0..3
-                uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi + 0, mask16);
-                uint32_t my = msg_rk(m_lane, r, 8 + 2*gi + 1, mask16);
+                
+                uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16);
+                uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16);
 
                 uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role);
 
                 // 反置换回原位：li_undo = (rq<<2) | ((q - rq) & 3)
-                int li_undo = (rq << 2) | ((q - rq) & 3);
+                
                 // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16);
                 v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16);
             }
@@ -292,15 +295,6 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row
     }
 
     // 由 lane0 / lane16 收集 8×u32 输出
-
-    // This will trigger problem!
-    // if (li == 0) {      // only thread 0 and thread 16 will do this.
-    //     #pragma unroll
-    //     for (int j = 0; j < 8; ++j) {
-    //         out_cv[j] = __shfl_sync(mask16, cv_word, j, 16);
-    //     }
-    // }
-
     #pragma unroll
     for (int j = 0; j < 8; ++j) {
         uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用
@@ -328,7 +322,7 @@ __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint3
 template<const int NUM_THREADS=512, 
         const int CHUNK_SIZE=1024, 
         const int CHUNKS_PER_BLOCK=64,
-        const int PAD_CHUNK=0,
+        const int PAD_CHUNK=16,
         const int PAD_CV=0>        // pad shared memory
 __global__ void blake3_block_reduce_kernel(uint32_t* d_input, 
                                             uint32_t* block_cvs,
@@ -834,8 +828,12 @@ void blake3_block_reduce_sm70(const uint8_t* d_data,
     const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
     const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
     const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);     // 16384 blocks
+
+    constexpr int pad_chunk = 16;
+    constexpr int pad_cv = 0;
+
     CUDA_CHECK(cudaFuncSetAttribute(
-        blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, 0, 0>,
+        blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>,
         cudaFuncAttributePreferredSharedMemoryCarveout, 100));
         
     uint8_t*  d_bytes = const_cast<uint8_t*>(d_data);
@@ -852,7 +850,7 @@ void blake3_block_reduce_sm70(const uint8_t* d_data,
     dim3 block_big(NUM_THREADS);
     uint64_t base_chunk_counter = 0ull;
     
-    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, 0, 0>
+    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>
         <<<grid_big, block_big, /*smem*/0, stream>>>(
             d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks);
 

From 4be8258f5e82aab4e57e8b70a604ebb9361d8aa0 Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Mon, 6 Oct 2025 19:46:55 +0800
Subject: [PATCH 08/20] tune size

---
 README.md           | 3 ++-
 csrc/blake3_sm70.cu | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 0945c67..576c3e7 100644
--- a/README.md
+++ b/README.md
@@ -28,4 +28,5 @@ python benchmark/test_script.py
 + 10.5 - v1 - [commit:4f31a2c55551965a2e5f048565f021c4551554ae]: 1709.34 MiB/s
 + 10.5 - v2 - [commit:2e71051367a9533af36f6c54a46876e20bc0bab5]: 2044.49 MiB/s
 + 10.6 - v3 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s (False Result)
-+ 10.6 - v4 - [commit:9781ad9759d13e0e482162d57c68b65c946f4ff9]: 17069.23 MiB/s
\ No newline at end of file
++ 10.6 - v4 - [commit:9781ad9759d13e0e482162d57c68b65c946f4ff9]: 17069.23 MiB/s
++ 10.6 - v5 - [commit:]: 34261.37 MiB/s
\ No newline at end of file
diff --git a/csrc/blake3_sm70.cu b/csrc/blake3_sm70.cu
index bb70888..e61b85b 100644
--- a/csrc/blake3_sm70.cu
+++ b/csrc/blake3_sm70.cu
@@ -331,7 +331,7 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
                                             int total_chunks) {
     // NUM_WARPS also stands for NUM_CHUKNS per block
     constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;        // consider it as 16
-    constexpr int CHUNKS_PROCEED = 64;
+    constexpr int CHUNKS_PROCEED = CHUNKS_PER_BLOCK;
 
     static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned");
     static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4");
@@ -820,11 +820,11 @@ void blake3_block_reduce_sm70(const uint8_t* d_data,
         cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
 
         
+    constexpr int  CHUNKS_PER_BLOCK = 32;                     // 16 * 32 = 512
     constexpr int  CHUNK_SIZE = 1024;                       // 1 KiB
     constexpr int  WORDS_PER_CHUNK = CHUNK_SIZE / 4;        // 256
-    constexpr int  NUM_THREADS = 512;                       // for big kernel
-    constexpr int  NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;       // 16
-    constexpr int  CHUNKS_PER_BLOCK = 64;                     // 16 * 32 = 512
+    constexpr int  NUM_THREADS = CHUNKS_PER_BLOCK * 512 / 64;                       // for big kernel, 512 or 256
+    constexpr int  NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;       // 16 or 8
     const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
     const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
     const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);     // 16384 blocks

From 4c388a3a80fdb4c59f9c96da9dac613971b27cd4 Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Tue, 7 Oct 2025 13:34:52 +0800
Subject: [PATCH 09/20] finish basic sm70_sm80 dev

---
 README.md                                     |   5 +-
 backup_deprecated/blake3_sm70_v2.cu           |  18 +-
 .../blake3_sm80_v3.cu                         | 151 ++-
 benchmark/perf.txt                            |   1 +
 benchmark/test_gpu.py                         |   8 +-
 csrc/binding.cpp                              |   4 +-
 csrc/blake3.h                                 |   2 +-
 csrc/blake3_sm70_sm80.cu                      | 993 ++++++++++++++++++
 requirements.txt                              |   2 +
 setup.py                                      |  65 +-
 10 files changed, 1168 insertions(+), 81 deletions(-)
 rename csrc/blake3_sm70.cu => backup_deprecated/blake3_sm80_v3.cu (90%)
 create mode 100644 benchmark/perf.txt
 create mode 100644 csrc/blake3_sm70_sm80.cu

diff --git a/README.md b/README.md
index 576c3e7..c435acf 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,9 @@ Proposal (from Google Drive): https://docs.google.com/document/d/10B3_nT8xF49vLg
 conda create -n ceg5206 python=3.12
 pip install -r requirements.txt
 
+# clone the cutlass repository
+git clone https://github.com/NVIDIA/cutlass.git ~/cutlass --depth 1
+
 # install the cpp source file
 python setup.py install
 
@@ -29,4 +32,4 @@ python benchmark/test_script.py
 + 10.5 - v2 - [commit:2e71051367a9533af36f6c54a46876e20bc0bab5]: 2044.49 MiB/s
 + 10.6 - v3 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s (False Result)
 + 10.6 - v4 - [commit:9781ad9759d13e0e482162d57c68b65c946f4ff9]: 17069.23 MiB/s
-+ 10.6 - v5 - [commit:]: 34261.37 MiB/s
\ No newline at end of file
++ 10.6 - v5 - [commit:4be8258f5e82aab4e57e8b70a604ebb9361d8aa0]: 34261.37 MiB/s
\ No newline at end of file
diff --git a/backup_deprecated/blake3_sm70_v2.cu b/backup_deprecated/blake3_sm70_v2.cu
index fc418c2..e61b85b 100644
--- a/backup_deprecated/blake3_sm70_v2.cu
+++ b/backup_deprecated/blake3_sm70_v2.cu
@@ -322,7 +322,7 @@ __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint3
 template<const int NUM_THREADS=512, 
         const int CHUNK_SIZE=1024, 
         const int CHUNKS_PER_BLOCK=64,
-        const int PAD_CHUNK=0,
+        const int PAD_CHUNK=16,
         const int PAD_CV=0>        // pad shared memory
 __global__ void blake3_block_reduce_kernel(uint32_t* d_input, 
                                             uint32_t* block_cvs,
@@ -331,7 +331,7 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
                                             int total_chunks) {
     // NUM_WARPS also stands for NUM_CHUKNS per block
     constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;        // consider it as 16
-    constexpr int CHUNKS_PROCEED = 64;
+    constexpr int CHUNKS_PROCEED = CHUNKS_PER_BLOCK;
 
     static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned");
     static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4");
@@ -820,16 +820,20 @@ void blake3_block_reduce_sm70(const uint8_t* d_data,
         cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
 
         
+    constexpr int  CHUNKS_PER_BLOCK = 32;                     // 16 * 32 = 512
     constexpr int  CHUNK_SIZE = 1024;                       // 1 KiB
     constexpr int  WORDS_PER_CHUNK = CHUNK_SIZE / 4;        // 256
-    constexpr int  NUM_THREADS = 512;                       // for big kernel
-    constexpr int  NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;       // 16
-    constexpr int  CHUNKS_PER_BLOCK = 64;                     // 16 * 32 = 512
+    constexpr int  NUM_THREADS = CHUNKS_PER_BLOCK * 512 / 64;                       // for big kernel, 512 or 256
+    constexpr int  NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;       // 16 or 8
     const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
     const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
     const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);     // 16384 blocks
+
+    constexpr int pad_chunk = 16;
+    constexpr int pad_cv = 0;
+
     CUDA_CHECK(cudaFuncSetAttribute(
-        blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, 0, 0>,
+        blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>,
         cudaFuncAttributePreferredSharedMemoryCarveout, 100));
         
     uint8_t*  d_bytes = const_cast<uint8_t*>(d_data);
@@ -846,7 +850,7 @@ void blake3_block_reduce_sm70(const uint8_t* d_data,
     dim3 block_big(NUM_THREADS);
     uint64_t base_chunk_counter = 0ull;
     
-    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, 0, 0>
+    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>
         <<<grid_big, block_big, /*smem*/0, stream>>>(
             d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks);
 
diff --git a/csrc/blake3_sm70.cu b/backup_deprecated/blake3_sm80_v3.cu
similarity index 90%
rename from csrc/blake3_sm70.cu
rename to backup_deprecated/blake3_sm80_v3.cu
index e61b85b..fdf1716 100644
--- a/csrc/blake3_sm70.cu
+++ b/backup_deprecated/blake3_sm80_v3.cu
@@ -1,10 +1,22 @@
 
+#include "cute/numeric/int.hpp"
 #include <cstdint>
 #include <array>
-#include <cuda_runtime.h>
 #include <cstdio>
 #include <cstring>
 
+#include <cuda_runtime.h>
+#include <cute/tensor.hpp>
+#include <cute/atom/copy_atom.hpp>
+
+#if __CUDA_ARCH__ >= 800
+    #include <cute/arch/copy_sm80.hpp>
+#endif
+
+#include <cute/algorithm/copy.hpp>
+#include <cute/config.hpp>
+#include <cute/numeric/numeric_types.hpp>
+
 #define WARP_SIZE 32
 #define LDST128BITS(value) (reinterpret_cast<const float4*>(&(value))[0])
 
@@ -17,6 +29,20 @@
     }                                                           \
   } while(0)
 
+
+using namespace cute;
+
+using vec_t = cute::uint128_t;        // one time loading 16 B
+
+#if __CUDA_ARCH__ >= 800
+  // SM80 branch
+  using Atom = cute::Copy_Atom<cute::SM80_CP_ASYNC_CACHEGLOBAL<vec_t>, vec_t>;
+#else
+  // Volta/Turing（SM70/75）general copy
+  using Atom = cute::Copy_Atom<cute::AutoVectorizingCopyWithAssumedAlignment<vec_t>, vec_t>;
+  // 也可用更保守的：using Atom = cute::Copy_Atom<cute::UniversalCopy<vec_t>, vec_t>;
+#endif
+
 __host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = {
     0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
     0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
@@ -317,11 +343,13 @@ __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint3
     blake3_state_to_cv(st, out_cv);
 }
 
+
+
 // ============ Big kernel: 16 WARPS in total ============
 // grid: (chunks / 64), thread: (512,)
-template<const int NUM_THREADS=512, 
+template<const int NUM_THREADS=512,         // or 256
         const int CHUNK_SIZE=1024, 
-        const int CHUNKS_PER_BLOCK=64,
+        const int CHUNKS_PER_BLOCK=64,      // 64 KB, if 256 threads -> 32 KB
         const int PAD_CHUNK=16,
         const int PAD_CV=0>        // pad shared memory
 __global__ void blake3_block_reduce_kernel(uint32_t* d_input, 
@@ -340,19 +368,19 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
     // 8 x 4 x 64 = 2 KiB shared memory in sum
     __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV];        // avoid bank conflict
 
+    constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4;     // 256
+    constexpr int VEC_ELEMS = 4;                        // uint4, 16B
+    constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB)
+    constexpr int WARPS_PER_CTA = NUM_WARPS;            // 16
+
     // 4 bytes x 256 x 64 = 64 KiB shared memory.
-    __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][(CHUNK_SIZE / 4) + PAD_CHUNK];     // [64][256]
+    __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][WORDS_PER_CHUNK + PAD_CHUNK];     // [64][256+PAD]
 
     const int tid = threadIdx.x;
     const int bx = blockIdx.x;
     const int warp_id = tid / WARP_SIZE;
     const int lane_id = tid % WARP_SIZE;
 
-    constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4;     // 256
-    constexpr int VEC_ELEMS = 4;                        // uint4
-    constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB)
-    constexpr int WARPS_PER_CTA = NUM_WARPS;            // 16
-
     // ============== STAGE 1: Coalsced Global Memory Loading ==============
     const int tile_id   = blockIdx.x;
     const int tile_base = tile_id * CHUNKS_PER_BLOCK;       // which chunk do this block start loading
@@ -363,59 +391,82 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
     }
     if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK;
 
+    // use CuTe for loading
+
+    auto make_smem_tensor = [&](int chunk_local /* The chunk id on shared memory */) {
+        // smem index：chunk_smem[chunk_local][0]
+        // length: (WORDS_PER_CHUNK + PAD_CHUNK)
+        uint32_t* s_u32 = &chunk_smem[chunk_local][0];
+        vec_t* s_vec = reinterpret_cast<vec_t*>(s_u32); // each element is 16B
+        return make_tensor(make_smem_ptr(s_vec),
+                           make_shape(Int<VEC_PER_CHUNK>{}),        // 64 x 4 = 256, one row of shared memory
+                           make_stride(Int<1>{}));  // continuous
+    };
+
+    // shared memory 256 items = 64 x vec_t(4)
+    // thread layout: 32 threads
+    //      thread 0: 0, 32
+    //      thread 1: 1, 33
+    //      thread 32:  31,63
+    // so we load data in coalsced mode
+    using Atom = cute::Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<vec_t>, vec_t>;
+
+    // thread layout. (lane,0)->idx=2*lane，(lane,1)->idx=2*lane+1
+    auto thr_layout = make_layout(
+        make_shape(Int<2>{}),
+        make_stride(Int<1>{})
+    );
+
+    auto tile_layout = make_layout(
+        make_shape(Int<WARP_SIZE>{}, Int<2>{}),     // (32,2)
+        make_stride(Int<1>{}, Int<WARP_SIZE>{})   // idx = lane + k2*32
+    );
+
+    // build the real copy
+    TiledCopy tcopy = make_tiled_copy(Atom{}, thr_layout, tile_layout);
+
+    // this sentence will load
+    // auto thr_copy = local_partition(tcopy, lane_id);
+
+
     for (int ldt = 0; ldt < 4; ldt++) {
         // each warp load 4 chunks
         int chunk_local  = ldt * WARPS_PER_CTA + warp_id;           // ldt*16 + warp -> start chunk
         int chunk_global = tile_base + chunk_local;                 // global chunk idx
 
         // the pointer for shared memory
-        uint32_t* s_u32 = &chunk_smem[chunk_local][0];
+        auto smem_vec1d = make_smem_tensor(chunk_local);        // 64 x uint128_t
 
         // only read from global, when it's valid
         // or, we fill it with 0
-        if (chunk_local < valid_chunks) {
-            const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK;
-
-            // move 16 bytes -> 128 bits each time
-            // each thread will load 2 x 16 bytes
-            // so 32 threads will load 32 x 2 x 16 = 1024 B
-            const uint4* __restrict__ g4 = reinterpret_cast<const uint4*>(g_u32);
-            uint4* __restrict__ s4 = reinterpret_cast<uint4*>(s_u32);
-
-            // idx = lane_id (0..31) 与 lane_id+32 (32..63)
-            int idx0 = lane_id;           // 0..31
-            int idx1 = lane_id + WARP_SIZE; // 32..63
-
-            // thread 0 -> 0, 32
-            // thread 1 -> 1, 33
-            // ...
-            // thread 31 -> 31, 63
-            // so the global memory access is coalsced
-
-            // notice, we load 16 bytes a time. the index is compressed
-            // tid 0 -> 0,       tid 1 -> 16
-            // tid 0 -> 32 x 16, tid 2 -> 32 x 16 + 16
-
-            uint4 v0 = g4[idx0];        // still, this step we load from gmem, in 4 elements aligned.
-            uint4 v1 = g4[idx1];
-
-            s_u32[4*idx0 + 0] = v0.x;   // when load into shared mem, do manually
-            s_u32[4*idx0 + 1] = v0.y;     
-            s_u32[4*idx0 + 2] = v0.z; 
-            s_u32[4*idx0 + 3] = v0.w;
-
-            s_u32[4*idx1 + 0] = v1.x; 
-            s_u32[4*idx1 + 1] = v1.y;
-            s_u32[4*idx1 + 2] = v1.z; 
-            s_u32[4*idx1 + 3] = v1.w;
+        const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK;
+        const vec_t* g_vec_base = reinterpret_cast<const vec_t*>(g_u32);        // 16 B each time
+
+        // the src is 64 x 4 = 256, so is dest
+        auto gmem_vec1d = make_tensor(make_gmem_ptr(g_vec_base),
+                                make_shape(Int<VEC_PER_CHUNK>{}),       // 64 x uint128_t
+                                make_stride(Int<1>{}));
+        bool g_valid = (chunk_local < valid_chunks);
+
+        auto smem_vec2d = make_tensor(smem_vec1d.data(), tile_layout);
+        auto gmem_vec2d = make_tensor(gmem_vec1d.data(), tile_layout);
+
+        // now we will load this
+        auto tCg = local_partition(gmem_vec2d, tile_layout, lane_id);
+        auto tCs = local_partition(smem_vec2d, tile_layout, lane_id);
+
+        // launch the copy inst.
+        if (g_valid) {
+            // gmem load → smem store：twice 16B
+            copy(tcopy, tCg, tCs);
         } else {
-            uint4* s4 = reinterpret_cast<uint4*>(s_u32);
-            int idx0 = lane_id;
-            int idx1 = lane_id + WARP_SIZE;
-            s4[idx0] = make_uint4(0u,0u,0u,0u);
-            s4[idx1] = make_uint4(0u,0u,0u,0u);
+            for (int i = 0; i < size(tCs); ++i) {
+                *reinterpret_cast<vec_t*>(raw_pointer_cast(&tCs(i))) = uint128_t(0, 0);
+            }
         }
     }
+    cp_async_fence();
+    cp_async_wait<0>();
 
     __syncthreads();        // sync all warps
 
diff --git a/benchmark/perf.txt b/benchmark/perf.txt
new file mode 100644
index 0000000..8656b7e
--- /dev/null
+++ b/benchmark/perf.txt
@@ -0,0 +1 @@
+10.6 V100: 53927.17 MiB/s
\ No newline at end of file
diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py
index a8ecbd1..52b0977 100644
--- a/benchmark/test_gpu.py
+++ b/benchmark/test_gpu.py
@@ -6,7 +6,7 @@
 
 GiB = 1024*1024*1024  # bytes -> 1 GiB
 
-cpu = torch.empty(GiB * 1, dtype=torch.uint8, pin_memory=True)
+cpu = torch.empty(GiB * 1, dtype=torch.uint8)
 cpu[:] = ord('A')
 
 # 一次性 H2D（可重用）
@@ -20,14 +20,14 @@
 
 # 2) 预热，触发 JIT/驱动初始化，避免首轮偏慢
 for _ in range(2):
-    fh.blake3_gpu_sm70_hex(d.data_ptr(), d.numel(), stream)
+    fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
 torch.cuda.synchronize()
 
 # 3) 正式计时：这测的是“端到端吞吐”（包含 H2D）
-repeat = 3  # 1GiB × 5 已经很重，按机器调整
+repeat = 5  # 1GiB × 5 已经很重，按机器调整
 t0 = time.perf_counter()
 for _ in range(repeat):
-    cv_hex = fh.blake3_gpu_sm70_hex(d.data_ptr(), d.numel(), stream)
+    cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
 torch.cuda.synchronize()
 t1 = time.perf_counter()
 
diff --git a/csrc/binding.cpp b/csrc/binding.cpp
index d32b172..43290c4 100644
--- a/csrc/binding.cpp
+++ b/csrc/binding.cpp
@@ -178,7 +178,7 @@ static std::string blake3_gpu_root_hex(uint64_t device_ptr,
     std::array<uint32_t,8> root{};
     {
         GilRelease _g;
-        blake3_block_reduce_sm70(d_data, nbytes, &root, stream);
+        blake3_block_reduce_sm70_sm80(d_data, nbytes, &root, stream);
     }
     std::string b = cv_words_to_bytes_le(root);
     return bytes_to_hex(reinterpret_cast<const uint8_t*>(b.data()), b.size());
@@ -206,7 +206,7 @@ PYBIND11_MODULE(flashashing, m) {
 // Return the 32-byte *root chaining value* (CV) computed on GPU for the given data.
 // NOTE: This is not the standard BLAKE3 digest/XOF output. It's the root CV.
 // )pbdoc");
-    m.def("blake3_gpu_sm70_hex",
+    m.def("blake3_gpu_sm70_sm80_hex",
           &blake3_gpu_root_hex,
           py::arg("d_data"), py::arg("nbytes"), py::arg("stream")=0,
           R"pbdoc(
diff --git a/csrc/blake3.h b/csrc/blake3.h
index 52e762a..27509c9 100644
--- a/csrc/blake3.h
+++ b/csrc/blake3.h
@@ -30,7 +30,7 @@ std::string bytes_to_hex(const uint8_t *data, size_t len);
 
 
 // ============== GPU implementations ================
-void blake3_block_reduce_sm70(const uint8_t* d_data, 
+void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, 
                             uint64_t bytes_len,
                             std::array<uint32_t,8>* root_out = nullptr,
                             cudaStream_t stream = 0);
\ No newline at end of file
diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu
new file mode 100644
index 0000000..95b282f
--- /dev/null
+++ b/csrc/blake3_sm70_sm80.cu
@@ -0,0 +1,993 @@
+
+#include "cute/numeric/int.hpp"
+#include <cstdint>
+#include <array>
+#include <cstdio>
+#include <cstring>
+
+#include <cuda_runtime.h>
+#include <cute/tensor.hpp>
+#include <cute/atom/copy_atom.hpp>
+
+#if __CUDA_ARCH__ >= 800
+    #include <cute/arch/copy_sm80.hpp>
+#endif
+
+#include <cute/algorithm/copy.hpp>
+#include <cute/config.hpp>
+#include <cute/numeric/numeric_types.hpp>
+
+#define WARP_SIZE 32
+#define LDST128BITS(value) (reinterpret_cast<const float4*>(&(value))[0])
+
+#define CUDA_CHECK(expr) do {                                   \
+    cudaError_t _e = (expr);                                    \
+    if (_e != cudaSuccess) {                                    \
+      fprintf(stderr, "CUDA error %s at %s:%d: %s\n",           \
+              #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\
+      std::abort();                                             \
+    }                                                           \
+  } while(0)
+
+
+using namespace cute;
+
+using vec_t = cute::uint128_t;        // one time loading 16 B
+
+#if __CUDA_ARCH__ >= 800
+  // SM80 branch
+  using Atom = cute::Copy_Atom<cute::SM80_CP_ASYNC_CACHEGLOBAL<vec_t>, vec_t>;
+#else
+  // Volta/Turing（SM70/75）general copy
+  using Atom = cute::Copy_Atom<cute::AutoVectorizingCopyWithAssumedAlignment<128>, vec_t>;
+#endif
+
+__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = {
+    0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
+    0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
+};
+
+enum : uint32_t {
+    FLAG_CHUNK_START        = 1u << 0,
+    FLAG_CHUNK_END          = 1u << 1,
+    FLAG_PARENT             = 1u << 2,
+    FLAG_ROOT               = 1u << 3,
+    FLAG_KEYED_HASH         = 1u << 4,
+    FLAG_DERIVE_KEY_CONTEXT = 1u << 5,
+    FLAG_DERIVE_KEY_MATERIAL= 1u << 6,
+};
+
+__device__ __noinline__
+uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) {
+    uint32_t f = 0;
+    f |= (uint32_t)-(block_idx_in_chunk==0)          & FLAG_CHUNK_START;
+    f |= (uint32_t)-(block_idx_in_chunk==nblocks_in_chunk-1)  & FLAG_CHUNK_END;
+    if (is_root_chunk)                         f |= FLAG_ROOT; // only this block in msg, or this is root
+    return f;
+}
+
+__device__ __forceinline__
+uint32_t blake3_parent_flags(bool is_root_parent) {
+    return FLAG_PARENT | (is_root_parent ? FLAG_ROOT : 0);
+}
+
+// ---- 小工具 ----
+__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
+#if defined(__CUDA_ARCH__)
+    return __funnelshift_r(x, x, n);
+#else
+  return (x >> n) | (x << (32 - n));    // host 路径
+#endif
+}
+
+__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) {
+#if defined(__CUDA_ARCH__)
+    const uint4 v = *reinterpret_cast<const uint4*>(src);
+    dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w;
+#else
+    std::memcpy(dst, src, 16);
+#endif
+}
+
+__host__ __device__ void blake3_compress_words_7r(
+    const uint32_t block_words[16],   // 64B -> shared memory
+    const uint32_t cv[8],             // 8×u32 -> shared memory
+    uint64_t chunk_counter,           // 64-bit
+    uint32_t block_len,               // [0..64]
+    uint32_t flags,                   // CHUNK_START/END/PARENT/ROOT…
+    uint32_t out_state[16])           // 返回 16×u32 状态向量（按规范）
+{
+    // TODO: 根据 BLAKE3（基于 BLAKE2s）的 G/round 实现7轮
+    // 这里先给占位：将 IV+cv 混合到 out_state，真实实现请替换
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_state[i] = cv[i];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_state[8+i] = BLAKE3_IV[i];
+
+    out_state[12] ^= (uint32_t)chunk_counter;
+    out_state[13] ^= (uint32_t)(chunk_counter >> 32);
+    out_state[14] ^= block_len;
+    out_state[15] ^= flags;
+
+    // so far, the block_words are still pointers.
+    // now we load it into kernel, as pointed out by ncu profile
+    uint32_t block_reg_1[4];
+
+#pragma unroll
+    for (int i = 0; i < 16; i += 4) {        // the gap is 4
+        // load_u128_u32x4(block_words + i, block_reg_1);
+        out_state[i] ^= block_words[i];
+        // 做一点点搅动（占位）
+        out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31);
+    }
+}
+
+// 从 out_state 派生新的 CV（按规范应取 state[0..7] ^ state[8..15]）
+__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_cv[i] = st[i] ^ st[8+i];
+}
+
+// swap-table
+// BLAKE3 message schedule: rows are P^r, r=0..6.
+// Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) 
+// https://www.cse.unsw.edu.au/~cs4601/refs/papers/blake3.pdf
+__device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = {
+    // r = 0: identity
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+    // r = 1: P
+    {  2,  6,  3, 10,  7,  0,  4, 13,  1, 11, 12,  5,  9, 14, 15,  8 },
+    // r = 2: P∘P
+    {  3,  4, 10, 12, 13,  2,  7, 14,  6,  5,  9,  0, 11, 15,  8,  1 },
+    // r = 3
+    { 10,  7, 12,  9, 14,  3, 13, 15,  4,  0, 11,  2,  5,  8,  1,  6 },
+    // r = 4
+    { 12, 13,  9, 11, 15, 10, 14,  8,  7,  2,  5,  3,  0,  1,  6,  4 },
+    // r = 5
+    {  9, 14, 11,  5,  8, 12, 15,  1, 13,  3,  0, 10,  2,  6,  4,  7 },
+    // r = 6
+    { 11, 15,  5,  0,  1,  9,  8,  6, 14, 10,  2, 12,  3,  4,  7, 13 },
+};
+
+// get the "r" round, "k" message, it is broadcasted from m[li] lane. li = k
+__device__ __forceinline__ uint32_t msg_rk(uint32_t m_lane, int round, int k, unsigned mask16) {
+    int src = B3_MSG_SCHEDULE[round][k];
+    return __shfl_sync(mask16, m_lane, src, 16);
+}
+
+__device__ __noinline__
+uint32_t G_update_role(uint32_t v_self, uint32_t v_b, uint32_t v_c, uint32_t v_d,
+                       uint32_t mx, uint32_t my, int role)
+{
+    // 按 BLAKE2s 32-bit 的 G 序列算出 a',b',c',d'，最后返回“当前 role”的那个值
+    uint32_t a = v_self, b = v_b, c = v_c, d = v_d;
+
+    // a = a + b + mx; 
+    // d ^= a; 
+    // d >>>= 16
+    a = a + b + mx;   
+    d ^= a;   
+    d = rotr32(d, 16);
+
+    // c = c + d; 
+    // b ^= c; 
+    // b >>>= 12
+    c = c + d;        
+    b ^= c;   
+    b = rotr32(b, 12);
+
+    // a = a + b + my; 
+    // d ^= a; 
+    // d >>>= 8
+    a = a + b + my;   
+    d ^= a;   
+    d = rotr32(d, 8);
+
+    // c = c + d; 
+    // b ^= c; 
+    // b >>>= 7
+    c = c + d;        
+    b ^= c;   
+    b = rotr32(b, 7);
+
+    // role choice:
+    switch (role) {
+      case 0: return a;
+      case 1: return b;
+      case 2: return c;
+      default: return d;
+    }
+}
+
+// notice that, this function will proceed 2 chunks, each time.
+// - chunk_words_row: current chunk
+// - out_cv: written by lane 0, or lane 16
+__device__ __noinline__
+void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row, // 256×u32 -> 1024 Bytes, from shared memory
+                                    // so the chunks_row += 2 as gap
+                                    int chunk_len_bytes,
+                                    uint64_t chunk_counter,
+                                    uint32_t out_cv[8],
+                                    unsigned mask16) {
+    // computing index
+    int lane = threadIdx.x & 31;    // lane_id: 0-31
+    int sub  = lane >> 4;            // 0/1
+    int li   = lane & 15;            // 0..15, abstract lane id. for example, lane 16 will be li=0
+    int role = li & 3;               // a/b/c/d role
+    int base = (sub << 4);           // 0 or 16 the absolute base
+
+    const int nblocks = (chunk_len_bytes + 63) >> 6;  // ceil(chunk_len/64)
+
+    int warp_id = threadIdx.x / WARP_SIZE;
+
+    // initialize
+    uint32_t cv_word = 0;
+    if (li < 8) cv_word = BLAKE3_IV[li];
+
+    // process all blocks
+    // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks
+    // each block has 64B -> 16 x u32
+    for (int b = 0; b < nblocks; ++b) {
+        // each lane holds one u32, 
+        // 16 lane will hold 16 x 4 = 64 B -> it's block
+        // the another 16 lane will hold opposite 64 B
+        const uint32_t m_lane = chunk_words_row[b * 16 + li];
+
+        // 初始化 v：v[0..7]=cv, v[8..11]=IV，v[12..15]^=t/len/flags
+        // 先把“自己的那个索引”的初值准备好：
+        uint32_t v = (li < 8)
+            ? cv_word                                 // v[i]（i<8）
+            : BLAKE3_IV[li - 8];                      // v[8..15] ← IV
+
+        // 计数器/长度/标志（按 BLAKE3 规范）
+        const uint32_t t0 = (uint32_t)chunk_counter;
+        const uint32_t t1 = (uint32_t)(chunk_counter >> 32);
+        const int remain = chunk_len_bytes - (b << 6);
+        const uint32_t block_len = (remain >= 64) ? 64u : (uint32_t)remain;
+
+        const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false);
+
+        // 只在 12..15 四个索引上异或相应域（不分支，用谓词掩码）
+        v ^= (li == 12) ? t0       : 0u;
+        v ^= (li == 13) ? t1       : 0u;
+        v ^= (li == 14) ? block_len: 0u;
+        v ^= (li == 15) ? flags    : 0u;
+
+        // 把索引写成 li = q + 4*rq；对角步先做置换 li' = (rq<<2) | ((q+rq)&3)
+        int q  = (li & 3);
+        int rq = (li >> 2);
+        int li_diag = (rq << 2) | ((q + rq) & 3);
+        int li_undo = (rq << 2) | ((q - rq) & 3);
+        int gi_col = q; // 0..3
+        int gi_diag = (li_diag & 3); // 0..3
+
+        // ===== 7 rounds =====
+        #pragma unroll 4
+        for (int r = 0; r < 7; ++r) {
+            // inside this loop, each lane will do one job
+            // 16 lane will execute 16 x 2 operations
+            // in sequential-programming, will do 8 operation
+
+            // ---- 列步（quartet: {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15}）----
+            {
+                // 取同 quartet 的 b/c/d（基于当前 v）
+                uint32_t vb = __shfl_xor_sync(mask16, v,  4, 16);
+                uint32_t vc = __shfl_xor_sync(mask16, v,  8, 16);
+                uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16);
+
+                // 本 quartet 的 i ∈ {0,1,2,3}，列步用 msg 索引 0..7（两两为一对）
+                
+                uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16);
+                uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16);
+
+                v = G_update_role(v, vb, vc, vd, mx, my, role);
+            }
+
+            // ---- 对角步 ----
+            {
+                // 在“对角置换域”取到当前 v 值
+                uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16);
+
+                // 在该域内做“列步”同样的四邻取值
+                uint32_t vb = __shfl_xor_sync(mask16, v_diag,  4, 16);
+                uint32_t vc = __shfl_xor_sync(mask16, v_diag,  8, 16);
+                uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16);
+
+                // 对角步的 4 组 G 使用本轮消息对的后半（索引 8..15）
+                
+                uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16);
+                uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16);
+
+                uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role);
+
+                // 反置换回原位：li_undo = (rq<<2) | ((q - rq) & 3)
+                
+                // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16);
+                v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16);
+            }
+        } // 7 rounds end
+
+        // 派生新的 CV：cv[i] = v[i] ^ v[i+8]（仅 li=0..7 生效）
+        uint32_t vip8_all = __shfl_sync(mask16, v, li ^ 8, 16);
+        if (li < 8) {
+            cv_word = v ^ vip8_all;
+        }
+
+        // 下一块继续（本函数内 16 个 block 串行）
+    }
+
+    // 由 lane0 / lane16 收集 8×u32 输出
+    #pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用
+        if (li == 0) out_cv[j] = wj;                       // 仅 lane0 落盘
+    }
+}
+
+__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){
+    uint32_t msg[16];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[i] = L[i]; 
+    }
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[8+i] = R[i]; 
+    }
+    uint32_t st[16];
+    blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st);
+    blake3_state_to_cv(st, out_cv);
+}
+
+
+
+// ============ Big kernel: 16 WARPS in total ============
+// grid: (chunks / 64), thread: (512,)
+template<const int NUM_THREADS=512,         // or 256
+        const int CHUNK_SIZE=1024, 
+        const int CHUNKS_PER_BLOCK=64,      // 64 KB, if 256 threads -> 32 KB
+        const int PAD_CHUNK=16,
+        const int PAD_CV=0>        // pad shared memory
+__global__ void blake3_block_reduce_kernel(uint32_t* d_input, 
+                                            uint32_t* block_cvs,
+                                            int chunk_len_bytes,
+                                            uint64_t base_chunk_counter,
+                                            int total_chunks) {
+    // NUM_WARPS also stands for NUM_CHUKNS per block
+    constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;        // consider it as 16
+    constexpr int CHUNKS_PROCEED = CHUNKS_PER_BLOCK;
+
+    static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned");
+    static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4");
+
+    // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total
+    // 8 x 4 x 64 = 2 KiB shared memory in sum
+    __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV];        // avoid bank conflict
+
+    constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4;     // 256
+    constexpr int VEC_ELEMS = 4;                        // uint4, 16B
+    constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB)
+    constexpr int WARPS_PER_CTA = NUM_WARPS;            // 16
+
+    // 4 bytes x 256 x 64 = 64 KiB shared memory.
+    __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][WORDS_PER_CHUNK + PAD_CHUNK];     // [64][256+PAD]
+
+    const int tid = threadIdx.x;
+    const int bx = blockIdx.x;
+    const int warp_id = tid / WARP_SIZE;
+    const int lane_id = tid % WARP_SIZE;
+
+    // ============== STAGE 1: Coalsced Global Memory Loading ==============
+    const int tile_id   = blockIdx.x;
+    const int tile_base = tile_id * CHUNKS_PER_BLOCK;       // which chunk do this block start loading
+
+    int valid_chunks = total_chunks - tile_base;
+    if (valid_chunks <= 0) {
+        return; // overflow
+    }
+    if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK;
+
+    // use CuTe for loading
+
+    auto make_smem_tensor = [&](int chunk_local /* The chunk id on shared memory */) {
+        // smem index：chunk_smem[chunk_local][0]
+        // length: (WORDS_PER_CHUNK + PAD_CHUNK)
+        uint32_t* s_u32 = &chunk_smem[chunk_local][0];
+        vec_t* s_vec = reinterpret_cast<vec_t*>(s_u32); // each element is 16B
+        return make_tensor(make_smem_ptr(s_vec),
+                           make_shape(Int<VEC_PER_CHUNK>{}),        // 64 x 4 = 256, one row of shared memory
+                           make_stride(Int<1>{}));  // continuous
+    };
+
+    // shared memory 256 items = 64 x vec_t(4)
+    // thread layout: 32 threads
+    //      thread 0: 0, 32
+    //      thread 1: 1, 33
+    //      thread 32:  31,63
+    // so we load data in coalsced mode
+    using Atom = cute::Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<vec_t>, vec_t>;
+
+    // thread layout. (lane,0)->idx=2*lane，(lane,1)->idx=2*lane+1
+    auto thr_layout = make_layout(
+        make_shape(Int<2>{}),
+        make_stride(Int<1>{})
+    );
+
+    auto tile_layout = make_layout(
+        make_shape(Int<WARP_SIZE>{}, Int<2>{}),     // (32,2)
+        make_stride(Int<1>{}, Int<WARP_SIZE>{})   // idx = lane + k2*32
+    );
+
+    // build the real copy
+    TiledCopy tcopy = make_tiled_copy(Atom{}, thr_layout, tile_layout);
+
+    // this sentence will load
+    // auto thr_copy = local_partition(tcopy, lane_id);
+
+
+    for (int ldt = 0; ldt < 4; ldt++) {
+        // each warp load 4 chunks
+        int chunk_local  = ldt * WARPS_PER_CTA + warp_id;           // ldt*16 + warp -> start chunk
+        int chunk_global = tile_base + chunk_local;                 // global chunk idx
+
+        // the pointer for shared memory
+        auto smem_vec1d = make_smem_tensor(chunk_local);        // 64 x uint128_t
+
+        // only read from global, when it's valid
+        // or, we fill it with 0
+        const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK;
+        const vec_t* g_vec_base = reinterpret_cast<const vec_t*>(g_u32);        // 16 B each time
+
+        // the src is 64 x 4 = 256, so is dest
+        auto gmem_vec1d = make_tensor(make_gmem_ptr(g_vec_base),
+                                make_shape(Int<VEC_PER_CHUNK>{}),       // 64 x uint128_t
+                                make_stride(Int<1>{}));
+        bool g_valid = (chunk_local < valid_chunks);
+
+        auto smem_vec2d = make_tensor(smem_vec1d.data(), tile_layout);
+        auto gmem_vec2d = make_tensor(gmem_vec1d.data(), tile_layout);
+
+        // now we will load this
+        auto tCg = local_partition(gmem_vec2d, tile_layout, lane_id);
+        auto tCs = local_partition(smem_vec2d, tile_layout, lane_id);
+
+        // launch the copy inst.
+        if (g_valid) {
+            // gmem load → smem store：twice 16B
+            copy(tcopy, tCg, tCs);
+        } else {
+            for (int i = 0; i < size(tCs); ++i) {
+                *reinterpret_cast<vec_t*>(raw_pointer_cast(&tCs(i))) = uint128_t(0, 0);
+            }
+        }
+    }
+    cp_async_fence();
+    cp_async_wait<0>();
+
+    __syncthreads();        // sync all warps
+
+    // ============== STAGE 2: Compress Leaf to 64 chain value ==============
+    const int pass0_valid = min(32, valid_chunks);              // pass0 cover [0, 31] chunks
+    const int pass1_valid = max(0, valid_chunks - 32);          // pass1 cover [32, 63] chunks
+
+    __shared__ int parents_count;
+    if (threadIdx.x == 0) {
+        const int parents0 = (pass0_valid + 1) >> 1;
+        const int parents1 = (pass1_valid + 1) >> 1;
+        parents_count = parents0 + parents1;             // ≤ 32
+    }
+    __syncthreads();
+
+    // if (blockIdx.x==0 && threadIdx.x==0) printf("after stage1\n");
+
+    // this is for each warp's lane0 and lane16 written
+    // to decrease the register usage.
+    __shared__ __align__(16) uint32_t tmp_cv[CHUNKS_PER_BLOCK][8 + PAD_CV]; // 2 KiB
+
+    // lambda function: compress this thing
+    auto do_big_pass = [&](int base /*0 或 32*/, int pass_valid) {
+        // left=base+2*warp_id, right=left+1
+        const int left       = base + (warp_id << 1);       // base + 0,2,4,6,...
+        const int right      = left + 1;
+        const int left_rel   = left  - base;   // 0..31
+        const int right_rel  = right - base;   // 1..32
+        const bool has_left  = (left_rel  < pass_valid);
+        const bool has_right = (right_rel < pass_valid);
+
+        // const int lane_id    = threadIdx.x & 31;
+        const int sub        = lane_id >> 4;               // sub-warp: 0 or 1, lane0-15: sub-warp0; lane16-lane31: sub-warp1
+        const int li         = lane_id & 15;               // 0..15
+        const unsigned mask16= (sub == 0 ? 0x0000FFFFu : 0xFFFF0000u);
+
+        const int chunk_local = left + sub;        // sub=0→left, sub=1→right
+        const bool active = (sub==0 ? (left - base)  < pass_valid
+                                    : (right - base) < pass_valid);
+
+        // uint32_t my_cv[8];
+
+        // the left-sub-warp and right-sub-warp will execute the same code
+        // distinguish the index by computing,
+        // to avoid warp-divergence
+        if (active) {
+            // the chunk local identifies the left or right chunk, so do not worry.
+            const uint32_t* row = &chunk_smem[chunk_local][0];
+            const uint64_t  cc  = base_chunk_counter + (uint64_t)(tile_base + chunk_local);
+            blake3_leaf_cv_simd16_onechunk(row, 
+                chunk_len_bytes,
+                cc, 
+                &tmp_cv[chunk_local][0], 
+                mask16);
+        }
+
+        __syncwarp();       // make sure two warps written into `tmp_cv`
+
+        // now we have compute 2 chunks' cv
+        // merge it to a parent cv
+        if (lane_id == 0 && has_left) {
+            const uint32_t* lcv = &tmp_cv[left][0];
+            uint32_t parent[8];
+            if ((right - base) < pass_valid) {
+                const uint32_t* rcv = &tmp_cv[right][0];
+                blake3_parent_cv(lcv, rcv, parent);
+            } else {        // odd: up-flow directly
+                #pragma unroll
+                for (int j = 0 ; j < 8; ++j) 
+                    parent[j] = lcv[j];
+            }
+
+            // now, one warp computes 2 chunks, yield one parent-cv value
+            const int pair_idx = (base >> 1) + warp_id;     // 0, 16 + warp_id
+            #pragma unroll
+            for (int j = 0; j < 8; ++j)
+                cv_smem[pair_idx][j] = parent[j];
+        }
+
+        __syncwarp();       // NOTICE: this is necessary!
+    }; // do_big_pass
+
+    // big-pass 1: computing 0-31 chunks
+    do_big_pass(/*base=*/0,  pass0_valid);
+
+    // if (bx == 0) printf("Finish 1 big pass\n");
+
+    // big-pass 2: computing 32-63 chunks
+    do_big_pass(/*base=*/32, pass1_valid);
+
+    // if (bx == 0) printf("Finish 2 big pass\n");
+
+    __syncthreads();
+
+    // printf("Stage 2 done!!!\n");
+
+    // right now, we have got 32 chain values
+    // a warp-reduce to merge.
+
+    // ============== STAGE 3: Block-Reduce ==============
+    // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv
+    // we will only use warp 0 to handle this thing
+    if (warp_id == 0) {
+        uint32_t cv[8] = {0,0,0,0,0,0,0,0};
+
+        const bool active_lane = (lane_id < parents_count);
+        if (active_lane) {
+    #pragma unroll
+            for (int j = 0; j < 8; ++j) cv[j] = cv_smem[lane_id][j];
+        }
+
+        // 2) warp reduce 32 -> 16 -> 8 -> 4 -> 2 -> 1
+        unsigned mask = __ballot_sync(0xFFFFFFFFu, active_lane);
+        int cur_n = parents_count;  // 当前层的有效节点数（逐层更新）
+
+        for (int step = 1; step < WARP_SIZE; step <<= 1) {
+            // right-neighbor
+            uint32_t nbr[8];
+    #pragma unroll
+            for (int j = 0; j < 8; ++j) {
+                nbr[j] = __shfl_down_sync(mask, cv[j], step);
+            }
+
+            // safety checking
+            const bool do_pair =
+                (lane_id % (step << 1) == 0) &&               // 左侧
+                (lane_id + step < cur_n) &&                   // 右侧在当前层有效范围内
+                (lane_id < cur_n);                            // 左侧也必须有效
+
+            if (do_pair) {
+                blake3_parent_cv(cv, nbr, cv);               // parent(left, right) -> cv
+            }
+
+            cur_n = (cur_n + 1) >> 1;
+            __syncwarp(mask);
+        }
+
+        // 3) write back to global memory
+        if (lane_id == 0 && parents_count > 0) {
+            const int tile_id = blockIdx.x;
+            uint32_t* out = block_cvs + (size_t)tile_id * 8;        // 8 x 4 = 32 B
+
+            // two different write ways
+            #if 0
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                out[j] = cv[j];
+            #else
+            // block_cvs should be cudaMalloc ed
+            reinterpret_cast<uint4*>(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]);
+            reinterpret_cast<uint4*>(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]);
+            #endif
+        }
+    }
+}   // blake3_block_reduce_kernel
+
+__device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) {
+    const uint4* g4 = reinterpret_cast<const uint4*>(g);
+    uint4 a = g4[0], b = g4[1];
+    r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w;
+    r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w;
+}
+
+__device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) {
+    uint4* g4 = reinterpret_cast<uint4*>(g);
+    g4[0] = make_uint4(r[0],r[1],r[2],r[3]);
+    g4[1] = make_uint4(r[4],r[5],r[6],r[7]);
+}
+
+// ============ Tiny kernel ============
+// In big kernel, it will consume 64 KiB each block
+// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 64 root = 16384 root
+// And this tiny kernel is designed to process these 16384 root
+template<int NUM_THREADS=512, int TILE_CVS=2048, int PAD=0>
+__global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32,
+                                              uint32_t* __restrict__ out_cv32,
+                                              int N)
+{
+    extern __shared__ __align__(16) uint32_t smem[]; // 动态 SMEM；需要 >= TILE_CVS*8*4 字节
+    // 视作 2D：[TILE_CVS][8+PAD]
+    uint32_t* cv_tile = smem;
+
+    const int tid     = threadIdx.x;
+    const int warp_id = tid / WARP_SIZE;   // 0..15
+    const int lane_id = tid % WARP_SIZE;   // 0..31
+
+    // 本 block 负责的分片起点
+    const int tile_start = blockIdx.x * TILE_CVS;
+    if (tile_start >= N) return;
+
+    // N等于8的时候，这里就是8
+    const int tile_n = min(TILE_CVS, N - tile_start); // 该分片的实际 CV 数（<=2048）
+
+    // ---------------- Stage 1: 合并访存 loading 到 SMEM ----------------
+    // 每线程搬多个 CV：i = tid, tid+blockDim, ...
+    for (int i = tid; i < tile_n; i += NUM_THREADS) {       // 注意：i = tid, 不是等于0
+        const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8;
+        uint32_t* s = cv_tile + (size_t)i * (8 + PAD);
+        // 两次 16B
+        const uint4* g4 = reinterpret_cast<const uint4*>(g);
+        uint4*       s4 = reinterpret_cast<uint4*>(s);
+        // s4[0] = g4[0];
+        // s4[1] = g4[1];
+
+        // in case that the address is not aligned
+        uint4 v0 = g4[0];
+        uint4 v1 = g4[1];
+
+        s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w;
+        s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w;
+    }
+    // 对于 tile_n < TILE_CVS 的尾部，无需清零；后续按有效范围处理
+    __syncthreads();
+
+    // ---------------- Stage 2: 线程内 4→1（保持相邻配对） ----------------
+    // 共有 reduced_n0 = ceil(tile_n / 4) 个 lane-root
+    const int reduced_n0 = (tile_n + 3) >> 1 >> 1; // 等价于 (tile_n+3)/4
+    uint32_t lane_cv[8]; // 本线程输出的 lane-root
+    bool lane_valid = false;
+
+    // 每线程的 4 个输入的起始索引
+    int base4 = tid << 2; // tid*4
+    if (base4 < tile_n) {
+        // 读取最多 4 个相邻 CV：idx = base4 + 0,1,2,3
+        uint32_t a[8], b[8], c[8], d[8];
+        const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD);
+        load_cv_g2r(s0, a);
+
+        int remain = tile_n - base4;
+
+        if (remain >= 2) {
+            const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD);
+            load_cv_g2r(s1, b);
+        }
+        if (remain >= 3) {
+            const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD);
+            load_cv_g2r(s2, c);
+        }
+        if (remain >= 4) {
+            const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD);
+            load_cv_g2r(s3, d);
+        }
+
+        // 两层相邻配对（奇数晋级）
+        if (remain == 1) {
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                lane_cv[j] = a[j];
+        } else if (remain == 2) {
+            blake3_parent_cv(a, b, lane_cv);
+        } else if (remain == 3) {
+            uint32_t p01[8];
+            blake3_parent_cv(a, b, p01);
+            blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01，(p01,c)->lane_cv
+        } else { // remain >= 4
+            uint32_t p01[8], p23[8];
+            blake3_parent_cv(a, b, p01);
+            blake3_parent_cv(c, d, p23);
+            blake3_parent_cv(p01, p23, lane_cv);
+        }
+        lane_valid = true;
+    }
+
+    // ---------------- Stage 3: Warp 内 32→1 相邻配对 ----------------
+    // 每个 warp 负责一个连续段：warp_base = warp_id*32
+    const int warp_base = warp_id * WARP_SIZE;
+    const int cur_n_w   = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // 本 warp 段内的有效数量
+
+    // 把 lane_cv 保留在寄存器里做归约；无效 lane 用 mask 剔除
+    unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // 仅做存在检测
+    int cur_n = cur_n_w;
+
+    // 把“段外的线程”标成无效（避免读越界）
+    bool active_lane = (lane_id < cur_n_w);
+
+    // 对无效 lane 把值清成 0（不会被使用）
+    if (!active_lane) { 
+        #pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            lane_cv[j] = 0u; 
+    }
+
+    // 逐层配对：1,2,4,8,16 - warp-reduce
+    for (int step = 1; step < WARP_SIZE; step <<= 1) {
+        // 取右邻
+        uint32_t nbr[8];
+        #pragma unroll
+        for (int j = 0; j < 8; ++j)
+            nbr[j] = __shfl_down_sync(0xFFFFFFFFu, lane_cv[j], step);
+
+        const bool do_pair =
+            active_lane &&
+            ((lane_id % (step<<1)) == 0) &&
+            (lane_id + step < cur_n);
+
+        if (do_pair) {
+            blake3_parent_cv(lane_cv, nbr, lane_cv);
+        }
+
+        cur_n = (cur_n + 1) >> 1;
+        // __syncwarp();
+    }
+
+    // 这一段的结果在 lane0；把 16 个 warp-root 写入 SMEM 的前 16 行
+    __shared__ uint32_t warp_roots[WARP_SIZE/2][8]; // 16×8
+    if (lane_id == 0 && cur_n_w > 0) {
+        #pragma unroll
+        for (int j=0;j<8;++j) 
+            warp_roots[warp_id][j] = lane_cv[j];
+    }
+    __syncthreads();
+
+    // ---------------- Stage 4: CTA 内 16→1 相邻配对 ----------------
+    // 有效 warp 数：ceil(reduced_n0 / 32)
+    int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..16
+    if (valid_warps == 0) return;
+
+    // 每一个warp的lane 0来做计算
+    // 用 lane0 做计算，其它 lane 空转
+    for (int stride = (valid_warps >> 1); stride >= 1; stride >>= 1) {
+        if (warp_id < stride && lane_id == 0) {
+            uint32_t p[8];
+            blake3_parent_cv(&warp_roots[2*warp_id][0],
+                             &warp_roots[2*warp_id + 1][0], p);
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                warp_roots[warp_id][j] = p[j];
+        }
+        __syncthreads();
+        // 奇数晋级
+        if ((valid_warps & 1) && warp_id==0 && lane_id==0) {
+            #pragma unroll
+            for (int j=0;j<8;++j)
+                warp_roots[stride][j] = warp_roots[valid_warps-1][j];
+        }
+        __syncthreads();
+        valid_warps = (valid_warps + 1) >> 1;
+    }
+
+    // 写回本 block 的根
+    if (threadIdx.x == 0) {
+        uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8;
+        #pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            out[j] = warp_roots[0][j];
+    }
+}
+
+inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) {
+    const uint32_t zero_block[16] = {0};
+    uint32_t st[16];
+    blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st);
+    // 写出前 32 字节（state[0..7]，小端）
+    for (int i = 0; i < 8; ++i) {
+        uint32_t w = st[i];
+        out32[4*i+0] = (uint8_t)( w        & 0xFF);
+        out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF);
+        out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF);
+        out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF);
+    }
+}
+
+// wrapper function
+void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, 
+                            uint64_t bytes_len,
+                            std::array<uint32_t,8>* root_out = nullptr,
+                            cudaStream_t stream = 0) {
+    if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) {
+        fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n",
+                (unsigned long long)bytes_len);
+        std::abort();
+    }
+
+    // int dev = -1;
+    // cudaGetDevice(&dev);
+    // printf("[dbg] my runtime current device = %d\n", dev);
+
+    // cudaPointerAttributes attr{};
+    // auto st = cudaPointerGetAttributes(&attr, d_data);
+    // printf("[dbg] getAttr=%d, type=%d (0=host,1=device,2=managed), device=%d\n",
+    //     (int)st, (int)attr.type, attr.device);
+
+    // cudaPointerAttributes attr{};
+    // CUDA_CHECK(cudaPointerGetAttributes(&attr, d_data));
+    // if (attr.type != cudaMemoryTypeDevice) {
+    //     fprintf(stderr, "d_data is not device memory!\n");
+    //     std::abort();
+    // }
+
+    int optin = 0, deflt = 0;
+    cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0);
+    cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0);
+
+    const int dyn_smem = 64 * 1024;     // 64KiB
+
+    // 编译器在编译期决定分配多少动态shmem给kernel
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_cv_block_reduce_kernel<512, 2048, 0>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_cv_block_reduce_kernel<32, 2048, 0>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+
+        
+    constexpr int  CHUNKS_PER_BLOCK = 32;                     // 16 * 32 = 512
+    constexpr int  CHUNK_SIZE = 1024;                       // 1 KiB
+    constexpr int  WORDS_PER_CHUNK = CHUNK_SIZE / 4;        // 256
+    constexpr int  NUM_THREADS = CHUNKS_PER_BLOCK * 512 / 64;                       // for big kernel, 512 or 256
+    constexpr int  NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;       // 16 or 8
+    const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
+    const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
+    const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);     // 16384 blocks
+
+    constexpr int pad_chunk = 16;
+    constexpr int pad_cv = 0;
+
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>,
+        cudaFuncAttributePreferredSharedMemoryCarveout, 100));
+        
+    uint8_t*  d_bytes = const_cast<uint8_t*>(d_data);
+    uint32_t* d_words = reinterpret_cast<uint32_t*>(d_bytes);; // alias
+    uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32
+
+    // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch.
+
+    // TODO: use thrust
+    cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t));     // 512 KiB
+
+    // ============= launch big kernel =============
+    dim3 grid_big(num_blocks);
+    dim3 block_big(NUM_THREADS);
+    uint64_t base_chunk_counter = 0ull;
+    
+    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>
+        <<<grid_big, block_big, /*smem*/0, stream>>>(
+            d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks);
+
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    if (num_blocks == 1) {
+        std::array<uint32_t,8> host_root{};
+        CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        // last final process
+        uint8_t digest32[32];
+        blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+        if (root_out) *root_out = host_root;
+        else {
+            // 简单打印
+            printf("root CV:");
+            for (int i=0;i<8;++i) 
+                printf(" %08x", host_root[i]);
+            printf("\n");
+        }
+
+        CUDA_CHECK(cudaFree(d_blockCV));
+        CUDA_CHECK(cudaFree(d_bytes));
+        return;
+    }
+
+    // the first round of tiny kernel
+    // 1) 16384 output reduce -> 8
+    uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32
+    {
+        const int N = 16384;        // total number
+        const int TILE = 2048;
+        const int grid = (N + TILE - 1) / TILE;  // = 8
+        const int block = 512;
+        const size_t smem_bytes = (size_t)(TILE) * 8u * sizeof(uint32_t); // 2048×8×4 = 64 KiB
+
+        cudaMalloc(&d_mid_out, (size_t)8 * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<512, 2048, 0>
+            <<<grid, block, smem_bytes, stream>>>(d_blockCV /*in: 16384×8 x 4*/,
+                                                d_mid_out   /*out: 8×8*/, N);
+        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    // second round
+    uint32_t* d_root_cv = nullptr;
+    {
+        const int N = 8;
+        const int TILE = 2048; // 任意 >=N 即可
+        const int grid = 1;
+        const int block = 32; // 32 线程够用
+        const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t);        // 8 x 8 x 4 = 8 x 32 B
+
+        cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<32, 2048, 0>
+            <<<grid, block, smem_bytes, stream>>>(d_mid_out /*in: 8×8*/,
+                                                d_root_cv /*out: 1×8*/, N);
+        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    std::array<uint32_t, 8> host_root{};
+    CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root_cv, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    // last final process
+    uint8_t digest32[32];
+    blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+    if (root_out) {
+        *root_out = host_root;
+    } else {
+        printf("root CV:");
+        for (int i=0;i<8;++i) printf(" %08x", host_root[i]);
+        printf("\n");
+    }
+
+    // clear
+    CUDA_CHECK(cudaFree(d_mid_out));
+    CUDA_CHECK(cudaFree(d_root_cv));
+    CUDA_CHECK(cudaFree(d_blockCV));
+    // CUDA_CHECK(cudaFree(d_bytes));       // this memory is managed by torch, we do not free it.
+}
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index b62068c..68e8d6d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,5 @@ pandas==2.3.3
 pip-chill==1.0.3
 pybind11==3.0.1
 tomli==2.0.1
+torch
+blake3
\ No newline at end of file
diff --git a/setup.py b/setup.py
index e047cc4..599e2f0 100644
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,44 @@ def locate_cuda():
         cuda_libdir = os.path.join(cuda_home, "lib64")
     return {"home": cuda_home, "nvcc": nvcc, "include": cuda_include, "libdir": cuda_libdir}
 
+def locate_cutlass_cute():
+    """
+    返回 CUTLASS/CuTe 的 include 根目录（要求该目录下有 cutlass/ 与 cute/）。
+    优先使用环境变量 CUTLASS_HOME 或 CUTE_HOME。
+    """
+    cand = []
+    if "CUTE_HOME" in os.environ:
+        cand.append(os.path.join(os.environ["CUTE_HOME"], "include"))
+        cand.append(os.environ["CUTE_HOME"])
+    if "CUTLASS_HOME" in os.environ:
+        cand.append(os.path.join(os.environ["CUTLASS_HOME"], "include"))
+        cand.append(os.environ["CUTLASS_HOME"])
+
+    cand += [
+        "/usr/local/include",                         # linux 常见
+        os.path.expanduser("~/cutlass/include"),      # clone 到家目录
+        os.path.expanduser("~/CUTLASS/include"),
+        os.path.expanduser("~/third_party/cutlass/include"),
+        os.path.abspath("third_party/cutlass/include"),
+    ]
+
+    def ok(p):
+        return p and os.path.isdir(p) and \
+               os.path.isdir(os.path.join(p, "cute"))  # CuTe 在 cutlass/include/cute
+
+    for p in cand:
+        if ok(p):
+            return os.path.abspath(p)
+
+    raise RuntimeError(
+        "Cannot find CUTLASS/CuTe include path.\n"
+        "Set CUTLASS_HOME or CUTE_HOME to the repository root (the path that contains 'include/cute')."
+    )
+
 CUDA = locate_cuda()
+CUTLASS_INCLUDE = locate_cutlass_cute()
+
+print(f"Found cutlass include: {CUTLASS_INCLUDE}")
 
 CXX_STD = 17
 
@@ -42,7 +79,13 @@ def locate_cuda():
         NVCC_ARCH_FLAGS += ["-gencode", f"arch=compute_{a},code=sm_{a}"]
 
 COMMON_DEFINES = []
-COMMON_INCLUDES = [np.get_include(), pybind11.get_include(), pybind11.get_include(user=True), CUDA["include"]]
+COMMON_INCLUDES = [
+    np.get_include(), 
+    pybind11.get_include(), 
+    pybind11.get_include(user=True), 
+    CUDA["include"],
+    CUTLASS_INCLUDE,    
+]
 COMMON_LIB_DIRS = [CUDA["libdir"]]
 COMMON_LIBS = ["cudart"]
 RPATH = [CUDA["libdir"]] if not sys.platform.startswith("win") else []
@@ -89,24 +132,10 @@ def locate_cuda():
     "csrc/sha256_base.cpp",
     "csrc/sha256_simd.cpp",
     "csrc/blake3_base.cpp",
-    "csrc/blake3_sm70.cu",
+    "csrc/blake3_sm70_sm80.cu",
     "csrc/binding.cpp",
 ]
 
-# ext = Pybind11Extension(
-#     "flashashing",
-#     sources=sources,
-#     include_dirs=COMMON_INCLUDES,
-#     library_dirs=COMMON_LIB_DIRS,
-#     libraries=COMMON_LIBS,
-#     extra_compile_args={
-#         "cxx": CXX_FLAGS,
-#         "nvcc": []
-#     },
-#     extra_link_args=LINK_FLAGS,
-#     define_macros=[("PYBIND11_DETAILED_ERROR_MESSAGES", "1")] + [(d, None) for d in COMMON_DEFINES],
-# )
-
 setup(
     name="flashashing",
     ext_modules=[
@@ -117,6 +146,10 @@ def locate_cuda():
                 "cxx": CXX_FLAGS,
                 "nvcc": NVCC_FLAGS,
             },
+            include_dirs=COMMON_INCLUDES,
+            library_dirs=COMMON_LIB_DIRS,
+            libraries=COMMON_LIBS,
+            extra_link_args=LINK_FLAGS + (["-Wl,-rpath," + RPATH[0]] if RPATH else []),
         )
     ],
     cmdclass={"build_ext": BuildExtension},

From 263756c78e23f2e5e255e9b4088f8bba427590be Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Tue, 7 Oct 2025 14:09:26 +0800
Subject: [PATCH 10/20] bug fix for SM70

---
 README.md                | 2 ++
 csrc/blake3_sm70_sm80.cu | 9 ++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index c435acf..0a6cf25 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,8 @@ pip install -r requirements.txt
 # clone the cutlass repository
 git clone https://github.com/NVIDIA/cutlass.git ~/cutlass --depth 1
 
+export FLASHASHING_CUDA_ARCH_LIST=<your-arch-here>        # your arch here
+
 # install the cpp source file
 python setup.py install
 
diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu
index 95b282f..989eafa 100644
--- a/csrc/blake3_sm70_sm80.cu
+++ b/csrc/blake3_sm70_sm80.cu
@@ -9,7 +9,7 @@
 #include <cute/tensor.hpp>
 #include <cute/atom/copy_atom.hpp>
 
-#if __CUDA_ARCH__ >= 800
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
     #include <cute/arch/copy_sm80.hpp>
 #endif
 
@@ -34,7 +34,7 @@ using namespace cute;
 
 using vec_t = cute::uint128_t;        // one time loading 16 B
 
-#if __CUDA_ARCH__ >= 800
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
   // SM80 branch
   using Atom = cute::Copy_Atom<cute::SM80_CP_ASYNC_CACHEGLOBAL<vec_t>, vec_t>;
 #else
@@ -408,7 +408,7 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
     //      thread 1: 1, 33
     //      thread 32:  31,63
     // so we load data in coalsced mode
-    using Atom = cute::Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<vec_t>, vec_t>;
+    // using Atom = cute::Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<vec_t>, vec_t>;
 
     // thread layout. (lane,0)->idx=2*lane，(lane,1)->idx=2*lane+1
     auto thr_layout = make_layout(
@@ -464,8 +464,11 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
             }
         }
     }
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
     cp_async_fence();
     cp_async_wait<0>();
+#endif
 
     __syncthreads();        // sync all warps
 

From 7f8e35dbe17c4e26eec478d699884f96a31c4e13 Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Tue, 7 Oct 2025 18:33:36 +0800
Subject: [PATCH 11/20] debug

---
 benchmark/perf.txt       |   3 +-
 benchmark/test_gpu.py    |  51 ++++++++-----
 csrc/blake3_sm70_sm80.cu | 150 ++++++++++++++++++++++++---------------
 setup.py                 |   9 +--
 4 files changed, 134 insertions(+), 79 deletions(-)

diff --git a/benchmark/perf.txt b/benchmark/perf.txt
index 8656b7e..2920e89 100644
--- a/benchmark/perf.txt
+++ b/benchmark/perf.txt
@@ -1 +1,2 @@
-10.6 V100: 53927.17 MiB/s
\ No newline at end of file
+10.7 V100: 54819.25 MiB/s
+    RTX 4090: 150585.83 MiB/s
diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py
index 52b0977..33e14a0 100644
--- a/benchmark/test_gpu.py
+++ b/benchmark/test_gpu.py
@@ -4,6 +4,9 @@
 import time
 import blake3
 
+check_accuracy = True
+check_perf = False
+
 GiB = 1024*1024*1024  # bytes -> 1 GiB
 
 cpu = torch.empty(GiB * 1, dtype=torch.uint8)
@@ -18,23 +21,37 @@
 
 # std_hex = blake3.blake3(data).hexdigest()
 
-# 2) 预热，触发 JIT/驱动初始化，避免首轮偏慢
-for _ in range(2):
-    fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
-torch.cuda.synchronize()
-
-# 3) 正式计时：这测的是“端到端吞吐”（包含 H2D）
-repeat = 5  # 1GiB × 5 已经很重，按机器调整
-t0 = time.perf_counter()
-for _ in range(repeat):
+if check_accuracy:
+    # 2) GPU 版本
     cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
-torch.cuda.synchronize()
-t1 = time.perf_counter()
-
-t1 = time.perf_counter()
-elapsed = t1 - t0
-print(f"Elapsed time for {repeat}x BLAKE3 (GPU SM70): {elapsed:.3f} seconds")
-print(f"Throughput: {repeat * d.numel() / elapsed / (1024**2):.2f} MiB/s")
-print("root CV (hex) =", cv_hex)
+    torch.cuda.synchronize()
+    print("GPU BLAKE3 Result:   ", cv_hex)
+
+    # 1) CPU 版本
+    std_hex = blake3.blake3(cpu.numpy()).hexdigest()
+    print("std BLAKE3 Expected: ", std_hex)
+
+    assert cv_hex == std_hex, "GPU BLAKE3 result does not match CPU result!"
+    print("GPU BLAKE3 result matches CPU result!")
+
+if check_perf:
+    # 2) 预热，触发 JIT/驱动初始化，避免首轮偏慢
+    for _ in range(2):
+        fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
+    torch.cuda.synchronize()
+
+    # 3) 正式计时：这测的是“端到端吞吐”（包含 H2D）
+    repeat = 5  # 1GiB × 5 已经很重，按机器调整
+    t0 = time.perf_counter()
+    for _ in range(repeat):
+        cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
+    torch.cuda.synchronize()
+    t1 = time.perf_counter()
+
+    t1 = time.perf_counter()
+    elapsed = t1 - t0
+    print(f"Elapsed time for {repeat}x BLAKE3 (GPU SM70): {elapsed:.3f} seconds")
+    print(f"Throughput: {repeat * d.numel() / elapsed / (1024**2):.2f} MiB/s")
+    print("root CV (hex) =", cv_hex)
 
 # print(f"std BLAKE3 Expected: {std_hex}")
\ No newline at end of file
diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu
index 989eafa..a9f3ec9 100644
--- a/csrc/blake3_sm70_sm80.cu
+++ b/csrc/blake3_sm70_sm80.cu
@@ -10,7 +10,7 @@
 #include <cute/atom/copy_atom.hpp>
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-    #include <cute/arch/copy_sm80.hpp>
+#include <cute/arch/copy_sm80.hpp>
 #endif
 
 #include <cute/algorithm/copy.hpp>
@@ -365,7 +365,7 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
 
     // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total
     // 8 x 4 x 64 = 2 KiB shared memory in sum
-    __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV];        // avoid bank conflict
+    __shared__ __align__(16) uint32_t cv_smem[CHUNKS_PER_BLOCK / 2][8 + PAD_CV];        // avoid bank conflict
 
     constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4;     // 256
     constexpr int VEC_ELEMS = 4;                        // uint4, 16B
@@ -427,7 +427,6 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
     // this sentence will load
     // auto thr_copy = local_partition(tcopy, lane_id);
 
-
     for (int ldt = 0; ldt < 4; ldt++) {
         // each warp load 4 chunks
         int chunk_local  = ldt * WARPS_PER_CTA + warp_id;           // ldt*16 + warp -> start chunk
@@ -456,8 +455,12 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
 
         // launch the copy inst.
         if (g_valid) {
-            // gmem load → smem store：twice 16B
+            // gmem load → smem store: 16B x 2
             copy(tcopy, tCg, tCs);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+            cp_async_fence();
+            cp_async_wait<0>();
+#endif
         } else {
             for (int i = 0; i < size(tCs); ++i) {
                 *reinterpret_cast<vec_t*>(raw_pointer_cast(&tCs(i))) = uint128_t(0, 0);
@@ -465,33 +468,34 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
         }
     }
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-    cp_async_fence();
-    cp_async_wait<0>();
-#endif
-
     __syncthreads();        // sync all warps
 
-    // ============== STAGE 2: Compress Leaf to 64 chain value ==============
-    const int pass0_valid = min(32, valid_chunks);              // pass0 cover [0, 31] chunks
-    const int pass1_valid = max(0, valid_chunks - 32);          // pass1 cover [32, 63] chunks
+#if defined(DBG_KERNEL) && DBG_KERNEL
+        if (threadIdx.x == 0 && blockIdx.x == 0) {
+            printf("Stage 1 finish processing\n");
+        }
+#endif
+
+    // ============== STAGE 2: Compress Leaf to `CHUNKS_PER_BLOCK` chain value ==============
+    const int pass0_valid = min(CHUNKS_PER_BLOCK / 2, valid_chunks);                // pass0 cover [0, CHUNKS_PER_BLOCK / 2] chunks
+    const int pass1_valid = max(0, valid_chunks - (CHUNKS_PER_BLOCK / 2));          // pass1 cover [CHUNKS_PER_BLOCK / 2, CHUNKS_PER_BLOCK] chunks
 
     __shared__ int parents_count;
     if (threadIdx.x == 0) {
-        const int parents0 = (pass0_valid + 1) >> 1;
-        const int parents1 = (pass1_valid + 1) >> 1;
+        const int parents0 = (pass0_valid + 1) >> 1;     // 8 or 16
+        const int parents1 = (pass1_valid + 1) >> 1;     // 8 or 16
         parents_count = parents0 + parents1;             // ≤ 32
     }
     __syncthreads();
 
-    // if (blockIdx.x==0 && threadIdx.x==0) printf("after stage1\n");
-
     // this is for each warp's lane0 and lane16 written
     // to decrease the register usage.
     __shared__ __align__(16) uint32_t tmp_cv[CHUNKS_PER_BLOCK][8 + PAD_CV]; // 2 KiB
 
     // lambda function: compress this thing
-    auto do_big_pass = [&](int base /*0 或 32*/, int pass_valid) {
+    // 64 chunks - 16 warps, 1 warp -> 4 chunks -> 2pass
+    // 32 chunks - 8  warps, 1 warp -> 4 chunks -> 2pass
+    auto do_big_pass = [&](int base /*0 or chunks/2 */, int pass_valid) {
         // left=base+2*warp_id, right=left+1
         const int left       = base + (warp_id << 1);       // base + 0,2,4,6,...
         const int right      = left + 1;
@@ -499,33 +503,51 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
         const int right_rel  = right - base;   // 1..32
         const bool has_left  = (left_rel  < pass_valid);
         const bool has_right = (right_rel < pass_valid);
-
+#if defined(DBG_KERNEL) && DBG_KERNEL
+        if (base == 16 && threadIdx.x % 32 == 31 && blockIdx.x == 0) {
+            printf("left %d right %d left_rel %d, right_rel %d, pass_valid %d\n", left, right, left_rel, right_rel, pass_valid);
+        }
+#endif
+        
         // const int lane_id    = threadIdx.x & 31;
         const int sub        = lane_id >> 4;               // sub-warp: 0 or 1, lane0-15: sub-warp0; lane16-lane31: sub-warp1
         const int li         = lane_id & 15;               // 0..15
-        const unsigned mask16= (sub == 0 ? 0x0000FFFFu : 0xFFFF0000u);
+
+        const unsigned full = __activemask();
+        const unsigned submask = (sub==0 ? 0x0000FFFFu : 0xFFFF0000u) & full;
+        const unsigned mask16  = (sub == 0 ? 0x0000FFFFu : 0xFFFF0000u);
 
         const int chunk_local = left + sub;        // sub=0→left, sub=1→right
-        const bool active = (sub==0 ? (left - base)  < pass_valid
-                                    : (right - base) < pass_valid);
+        const bool valid_sub = (sub==0 ? (left_rel  < pass_valid) : (right_rel < pass_valid));
 
         // uint32_t my_cv[8];
 
         // the left-sub-warp and right-sub-warp will execute the same code
         // distinguish the index by computing,
         // to avoid warp-divergence
-        if (active) {
-            // the chunk local identifies the left or right chunk, so do not worry.
-            const uint32_t* row = &chunk_smem[chunk_local][0];
-            const uint64_t  cc  = base_chunk_counter + (uint64_t)(tile_base + chunk_local);
-            blake3_leaf_cv_simd16_onechunk(row, 
-                chunk_len_bytes,
-                cc, 
-                &tmp_cv[chunk_local][0], 
-                mask16);
+
+        // the chunk local identifies the left or right chunk, so do not worry.
+        const uint32_t* row = valid_sub ? &chunk_smem[chunk_local][0] : &chunk_smem[0][0];
+        uint32_t* out = valid_sub ? &tmp_cv[chunk_local][0] : nullptr;
+
+#if defined(DBG_KERNEL) && DBG_KERNEL
+        if (!valid_sub) {
+            printf("tile %d warp %d not valid in base %d, pass_valid %d\n", blockIdx.x, threadIdx.x/32, base, pass_valid);
         }
+#endif
+        const uint64_t  cc  = base_chunk_counter + (uint64_t)(tile_base + chunk_local);
+        blake3_leaf_cv_simd16_onechunk(row, 
+            chunk_len_bytes,
+            cc, 
+            out, 
+            submask);
+#if defined(DBG_KERNEL) && DBG_KERNEL
+        if (threadIdx.x == 0 && blockIdx.x == 0) {
+            printf("tile %d warp %d ready\n", blockIdx.x, threadIdx.x/32);
+        }
+#endif
 
-        __syncwarp();       // make sure two warps written into `tmp_cv`
+        __syncwarp(full);       // make sure two warps written into `tmp_cv`
 
         // now we have compute 2 chunks' cv
         // merge it to a parent cv
@@ -548,46 +570,59 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
                 cv_smem[pair_idx][j] = parent[j];
         }
 
-        __syncwarp();       // NOTICE: this is necessary!
+        __syncwarp(full);       // NOTICE: this is necessary!
     }; // do_big_pass
 
-    // big-pass 1: computing 0-31 chunks
-    do_big_pass(/*base=*/0,  pass0_valid);
+    // big-pass 1: computing [CHUNK_PER_BLOCK / 2] chunks
+    do_big_pass(0,  pass0_valid);
 
-    // if (bx == 0) printf("Finish 1 big pass\n");
+#if defined(DBG_KERNEL) && DBG_KERNEL
+        if (threadIdx.x == 0 && blockIdx.x == 0 && pass0_valid != 0) {
+            printf("Stage 2 - pass 1 finish processing\n");
+        }
+#endif
 
-    // big-pass 2: computing 32-63 chunks
-    do_big_pass(/*base=*/32, pass1_valid);
+    // big-pass 2: computing [CHUNK_PER_BLOCK / 2] chunks
+    do_big_pass(CHUNKS_PER_BLOCK / 2, pass1_valid);
 
-    // if (bx == 0) printf("Finish 2 big pass\n");
+#if defined(DBG_KERNEL) && DBG_KERNEL
+        if (threadIdx.x == 0 && blockIdx.x == 0 && pass1_valid != 0) {
+            printf("Stage 2 - pass 2 finish processing, pass 1: %d\n", pass1_valid);
+        }
+#endif
 
     __syncthreads();
 
-    // printf("Stage 2 done!!!\n");
-
     // right now, we have got 32 chain values
     // a warp-reduce to merge.
 
     // ============== STAGE 3: Block-Reduce ==============
-    // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv
+    // [32] - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv
     // we will only use warp 0 to handle this thing
     if (warp_id == 0) {
         uint32_t cv[8] = {0,0,0,0,0,0,0,0};
 
-        const bool active_lane = (lane_id < parents_count);
+#if defined(DBG_KERNEL) && DBG_KERNEL
+        if (threadIdx.x == 0 && blockIdx.x == 0) {
+            printf("parents count: %d\n", parents_count);
+        }
+#endif
+
+        const bool active_lane = (lane_id < parents_count);     // parents_count = CHUNKS_PER_BLOCK / 2
         if (active_lane) {
-    #pragma unroll
-            for (int j = 0; j < 8; ++j) cv[j] = cv_smem[lane_id][j];
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                cv[j] = cv_smem[lane_id][j];
         }
 
-        // 2) warp reduce 32 -> 16 -> 8 -> 4 -> 2 -> 1
-        unsigned mask = __ballot_sync(0xFFFFFFFFu, active_lane);
+        // 2) warp reduce [32] -> 16 -> 8 -> 4 -> 2 -> 1
         int cur_n = parents_count;  // 当前层的有效节点数（逐层更新）
-
-        for (int step = 1; step < WARP_SIZE; step <<= 1) {
+        
+        for (int step = 1; cur_n > 1; step <<= 1) {
+            unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_id < cur_n);
             // right-neighbor
             uint32_t nbr[8];
-    #pragma unroll
+            #pragma unroll
             for (int j = 0; j < 8; ++j) {
                 nbr[j] = __shfl_down_sync(mask, cv[j], step);
             }
@@ -607,22 +642,23 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
         }
 
         // 3) write back to global memory
+        // now, warp_id == 0 and lane_id == 0 will write to output ptr
+        // one block compute 32 chunks
         if (lane_id == 0 && parents_count > 0) {
             const int tile_id = blockIdx.x;
             uint32_t* out = block_cvs + (size_t)tile_id * 8;        // 8 x 4 = 32 B
 
-            // two different write ways
-            #if 0
-            #pragma unroll
-            for (int j = 0; j < 8; ++j) 
-                out[j] = cv[j];
-            #else
-            // block_cvs should be cudaMalloc ed
+            // block_cvs should be cudaMalloced
             reinterpret_cast<uint4*>(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]);
             reinterpret_cast<uint4*>(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]);
-            #endif
         }
     }
+
+#if defined(DBG_KERNEL) && DBG_KERNEL
+        if (threadIdx.x == 0 && blockIdx.x == 0) {
+            printf("Stage 3 - finish processing\n");
+        }
+#endif
 }   // blake3_block_reduce_kernel
 
 __device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) {
diff --git a/setup.py b/setup.py
index 599e2f0..630e58b 100644
--- a/setup.py
+++ b/setup.py
@@ -71,7 +71,7 @@ def ok(p):
 
 CXX_STD = 17
 
-arch_list = os.environ.get("FLASHASHING_CUDA_ARCH_LIST", "80;86;89").split(";")
+arch_list = os.environ.get("FLASHASHING_CUDA_ARCH_LIST", "80;89").split(";")
 NVCC_ARCH_FLAGS = []
 for a in arch_list:
     a = a.strip()
@@ -84,18 +84,18 @@ def ok(p):
     pybind11.get_include(), 
     pybind11.get_include(user=True), 
     CUDA["include"],
-    CUTLASS_INCLUDE,    
+    CUTLASS_INCLUDE,
 ]
 COMMON_LIB_DIRS = [CUDA["libdir"]]
 COMMON_LIBS = ["cudart"]
 RPATH = [CUDA["libdir"]] if not sys.platform.startswith("win") else []
 
-debug = False
+debug = os.environ.get("DBG_KERNEL", "0") == "1"
 
 CXX_FLAGS = [
     f"-std=c++{CXX_STD}", "-O3", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math", "-march=native", "-mavx2", "-mfma"
 ] if not debug else [
-    f"-std=c++{CXX_STD}", "-g", "-O0", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math", "-march=native", "-mavx2", "-mfma"
+    f"-std=c++{CXX_STD}", "-g", "-O0", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math", "-march=native", "-mavx2", "-mfma", "-DDBG_KERNEL=1"
 ]
 LINK_FLAGS = []
 
@@ -119,6 +119,7 @@ def ok(p):
     "--expt-relaxed-constexpr",
     "--use_fast_math",
     "-G", "-lineinfo",
+    "-DDBG_KERNEL=1",
 ] + NVCC_ARCH_FLAGS
 
 if not sys.platform.startswith("win"):

From 1853bafae8b19dfa8a78594a227d5913e3dc88fe Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Tue, 7 Oct 2025 22:03:03 +0800
Subject: [PATCH 12/20] finish big kernel debug

---
 benchmark/test_gpu.py    |   3 +
 csrc/blake3_sm70_sm80.cu | 257 +++++++++++++++++++++++++++------------
 setup.py                 |   3 +-
 3 files changed, 185 insertions(+), 78 deletions(-)

diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py
index 33e14a0..40cbb23 100644
--- a/benchmark/test_gpu.py
+++ b/benchmark/test_gpu.py
@@ -31,6 +31,9 @@
     std_hex = blake3.blake3(cpu.numpy()).hexdigest()
     print("std BLAKE3 Expected: ", std_hex)
 
+    std_hex_1KB = blake3.blake3(cpu[:1024].numpy()).hexdigest()
+    print("std BLAKE3 1KB:      ", std_hex_1KB)
+
     assert cv_hex == std_hex, "GPU BLAKE3 result does not match CPU result!"
     print("GPU BLAKE3 result matches CPU result!")
 
diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu
index a9f3ec9..ee5dd16 100644
--- a/csrc/blake3_sm70_sm80.cu
+++ b/csrc/blake3_sm70_sm80.cu
@@ -89,6 +89,23 @@ __host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t ds
 #endif
 }
 
+__device__ void print_cv(uint32_t cv[8]) {
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        auto get_byte = [&](int i) {
+            int w = i >> 2;          // 第 i 个字节来自第 w 个 u32
+            int off = (i & 3) * 8;   // 在该 u32 中的偏移
+            return (unsigned)((cv[w] >> off) & 0xFFu);
+        };
+
+        printf("block %d root CV (u32, little-endian words):", blockIdx.x);
+        for (int i = 0; i < 32; ++i) {
+            printf("%02x", get_byte(i));
+            if ((i & 3) == 3) printf(" ");   // 每 4 字节空格
+        }
+        printf("\n");
+    }
+}
+
 __host__ __device__ void blake3_compress_words_7r(
     const uint32_t block_words[16],   // 64B -> shared memory
     const uint32_t cv[8],             // 8×u32 -> shared memory
@@ -327,6 +344,80 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row
     }
 }
 
+__device__ __noinline__
+void blake3_parent_cv_simd16(const uint32_t* __restrict__ L,     // 8×u32
+                             const uint32_t* __restrict__ R,     // 8×u32
+                             uint32_t* __restrict__ out_cv,      // 8×u32
+                             unsigned mask16)                    // half-warp masks for 16 lanes
+{
+    const int lane = threadIdx.x & 31;
+    const int li   = lane & 15;      // 0..15 half the warp
+    const int role = li & 3;
+
+    // messages: the front 8 from L, and the latter 8 from R
+    const uint32_t m_lane = (li < 8) ? L[li] : R[li - 8];
+
+    // v initialize
+    uint32_t v = (li < 8) ? BLAKE3_IV[li] : BLAKE3_IV[li - 8];
+
+    const uint32_t t0 = 0u;
+    const uint32_t t1 = 0u;
+    const uint32_t block_len = 64u;
+    const uint32_t flags = FLAG_PARENT;
+
+    v ^= (li == 12) ? t0        : 0u;
+    v ^= (li == 13) ? t1        : 0u;
+    v ^= (li == 14) ? block_len : 0u;
+    v ^= (li == 15) ? flags     : 0u;
+
+    // 与 leaf 相同的“列/对角”两步、共 7 轮
+    int q  = (li & 3);
+    int rq = (li >> 2);
+    int li_diag = (rq << 2) | ((q + rq) & 3);
+    int li_undo = (rq << 2) | ((q - rq) & 3);
+    int gi_col  = q;
+    int gi_diag = (li_diag & 3);
+
+    #pragma unroll 4
+    for (int r = 0; r < 7; ++r) {
+        // 列步
+        {
+            uint32_t vb = __shfl_xor_sync(mask16, v,  4, 16);
+            uint32_t vc = __shfl_xor_sync(mask16, v,  8, 16);
+            uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16);
+
+            uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16);
+            uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16);
+
+            v = G_update_role(v, vb, vc, vd, mx, my, role);
+        }
+        // 对角步
+        {
+            uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16);
+            uint32_t vb = __shfl_xor_sync(mask16, v_diag,  4, 16);
+            uint32_t vc = __shfl_xor_sync(mask16, v_diag,  8, 16);
+            uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16);
+
+            uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16);
+            uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16);
+
+            uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role);
+            v = __shfl_sync(mask16, v_diag_new, li_undo, 16);
+        }
+    }
+
+    // state -> CV：cv[i] = v[i] ^ v[i+8]
+    uint32_t vip8 = __shfl_sync(mask16, v, li ^ 8, 16);
+    uint32_t cv_word = (li < 8) ? (v ^ vip8) : 0;
+
+    // 半 warp 汇聚到 out_cv[0..7]（仅 li==0 的 4×收集也可以）
+    #pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        uint32_t wj = __shfl_sync(mask16, cv_word, j, 16);
+        if (li == 0) out_cv[j] = wj;
+    }
+}
+
 __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){
     uint32_t msg[16];
 #pragma unroll
@@ -503,11 +594,11 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
         const int right_rel  = right - base;   // 1..32
         const bool has_left  = (left_rel  < pass_valid);
         const bool has_right = (right_rel < pass_valid);
-#if defined(DBG_KERNEL) && DBG_KERNEL
-        if (base == 16 && threadIdx.x % 32 == 31 && blockIdx.x == 0) {
-            printf("left %d right %d left_rel %d, right_rel %d, pass_valid %d\n", left, right, left_rel, right_rel, pass_valid);
-        }
-#endif
+// #if defined(DBG_KERNEL) && DBG_KERNEL
+//         if (base == 16 && threadIdx.x % 32 == 31 && blockIdx.x == 0) {
+//             printf("left %d right %d left_rel %d, right_rel %d, pass_valid %d\n", left, right, left_rel, right_rel, pass_valid);
+//         }
+// #endif
         
         // const int lane_id    = threadIdx.x & 31;
         const int sub        = lane_id >> 4;               // sub-warp: 0 or 1, lane0-15: sub-warp0; lane16-lane31: sub-warp1
@@ -541,35 +632,43 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
             cc, 
             out, 
             submask);
-#if defined(DBG_KERNEL) && DBG_KERNEL
-        if (threadIdx.x == 0 && blockIdx.x == 0) {
-            printf("tile %d warp %d ready\n", blockIdx.x, threadIdx.x/32);
-        }
-#endif
 
         __syncwarp(full);       // make sure two warps written into `tmp_cv`
 
-        // now we have compute 2 chunks' cv
-        // merge it to a parent cv
-        if (lane_id == 0 && has_left) {
+// #if defined(DBG_KERNEL) && DBG_KERNEL
+//             if (blockIdx.x == 0 && threadIdx.x == 0) printf("The simd16-lane res: \n");
+//             print_cv(out);
+// #endif
+
+        // now, one warp computes 2 chunks, yield one parent-cv value
+        const int pair_idx = (base >> 1) + warp_id;     // 0, 16 + warp_id
+
+        if (has_left) {
             const uint32_t* lcv = &tmp_cv[left][0];
-            uint32_t parent[8];
-            if ((right - base) < pass_valid) {
+
+            if (has_right) {
                 const uint32_t* rcv = &tmp_cv[right][0];
-                blake3_parent_cv(lcv, rcv, parent);
-            } else {        // odd: up-flow directly
-                #pragma unroll
-                for (int j = 0 ; j < 8; ++j) 
-                    parent[j] = lcv[j];
-            }
 
-            // now, one warp computes 2 chunks, yield one parent-cv value
-            const int pair_idx = (base >> 1) + warp_id;     // 0, 16 + warp_id
-            #pragma unroll
-            for (int j = 0; j < 8; ++j)
-                cv_smem[pair_idx][j] = parent[j];
+                // sub==0 half warp participate
+                const unsigned sub0_mask = 0x0000FFFFu & full;
+                if ((lane_id >> 4) == 0) {
+                    blake3_parent_cv_simd16(lcv, rcv, &cv_smem[pair_idx][0], sub0_mask);
+                }
+                // sub==1 do not involve
+            } else {
+                // odd: left -> pair_idx
+                if ((lane_id >> 4) == 0) {
+                    int li = lane_id & 15;
+                    if (li < 8) cv_smem[pair_idx][li] = lcv[li];
+                }
+            }
         }
 
+#if defined(DBG_KERNEL) && DBG_KERNEL
+        if (blockIdx.x == 0 && threadIdx.x == 0) printf("The 1st chunk merged res: \n");
+        print_cv(cv_smem[pair_idx]);
+#endif
+
         __syncwarp(full);       // NOTICE: this is necessary!
     }; // do_big_pass
 
@@ -593,72 +692,76 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
 
     __syncthreads();
 
-    // right now, we have got 32 chain values
+    // right now, we have got CHUNKS_PER_BLOCK / 2 chain values
     // a warp-reduce to merge.
 
     // ============== STAGE 3: Block-Reduce ==============
     // [32] - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv
     // we will only use warp 0 to handle this thing
-    if (warp_id == 0) {
-        uint32_t cv[8] = {0,0,0,0,0,0,0,0};
-
-#if defined(DBG_KERNEL) && DBG_KERNEL
-        if (threadIdx.x == 0 && blockIdx.x == 0) {
-            printf("parents count: %d\n", parents_count);
-        }
-#endif
-
-        const bool active_lane = (lane_id < parents_count);     // parents_count = CHUNKS_PER_BLOCK / 2
-        if (active_lane) {
-            #pragma unroll
-            for (int j = 0; j < 8; ++j) 
-                cv[j] = cv_smem[lane_id][j];
-        }
-
-        // 2) warp reduce [32] -> 16 -> 8 -> 4 -> 2 -> 1
-        int cur_n = parents_count;  // 当前层的有效节点数（逐层更新）
-        
-        for (int step = 1; cur_n > 1; step <<= 1) {
-            unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_id < cur_n);
-            // right-neighbor
-            uint32_t nbr[8];
-            #pragma unroll
-            for (int j = 0; j < 8; ++j) {
-                nbr[j] = __shfl_down_sync(mask, cv[j], step);
+    // the `parent_count` is the indicator
+    // for 256-8warps-32chunks, the parent count is 16
+    if (parents_count > 0) {
+        // half-warp info
+        const int sub      = lane_id >> 4;        // 0/1 -> which sub warp it belong to
+        const int li       = lane_id & 15;        // 0..15
+        const unsigned full    = __activemask();
+        const unsigned submask = ((sub==0) ? 0x0000FFFFu : 0xFFFF0000u) & full;
+
+        int cur_n = parents_count;
+
+        while (cur_n > 1) {
+            int pairs   = cur_n >> 1;                      // the merge num this layer will do, for 16 parents, this start at 8
+            // warp 0, 0-15: 0
+            // warp 0, 16-31: 1
+            // warp 1, 0-15: 2
+            // warp 1, 16-31: 3
+            // ..
+            // warp 7, 0-15: 14
+            // warp 7, 16-31: 15
+            int half_id = warp_id * 2 + sub;               // half-warp id
+
+            // each half-warp handle one pair： (left=2*half_id, right=left+1)
+            // only half warps will participate in
+            if (half_id < pairs) {
+                int left_idx  = (half_id << 1);
+                int right_idx = left_idx + 1;
+                const uint32_t* L = &cv_smem[left_idx][0];
+                const uint32_t* R = &cv_smem[right_idx][0];
+
+                // 半 warp SIMD16 合并：结果就地写回 left_idx
+                blake3_parent_cv_simd16(L, R, &cv_smem[left_idx][0], submask);
             }
 
-            // safety checking
-            const bool do_pair =
-                (lane_id % (step << 1) == 0) &&               // 左侧
-                (lane_id + step < cur_n) &&                   // 右侧在当前层有效范围内
-                (lane_id < cur_n);                            // 左侧也必须有效
+            __syncthreads();  // make sure all warps could see.
 
-            if (do_pair) {
-                blake3_parent_cv(cv, nbr, cv);               // parent(left, right) -> cv
+            // odd: up-flow
+            if (cur_n & 1) {
+                // 让 warp0 的 sub0 来搬运（li < 8）
+                if (warp_id == 0 && sub == 0 && li < 8) {
+                    cv_smem[pairs][li] = cv_smem[cur_n - 1][li];
+                }
             }
 
-            cur_n = (cur_n + 1) >> 1;
-            __syncwarp(mask);
-        }
+            __syncthreads();  // next - level
 
-        // 3) write back to global memory
-        // now, warp_id == 0 and lane_id == 0 will write to output ptr
-        // one block compute 32 chunks
-        if (lane_id == 0 && parents_count > 0) {
-            const int tile_id = blockIdx.x;
-            uint32_t* out = block_cvs + (size_t)tile_id * 8;        // 8 x 4 = 32 B
+            cur_n = pairs + (cur_n & 1);
+        }
 
-            // block_cvs should be cudaMalloced
-            reinterpret_cast<uint4*>(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]);
-            reinterpret_cast<uint4*>(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]);
+        // write out
+        if (warp_id == 0 && lane_id == 0) {
+            uint32_t* out = block_cvs + (size_t)blockIdx.x * 8;
+#if defined(DBG_KERNEL) && DBG_KERNEL
+            print_cv(cv_smem[0]);
+#endif
+            reinterpret_cast<uint4*>(out)[0] = make_uint4(cv_smem[0][0], cv_smem[0][1], cv_smem[0][2], cv_smem[0][3]);
+            reinterpret_cast<uint4*>(out)[1] = make_uint4(cv_smem[0][4], cv_smem[0][5], cv_smem[0][6], cv_smem[0][7]);
         }
     }
 
 #if defined(DBG_KERNEL) && DBG_KERNEL
-        if (threadIdx.x == 0 && blockIdx.x == 0) {
-            printf("Stage 3 - finish processing\n");
-        }
+            printf("================================ Finishing all in big kernel! ================================\n");
 #endif
+
 }   // blake3_block_reduce_kernel
 
 __device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) {
@@ -916,7 +1019,7 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data,
     constexpr int  NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;       // 16 or 8
     const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
     const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
-    const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);     // 16384 blocks
+    const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);     // 16384 blocks, 32768 for 32-size
 
     constexpr int pad_chunk = 16;
     constexpr int pad_cv = 0;
@@ -932,7 +1035,7 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data,
     // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch.
 
     // TODO: use thrust
-    cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t));     // 512 KiB
+    cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t));     // 512 KiB, 1M for 32-size
 
     // ============= launch big kernel =============
     dim3 grid_big(num_blocks);
diff --git a/setup.py b/setup.py
index 630e58b..a45ea14 100644
--- a/setup.py
+++ b/setup.py
@@ -118,7 +118,8 @@ def ok(p):
     "-g", "-O0", "-Xcompiler", "-fPIC",
     "--expt-relaxed-constexpr",
     "--use_fast_math",
-    "-G", "-lineinfo",
+    # "-G", 
+    "-lineinfo",
     "-DDBG_KERNEL=1",
 ] + NVCC_ARCH_FLAGS
 

From f802823de022d734d437512750a119a3916cec45 Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Wed, 8 Oct 2025 17:22:35 +0800
Subject: [PATCH 13/20] debug tiny kernel

---
 README.md                |   3 +-
 benchmark/perf.txt       |  41 ++++++
 benchmark/test_gpu.py    |   9 +-
 csrc/blake3_sm70_sm80.cu | 290 ++++++++++++++++++++++++---------------
 4 files changed, 224 insertions(+), 119 deletions(-)

diff --git a/README.md b/README.md
index 0a6cf25..88cb978 100644
--- a/README.md
+++ b/README.md
@@ -34,4 +34,5 @@ python benchmark/test_script.py
 + 10.5 - v2 - [commit:2e71051367a9533af36f6c54a46876e20bc0bab5]: 2044.49 MiB/s
 + 10.6 - v3 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s (False Result)
 + 10.6 - v4 - [commit:9781ad9759d13e0e482162d57c68b65c946f4ff9]: 17069.23 MiB/s
-+ 10.6 - v5 - [commit:4be8258f5e82aab4e57e8b70a604ebb9361d8aa0]: 34261.37 MiB/s
\ No newline at end of file
++ 10.6 - v5 - [commit:4be8258f5e82aab4e57e8b70a604ebb9361d8aa0]: 34261.37 MiB/s
++ 10.8 - v6 - [commit:]: 15145.81 MiB/s
\ No newline at end of file
diff --git a/benchmark/perf.txt b/benchmark/perf.txt
index 2920e89..d48f165 100644
--- a/benchmark/perf.txt
+++ b/benchmark/perf.txt
@@ -1,2 +1,43 @@
 10.7 V100: 54819.25 MiB/s
     RTX 4090: 150585.83 MiB/s
+
+(ceg5206) yazhu@DESKTOP-37AIPE6:~/workspace/cpp_code/ceg5206/grp_proj$ python benchmark/test_gpu.py 
+Stage 1 finish processing
+The 1st chunk merged res: 
+block 0 root CV (u32, little-endian words):68f4f634 6a351c13 83eef7bf 6194f235 c3e40515 5a2bff65 9ff142fa 54c43967 
+Stage 2 - pass 1 finish processing
+The 1st chunk merged res: 
+block 0 root CV (u32, little-endian words):930c7381 17018a62 722c6e90 4b5bfaa7 09e9e8f9 e41e7ecf 75386773 7c727b3b 
+Stage 2 - pass 2 finish processing, pass 1: 16
+block 0 root CV (u32, little-endian words):791e0e1a 05a01828 bc7cfa9f 3274a8ae 50feb0e3 c6113c92 2aaca74f 272f3096 
+================================ Finishing all in big kernel! ================================
+Block 0 root CV for tiny kernel:block 0 root CV (u32, little-endian words):369139fa 28a9fc9c ab9149a8 e3f60323 9c7504ac b5088ac8 4c951d3f 3ca90f3a 
+Block 0 root CV for tiny kernel:block 0 root CV (u32, little-endian words):62a88205 53625a4a b538365b c131e095 935d3504 989b4852 f5ed3f51 6a53b552 
+GPU BLAKE3 Result:    62a8820553625a4ab538365bc131e095935d3504989b4852f5ed3f516a53b552
+std BLAKE3 Expected:  d490c8f71546ae9909e5c6c1fd23264e268c148ea83a140d517b249e6a6a035b
+std BLAKE3 1KB:       f7314bcd4f08b945da46890d4abcbe9bd78905369461379ed5ab893eaccff236
+Traceback (most recent call last):
+  File "/home/yazhu/workspace/cpp_code/ceg5206/grp_proj/benchmark/test_gpu.py", line 37, in <module>
+    assert cv_hex == std_hex, "GPU BLAKE3 result does not match CPU result!"
+           ^^^^^^^^^^^^^^^^^
+AssertionError: GPU BLAKE3 result does not match CPU result!
+(ceg5206) yazhu@DESKTOP-37AIPE6:~/workspace/cpp_code/ceg5206/grp_proj$ python benchmark/test_gpu.py 
+Stage 1 finish processing
+The 1st chunk merged res: 
+block 0 root CV (u32, little-endian words):68f4f634 6a351c13 83eef7bf 6194f235 c3e40515 5a2bff65 9ff142fa 54c43967 
+Stage 2 - pass 1 finish processing
+The 1st chunk merged res: 
+block 0 root CV (u32, little-endian words):930c7381 17018a62 722c6e90 4b5bfaa7 09e9e8f9 e41e7ecf 75386773 7c727b3b 
+Stage 2 - pass 2 finish processing, pass 1: 16
+block 0 root CV (u32, little-endian words):791e0e1a 05a01828 bc7cfa9f 3274a8ae 50feb0e3 c6113c92 2aaca74f 272f3096 
+================================ Finishing all in big kernel! ================================
+Block 0 root CV for tiny kernel:block 0 root CV (u32, little-endian words):369139fa 28a9fc9c ab9149a8 e3f60323 9c7504ac b5088ac8 4c951d3f 3ca90f3a 
+Block 0 root CV for tiny kernel:block 0 root CV (u32, little-endian words):62a88205 53625a4a b538365b c131e095 935d3504 989b4852 f5ed3f51 6a53b552 
+GPU BLAKE3 Result:    62a8820553625a4ab538365bc131e095935d3504989b4852f5ed3f516a53b552
+std BLAKE3 Expected:  d490c8f71546ae9909e5c6c1fd23264e268c148ea83a140d517b249e6a6a035b
+std BLAKE3 1KB:       f7314bcd4f08b945da46890d4abcbe9bd78905369461379ed5ab893eaccff236
+Traceback (most recent call last):
+  File "/home/yazhu/workspace/cpp_code/ceg5206/grp_proj/benchmark/test_gpu.py", line 37, in <module>
+    assert cv_hex == std_hex, "GPU BLAKE3 result does not match CPU result!"
+           ^^^^^^^^^^^^^^^^^
+AssertionError: GPU BLAKE3 result does not match CPU result!
\ No newline at end of file
diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py
index 40cbb23..f0696b2 100644
--- a/benchmark/test_gpu.py
+++ b/benchmark/test_gpu.py
@@ -4,8 +4,8 @@
 import time
 import blake3
 
-check_accuracy = True
-check_perf = False
+check_accuracy = False
+check_perf = True
 
 GiB = 1024*1024*1024  # bytes -> 1 GiB
 
@@ -31,8 +31,8 @@
     std_hex = blake3.blake3(cpu.numpy()).hexdigest()
     print("std BLAKE3 Expected: ", std_hex)
 
-    std_hex_1KB = blake3.blake3(cpu[:1024].numpy()).hexdigest()
-    print("std BLAKE3 1KB:      ", std_hex_1KB)
+    # std_hex_1KB = blake3.blake3(cpu[:1024].numpy()).hexdigest()
+    # print("std BLAKE3 1KB:      ", std_hex_1KB)
 
     assert cv_hex == std_hex, "GPU BLAKE3 result does not match CPU result!"
     print("GPU BLAKE3 result matches CPU result!")
@@ -51,7 +51,6 @@
     torch.cuda.synchronize()
     t1 = time.perf_counter()
 
-    t1 = time.perf_counter()
     elapsed = t1 - t0
     print(f"Elapsed time for {repeat}x BLAKE3 (GPU SM70): {elapsed:.3f} seconds")
     print(f"Throughput: {repeat * d.numel() / elapsed / (1024**2):.2f} MiB/s")
diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu
index ee5dd16..2911af3 100644
--- a/csrc/blake3_sm70_sm80.cu
+++ b/csrc/blake3_sm70_sm80.cu
@@ -1,5 +1,5 @@
 
-#include "cute/numeric/int.hpp"
+#include <cute/numeric/int.hpp>
 #include <cstdint>
 #include <array>
 #include <cstdio>
@@ -29,6 +29,17 @@
     }                                                           \
   } while(0)
 
+#define G(a,b,c,d, x, y)            \
+    do {                            \
+        (a) = (a) + (b) + (x);      \
+        (d) = rotr32((d) ^ (a),16); \
+        (c) = (c) + (d);            \
+        (b) = rotr32((b) ^ (c),12); \
+        (a) = (a) + (b) + (y);      \
+        (d) = rotr32((d) ^ (a), 8); \
+        (c) = (c) + (d);            \
+        (b) = rotr32((b) ^ (c), 7); \
+    } while (0)
 
 using namespace cute;
 
@@ -106,42 +117,70 @@ __device__ void print_cv(uint32_t cv[8]) {
     }
 }
 
+__constant__ __device__ int B3_PERMUTE[16] = {
+    2, 6, 3,10, 7, 0, 4,13, 1,11,12, 5, 9,14,15, 8
+};
+
+// the actually right compress 7r in a single lane function
 __host__ __device__ void blake3_compress_words_7r(
     const uint32_t block_words[16],   // 64B -> shared memory
     const uint32_t cv[8],             // 8×u32 -> shared memory
     uint64_t chunk_counter,           // 64-bit
     uint32_t block_len,               // [0..64]
     uint32_t flags,                   // CHUNK_START/END/PARENT/ROOT…
-    uint32_t out_state[16])           // 返回 16×u32 状态向量（按规范）
+    uint32_t out_state[16])           // output
 {
-    // TODO: 根据 BLAKE3（基于 BLAKE2s）的 G/round 实现7轮
-    // 这里先给占位：将 IV+cv 混合到 out_state，真实实现请替换
-#pragma unroll
-    for (int i = 0; i < 8; ++i) 
-        out_state[i] = cv[i];
-#pragma unroll
-    for (int i = 0; i < 8; ++i) 
-        out_state[8+i] = BLAKE3_IV[i];
+    // 1) initialize v
+    uint32_t v0=cv[0], v1=cv[1], v2=cv[2], v3=cv[3];
+    uint32_t v4=cv[4], v5=cv[5], v6=cv[6], v7=cv[7];
 
-    out_state[12] ^= (uint32_t)chunk_counter;
-    out_state[13] ^= (uint32_t)(chunk_counter >> 32);
-    out_state[14] ^= block_len;
-    out_state[15] ^= flags;
+    uint32_t v8 =BLAKE3_IV[0], v9 =BLAKE3_IV[1], v10=BLAKE3_IV[2], v11=BLAKE3_IV[3];
+    uint32_t v12=BLAKE3_IV[4], v13=BLAKE3_IV[5], v14=BLAKE3_IV[6], v15=BLAKE3_IV[7];
 
-    // so far, the block_words are still pointers.
-    // now we load it into kernel, as pointed out by ncu profile
-    uint32_t block_reg_1[4];
+    // injection
+    v12 ^= (uint32_t)chunk_counter;
+    v13 ^= (uint32_t)(chunk_counter >> 32);
+    v14 ^= block_len;
+    v15 ^= flags;
 
-#pragma unroll
-    for (int i = 0; i < 16; i += 4) {        // the gap is 4
-        // load_u128_u32x4(block_words + i, block_reg_1);
-        out_state[i] ^= block_words[i];
-        // 做一点点搅动（占位）
-        out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31);
+    // 2) 7 轮
+    int perm[16];    // 每轮的消息索引
+    #pragma unroll
+    for (int i = 0; i < 16; ++i) 
+        perm[i] = i;
+
+    #pragma unroll
+    for (int r=0; r < 7; ++r) {
+        // col-step
+        G(v0, v4, v8, v12, block_words[perm[0]],  block_words[perm[1]]);
+        G(v1, v5, v9, v13, block_words[perm[2]],  block_words[perm[3]]);
+        G(v2, v6, v10,v14, block_words[perm[4]],  block_words[perm[5]]);
+        G(v3, v7, v11,v15, block_words[perm[6]],  block_words[perm[7]]);
+
+        // diag-step
+        G(v0, v5, v10,v15, block_words[perm[8]], block_words[perm[9]]);
+        G(v1, v6, v11,v12, block_words[perm[10]], block_words[perm[11]]);
+        G(v2, v7, v8, v13, block_words[perm[12]], block_words[perm[13]]);
+        G(v3, v4, v9, v14, block_words[perm[14]], block_words[perm[15]]);
+
+        // perm = perm ∘ PERMUTE
+        int np[16];
+        #pragma unroll
+        for (int i = 0; i < 16; ++i) 
+            np[i] = perm[B3_PERMUTE[i]];
+        #pragma unroll
+        for (int i = 0; i < 16; ++i) 
+            perm[i] = np[i];
     }
+
+    // 3) write to out state
+    out_state[ 0]=v0;  out_state[ 1]=v1;  out_state[ 2]=v2;  out_state[ 3]=v3;
+    out_state[ 4]=v4;  out_state[ 5]=v5;  out_state[ 6]=v6;  out_state[ 7]=v7;
+    out_state[ 8]=v8;  out_state[ 9]=v9;  out_state[10]=v10; out_state[11]=v11;
+    out_state[12]=v12; out_state[13]=v13; out_state[14]=v14; out_state[15]=v15;
 }
 
-// 从 out_state 派生新的 CV（按规范应取 state[0..7] ^ state[8..15]）
+// from out_state yields CV
 __host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){
 #pragma unroll
     for (int i = 0; i < 8; ++i) 
@@ -429,7 +468,7 @@ __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint3
         msg[8+i] = R[i]; 
     }
     uint32_t st[16];
-    blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st);
+    blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, FLAG_PARENT, st);
     blake3_state_to_cv(st, out_cv);
 }
 
@@ -759,7 +798,8 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
     }
 
 #if defined(DBG_KERNEL) && DBG_KERNEL
-            printf("================================ Finishing all in big kernel! ================================\n");
+    if (blockIdx.x == 0 && threadIdx.x == 0) 
+        printf("================================ Finishing all in big kernel! ================================\n");
 #endif
 
 }   // blake3_block_reduce_kernel
@@ -778,66 +818,72 @@ __device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) {
 }
 
 // ============ Tiny kernel ============
-// In big kernel, it will consume 64 KiB each block
-// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 64 root = 16384 root
-// And this tiny kernel is designed to process these 16384 root
-template<int NUM_THREADS=512, int TILE_CVS=2048, int PAD=0>
+// In big kernel, it will consume 64 or 32 KiB each block  [CHUNKS_PER_BLOCK]
+// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / CHUNKS_PER_BLOCK root = 16384 or 32768 roots
+// And this tiny kernel is designed to process these 16384 or 32768 root
+// For one chain value, takes u32 x 8 = 32B, 
+// 32KB shared memory could contain 1K chain value
+// 16KB shmem         could contain 512 chain value
+template<int NUM_THREADS=256, int TILE_CVS=1024, int PAD=0>
 __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32,
                                               uint32_t* __restrict__ out_cv32,
                                               int N)
 {
-    extern __shared__ __align__(16) uint32_t smem[]; // 动态 SMEM；需要 >= TILE_CVS*8*4 字节
-    // 视作 2D：[TILE_CVS][8+PAD]
+    extern __shared__ __align__(16) uint32_t smem[]; // dyn SMEM >= TILE_CVS*32B
+    // regarad as 2D dyn shmem: [TILE_CVS][8+PAD]
     uint32_t* cv_tile = smem;
 
     const int tid     = threadIdx.x;
-    const int warp_id = tid / WARP_SIZE;   // 0..15
+    constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+    const int warp_id = tid / WARP_SIZE;   // 0..NUM_WARPS (8 or 16)
     const int lane_id = tid % WARP_SIZE;   // 0..31
 
-    // 本 block 负责的分片起点
+    // the start of this block.
+    // each block will process TILE_CVS roots.
     const int tile_start = blockIdx.x * TILE_CVS;
     if (tile_start >= N) return;
 
-    // N等于8的时候，这里就是8
-    const int tile_n = min(TILE_CVS, N - tile_start); // 该分片的实际 CV 数（<=2048）
+    const int tile_n = min(TILE_CVS, N - tile_start); // actual cv count of this block
 
-    // ---------------- Stage 1: 合并访存 loading 到 SMEM ----------------
-    // 每线程搬多个 CV：i = tid, tid+blockDim, ...
-    for (int i = tid; i < tile_n; i += NUM_THREADS) {       // 注意：i = tid, 不是等于0
+    // ---------------- Stage 1: coalsced loading to SMEM ----------------
+    // each time load 8 KB, [tile_n/NUM_THREADS] times in total.
+    for (int i = tid; i < tile_n; i += NUM_THREADS) {       // notice: i = tid, not start from 0
         const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8;
         uint32_t* s = cv_tile + (size_t)i * (8 + PAD);
-        // 两次 16B
+        // 16B x 2
         const uint4* g4 = reinterpret_cast<const uint4*>(g);
         uint4*       s4 = reinterpret_cast<uint4*>(s);
         // s4[0] = g4[0];
         // s4[1] = g4[1];
 
         // in case that the address is not aligned
-        uint4 v0 = g4[0];
-        uint4 v1 = g4[1];
+        uint4 v0 = g4[0];       // each thread load 16B, 256 x 16=4KB
+        uint4 v1 = g4[1];       // each thread load 16B, 256 x 16=4KB
 
         s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w;
         s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w;
     }
-    // 对于 tile_n < TILE_CVS 的尾部，无需清零；后续按有效范围处理
     __syncthreads();
 
-    // ---------------- Stage 2: 线程内 4→1（保持相邻配对） ----------------
-    // 共有 reduced_n0 = ceil(tile_n / 4) 个 lane-root
-    const int reduced_n0 = (tile_n + 3) >> 1 >> 1; // 等价于 (tile_n+3)/4
-    uint32_t lane_cv[8]; // 本线程输出的 lane-root
+    // ---------------- Stage 2: each lane merge 4 → 1 (keep the neighbor order) ----------------
+    // reduced_n0 = ceil(tile_n / 4)  lane-root
+    const int reduced_n0 = (tile_n + 3) >> 1 >> 1;
+    uint32_t lane_cv[8]; // output of this lane 4-1 root
     bool lane_valid = false;
 
-    // 每线程的 4 个输入的起始索引
+    // start index
     int base4 = tid << 2; // tid*4
     if (base4 < tile_n) {
-        // 读取最多 4 个相邻 CV：idx = base4 + 0,1,2,3
+        // 4-neighbor CV：idx = base4 + 0,1,2,3
         uint32_t a[8], b[8], c[8], d[8];
+
         const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD);
         load_cv_g2r(s0, a);
 
         int remain = tile_n - base4;
 
+        // Yazhu: branch-prediction extra-overhead
+
         if (remain >= 2) {
             const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD);
             load_cv_g2r(s1, b);
@@ -851,16 +897,16 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
             load_cv_g2r(s3, d);
         }
 
-        // 两层相邻配对（奇数晋级）
+        // merge the neighbor
         if (remain == 1) {
             #pragma unroll
             for (int j = 0; j < 8; ++j) 
                 lane_cv[j] = a[j];
         } else if (remain == 2) {
-            blake3_parent_cv(a, b, lane_cv);
+            blake3_parent_cv(a, b, lane_cv);        // write to lane_cv directly
         } else if (remain == 3) {
             uint32_t p01[8];
-            blake3_parent_cv(a, b, p01);
+            blake3_parent_cv(a, b, p01);        // one buffer
             blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01，(p01,c)->lane_cv
         } else { // remain >= 4
             uint32_t p01[8], p23[8];
@@ -871,28 +917,25 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
         lane_valid = true;
     }
 
-    // ---------------- Stage 3: Warp 内 32→1 相邻配对 ----------------
-    // 每个 warp 负责一个连续段：warp_base = warp_id*32
+    // ---------------- Stage 3: Warp-level 32→1 neighbor-shfl merge ----------------
     const int warp_base = warp_id * WARP_SIZE;
-    const int cur_n_w   = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // 本 warp 段内的有效数量
+    const int cur_n_w   = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // active number
 
-    // 把 lane_cv 保留在寄存器里做归约；无效 lane 用 mask 剔除
-    unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // 仅做存在检测
+    // this will introduce extra branch-prediction overhead
+    unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4));
     int cur_n = cur_n_w;
 
-    // 把“段外的线程”标成无效（避免读越界）
     bool active_lane = (lane_id < cur_n_w);
 
-    // 对无效 lane 把值清成 0（不会被使用）
     if (!active_lane) { 
         #pragma unroll
         for (int j = 0; j < 8; ++j) 
             lane_cv[j] = 0u; 
     }
 
-    // 逐层配对：1,2,4,8,16 - warp-reduce
+    // step = 1,2,4,8,16 - warp-reduce
     for (int step = 1; step < WARP_SIZE; step <<= 1) {
-        // 取右邻
+        // right-neighbor
         uint32_t nbr[8];
         #pragma unroll
         for (int j = 0; j < 8; ++j)
@@ -911,43 +954,62 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
         // __syncwarp();
     }
 
-    // 这一段的结果在 lane0；把 16 个 warp-root 写入 SMEM 的前 16 行
-    __shared__ uint32_t warp_roots[WARP_SIZE/2][8]; // 16×8
+    // lane0；NUM_WARPS warp-root write to SMEM
+    // e.g. if this block has 8 warp, there will be 8 warp-root
+    __shared__ uint32_t warp_roots[NUM_WARPS][8]; // NUM_WARPS × 8
     if (lane_id == 0 && cur_n_w > 0) {
         #pragma unroll
-        for (int j=0;j<8;++j) 
+        for (int j = 0; j < 8; ++j) 
             warp_roots[warp_id][j] = lane_cv[j];
     }
     __syncthreads();
 
-    // ---------------- Stage 4: CTA 内 16→1 相邻配对 ----------------
-    // 有效 warp 数：ceil(reduced_n0 / 32)
-    int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..16
+    // ---------------- Stage 4: CTA's NUM_WARPS → 1 block reduce ----------------
+    int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..NUM_WARPS
     if (valid_warps == 0) return;
 
-    // 每一个warp的lane 0来做计算
-    // 用 lane0 做计算，其它 lane 空转
-    for (int stride = (valid_warps >> 1); stride >= 1; stride >>= 1) {
-        if (warp_id < stride && lane_id == 0) {
-            uint32_t p[8];
-            blake3_parent_cv(&warp_roots[2*warp_id][0],
-                             &warp_roots[2*warp_id + 1][0], p);
-            #pragma unroll
-            for (int j = 0; j < 8; ++j) 
-                warp_roots[warp_id][j] = p[j];
+    
+    // 16 lane compute the merge together
+    const int  sub      = lane_id >> 4;              // 0/1
+    const int  li       = lane_id & 15;              // 0..15
+    const unsigned full    = __activemask();
+    const unsigned submask = ((sub==0) ? 0x0000FFFFu : 0xFFFF0000u) & full;
+    cur_n = NUM_WARPS;
+
+    while (cur_n > 1) {
+        const int pairs   = cur_n >> 1;              // the pair count.
+        const int half_id = warp_id * 2 + sub;       // half-warp index
+
+        if (half_id < pairs) {
+            const int left_idx  = (half_id << 1);
+            const int right_idx = left_idx + 1;
+
+            const uint32_t* L = &warp_roots[left_idx][0];
+            const uint32_t* R = &warp_roots[right_idx][0];
+
+            blake3_parent_cv_simd16(L, R, &warp_roots[left_idx][0], submask);
         }
+
         __syncthreads();
-        // 奇数晋级
-        if ((valid_warps & 1) && warp_id==0 && lane_id==0) {
-            #pragma unroll
-            for (int j=0;j<8;++j)
-                warp_roots[stride][j] = warp_roots[valid_warps-1][j];
+
+        // if odd
+        if (cur_n & 1) {
+            if (warp_id == 0 && sub == 0 && li < 8) {
+                warp_roots[pairs][li] = warp_roots[cur_n - 1][li];
+            }
         }
-        __syncthreads();
-        valid_warps = (valid_warps + 1) >> 1;
+
+        __syncthreads();                             // next-level
+        cur_n = pairs + (cur_n & 1);                 //
+    }
+#if defined(DBG_KERNEL) && DBG_KERNEL
+    if (tid == 0 && blockIdx.x == 0) {
+        printf("Block %d root CV for tiny kernel:", blockIdx.x);
+        print_cv(warp_roots[0]);
     }
+#endif
 
-    // 写回本 block 的根
+    // ---------------- Stage 5: write to output ----------------
     if (threadIdx.x == 0) {
         uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8;
         #pragma unroll
@@ -1000,17 +1062,6 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data,
     int optin = 0, deflt = 0;
     cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0);
     cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0);
-
-    const int dyn_smem = 64 * 1024;     // 64KiB
-
-    // 编译器在编译期决定分配多少动态shmem给kernel
-    CUDA_CHECK(cudaFuncSetAttribute(
-        blake3_cv_block_reduce_kernel<512, 2048, 0>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
-    CUDA_CHECK(cudaFuncSetAttribute(
-        blake3_cv_block_reduce_kernel<32, 2048, 0>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
-
         
     constexpr int  CHUNKS_PER_BLOCK = 32;                     // 16 * 32 = 512
     constexpr int  CHUNK_SIZE = 1024;                       // 1 KiB
@@ -1074,19 +1125,26 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data,
     }
 
     // the first round of tiny kernel
-    // 1) 16384 output reduce -> 8
+    // 1) 16384 or 32768 output reduce -> 8
+    
     uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32
     {
-        const int N = 16384;        // total number
-        const int TILE = 2048;
-        const int grid = (N + TILE - 1) / TILE;  // = 8
-        const int block = 512;
-        const size_t smem_bytes = (size_t)(TILE) * 8u * sizeof(uint32_t); // 2048×8×4 = 64 KiB
-
-        cudaMalloc(&d_mid_out, (size_t)8 * 8u * sizeof(uint32_t));
-
-        blake3_cv_block_reduce_kernel<512, 2048, 0>
-            <<<grid, block, smem_bytes, stream>>>(d_blockCV /*in: 16384×8 x 4*/,
+        const int dyn_smem = 32 * 1024;     // 32KiB
+        const int N = num_blocks;        // total root-cv number
+        constexpr int TILE = 1024;
+        const int grid = (N + TILE - 1) / TILE; // 32 KB / 1024 = 32
+        constexpr int NUM_THREADS = 256;
+        const size_t smem_bytes = (size_t)(TILE) * (8u + pad_cv) * sizeof(uint32_t); // 1024 x 32 = 32 KiB
+
+        // decide upon compiling time
+        CUDA_CHECK(cudaFuncSetAttribute(
+            blake3_cv_block_reduce_kernel<NUM_THREADS, TILE, pad_cv>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+
+        cudaMalloc(&d_mid_out, (size_t)grid * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<NUM_THREADS, TILE, pad_cv>
+            <<<grid, NUM_THREADS, smem_bytes, stream>>>(d_blockCV /*in: 16384×8 x 4*/,
                                                 d_mid_out   /*out: 8×8*/, N);
         CUDA_CHECK(cudaGetLastError());
         CUDA_CHECK(cudaDeviceSynchronize());
@@ -1095,16 +1153,22 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data,
     // second round
     uint32_t* d_root_cv = nullptr;
     {
-        const int N = 8;
-        const int TILE = 2048; // 任意 >=N 即可
-        const int grid = 1;
-        const int block = 32; // 32 线程够用
+        const int dyn_smem = 1024;     // 1 KiB will enough
+        constexpr int TILE = 1024; // any >= N
+        const int N = (num_blocks + TILE - 1) / TILE;       // 32
+        constexpr int grid = 1;
+        constexpr int NUM_THREADS = 32; // 32 threads will be enough
         const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t);        // 8 x 8 x 4 = 8 x 32 B
 
+        // decide upon compiling time
+        CUDA_CHECK(cudaFuncSetAttribute(
+            blake3_cv_block_reduce_kernel<NUM_THREADS, TILE, pad_cv>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+
         cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t));
 
-        blake3_cv_block_reduce_kernel<32, 2048, 0>
-            <<<grid, block, smem_bytes, stream>>>(d_mid_out /*in: 8×8*/,
+        blake3_cv_block_reduce_kernel<NUM_THREADS, TILE, pad_cv>
+            <<<grid, NUM_THREADS, smem_bytes, stream>>>(d_mid_out /*in: 8×8*/,
                                                 d_root_cv /*out: 1×8*/, N);
         CUDA_CHECK(cudaGetLastError());
         CUDA_CHECK(cudaDeviceSynchronize());

From cd2a2386314d4868b1deb1972912b901a873a118 Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Wed, 8 Oct 2025 17:30:44 +0800
Subject: [PATCH 14/20] debug whole gpu kernel 1

---
 csrc/blake3_sm70_sm80.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu
index 2911af3..81addad 100644
--- a/csrc/blake3_sm70_sm80.cu
+++ b/csrc/blake3_sm70_sm80.cu
@@ -481,7 +481,7 @@ template<const int NUM_THREADS=512,         // or 256
         const int CHUNKS_PER_BLOCK=64,      // 64 KB, if 256 threads -> 32 KB
         const int PAD_CHUNK=16,
         const int PAD_CV=0>        // pad shared memory
-__global__ void blake3_block_reduce_kernel(uint32_t* d_input, 
+__global__ void blake3_block_reduce_kernel(const uint32_t* d_input, 
                                             uint32_t* block_cvs,
                                             int chunk_len_bytes,
                                             uint64_t base_chunk_counter,
@@ -596,6 +596,7 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input,
                 *reinterpret_cast<vec_t*>(raw_pointer_cast(&tCs(i))) = uint128_t(0, 0);
             }
         }
+        __syncwarp();   // inner-warp sync
     }
 
     __syncthreads();        // sync all warps
@@ -1080,7 +1081,7 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data,
         cudaFuncAttributePreferredSharedMemoryCarveout, 100));
         
     uint8_t*  d_bytes = const_cast<uint8_t*>(d_data);
-    uint32_t* d_words = reinterpret_cast<uint32_t*>(d_bytes);; // alias
+    const uint32_t* d_words = reinterpret_cast<uint32_t*>(d_bytes);; // alias
     uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32
 
     // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch.

From 5d617e044dbf090e4067ad2daea79663da88a456 Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Wed, 8 Oct 2025 23:07:19 +0800
Subject: [PATCH 15/20] reconstruct proj

---
 benchmark/test_gpu.py    |   5 +-
 csrc/blake3_sm70_sm80.cu | 435 +--------------------------------------
 csrc/utils.cuh           | 433 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 440 insertions(+), 433 deletions(-)
 create mode 100644 csrc/utils.cuh

diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py
index f0696b2..50a7422 100644
--- a/benchmark/test_gpu.py
+++ b/benchmark/test_gpu.py
@@ -41,13 +41,16 @@
     # 2) 预热，触发 JIT/驱动初始化，避免首轮偏慢
     for _ in range(2):
         fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
+        torch.cuda.synchronize()
+        print("\n")
     torch.cuda.synchronize()
 
     # 3) 正式计时：这测的是“端到端吞吐”（包含 H2D）
-    repeat = 5  # 1GiB × 5 已经很重，按机器调整
+    repeat = 2  # 1GiB × 5 已经很重，按机器调整
     t0 = time.perf_counter()
     for _ in range(repeat):
         cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
+        print("\n")
     torch.cuda.synchronize()
     t1 = time.perf_counter()
 
diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu
index 81addad..9365f7d 100644
--- a/csrc/blake3_sm70_sm80.cu
+++ b/csrc/blake3_sm70_sm80.cu
@@ -17,6 +17,8 @@
 #include <cute/config.hpp>
 #include <cute/numeric/numeric_types.hpp>
 
+#include "utils.cuh"
+
 #define WARP_SIZE 32
 #define LDST128BITS(value) (reinterpret_cast<const float4*>(&(value))[0])
 
@@ -29,17 +31,7 @@
     }                                                           \
   } while(0)
 
-#define G(a,b,c,d, x, y)            \
-    do {                            \
-        (a) = (a) + (b) + (x);      \
-        (d) = rotr32((d) ^ (a),16); \
-        (c) = (c) + (d);            \
-        (b) = rotr32((b) ^ (c),12); \
-        (a) = (a) + (b) + (y);      \
-        (d) = rotr32((d) ^ (a), 8); \
-        (c) = (c) + (d);            \
-        (b) = rotr32((b) ^ (c), 7); \
-    } while (0)
+
 
 using namespace cute;
 
@@ -53,426 +45,6 @@ using vec_t = cute::uint128_t;        // one time loading 16 B
   using Atom = cute::Copy_Atom<cute::AutoVectorizingCopyWithAssumedAlignment<128>, vec_t>;
 #endif
 
-__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = {
-    0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
-    0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
-};
-
-enum : uint32_t {
-    FLAG_CHUNK_START        = 1u << 0,
-    FLAG_CHUNK_END          = 1u << 1,
-    FLAG_PARENT             = 1u << 2,
-    FLAG_ROOT               = 1u << 3,
-    FLAG_KEYED_HASH         = 1u << 4,
-    FLAG_DERIVE_KEY_CONTEXT = 1u << 5,
-    FLAG_DERIVE_KEY_MATERIAL= 1u << 6,
-};
-
-__device__ __noinline__
-uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) {
-    uint32_t f = 0;
-    f |= (uint32_t)-(block_idx_in_chunk==0)          & FLAG_CHUNK_START;
-    f |= (uint32_t)-(block_idx_in_chunk==nblocks_in_chunk-1)  & FLAG_CHUNK_END;
-    if (is_root_chunk)                         f |= FLAG_ROOT; // only this block in msg, or this is root
-    return f;
-}
-
-__device__ __forceinline__
-uint32_t blake3_parent_flags(bool is_root_parent) {
-    return FLAG_PARENT | (is_root_parent ? FLAG_ROOT : 0);
-}
-
-// ---- 小工具 ----
-__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
-#if defined(__CUDA_ARCH__)
-    return __funnelshift_r(x, x, n);
-#else
-  return (x >> n) | (x << (32 - n));    // host 路径
-#endif
-}
-
-__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) {
-#if defined(__CUDA_ARCH__)
-    const uint4 v = *reinterpret_cast<const uint4*>(src);
-    dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w;
-#else
-    std::memcpy(dst, src, 16);
-#endif
-}
-
-__device__ void print_cv(uint32_t cv[8]) {
-    if (blockIdx.x == 0 && threadIdx.x == 0) {
-        auto get_byte = [&](int i) {
-            int w = i >> 2;          // 第 i 个字节来自第 w 个 u32
-            int off = (i & 3) * 8;   // 在该 u32 中的偏移
-            return (unsigned)((cv[w] >> off) & 0xFFu);
-        };
-
-        printf("block %d root CV (u32, little-endian words):", blockIdx.x);
-        for (int i = 0; i < 32; ++i) {
-            printf("%02x", get_byte(i));
-            if ((i & 3) == 3) printf(" ");   // 每 4 字节空格
-        }
-        printf("\n");
-    }
-}
-
-__constant__ __device__ int B3_PERMUTE[16] = {
-    2, 6, 3,10, 7, 0, 4,13, 1,11,12, 5, 9,14,15, 8
-};
-
-// the actually right compress 7r in a single lane function
-__host__ __device__ void blake3_compress_words_7r(
-    const uint32_t block_words[16],   // 64B -> shared memory
-    const uint32_t cv[8],             // 8×u32 -> shared memory
-    uint64_t chunk_counter,           // 64-bit
-    uint32_t block_len,               // [0..64]
-    uint32_t flags,                   // CHUNK_START/END/PARENT/ROOT…
-    uint32_t out_state[16])           // output
-{
-    // 1) initialize v
-    uint32_t v0=cv[0], v1=cv[1], v2=cv[2], v3=cv[3];
-    uint32_t v4=cv[4], v5=cv[5], v6=cv[6], v7=cv[7];
-
-    uint32_t v8 =BLAKE3_IV[0], v9 =BLAKE3_IV[1], v10=BLAKE3_IV[2], v11=BLAKE3_IV[3];
-    uint32_t v12=BLAKE3_IV[4], v13=BLAKE3_IV[5], v14=BLAKE3_IV[6], v15=BLAKE3_IV[7];
-
-    // injection
-    v12 ^= (uint32_t)chunk_counter;
-    v13 ^= (uint32_t)(chunk_counter >> 32);
-    v14 ^= block_len;
-    v15 ^= flags;
-
-    // 2) 7 轮
-    int perm[16];    // 每轮的消息索引
-    #pragma unroll
-    for (int i = 0; i < 16; ++i) 
-        perm[i] = i;
-
-    #pragma unroll
-    for (int r=0; r < 7; ++r) {
-        // col-step
-        G(v0, v4, v8, v12, block_words[perm[0]],  block_words[perm[1]]);
-        G(v1, v5, v9, v13, block_words[perm[2]],  block_words[perm[3]]);
-        G(v2, v6, v10,v14, block_words[perm[4]],  block_words[perm[5]]);
-        G(v3, v7, v11,v15, block_words[perm[6]],  block_words[perm[7]]);
-
-        // diag-step
-        G(v0, v5, v10,v15, block_words[perm[8]], block_words[perm[9]]);
-        G(v1, v6, v11,v12, block_words[perm[10]], block_words[perm[11]]);
-        G(v2, v7, v8, v13, block_words[perm[12]], block_words[perm[13]]);
-        G(v3, v4, v9, v14, block_words[perm[14]], block_words[perm[15]]);
-
-        // perm = perm ∘ PERMUTE
-        int np[16];
-        #pragma unroll
-        for (int i = 0; i < 16; ++i) 
-            np[i] = perm[B3_PERMUTE[i]];
-        #pragma unroll
-        for (int i = 0; i < 16; ++i) 
-            perm[i] = np[i];
-    }
-
-    // 3) write to out state
-    out_state[ 0]=v0;  out_state[ 1]=v1;  out_state[ 2]=v2;  out_state[ 3]=v3;
-    out_state[ 4]=v4;  out_state[ 5]=v5;  out_state[ 6]=v6;  out_state[ 7]=v7;
-    out_state[ 8]=v8;  out_state[ 9]=v9;  out_state[10]=v10; out_state[11]=v11;
-    out_state[12]=v12; out_state[13]=v13; out_state[14]=v14; out_state[15]=v15;
-}
-
-// from out_state yields CV
-__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){
-#pragma unroll
-    for (int i = 0; i < 8; ++i) 
-        out_cv[i] = st[i] ^ st[8+i];
-}
-
-// swap-table
-// BLAKE3 message schedule: rows are P^r, r=0..6.
-// Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) 
-// https://www.cse.unsw.edu.au/~cs4601/refs/papers/blake3.pdf
-__device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = {
-    // r = 0: identity
-    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
-    // r = 1: P
-    {  2,  6,  3, 10,  7,  0,  4, 13,  1, 11, 12,  5,  9, 14, 15,  8 },
-    // r = 2: P∘P
-    {  3,  4, 10, 12, 13,  2,  7, 14,  6,  5,  9,  0, 11, 15,  8,  1 },
-    // r = 3
-    { 10,  7, 12,  9, 14,  3, 13, 15,  4,  0, 11,  2,  5,  8,  1,  6 },
-    // r = 4
-    { 12, 13,  9, 11, 15, 10, 14,  8,  7,  2,  5,  3,  0,  1,  6,  4 },
-    // r = 5
-    {  9, 14, 11,  5,  8, 12, 15,  1, 13,  3,  0, 10,  2,  6,  4,  7 },
-    // r = 6
-    { 11, 15,  5,  0,  1,  9,  8,  6, 14, 10,  2, 12,  3,  4,  7, 13 },
-};
-
-// get the "r" round, "k" message, it is broadcasted from m[li] lane. li = k
-__device__ __forceinline__ uint32_t msg_rk(uint32_t m_lane, int round, int k, unsigned mask16) {
-    int src = B3_MSG_SCHEDULE[round][k];
-    return __shfl_sync(mask16, m_lane, src, 16);
-}
-
-__device__ __noinline__
-uint32_t G_update_role(uint32_t v_self, uint32_t v_b, uint32_t v_c, uint32_t v_d,
-                       uint32_t mx, uint32_t my, int role)
-{
-    // 按 BLAKE2s 32-bit 的 G 序列算出 a',b',c',d'，最后返回“当前 role”的那个值
-    uint32_t a = v_self, b = v_b, c = v_c, d = v_d;
-
-    // a = a + b + mx; 
-    // d ^= a; 
-    // d >>>= 16
-    a = a + b + mx;   
-    d ^= a;   
-    d = rotr32(d, 16);
-
-    // c = c + d; 
-    // b ^= c; 
-    // b >>>= 12
-    c = c + d;        
-    b ^= c;   
-    b = rotr32(b, 12);
-
-    // a = a + b + my; 
-    // d ^= a; 
-    // d >>>= 8
-    a = a + b + my;   
-    d ^= a;   
-    d = rotr32(d, 8);
-
-    // c = c + d; 
-    // b ^= c; 
-    // b >>>= 7
-    c = c + d;        
-    b ^= c;   
-    b = rotr32(b, 7);
-
-    // role choice:
-    switch (role) {
-      case 0: return a;
-      case 1: return b;
-      case 2: return c;
-      default: return d;
-    }
-}
-
-// notice that, this function will proceed 2 chunks, each time.
-// - chunk_words_row: current chunk
-// - out_cv: written by lane 0, or lane 16
-__device__ __noinline__
-void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row, // 256×u32 -> 1024 Bytes, from shared memory
-                                    // so the chunks_row += 2 as gap
-                                    int chunk_len_bytes,
-                                    uint64_t chunk_counter,
-                                    uint32_t out_cv[8],
-                                    unsigned mask16) {
-    // computing index
-    int lane = threadIdx.x & 31;    // lane_id: 0-31
-    int sub  = lane >> 4;            // 0/1
-    int li   = lane & 15;            // 0..15, abstract lane id. for example, lane 16 will be li=0
-    int role = li & 3;               // a/b/c/d role
-    int base = (sub << 4);           // 0 or 16 the absolute base
-
-    const int nblocks = (chunk_len_bytes + 63) >> 6;  // ceil(chunk_len/64)
-
-    int warp_id = threadIdx.x / WARP_SIZE;
-
-    // initialize
-    uint32_t cv_word = 0;
-    if (li < 8) cv_word = BLAKE3_IV[li];
-
-    // process all blocks
-    // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks
-    // each block has 64B -> 16 x u32
-    for (int b = 0; b < nblocks; ++b) {
-        // each lane holds one u32, 
-        // 16 lane will hold 16 x 4 = 64 B -> it's block
-        // the another 16 lane will hold opposite 64 B
-        const uint32_t m_lane = chunk_words_row[b * 16 + li];
-
-        // 初始化 v：v[0..7]=cv, v[8..11]=IV，v[12..15]^=t/len/flags
-        // 先把“自己的那个索引”的初值准备好：
-        uint32_t v = (li < 8)
-            ? cv_word                                 // v[i]（i<8）
-            : BLAKE3_IV[li - 8];                      // v[8..15] ← IV
-
-        // 计数器/长度/标志（按 BLAKE3 规范）
-        const uint32_t t0 = (uint32_t)chunk_counter;
-        const uint32_t t1 = (uint32_t)(chunk_counter >> 32);
-        const int remain = chunk_len_bytes - (b << 6);
-        const uint32_t block_len = (remain >= 64) ? 64u : (uint32_t)remain;
-
-        const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false);
-
-        // 只在 12..15 四个索引上异或相应域（不分支，用谓词掩码）
-        v ^= (li == 12) ? t0       : 0u;
-        v ^= (li == 13) ? t1       : 0u;
-        v ^= (li == 14) ? block_len: 0u;
-        v ^= (li == 15) ? flags    : 0u;
-
-        // 把索引写成 li = q + 4*rq；对角步先做置换 li' = (rq<<2) | ((q+rq)&3)
-        int q  = (li & 3);
-        int rq = (li >> 2);
-        int li_diag = (rq << 2) | ((q + rq) & 3);
-        int li_undo = (rq << 2) | ((q - rq) & 3);
-        int gi_col = q; // 0..3
-        int gi_diag = (li_diag & 3); // 0..3
-
-        // ===== 7 rounds =====
-        #pragma unroll 4
-        for (int r = 0; r < 7; ++r) {
-            // inside this loop, each lane will do one job
-            // 16 lane will execute 16 x 2 operations
-            // in sequential-programming, will do 8 operation
-
-            // ---- 列步（quartet: {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15}）----
-            {
-                // 取同 quartet 的 b/c/d（基于当前 v）
-                uint32_t vb = __shfl_xor_sync(mask16, v,  4, 16);
-                uint32_t vc = __shfl_xor_sync(mask16, v,  8, 16);
-                uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16);
-
-                // 本 quartet 的 i ∈ {0,1,2,3}，列步用 msg 索引 0..7（两两为一对）
-                
-                uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16);
-                uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16);
-
-                v = G_update_role(v, vb, vc, vd, mx, my, role);
-            }
-
-            // ---- 对角步 ----
-            {
-                // 在“对角置换域”取到当前 v 值
-                uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16);
-
-                // 在该域内做“列步”同样的四邻取值
-                uint32_t vb = __shfl_xor_sync(mask16, v_diag,  4, 16);
-                uint32_t vc = __shfl_xor_sync(mask16, v_diag,  8, 16);
-                uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16);
-
-                // 对角步的 4 组 G 使用本轮消息对的后半（索引 8..15）
-                
-                uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16);
-                uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16);
-
-                uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role);
-
-                // 反置换回原位：li_undo = (rq<<2) | ((q - rq) & 3)
-                
-                // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16);
-                v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16);
-            }
-        } // 7 rounds end
-
-        // 派生新的 CV：cv[i] = v[i] ^ v[i+8]（仅 li=0..7 生效）
-        uint32_t vip8_all = __shfl_sync(mask16, v, li ^ 8, 16);
-        if (li < 8) {
-            cv_word = v ^ vip8_all;
-        }
-
-        // 下一块继续（本函数内 16 个 block 串行）
-    }
-
-    // 由 lane0 / lane16 收集 8×u32 输出
-    #pragma unroll
-    for (int j = 0; j < 8; ++j) {
-        uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用
-        if (li == 0) out_cv[j] = wj;                       // 仅 lane0 落盘
-    }
-}
-
-__device__ __noinline__
-void blake3_parent_cv_simd16(const uint32_t* __restrict__ L,     // 8×u32
-                             const uint32_t* __restrict__ R,     // 8×u32
-                             uint32_t* __restrict__ out_cv,      // 8×u32
-                             unsigned mask16)                    // half-warp masks for 16 lanes
-{
-    const int lane = threadIdx.x & 31;
-    const int li   = lane & 15;      // 0..15 half the warp
-    const int role = li & 3;
-
-    // messages: the front 8 from L, and the latter 8 from R
-    const uint32_t m_lane = (li < 8) ? L[li] : R[li - 8];
-
-    // v initialize
-    uint32_t v = (li < 8) ? BLAKE3_IV[li] : BLAKE3_IV[li - 8];
-
-    const uint32_t t0 = 0u;
-    const uint32_t t1 = 0u;
-    const uint32_t block_len = 64u;
-    const uint32_t flags = FLAG_PARENT;
-
-    v ^= (li == 12) ? t0        : 0u;
-    v ^= (li == 13) ? t1        : 0u;
-    v ^= (li == 14) ? block_len : 0u;
-    v ^= (li == 15) ? flags     : 0u;
-
-    // 与 leaf 相同的“列/对角”两步、共 7 轮
-    int q  = (li & 3);
-    int rq = (li >> 2);
-    int li_diag = (rq << 2) | ((q + rq) & 3);
-    int li_undo = (rq << 2) | ((q - rq) & 3);
-    int gi_col  = q;
-    int gi_diag = (li_diag & 3);
-
-    #pragma unroll 4
-    for (int r = 0; r < 7; ++r) {
-        // 列步
-        {
-            uint32_t vb = __shfl_xor_sync(mask16, v,  4, 16);
-            uint32_t vc = __shfl_xor_sync(mask16, v,  8, 16);
-            uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16);
-
-            uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16);
-            uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16);
-
-            v = G_update_role(v, vb, vc, vd, mx, my, role);
-        }
-        // 对角步
-        {
-            uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16);
-            uint32_t vb = __shfl_xor_sync(mask16, v_diag,  4, 16);
-            uint32_t vc = __shfl_xor_sync(mask16, v_diag,  8, 16);
-            uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16);
-
-            uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16);
-            uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16);
-
-            uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role);
-            v = __shfl_sync(mask16, v_diag_new, li_undo, 16);
-        }
-    }
-
-    // state -> CV：cv[i] = v[i] ^ v[i+8]
-    uint32_t vip8 = __shfl_sync(mask16, v, li ^ 8, 16);
-    uint32_t cv_word = (li < 8) ? (v ^ vip8) : 0;
-
-    // 半 warp 汇聚到 out_cv[0..7]（仅 li==0 的 4×收集也可以）
-    #pragma unroll
-    for (int j = 0; j < 8; ++j) {
-        uint32_t wj = __shfl_sync(mask16, cv_word, j, 16);
-        if (li == 0) out_cv[j] = wj;
-    }
-}
-
-__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){
-    uint32_t msg[16];
-#pragma unroll
-    for (int i = 0; i < 8; ++i) { 
-        msg[i] = L[i]; 
-    }
-#pragma unroll
-    for (int i = 0; i < 8; ++i) { 
-        msg[8+i] = R[i]; 
-    }
-    uint32_t st[16];
-    blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, FLAG_PARENT, st);
-    blake3_state_to_cv(st, out_cv);
-}
-
-
 
 // ============ Big kernel: 16 WARPS in total ============
 // grid: (chunks / 64), thread: (512,)
@@ -1023,7 +595,6 @@ inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out3
     const uint32_t zero_block[16] = {0};
     uint32_t st[16];
     blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st);
-    // 写出前 32 字节（state[0..7]，小端）
     for (int i = 0; i < 8; ++i) {
         uint32_t w = st[i];
         out32[4*i+0] = (uint8_t)( w        & 0xFF);
diff --git a/csrc/utils.cuh b/csrc/utils.cuh
new file mode 100644
index 0000000..fabfe55
--- /dev/null
+++ b/csrc/utils.cuh
@@ -0,0 +1,433 @@
+#include <cstring>
+
+#define WARP_SIZE 32
+#define G(a,b,c,d, x, y)            \
+    do {                            \
+        (a) = (a) + (b) + (x);      \
+        (d) = rotr32((d) ^ (a),16); \
+        (c) = (c) + (d);            \
+        (b) = rotr32((b) ^ (c),12); \
+        (a) = (a) + (b) + (y);      \
+        (d) = rotr32((d) ^ (a), 8); \
+        (c) = (c) + (d);            \
+        (b) = rotr32((b) ^ (c), 7); \
+    } while (0)
+
+
+__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = {
+    0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
+    0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
+};
+
+enum : uint32_t {
+    FLAG_CHUNK_START        = 1u << 0,
+    FLAG_CHUNK_END          = 1u << 1,
+    FLAG_PARENT             = 1u << 2,
+    FLAG_ROOT               = 1u << 3,
+    FLAG_KEYED_HASH         = 1u << 4,
+    FLAG_DERIVE_KEY_CONTEXT = 1u << 5,
+    FLAG_DERIVE_KEY_MATERIAL= 1u << 6,
+};
+
+__device__ __noinline__
+uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) {
+    uint32_t f = 0;
+    f |= (uint32_t)-(block_idx_in_chunk==0)          & FLAG_CHUNK_START;
+    f |= (uint32_t)-(block_idx_in_chunk==nblocks_in_chunk-1)  & FLAG_CHUNK_END;
+    if (is_root_chunk)                         f |= FLAG_ROOT; // only this block in msg, or this is root
+    return f;
+}
+
+__device__ __forceinline__
+uint32_t blake3_parent_flags(bool is_root_parent) {
+    return FLAG_PARENT | (is_root_parent ? FLAG_ROOT : 0);
+}
+
+// ---- 小工具 ----
+__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
+#if defined(__CUDA_ARCH__)
+    return __funnelshift_r(x, x, n);
+#else
+  return (x >> n) | (x << (32 - n));    // host 路径
+#endif
+}
+
+__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) {
+#if defined(__CUDA_ARCH__)
+    const uint4 v = *reinterpret_cast<const uint4*>(src);
+    dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w;
+#else
+    std::memcpy(dst, src, 16);
+#endif
+}
+
+__device__ void print_cv(uint32_t cv[8]) {
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        auto get_byte = [&](int i) {
+            int w = i >> 2;          // 第 i 个字节来自第 w 个 u32
+            int off = (i & 3) * 8;   // 在该 u32 中的偏移
+            return (unsigned)((cv[w] >> off) & 0xFFu);
+        };
+
+        printf("block %d root CV (u32, little-endian words):", blockIdx.x);
+        for (int i = 0; i < 32; ++i) {
+            printf("%02x", get_byte(i));
+            if ((i & 3) == 3) printf(" ");   // 每 4 字节空格
+        }
+        printf("\n");
+    }
+}
+
+__constant__ __device__ int B3_PERMUTE[16] = {
+    2, 6, 3,10, 7, 0, 4,13, 1,11,12, 5, 9,14,15, 8
+};
+
+// swap-table
+// BLAKE3 message schedule: rows are P^r, r=0..6.
+// Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) 
+// https://www.cse.unsw.edu.au/~cs4601/refs/papers/blake3.pdf
+__device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = {
+    // r = 0: identity
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+    // r = 1: P
+    {  2,  6,  3, 10,  7,  0,  4, 13,  1, 11, 12,  5,  9, 14, 15,  8 },
+    // r = 2: P∘P
+    {  3,  4, 10, 12, 13,  2,  7, 14,  6,  5,  9,  0, 11, 15,  8,  1 },
+    // r = 3
+    { 10,  7, 12,  9, 14,  3, 13, 15,  4,  0, 11,  2,  5,  8,  1,  6 },
+    // r = 4
+    { 12, 13,  9, 11, 15, 10, 14,  8,  7,  2,  5,  3,  0,  1,  6,  4 },
+    // r = 5
+    {  9, 14, 11,  5,  8, 12, 15,  1, 13,  3,  0, 10,  2,  6,  4,  7 },
+    // r = 6
+    { 11, 15,  5,  0,  1,  9,  8,  6, 14, 10,  2, 12,  3,  4,  7, 13 },
+};
+
+// get the "r" round, "k" message, it is broadcasted from m[li] lane. li = k
+__device__ __forceinline__ uint32_t msg_rk(uint32_t m_lane, int round, int k, unsigned mask16) {
+    int src = B3_MSG_SCHEDULE[round][k];
+    return __shfl_sync(mask16, m_lane, src, 16);
+}
+
+__device__ __noinline__
+uint32_t G_update_role(uint32_t v_self, uint32_t v_b, uint32_t v_c, uint32_t v_d,
+                       uint32_t mx, uint32_t my, int role)
+{
+    // 按 BLAKE2s 32-bit 的 G 序列算出 a',b',c',d'，最后返回“当前 role”的那个值
+    uint32_t a = v_self, b = v_b, c = v_c, d = v_d;
+
+    // a = a + b + mx; 
+    // d ^= a; 
+    // d >>>= 16
+    a = a + b + mx;   
+    d ^= a;   
+    d = rotr32(d, 16);
+
+    // c = c + d; 
+    // b ^= c; 
+    // b >>>= 12
+    c = c + d;        
+    b ^= c;   
+    b = rotr32(b, 12);
+
+    // a = a + b + my; 
+    // d ^= a; 
+    // d >>>= 8
+    a = a + b + my;   
+    d ^= a;   
+    d = rotr32(d, 8);
+
+    // c = c + d; 
+    // b ^= c; 
+    // b >>>= 7
+    c = c + d;        
+    b ^= c;   
+    b = rotr32(b, 7);
+
+    // role choice:
+    switch (role) {
+      case 0: return a;
+      case 1: return b;
+      case 2: return c;
+      default: return d;
+    }
+}
+
+
+// =============== Leaf 16-lane compressing ===============
+// notice that, this function will proceed 2 chunks, each time.
+// - chunk_words_row: current chunk
+// - out_cv: written by lane 0, or lane 16
+__device__ __noinline__
+void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row, // 256×u32 -> 1024 Bytes, from shared memory
+                                    // so the chunks_row += 2 as gap
+                                    int chunk_len_bytes,
+                                    uint64_t chunk_counter,
+                                    uint32_t out_cv[8],
+                                    unsigned mask16) {
+    // computing index
+    int lane = threadIdx.x & 31;    // lane_id: 0-31
+    int sub  = lane >> 4;            // 0/1
+    int li   = lane & 15;            // 0..15, abstract lane id. for example, lane 16 will be li=0
+    int role = li & 3;               // a/b/c/d role
+    int base = (sub << 4);           // 0 or 16 the absolute base
+
+    const int nblocks = (chunk_len_bytes + 63) >> 6;  // ceil(chunk_len/64)
+
+    int warp_id = threadIdx.x / WARP_SIZE;
+
+    // initialize
+    uint32_t cv_word = 0;
+    if (li < 8) cv_word = BLAKE3_IV[li];
+
+    // process all blocks
+    // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks
+    // each block has 64B -> 16 x u32
+    for (int b = 0; b < nblocks; ++b) {
+        // each lane holds one u32, 
+        // 16 lane will hold 16 x 4 = 64 B -> it's block
+        // the another 16 lane will hold opposite 64 B
+        const uint32_t m_lane = chunk_words_row[b * 16 + li];
+
+        // 初始化 v：v[0..7]=cv, v[8..11]=IV，v[12..15]^=t/len/flags
+        // 先把“自己的那个索引”的初值准备好：
+        
+        // 计数器/长度/标志（按 BLAKE3 规范）
+        const uint32_t t0 = (uint32_t)chunk_counter;
+        const uint32_t t1 = (uint32_t)(chunk_counter >> 32);
+        const int remain = chunk_len_bytes - (b << 6);
+        const uint32_t block_len = (remain >= 64) ? 64u : (uint32_t)remain;
+        
+        const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false);
+        uint32_t v =
+            (li < 8)  ? cv_word :
+            (li < 12) ? BLAKE3_IV[(li - 8) & 3] :
+            (li == 12) ? t0 :
+            (li == 13) ? t1 :
+            (li == 14) ? block_len : flags;
+
+        // 把索引写成 li = q + 4*rq；对角步先做置换 li' = (rq<<2) | ((q+rq)&3)
+        int q  = (li & 3);
+        int rq = (li >> 2);
+        int li_diag = (rq << 2) | ((q + rq) & 3);
+        int li_undo = (rq << 2) | ((q - rq) & 3);
+        int gi_col = q; // 0..3
+        int gi_diag = (li_diag & 3); // 0..3
+
+        // ===== 7 rounds =====
+        #pragma unroll 4
+        for (int r = 0; r < 7; ++r) {
+            // inside this loop, each lane will do one job
+            // 16 lane will execute 16 x 2 operations
+            // in sequential-programming, will do 8 operation
+
+            // ---- 列步（quartet: {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15}）----
+            {
+                // 取同 quartet 的 b/c/d（基于当前 v）
+                uint32_t vb = __shfl_xor_sync(mask16, v,  4, 16);
+                uint32_t vc = __shfl_xor_sync(mask16, v,  8, 16);
+                uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16);
+
+                // 本 quartet 的 i ∈ {0,1,2,3}，列步用 msg 索引 0..7（两两为一对）
+                
+                uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16);
+                uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16);
+
+                v = G_update_role(v, vb, vc, vd, mx, my, role);
+            }
+
+            // ---- 对角步 ----
+            {
+                // 在“对角置换域”取到当前 v 值
+                uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16);
+
+                // 在该域内做“列步”同样的四邻取值
+                uint32_t vb = __shfl_xor_sync(mask16, v_diag,  4, 16);
+                uint32_t vc = __shfl_xor_sync(mask16, v_diag,  8, 16);
+                uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16);
+
+                // 对角步的 4 组 G 使用本轮消息对的后半（索引 8..15）
+                
+                uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16);
+                uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16);
+
+                uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role);
+
+                // 反置换回原位：li_undo = (rq<<2) | ((q - rq) & 3)
+                
+                // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16);
+                v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16);
+            }
+        } // 7 rounds end
+
+        // 派生新的 CV：cv[i] = v[i] ^ v[i+8]（仅 li=0..7 生效）
+        uint32_t vip8_all = __shfl_sync(mask16, v, li ^ 8, 16);
+        if (li < 8) {
+            cv_word = v ^ vip8_all;
+        }
+
+        // 下一块继续（本函数内 16 个 block 串行）
+    }
+
+    // 由 lane0 / lane16 收集 8×u32 输出
+    #pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用
+        if (li == 0) out_cv[j] = wj;                       // 仅 lane0 落盘
+    }
+}
+
+// =============== Parent 16-lane compressing ===============
+__device__ __noinline__
+void blake3_parent_cv_simd16(const uint32_t* __restrict__ L,     // 8×u32
+                             const uint32_t* __restrict__ R,     // 8×u32
+                             uint32_t* __restrict__ out_cv,      // 8×u32
+                             unsigned mask16)                    // half-warp masks for 16 lanes
+{
+    const int lane = threadIdx.x & 31;
+    const int li   = lane & 15;      // 0..15 half the warp
+    const int role = li & 3;
+
+    // messages: the front 8 from L, and the latter 8 from R
+    const uint32_t m_lane = (li < 8) ? L[li] : R[li - 8];
+
+    // v initialize
+
+    const uint32_t t0 = 0u;
+    const uint32_t t1 = 0u;
+    const uint32_t block_len = 64u;
+    const uint32_t flags = FLAG_PARENT;
+
+    uint32_t iv_val = BLAKE3_IV[li & 7];
+
+    uint32_t v =
+        (li < 12)  ? iv_val :
+        (li == 12) ? t0 :
+        (li == 13) ? t1 :
+        (li == 14) ? block_len : flags;
+
+    // 与 leaf 相同的“列/对角”两步、共 7 轮
+    int q  = (li & 3);
+    int rq = (li >> 2);
+    int li_diag = (rq << 2) | ((q + rq) & 3);
+    int li_undo = (rq << 2) | ((q - rq) & 3);
+    int gi_col  = q;
+    int gi_diag = (li_diag & 3);
+
+    #pragma unroll 4
+    for (int r = 0; r < 7; ++r) {
+        // 列步
+        {
+            uint32_t vb = __shfl_xor_sync(mask16, v,  4, 16);
+            uint32_t vc = __shfl_xor_sync(mask16, v,  8, 16);
+            uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16);
+
+            uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16);
+            uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16);
+
+            v = G_update_role(v, vb, vc, vd, mx, my, role);
+        }
+        // 对角步
+        {
+            uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16);
+            uint32_t vb = __shfl_xor_sync(mask16, v_diag,  4, 16);
+            uint32_t vc = __shfl_xor_sync(mask16, v_diag,  8, 16);
+            uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16);
+
+            uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16);
+            uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16);
+
+            uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role);
+            v = __shfl_sync(mask16, v_diag_new, li_undo, 16);
+        }
+    }
+
+    // state -> CV：cv[i] = v[i] ^ v[i+8]
+    uint32_t vip8 = __shfl_sync(mask16, v, li ^ 8, 16);
+    uint32_t cv_word = (li < 8) ? (v ^ vip8) : 0;
+
+    // 半 warp 汇聚到 out_cv[0..7]（仅 li==0 的 4×收集也可以）
+    #pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        uint32_t wj = __shfl_sync(mask16, cv_word, j, 16);
+        if (li == 0) out_cv[j] = wj;
+    }
+}
+
+// =============== Parent 1-lane compressing ===============
+// the actually right compress 7r in a single lane function
+__host__ __device__ void blake3_compress_words_7r(
+    const uint32_t block_words[16],   // 64B -> shared memory
+    const uint32_t cv[8],             // 8×u32 -> shared memory
+    uint64_t chunk_counter,           // 64-bit
+    uint32_t block_len,               // [0..64]
+    uint32_t flags,                   // CHUNK_START/END/PARENT/ROOT…
+    uint32_t out_state[16])           // output
+{
+    // 1) initialize v
+    uint32_t v0=cv[0], v1=cv[1], v2=cv[2], v3=cv[3];
+    uint32_t v4=cv[4], v5=cv[5], v6=cv[6], v7=cv[7];
+
+    uint32_t v8 =BLAKE3_IV[0], v9 =BLAKE3_IV[1], v10=BLAKE3_IV[2], v11=BLAKE3_IV[3];
+
+    // injection
+    uint32_t v12=(uint32_t)chunk_counter, v13=(uint32_t)(chunk_counter >> 32), v14=block_len, v15=flags;
+
+    // 2) 7 轮
+    int perm[16];    // 每轮的消息索引
+    #pragma unroll
+    for (int i = 0; i < 16; ++i) 
+        perm[i] = i;
+
+    #pragma unroll
+    for (int r=0; r < 7; ++r) {
+        // col-step
+        G(v0, v4, v8, v12, block_words[perm[0]],  block_words[perm[1]]);
+        G(v1, v5, v9, v13, block_words[perm[2]],  block_words[perm[3]]);
+        G(v2, v6, v10,v14, block_words[perm[4]],  block_words[perm[5]]);
+        G(v3, v7, v11,v15, block_words[perm[6]],  block_words[perm[7]]);
+
+        // diag-step
+        G(v0, v5, v10,v15, block_words[perm[8]], block_words[perm[9]]);
+        G(v1, v6, v11,v12, block_words[perm[10]], block_words[perm[11]]);
+        G(v2, v7, v8, v13, block_words[perm[12]], block_words[perm[13]]);
+        G(v3, v4, v9, v14, block_words[perm[14]], block_words[perm[15]]);
+
+        // perm = perm ∘ PERMUTE
+        int np[16];
+        #pragma unroll
+        for (int i = 0; i < 16; ++i) 
+            np[i] = perm[B3_PERMUTE[i]];
+        #pragma unroll
+        for (int i = 0; i < 16; ++i) 
+            perm[i] = np[i];
+    }
+
+    // 3) write to out state
+    out_state[ 0]=v0;  out_state[ 1]=v1;  out_state[ 2]=v2;  out_state[ 3]=v3;
+    out_state[ 4]=v4;  out_state[ 5]=v5;  out_state[ 6]=v6;  out_state[ 7]=v7;
+    out_state[ 8]=v8;  out_state[ 9]=v9;  out_state[10]=v10; out_state[11]=v11;
+    out_state[12]=v12; out_state[13]=v13; out_state[14]=v14; out_state[15]=v15;
+}
+
+// from out_state yields CV
+__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_cv[i] = st[i] ^ st[8+i];
+}
+
+__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){
+    uint32_t msg[16];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[i] = L[i]; 
+    }
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[8+i] = R[i]; 
+    }
+    uint32_t st[16];
+    blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, FLAG_PARENT, st);
+    blake3_state_to_cv(st, out_cv);
+}
\ No newline at end of file

From 8642165c516d5e4d8ab1651294a00d2d7e27ce2e Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Thu, 9 Oct 2025 14:57:07 +0800
Subject: [PATCH 16/20] debug tiny kernel

---
 benchmark/test_gpu.py    |  8 ++--
 csrc/blake3_sm70_sm80.cu | 89 +++++++++++++++++++++++++++-------------
 csrc/utils.cuh           |  2 +-
 3 files changed, 66 insertions(+), 33 deletions(-)

diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py
index 50a7422..6e10d6d 100644
--- a/benchmark/test_gpu.py
+++ b/benchmark/test_gpu.py
@@ -42,15 +42,17 @@
     for _ in range(2):
         fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
         torch.cuda.synchronize()
-        print("\n")
+        print("1\n")
     torch.cuda.synchronize()
 
     # 3) 正式计时：这测的是“端到端吞吐”（包含 H2D）
-    repeat = 2  # 1GiB × 5 已经很重，按机器调整
+    repeat = 0  # 1GiB × 5 已经很重，按机器调整
+    cv_hex = None
     t0 = time.perf_counter()
     for _ in range(repeat):
         cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
-        print("\n")
+        torch.cuda.synchronize()
+        print("2\n")
     torch.cuda.synchronize()
     t1 = time.perf_counter()
 
diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu
index 9365f7d..0fc397e 100644
--- a/csrc/blake3_sm70_sm80.cu
+++ b/csrc/blake3_sm70_sm80.cu
@@ -231,7 +231,7 @@ __global__ void blake3_block_reduce_kernel(const uint32_t* d_input,
 
         // the chunk local identifies the left or right chunk, so do not worry.
         const uint32_t* row = valid_sub ? &chunk_smem[chunk_local][0] : &chunk_smem[0][0];
-        uint32_t* out = valid_sub ? &tmp_cv[chunk_local][0] : nullptr;
+        uint32_t* out = valid_sub ? &tmp_cv[chunk_local][0] : nullptr;      // FIXME: here has some problem!!!
 
 #if defined(DBG_KERNEL) && DBG_KERNEL
         if (!valid_sub) {
@@ -245,7 +245,7 @@ __global__ void blake3_block_reduce_kernel(const uint32_t* d_input,
             out, 
             submask);
 
-        __syncwarp(full);       // make sure two warps written into `tmp_cv`
+        __syncwarp();       // make sure two warps written into `tmp_cv`
 
 // #if defined(DBG_KERNEL) && DBG_KERNEL
 //             if (blockIdx.x == 0 && threadIdx.x == 0) printf("The simd16-lane res: \n");
@@ -281,7 +281,7 @@ __global__ void blake3_block_reduce_kernel(const uint32_t* d_input,
         print_cv(cv_smem[pair_idx]);
 #endif
 
-        __syncwarp(full);       // NOTICE: this is necessary!
+        __syncwarp();       // NOTICE: this is necessary!
     }; // do_big_pass
 
     // big-pass 1: computing [CHUNK_PER_BLOCK / 2] chunks
@@ -390,6 +390,20 @@ __device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) {
     g4[1] = make_uint4(r[4],r[5],r[6],r[7]);
 }
 
+__device__ __forceinline__
+const void* to_smem_ptr(const void* p){
+    return (const void*)__cvta_generic_to_shared(p);
+}
+
+__device__ __forceinline__
+void load_cv_s2r_vec(const uint32_t* __restrict__ s, uint32_t dst[8]) {
+    const uint4* s4 = reinterpret_cast<const uint4*>(to_smem_ptr(s));  // 编译成 ld.shared.v4.u32
+    uint4 v0 = s4[0];
+    uint4 v1 = s4[1];
+    dst[0]=v0.x; dst[1]=v0.y; dst[2]=v0.z; dst[3]=v0.w;
+    dst[4]=v1.x; dst[5]=v1.y; dst[6]=v1.z; dst[7]=v1.w;
+}
+
 // ============ Tiny kernel ============
 // In big kernel, it will consume 64 or 32 KiB each block  [CHUNKS_PER_BLOCK]
 // For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / CHUNKS_PER_BLOCK root = 16384 or 32768 roots
@@ -438,6 +452,13 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
     }
     __syncthreads();
 
+#if defined(DBG_KERNEL) && DBG_KERNEL
+    if (tid == 0 && blockIdx.x == 0) {
+        printf("Block %d root CV for tiny kernel entry:", blockIdx.x);
+        print_cv(smem);
+    }
+#endif
+
     // ---------------- Stage 2: each lane merge 4 → 1 (keep the neighbor order) ----------------
     // reduced_n0 = ceil(tile_n / 4)  lane-root
     const int reduced_n0 = (tile_n + 3) >> 1 >> 1;
@@ -445,13 +466,15 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
     bool lane_valid = false;
 
     // start index
-    int base4 = tid << 2; // tid*4
+    int base4 = tid << 2; // tid*4, 0..8
     if (base4 < tile_n) {
         // 4-neighbor CV：idx = base4 + 0,1,2,3
         uint32_t a[8], b[8], c[8], d[8];
 
         const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD);
-        load_cv_g2r(s0, a);
+        printf("The base 1: %d\n", base4);
+        // TODO: what's wrong?
+        load_cv_s2r_vec(s0, a);
 
         int remain = tile_n - base4;
 
@@ -459,15 +482,18 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
 
         if (remain >= 2) {
             const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD);
-            load_cv_g2r(s1, b);
+            if (base4 >= 1020) printf("The base 2: %d\n", base4);
+            load_cv_s2r_vec(s1, b);
         }
         if (remain >= 3) {
             const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD);
-            load_cv_g2r(s2, c);
+            if (base4 >= 1020) printf("The base 3: %d\n", base4);
+            load_cv_s2r_vec(s2, c);
         }
         if (remain >= 4) {
             const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD);
-            load_cv_g2r(s3, d);
+            if (base4 >= 1020) printf("The base 4: %d\n", base4);
+            load_cv_s2r_vec(s3, d);
         }
 
         // merge the neighbor
@@ -490,15 +516,22 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
         lane_valid = true;
     }
 
+#if defined(DBG_KERNEL) && DBG_KERNEL
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        printf("The lane-reduce-4 output:");
+        print_cv(lane_cv);
+    }
+#endif
+
     // ---------------- Stage 3: Warp-level 32→1 neighbor-shfl merge ----------------
     const int warp_base = warp_id * WARP_SIZE;
     const int cur_n_w   = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // active number
 
     // this will introduce extra branch-prediction overhead
-    unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4));
+    unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4));     // This may not work
     int cur_n = cur_n_w;
 
-    bool active_lane = (lane_id < cur_n_w);
+    bool active_lane = (lane_id < cur_n);
 
     if (!active_lane) { 
         #pragma unroll
@@ -507,12 +540,12 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
     }
 
     // step = 1,2,4,8,16 - warp-reduce
-    for (int step = 1; step < WARP_SIZE; step <<= 1) {
+    for (int step = 1; step < cur_n_w; step <<= 1) {
         // right-neighbor
         uint32_t nbr[8];
         #pragma unroll
         for (int j = 0; j < 8; ++j)
-            nbr[j] = __shfl_down_sync(0xFFFFFFFFu, lane_cv[j], step);
+            nbr[j] = __shfl_down_sync(mask, lane_cv[j], step);
 
         const bool do_pair =
             active_lane &&
@@ -537,6 +570,13 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
     }
     __syncthreads();
 
+#if defined(DBG_KERNEL) && DBG_KERNEL
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        printf("The warp-reduce output:");
+        print_cv(warp_roots[0]);
+    }
+#endif
+
     // ---------------- Stage 4: CTA's NUM_WARPS → 1 block reduce ----------------
     int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..NUM_WARPS
     if (valid_warps == 0) return;
@@ -547,11 +587,11 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
     const int  li       = lane_id & 15;              // 0..15
     const unsigned full    = __activemask();
     const unsigned submask = ((sub==0) ? 0x0000FFFFu : 0xFFFF0000u) & full;
+    const int half_id = warp_id * 2 + sub;       // half-warp index
     cur_n = NUM_WARPS;
 
     while (cur_n > 1) {
         const int pairs   = cur_n >> 1;              // the pair count.
-        const int half_id = warp_id * 2 + sub;       // half-warp index
 
         if (half_id < pairs) {
             const int left_idx  = (half_id << 1);
@@ -589,6 +629,13 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
         for (int j = 0; j < 8; ++j) 
             out[j] = warp_roots[0][j];
     }
+
+#if defined(DBG_KERNEL) && DBG_KERNEL
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        printf("================================ Finishing all in tiny kernel! ================================\n");
+    }
+#endif
+
 }
 
 inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) {
@@ -615,22 +662,6 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data,
         std::abort();
     }
 
-    // int dev = -1;
-    // cudaGetDevice(&dev);
-    // printf("[dbg] my runtime current device = %d\n", dev);
-
-    // cudaPointerAttributes attr{};
-    // auto st = cudaPointerGetAttributes(&attr, d_data);
-    // printf("[dbg] getAttr=%d, type=%d (0=host,1=device,2=managed), device=%d\n",
-    //     (int)st, (int)attr.type, attr.device);
-
-    // cudaPointerAttributes attr{};
-    // CUDA_CHECK(cudaPointerGetAttributes(&attr, d_data));
-    // if (attr.type != cudaMemoryTypeDevice) {
-    //     fprintf(stderr, "d_data is not device memory!\n");
-    //     std::abort();
-    // }
-
     int optin = 0, deflt = 0;
     cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0);
     cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0);
diff --git a/csrc/utils.cuh b/csrc/utils.cuh
index fabfe55..3c1739b 100644
--- a/csrc/utils.cuh
+++ b/csrc/utils.cuh
@@ -273,7 +273,7 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row
     #pragma unroll
     for (int j = 0; j < 8; ++j) {
         uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用
-        if (li == 0) out_cv[j] = wj;                       // 仅 lane0 落盘
+        if (li == 0 && out_cv) out_cv[j] = wj;                       // 仅 lane0 落盘
     }
 }
 

From e8afb0664d66f10907f2e2f8c53a533d68493734 Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Thu, 9 Oct 2025 15:06:47 +0800
Subject: [PATCH 17/20] finish part-debugging

---
 benchmark/test_gpu.py    |  6 +-----
 csrc/blake3_sm70_sm80.cu | 17 ++++-------------
 2 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py
index 6e10d6d..3052a80 100644
--- a/benchmark/test_gpu.py
+++ b/benchmark/test_gpu.py
@@ -41,18 +41,14 @@
     # 2) 预热，触发 JIT/驱动初始化，避免首轮偏慢
     for _ in range(2):
         fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
-        torch.cuda.synchronize()
-        print("1\n")
     torch.cuda.synchronize()
 
     # 3) 正式计时：这测的是“端到端吞吐”（包含 H2D）
-    repeat = 0  # 1GiB × 5 已经很重，按机器调整
+    repeat = 5  # 1GiB × 5 已经很重，按机器调整
     cv_hex = None
     t0 = time.perf_counter()
     for _ in range(repeat):
         cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
-        torch.cuda.synchronize()
-        print("2\n")
     torch.cuda.synchronize()
     t1 = time.perf_counter()
 
diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu
index 0fc397e..b782f90 100644
--- a/csrc/blake3_sm70_sm80.cu
+++ b/csrc/blake3_sm70_sm80.cu
@@ -390,16 +390,12 @@ __device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) {
     g4[1] = make_uint4(r[4],r[5],r[6],r[7]);
 }
 
-__device__ __forceinline__
-const void* to_smem_ptr(const void* p){
-    return (const void*)__cvta_generic_to_shared(p);
-}
-
 __device__ __forceinline__
 void load_cv_s2r_vec(const uint32_t* __restrict__ s, uint32_t dst[8]) {
-    const uint4* s4 = reinterpret_cast<const uint4*>(to_smem_ptr(s));  // 编译成 ld.shared.v4.u32
-    uint4 v0 = s4[0];
-    uint4 v1 = s4[1];
+    // 确保 16B 对齐：cv_tile 要 __align__(16)，stride=(8+PAD)*4 也要是16的倍数
+    const uint4* p = reinterpret_cast<const uint4*>(s);
+    uint4 v0 = p[0];
+    uint4 v1 = p[1];
     dst[0]=v0.x; dst[1]=v0.y; dst[2]=v0.z; dst[3]=v0.w;
     dst[4]=v1.x; dst[5]=v1.y; dst[6]=v1.z; dst[7]=v1.w;
 }
@@ -472,8 +468,6 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
         uint32_t a[8], b[8], c[8], d[8];
 
         const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD);
-        printf("The base 1: %d\n", base4);
-        // TODO: what's wrong?
         load_cv_s2r_vec(s0, a);
 
         int remain = tile_n - base4;
@@ -482,17 +476,14 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
 
         if (remain >= 2) {
             const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD);
-            if (base4 >= 1020) printf("The base 2: %d\n", base4);
             load_cv_s2r_vec(s1, b);
         }
         if (remain >= 3) {
             const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD);
-            if (base4 >= 1020) printf("The base 3: %d\n", base4);
             load_cv_s2r_vec(s2, c);
         }
         if (remain >= 4) {
             const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD);
-            if (base4 >= 1020) printf("The base 4: %d\n", base4);
             load_cv_s2r_vec(s3, d);
         }
 

From 0d1485a5ac3478355998ac845660a55926fa2599 Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Fri, 10 Oct 2025 17:07:21 +0800
Subject: [PATCH 18/20] debug big kernel tail problem

---
 benchmark/perf.txt       |  44 ++--------------
 benchmark/test_gpu.py    |  20 ++++---
 csrc/blake3_sm70_sm80.cu | 110 ++++++++++++++++++++++++++++++---------
 csrc/utils.cuh           |  52 ++++++++++++------
 4 files changed, 138 insertions(+), 88 deletions(-)

diff --git a/benchmark/perf.txt b/benchmark/perf.txt
index d48f165..0058b2e 100644
--- a/benchmark/perf.txt
+++ b/benchmark/perf.txt
@@ -1,43 +1,5 @@
 10.7 V100: 54819.25 MiB/s
-    RTX 4090: 150585.83 MiB/s
+    RTX 4090: 150585.83 MiB/s (False)
 
-(ceg5206) yazhu@DESKTOP-37AIPE6:~/workspace/cpp_code/ceg5206/grp_proj$ python benchmark/test_gpu.py 
-Stage 1 finish processing
-The 1st chunk merged res: 
-block 0 root CV (u32, little-endian words):68f4f634 6a351c13 83eef7bf 6194f235 c3e40515 5a2bff65 9ff142fa 54c43967 
-Stage 2 - pass 1 finish processing
-The 1st chunk merged res: 
-block 0 root CV (u32, little-endian words):930c7381 17018a62 722c6e90 4b5bfaa7 09e9e8f9 e41e7ecf 75386773 7c727b3b 
-Stage 2 - pass 2 finish processing, pass 1: 16
-block 0 root CV (u32, little-endian words):791e0e1a 05a01828 bc7cfa9f 3274a8ae 50feb0e3 c6113c92 2aaca74f 272f3096 
-================================ Finishing all in big kernel! ================================
-Block 0 root CV for tiny kernel:block 0 root CV (u32, little-endian words):369139fa 28a9fc9c ab9149a8 e3f60323 9c7504ac b5088ac8 4c951d3f 3ca90f3a 
-Block 0 root CV for tiny kernel:block 0 root CV (u32, little-endian words):62a88205 53625a4a b538365b c131e095 935d3504 989b4852 f5ed3f51 6a53b552 
-GPU BLAKE3 Result:    62a8820553625a4ab538365bc131e095935d3504989b4852f5ed3f516a53b552
-std BLAKE3 Expected:  d490c8f71546ae9909e5c6c1fd23264e268c148ea83a140d517b249e6a6a035b
-std BLAKE3 1KB:       f7314bcd4f08b945da46890d4abcbe9bd78905369461379ed5ab893eaccff236
-Traceback (most recent call last):
-  File "/home/yazhu/workspace/cpp_code/ceg5206/grp_proj/benchmark/test_gpu.py", line 37, in <module>
-    assert cv_hex == std_hex, "GPU BLAKE3 result does not match CPU result!"
-           ^^^^^^^^^^^^^^^^^
-AssertionError: GPU BLAKE3 result does not match CPU result!
-(ceg5206) yazhu@DESKTOP-37AIPE6:~/workspace/cpp_code/ceg5206/grp_proj$ python benchmark/test_gpu.py 
-Stage 1 finish processing
-The 1st chunk merged res: 
-block 0 root CV (u32, little-endian words):68f4f634 6a351c13 83eef7bf 6194f235 c3e40515 5a2bff65 9ff142fa 54c43967 
-Stage 2 - pass 1 finish processing
-The 1st chunk merged res: 
-block 0 root CV (u32, little-endian words):930c7381 17018a62 722c6e90 4b5bfaa7 09e9e8f9 e41e7ecf 75386773 7c727b3b 
-Stage 2 - pass 2 finish processing, pass 1: 16
-block 0 root CV (u32, little-endian words):791e0e1a 05a01828 bc7cfa9f 3274a8ae 50feb0e3 c6113c92 2aaca74f 272f3096 
-================================ Finishing all in big kernel! ================================
-Block 0 root CV for tiny kernel:block 0 root CV (u32, little-endian words):369139fa 28a9fc9c ab9149a8 e3f60323 9c7504ac b5088ac8 4c951d3f 3ca90f3a 
-Block 0 root CV for tiny kernel:block 0 root CV (u32, little-endian words):62a88205 53625a4a b538365b c131e095 935d3504 989b4852 f5ed3f51 6a53b552 
-GPU BLAKE3 Result:    62a8820553625a4ab538365bc131e095935d3504989b4852f5ed3f516a53b552
-std BLAKE3 Expected:  d490c8f71546ae9909e5c6c1fd23264e268c148ea83a140d517b249e6a6a035b
-std BLAKE3 1KB:       f7314bcd4f08b945da46890d4abcbe9bd78905369461379ed5ab893eaccff236
-Traceback (most recent call last):
-  File "/home/yazhu/workspace/cpp_code/ceg5206/grp_proj/benchmark/test_gpu.py", line 37, in <module>
-    assert cv_hex == std_hex, "GPU BLAKE3 result does not match CPU result!"
-           ^^^^^^^^^^^^^^^^^
-AssertionError: GPU BLAKE3 result does not match CPU result!
\ No newline at end of file
+10.9, RTX4090: 67184.92 MiB/s
+10.9  A800: 20803.11 MiB/s
\ No newline at end of file
diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py
index 3052a80..3e391e9 100644
--- a/benchmark/test_gpu.py
+++ b/benchmark/test_gpu.py
@@ -40,21 +40,29 @@
 if check_perf:
     # 2) 预热，触发 JIT/驱动初始化，避免首轮偏慢
     for _ in range(2):
+        d.copy_(cpu, non_blocking=True)
+        torch.cuda.synchronize()
         fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
     torch.cuda.synchronize()
 
     # 3) 正式计时：这测的是“端到端吞吐”（包含 H2D）
-    repeat = 5  # 1GiB × 5 已经很重，按机器调整
+    repeat = 0  # 1GiB × 5 已经很重，按机器调整
     cv_hex = None
-    t0 = time.perf_counter()
+    total_time = 1e-8
     for _ in range(repeat):
+        d.copy_(cpu, non_blocking=True)
+        torch.cuda.synchronize()
+
+        t0 = time.perf_counter()
         cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
+        t1 = time.perf_counter()
+
+        elapsed = t1 - t0
+        total_time += elapsed
     torch.cuda.synchronize()
-    t1 = time.perf_counter()
 
-    elapsed = t1 - t0
-    print(f"Elapsed time for {repeat}x BLAKE3 (GPU SM70): {elapsed:.3f} seconds")
-    print(f"Throughput: {repeat * d.numel() / elapsed / (1024**2):.2f} MiB/s")
+    print(f"Elapsed time for {repeat}x BLAKE3 (GPU SM70): {total_time:.3f} seconds")
+    print(f"Throughput: {repeat * d.numel() / total_time / (1024**2):.2f} MiB/s")
     print("root CV (hex) =", cv_hex)
 
 # print(f"std BLAKE3 Expected: {std_hex}")
\ No newline at end of file
diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu
index b782f90..f17e07f 100644
--- a/csrc/blake3_sm70_sm80.cu
+++ b/csrc/blake3_sm70_sm80.cu
@@ -1,4 +1,3 @@
-
 #include <cute/numeric/int.hpp>
 #include <cstdint>
 #include <array>
@@ -245,7 +244,7 @@ __global__ void blake3_block_reduce_kernel(const uint32_t* d_input,
             out, 
             submask);
 
-        __syncwarp();       // make sure two warps written into `tmp_cv`
+        __syncwarp(submask);       // make sure two warps written into `tmp_cv`
 
 // #if defined(DBG_KERNEL) && DBG_KERNEL
 //             if (blockIdx.x == 0 && threadIdx.x == 0) printf("The simd16-lane res: \n");
@@ -362,9 +361,9 @@ __global__ void blake3_block_reduce_kernel(const uint32_t* d_input,
         // write out
         if (warp_id == 0 && lane_id == 0) {
             uint32_t* out = block_cvs + (size_t)blockIdx.x * 8;
-#if defined(DBG_KERNEL) && DBG_KERNEL
-            print_cv(cv_smem[0]);
-#endif
+// #if defined(DBG_KERNEL) && DBG_KERNEL
+//             print_cv(cv_smem[0]);
+// #endif
             reinterpret_cast<uint4*>(out)[0] = make_uint4(cv_smem[0][0], cv_smem[0][1], cv_smem[0][2], cv_smem[0][3]);
             reinterpret_cast<uint4*>(out)[1] = make_uint4(cv_smem[0][4], cv_smem[0][5], cv_smem[0][6], cv_smem[0][7]);
         }
@@ -400,6 +399,27 @@ void load_cv_s2r_vec(const uint32_t* __restrict__ s, uint32_t dst[8]) {
     dst[4]=v1.x; dst[5]=v1.y; dst[6]=v1.z; dst[7]=v1.w;
 }
 
+__device__ __forceinline__
+void load_cv_s2r_vec_shared(const uint32_t* __restrict__ s, uint32_t dst[8]) {
+#if __CUDA_ARCH__ >= 700
+    // 把泛型指针显式转成 shared 地址，然后 ld.shared.v4.u32 两次
+    unsigned smem_addr = static_cast<unsigned>(__cvta_generic_to_shared(s));
+    uint32_t x0,x1,x2,x3,x4,x5,x6,x7;
+    asm volatile(
+        "ld.shared.v4.u32 {%0,%1,%2,%3}, [%8];\n\t"
+        "ld.shared.v4.u32 {%4,%5,%6,%7}, [%8+16];\n\t"
+        : "=r"(x0),"=r"(x1),"=r"(x2),"=r"(x3),
+          "=r"(x4),"=r"(x5),"=r"(x6),"=r"(x7)
+        : "r"(smem_addr));
+    dst[0]=x0; dst[1]=x1; dst[2]=x2; dst[3]=x3;
+    dst[4]=x4; dst[5]=x5; dst[6]=x6; dst[7]=x7;
+#else
+    // 旧架构退化到标量
+    #pragma unroll
+    for (int j=0;j<8;++j) dst[j] = s[j];
+#endif
+}
+
 // ============ Tiny kernel ============
 // In big kernel, it will consume 64 or 32 KiB each block  [CHUNKS_PER_BLOCK]
 // For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / CHUNKS_PER_BLOCK root = 16384 or 32768 roots
@@ -468,7 +488,7 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
         uint32_t a[8], b[8], c[8], d[8];
 
         const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD);
-        load_cv_s2r_vec(s0, a);
+        load_cv_s2r_vec_shared(s0, a);
 
         int remain = tile_n - base4;
 
@@ -476,16 +496,31 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
 
         if (remain >= 2) {
             const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD);
-            load_cv_s2r_vec(s1, b);
+            load_cv_s2r_vec_shared(s1, b);
         }
         if (remain >= 3) {
             const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD);
-            load_cv_s2r_vec(s2, c);
+            load_cv_s2r_vec_shared(s2, c);
         }
         if (remain >= 4) {
             const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD);
-            load_cv_s2r_vec(s3, d);
+            load_cv_s2r_vec_shared(s3, d);
+        }
+
+#if defined(DBG_KERNEL) && DBG_KERNEL
+        // 石锤了：load进来的东西压根不一样
+        if (blockIdx.x == 0 && threadIdx.x < 28 && threadIdx.x > 10) {
+            for (int i = 0 ; i < 32; i++) {
+                if (i == lane_id) {
+                    printf("The lane-merge input from lane: %d:\n", lane_id);
+                    print_cv(a, lane_id);
+                    print_cv(b, lane_id);
+                    print_cv(c, lane_id);
+                    print_cv(d, lane_id);
+                }
+            }
         }
+#endif
 
         // merge the neighbor
         if (remain == 1) {
@@ -508,53 +543,76 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
     }
 
 #if defined(DBG_KERNEL) && DBG_KERNEL
-    if (blockIdx.x == 0 && threadIdx.x == 0) {
+    if (blockIdx.x == 0 && threadIdx.x == 4 && lane_valid) {
         printf("The lane-reduce-4 output:");
-        print_cv(lane_cv);
+        print_cv(lane_cv, 4);
     }
 #endif
 
     // ---------------- Stage 3: Warp-level 32→1 neighbor-shfl merge ----------------
     const int warp_base = warp_id * WARP_SIZE;
-    const int cur_n_w   = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // active number
+    const int n   = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // active number
 
     // this will introduce extra branch-prediction overhead
-    unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4));     // This may not work
-    int cur_n = cur_n_w;
+    const unsigned active_mask = __ballot_sync(0xFFFFFFFFu, lane_id < n);
 
-    bool active_lane = (lane_id < cur_n);
+// #if defined(DBG_KERNEL) && DBG_KERNEL
+//     if (gridDim.x == 1 && lane_valid) {
+//         printf("The tid: %d\n", tid);
+//     }
+// #endif
 
-    if (!active_lane) { 
+    if (lane_id >= n) { 
         #pragma unroll
         for (int j = 0; j < 8; ++j) 
             lane_cv[j] = 0u; 
     }
 
+    __syncwarp(active_mask);
+
     // step = 1,2,4,8,16 - warp-reduce
-    for (int step = 1; step < cur_n_w; step <<= 1) {
+    for (int step = 1; step < n; step <<= 1) {
         // right-neighbor
         uint32_t nbr[8];
         #pragma unroll
         for (int j = 0; j < 8; ++j)
-            nbr[j] = __shfl_down_sync(mask, lane_cv[j], step);
+            nbr[j] = __shfl_down_sync(active_mask, lane_cv[j], step);
+
+        __syncwarp(active_mask);  // 所有 lane 都抓完再继续
 
+        // only the left-half lane will participant 
+        // 0,2,4,6,8,... step = 1
+        // 0,4,8,...    step = 2
+        // 0,8,..       step = 4
+        // 0,16         step = 8
+        // 0            step = 16
         const bool do_pair =
-            active_lane &&
-            ((lane_id % (step<<1)) == 0) &&
-            (lane_id + step < cur_n);
+            ((lane_id & ((step << 1) - 1)) == 0) &&   // 只允许每 2*step 组的第一个 lane 合并
+            ((lane_id ^ step) < n);                   // 右邻确实在活跃集合内
+
+// #if defined(DBG_KERNEL) && DBG_KERNEL
+//         if (gridDim.x != 1 && blockIdx.x == 0 && threadIdx.x < 32) printf("lane %d do pair? %d\n", lane_id, do_pair);
+// #endif
+
+// #if defined(DBG_KERNEL) && DBG_KERNEL
+//     if (gridDim.x != 1 && blockIdx.x == 0 && threadIdx.x == 16) {
+//             printf("left:");
+//             print_cv(lane_cv, 16);
+//             printf("right:");
+//             print_cv(nbr, 16);
+//     }
+// #endif
 
         if (do_pair) {
             blake3_parent_cv(lane_cv, nbr, lane_cv);
         }
-
-        cur_n = (cur_n + 1) >> 1;
-        // __syncwarp();
+        __syncwarp(active_mask);
     }
 
     // lane0；NUM_WARPS warp-root write to SMEM
     // e.g. if this block has 8 warp, there will be 8 warp-root
     __shared__ uint32_t warp_roots[NUM_WARPS][8]; // NUM_WARPS × 8
-    if (lane_id == 0 && cur_n_w > 0) {
+    if (lane_id == 0 && n > 0) {
         #pragma unroll
         for (int j = 0; j < 8; ++j) 
             warp_roots[warp_id][j] = lane_cv[j];
@@ -579,7 +637,7 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
     const unsigned full    = __activemask();
     const unsigned submask = ((sub==0) ? 0x0000FFFFu : 0xFFFF0000u) & full;
     const int half_id = warp_id * 2 + sub;       // half-warp index
-    cur_n = NUM_WARPS;
+    int cur_n = NUM_WARPS;
 
     while (cur_n > 1) {
         const int pairs   = cur_n >> 1;              // the pair count.
diff --git a/csrc/utils.cuh b/csrc/utils.cuh
index 3c1739b..8c88995 100644
--- a/csrc/utils.cuh
+++ b/csrc/utils.cuh
@@ -1,4 +1,5 @@
 #include <cstring>
+#pragma once
 
 #define WARP_SIZE 32
 #define G(a,b,c,d, x, y)            \
@@ -13,10 +14,23 @@
         (b) = rotr32((b) ^ (c), 7); \
     } while (0)
 
+// host - side definition
+inline constexpr uint32_t BLAKE3_IV_HOST[8] = {
+  0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
+  0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
+};
+
+__device__ __constant__ uint32_t BLAKE3_IV_DEV[8] = {
+  0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
+  0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
+};
 
-__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = {
-    0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
-    0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
+inline constexpr int B3_PERMUTE_HOST[16] = {
+    2, 6, 3,10, 7, 0, 4,13, 1,11,12, 5, 9, 14, 15, 8
+};
+
+__device__ __constant__ int B3_PERMUTE_DEV[16] = {
+    2, 6, 3,10, 7, 0, 4,13, 1,11,12, 5, 9, 14, 15, 8
 };
 
 enum : uint32_t {
@@ -61,8 +75,8 @@ __host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t ds
 #endif
 }
 
-__device__ void print_cv(uint32_t cv[8]) {
-    if (blockIdx.x == 0 && threadIdx.x == 0) {
+__device__ void print_cv(uint32_t cv[8], int tgt_tid = 0) {
+    if (blockIdx.x == 0 && threadIdx.x == tgt_tid) {
         auto get_byte = [&](int i) {
             int w = i >> 2;          // 第 i 个字节来自第 w 个 u32
             int off = (i & 3) * 8;   // 在该 u32 中的偏移
@@ -78,10 +92,6 @@ __device__ void print_cv(uint32_t cv[8]) {
     }
 }
 
-__constant__ __device__ int B3_PERMUTE[16] = {
-    2, 6, 3,10, 7, 0, 4,13, 1,11,12, 5, 9,14,15, 8
-};
-
 // swap-table
 // BLAKE3 message schedule: rows are P^r, r=0..6.
 // Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) 
@@ -178,7 +188,7 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row
 
     // initialize
     uint32_t cv_word = 0;
-    if (li < 8) cv_word = BLAKE3_IV[li];
+    if (li < 8) cv_word = BLAKE3_IV_DEV[li];
 
     // process all blocks
     // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks
@@ -201,7 +211,7 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row
         const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false);
         uint32_t v =
             (li < 8)  ? cv_word :
-            (li < 12) ? BLAKE3_IV[(li - 8) & 3] :
+            (li < 12) ? BLAKE3_IV_DEV[(li - 8) & 3] :
             (li == 12) ? t0 :
             (li == 13) ? t1 :
             (li == 14) ? block_len : flags;
@@ -298,7 +308,7 @@ void blake3_parent_cv_simd16(const uint32_t* __restrict__ L,     // 8×u32
     const uint32_t block_len = 64u;
     const uint32_t flags = FLAG_PARENT;
 
-    uint32_t iv_val = BLAKE3_IV[li & 7];
+    uint32_t iv_val = BLAKE3_IV_DEV[li & 7];
 
     uint32_t v =
         (li < 12)  ? iv_val :
@@ -368,8 +378,15 @@ __host__ __device__ void blake3_compress_words_7r(
     uint32_t v0=cv[0], v1=cv[1], v2=cv[2], v3=cv[3];
     uint32_t v4=cv[4], v5=cv[5], v6=cv[6], v7=cv[7];
 
-    uint32_t v8 =BLAKE3_IV[0], v9 =BLAKE3_IV[1], v10=BLAKE3_IV[2], v11=BLAKE3_IV[3];
+#if defined(__CUDA_ARCH__)
+    // device-side call
+    const uint32_t* IV = BLAKE3_IV_DEV;
+#else
+    // host-side call
+    const uint32_t* IV = BLAKE3_IV_HOST;
+#endif
 
+    uint32_t v8 =IV[0], v9 =IV[1], v10=IV[2], v11=IV[3];
     // injection
     uint32_t v12=(uint32_t)chunk_counter, v13=(uint32_t)(chunk_counter >> 32), v14=block_len, v15=flags;
 
@@ -395,9 +412,14 @@ __host__ __device__ void blake3_compress_words_7r(
 
         // perm = perm ∘ PERMUTE
         int np[16];
+#if defined (__CUDA_ARCH__)
+        const int* PERM_T = B3_PERMUTE_DEV;
+#else
+        const int* PERM_T = B3_PERMUTE_HOST;
+#endif
         #pragma unroll
         for (int i = 0; i < 16; ++i) 
-            np[i] = perm[B3_PERMUTE[i]];
+            np[i] = perm[PERM_T[i]];
         #pragma unroll
         for (int i = 0; i < 16; ++i) 
             perm[i] = np[i];
@@ -428,6 +450,6 @@ __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint3
         msg[8+i] = R[i]; 
     }
     uint32_t st[16];
-    blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, FLAG_PARENT, st);
+    blake3_compress_words_7r(msg, BLAKE3_IV_DEV, 0ull, 64u, FLAG_PARENT, st);
     blake3_state_to_cv(st, out_cv);
 }
\ No newline at end of file

From 520c60ce195df802cf13293ceb653f3137de44f2 Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Sat, 11 Oct 2025 10:34:21 +0800
Subject: [PATCH 19/20] finish debug

---
 benchmark/test_gpu.py    |  5 ++--
 csrc/blake3_sm70_sm80.cu | 49 ++++++++++++++++++++--------------------
 csrc/utils.cuh           |  2 +-
 3 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py
index 3e391e9..902386c 100644
--- a/benchmark/test_gpu.py
+++ b/benchmark/test_gpu.py
@@ -40,9 +40,10 @@
 if check_perf:
     # 2) 预热，触发 JIT/驱动初始化，避免首轮偏慢
     for _ in range(2):
-        d.copy_(cpu, non_blocking=True)
+        # d.copy_(cpu, non_blocking=True)
         torch.cuda.synchronize()
-        fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
+        output = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
+        print("warmup CV (hex) =", output)
     torch.cuda.synchronize()
 
     # 3) 正式计时：这测的是“端到端吞吐”（包含 H2D）
diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu
index f17e07f..b835c15 100644
--- a/csrc/blake3_sm70_sm80.cu
+++ b/csrc/blake3_sm70_sm80.cu
@@ -507,20 +507,20 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
             load_cv_s2r_vec_shared(s3, d);
         }
 
-#if defined(DBG_KERNEL) && DBG_KERNEL
-        // 石锤了：load进来的东西压根不一样
-        if (blockIdx.x == 0 && threadIdx.x < 28 && threadIdx.x > 10) {
-            for (int i = 0 ; i < 32; i++) {
-                if (i == lane_id) {
-                    printf("The lane-merge input from lane: %d:\n", lane_id);
-                    print_cv(a, lane_id);
-                    print_cv(b, lane_id);
-                    print_cv(c, lane_id);
-                    print_cv(d, lane_id);
-                }
-            }
-        }
-#endif
+// #if defined(DBG_KERNEL) && DBG_KERNEL
+//         // 石锤了：load进来的东西压根不一样
+//         if (blockIdx.x == 0 && threadIdx.x < 28 && threadIdx.x > 10) {
+//             for (int i = 0 ; i < 32; i++) {
+//                 if (i == lane_id) {
+//                     printf("The lane-merge input from lane: %d:\n", lane_id);
+//                     print_cv(a, lane_id);
+//                     print_cv(b, lane_id);
+//                     print_cv(c, lane_id);
+//                     print_cv(d, lane_id);
+//                 }
+//             }
+//         }
+// #endif
 
         // merge the neighbor
         if (remain == 1) {
@@ -595,16 +595,16 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
 // #endif
 
 // #if defined(DBG_KERNEL) && DBG_KERNEL
-//     if (gridDim.x != 1 && blockIdx.x == 0 && threadIdx.x == 16) {
-//             printf("left:");
-//             print_cv(lane_cv, 16);
-//             printf("right:");
-//             print_cv(nbr, 16);
-//     }
+//         if (gridDim.x != 1 && blockIdx.x == 0 && threadIdx.x == 16) {
+//                 printf("left on step %d:", step);
+//                 print_cv(lane_cv, 16);
+//                 printf("right on step %d:", step);
+//                 print_cv(nbr, 16);
+//         }
 // #endif
 
         if (do_pair) {
-            blake3_parent_cv(lane_cv, nbr, lane_cv);
+            blake3_parent_cv(lane_cv, nbr, lane_cv);        // does not require shared memory
         }
         __syncwarp(active_mask);
     }
@@ -637,7 +637,7 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv
     const unsigned full    = __activemask();
     const unsigned submask = ((sub==0) ? 0x0000FFFFu : 0xFFFF0000u) & full;
     const int half_id = warp_id * 2 + sub;       // half-warp index
-    int cur_n = NUM_WARPS;
+    int cur_n = valid_warps;
 
     while (cur_n > 1) {
         const int pairs   = cur_n >> 1;              // the pair count.
@@ -731,8 +731,7 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data,
         blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>,
         cudaFuncAttributePreferredSharedMemoryCarveout, 100));
         
-    uint8_t*  d_bytes = const_cast<uint8_t*>(d_data);
-    const uint32_t* d_words = reinterpret_cast<uint32_t*>(d_bytes);; // alias
+    const auto* d_words = reinterpret_cast<const uint32_t*>(d_data); // alias
     uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32
 
     // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch.
@@ -772,7 +771,7 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data,
         }
 
         CUDA_CHECK(cudaFree(d_blockCV));
-        CUDA_CHECK(cudaFree(d_bytes));
+        // CUDA_CHECK(cudaFree(d_bytes));
         return;
     }
 
diff --git a/csrc/utils.cuh b/csrc/utils.cuh
index 8c88995..c0311b8 100644
--- a/csrc/utils.cuh
+++ b/csrc/utils.cuh
@@ -288,7 +288,7 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row
 }
 
 // =============== Parent 16-lane compressing ===============
-__device__ __noinline__
+__device__ __forceinline__
 void blake3_parent_cv_simd16(const uint32_t* __restrict__ L,     // 8×u32
                              const uint32_t* __restrict__ R,     // 8×u32
                              uint32_t* __restrict__ out_cv,      // 8×u32

From 2cc5b6504c393a7777b13ed4ab9ed9dd529d3b1f Mon Sep 17 00:00:00 2001
From: l1cacheDell <yazhul1cache@gmail.com>
Date: Tue, 11 Nov 2025 22:20:21 +0800
Subject: [PATCH 20/20] benchmark

---
 benchmark/bench.py    | 116 ++++++++++++++++++++++++++++++++++++++++++
 benchmark/test_gpu.py |   2 +-
 2 files changed, 117 insertions(+), 1 deletion(-)
 create mode 100644 benchmark/bench.py

diff --git a/benchmark/bench.py b/benchmark/bench.py
new file mode 100644
index 0000000..307e2fa
--- /dev/null
+++ b/benchmark/bench.py
@@ -0,0 +1,116 @@
+# bench_blake3_sizes.py
+import time
+import math
+import torch
+import blake3
+import flashashing as fh
+
+# ===== 可调参数 =====
+CHECK_ACCURACY = False       # 是否校验与 CPU blake3 一致（小规模可开，大规模会慢一点）
+USE_PINNED_HOST = True       # 是否使用 pinned host memory（建议 True，用于端到端吞吐）
+WARMUP_PER_SIZE = 2          # 每个尺寸的预热次数（不计入统计）
+
+# 每个尺寸的重复次数，尽量让统计时间不至于太短
+def pick_repeats(n_bytes: int) -> int:
+    if n_bytes <= 1*1024:          # 1 KB
+        return 2000
+    if n_bytes <= 16*1024:
+        return 1000
+    if n_bytes <= 64*1024:
+        return 500
+    if n_bytes <= 256*1024:
+        return 200
+    if n_bytes <= 1*1024*1024:
+        return 100
+    if n_bytes <= 4*1024*1024:
+        return 50
+    return 20  # 10.3MB
+
+# ===== 测试尺寸（字节）=====
+sizes = [
+    ("1KB (1 chunk)", 1 * 1024),
+    ("8KB",            8 * 1024),
+    ("16KB",          16 * 1024),
+    ("64KB",          64 * 1024),
+    ("256KB",        256 * 1024),
+    ("1MB",         1 * 1024 * 1024),
+    ("4MB",         4 * 1024 * 1024),
+    ("10.3MB",      int(11 * 1024 * 1024)),  # 以 MiB 解释 10.3MB
+]
+
+device = torch.device("cuda")
+stream = torch.cuda.current_stream().cuda_stream
+
+# 预分配最大的 host/device buffer，子切片复用，保证公平且减少反复分配
+max_n = max(n for _, n in sizes)
+cpu = torch.empty(max_n, dtype=torch.uint8, pin_memory=USE_PINNED_HOST)
+cpu[:] = ord('A')
+
+d = torch.empty_like(cpu, device=device)  # device 大小与 max_n 一致
+torch.cuda.synchronize()
+
+# 标题
+print("BLAKE3 GPU benchmark across message sizes")
+print(f"Host pinned memory: {USE_PINNED_HOST}")
+print("-" * 96)
+print("{:<14} {:>10} {:>8} {:>12} {:>12} {:>12} {:>12}".format(
+    "size", "bytes", "repeat", "kern_ms", "kern_MiB/s", "e2e_ms", "e2e_MiB/s"
+))
+print("-" * 96)
+
+for label, n in sizes:
+    # 预热
+    for _ in range(WARMUP_PER_SIZE):
+        # 给 device 子切片复制 n 字节
+        d[:n].copy_(cpu[:n], non_blocking=True)
+        torch.cuda.synchronize()
+        _ = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), n, stream)
+    torch.cuda.synchronize()
+
+    repeat = pick_repeats(n)
+    total_kernel_ms = 0.0
+    total_e2e_sec = 0.0
+    last_hex = None
+
+    # CUDA event 用于“纯 kernel”计时
+    start_evt = torch.cuda.Event(enable_timing=True)
+    end_evt = torch.cuda.Event(enable_timing=True)
+
+    for _ in range(repeat):
+        # -------- End-to-end: H2D + kernel --------
+        torch.cuda.synchronize()
+        t0 = time.perf_counter()
+        d[:n].copy_(cpu[:n], non_blocking=True)
+        # kernel-only 计时：events 只包住内核调用
+        start_evt.record()
+        last_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), n, stream)
+        end_evt.record()
+        torch.cuda.synchronize()  # 等 H2D+kernel 都完成
+        t1 = time.perf_counter()
+
+        total_e2e_sec += (t1 - t0)
+        total_kernel_ms += start_evt.elapsed_time(end_evt)
+
+    # 平均时间
+    avg_kernel_ms = total_kernel_ms / repeat
+    avg_e2e_ms = (total_e2e_sec / repeat) * 1e3
+
+    # 吞吐（MiB/s；1 MiB = 1024^2）
+    mib = n / (1024 ** 2) if n != 10.3 else 11 / (1024 ** 2)
+    kernel_throughput = mib / (avg_kernel_ms / 1e3) if avg_kernel_ms > 0 else float('inf')
+    e2e_throughput = mib / (avg_e2e_ms / 1e3) if avg_e2e_ms > 0 else float('inf')
+
+    print("{:<14} {:>10} {:>8} {:>12.3f} {:>12.2f} {:>12.3f} {:>12.2f}".format(
+        label, n, repeat, avg_kernel_ms, kernel_throughput, avg_e2e_ms, e2e_throughput
+    ))
+
+    if CHECK_ACCURACY:
+        # CPU 校验（只对当前尺寸做一次）
+        std_hex = blake3.blake3(cpu[:n].numpy()).hexdigest()
+        assert last_hex == std_hex, f"Mismatch at {label}: GPU {last_hex} vs CPU {std_hex}"
+
+print("-" * 96)
+print("Notes:")
+print("  • kernel_ms / kern_MiB/s 只统计 GPU 计算时间（CUDA events），不含 H2D；")
+print("  • e2e_ms / e2e_MiB/s 统计 H2D + kernel 的总时间；")
+print("  • '10.3MB' 按 10.3 * 1024 * 1024 字节计算。")
diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py
index 902386c..d634d21 100644
--- a/benchmark/test_gpu.py
+++ b/benchmark/test_gpu.py
@@ -47,7 +47,7 @@
     torch.cuda.synchronize()
 
     # 3) 正式计时：这测的是“端到端吞吐”（包含 H2D）
-    repeat = 0  # 1GiB × 5 已经很重，按机器调整
+    repeat = 10  # 1GiB × 5 已经很重，按机器调整
     cv_hex = None
     total_time = 1e-8
     for _ in range(repeat):