diff --git a/.gitignore b/.gitignore
index aebc9d7..47c25b7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 run.sh
+profile.sh
 
 **.__pycache__
 
@@ -6,4 +7,11 @@ run.sh
 
 build/
 dist/
-*.egg-info/
\ No newline at end of file
+*.egg-info/
+
+*.ncu-rep
+*.nsys-rep
+
+*.ptx
+
+*.o
\ No newline at end of file
diff --git a/README.md b/README.md
index ef873a4..88cb978 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,11 @@ Proposal (from Google Drive): https://docs.google.com/document/d/10B3_nT8xF49vLg
 conda create -n ceg5206 python=3.12
 pip install -r requirements.txt
 
+# clone the cutlass repository
+git clone https://github.com/NVIDIA/cutlass.git ~/cutlass --depth 1
+
+export FLASHASHING_CUDA_ARCH_LIST=<your-arch-here>        # your arch here
+
 # install the cpp source file
 python setup.py install
 
@@ -19,4 +24,15 @@ python benchmark/test_script.py
 
 + 10.3: Implement `SHA256-SIMD` version, still some flaws: only allow short string processing (`len < 55`)
 + 10.2: Consultation with doctor, invite to repo, add `BLAKE3` basic implementations
-+ 10.1: Provide basic template for project, implementing basic sha256 in C++.
\ No newline at end of file
++ 10.1: Provide basic template for project, implementing basic sha256 in C++.
+
+# GPU kernel performance on File Compress hashing
+
+> This logs were tested on RTX 4090 Laptop GPU, computation / memory is limited, soon in the latter section we will see the perf on different machine.
+
++ 10.5 - v1 - [commit:4f31a2c55551965a2e5f048565f021c4551554ae]: 1709.34 MiB/s
++ 10.5 - v2 - [commit:2e71051367a9533af36f6c54a46876e20bc0bab5]: 2044.49 MiB/s
++ 10.6 - v3 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s (False Result)
++ 10.6 - v4 - [commit:9781ad9759d13e0e482162d57c68b65c946f4ff9]: 17069.23 MiB/s
++ 10.6 - v5 - [commit:4be8258f5e82aab4e57e8b70a604ebb9361d8aa0]: 34261.37 MiB/s
++ 10.8 - v6 - [commit:]: 15145.81 MiB/s
\ No newline at end of file
diff --git a/backup_deprecated/blake3_sm70_v1.cu b/backup_deprecated/blake3_sm70_v1.cu
new file mode 100644
index 0000000..867b6af
--- /dev/null
+++ b/backup_deprecated/blake3_sm70_v1.cu
@@ -0,0 +1,800 @@
+
+#include <cstdint>
+#include <array>
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <cstring>
+
+#define WARP_SIZE 32
+#define LDST128BITS(value) (reinterpret_cast<const float4*>(&(value))[0])
+
+#define CUDA_CHECK(expr) do {                                   \
+    cudaError_t _e = (expr);                                    \
+    if (_e != cudaSuccess) {                                    \
+      fprintf(stderr, "CUDA error %s at %s:%d: %s\n",           \
+              #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\
+      std::abort();                                             \
+    }                                                           \
+  } while(0)
+
+__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = {
+    0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
+    0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
+};
+
+// ---- 小工具 ----
+__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
+#if defined(__CUDA_ARCH__)
+    return __funnelshift_r(x, x, n);
+#else
+  return (x >> n) | (x << (32 - n));    // host 路径
+#endif
+}
+
+#define B3_G(a,b,c,d, mx, my)     \
+    do {                          \
+        a = a + b + (mx);         \
+        d = rotr32(d ^ a, 16);    \
+        c = c + d;                \
+        b = rotr32(b ^ c, 12);    \
+        a = a + b + (my);         \
+        d = rotr32(d ^ a,  8);    \
+        c = c + d;                \
+        b = rotr32(b ^ c,  7);    \
+    } while (0)
+
+__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) {
+#if defined(__CUDA_ARCH__)
+    const uint4 v = *reinterpret_cast<const uint4*>(src);
+    dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w;
+#else
+    std::memcpy(dst, src, 16);
+#endif
+}
+
+__host__ __device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = {
+    // r = 0: identity
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+    // r = 1: P
+    {  2,  6,  3, 10,  7,  0,  4, 13,  1, 11, 12,  5,  9, 14, 15,  8 },
+    // r = 2: P∘P
+    {  3,  4, 10, 12, 13,  2,  7, 14,  6,  5,  9,  0, 11, 15,  8,  1 },
+    // r = 3
+    { 10,  7, 12,  9, 14,  3, 13, 15,  4,  0, 11,  2,  5,  8,  1,  6 },
+    // r = 4
+    { 12, 13,  9, 11, 15, 10, 14,  8,  7,  2,  5,  3,  0,  1,  6,  4 },
+    // r = 5
+    {  9, 14, 11,  5,  8, 12, 15,  1, 13,  3,  0, 10,  2,  6,  4,  7 },
+    // r = 6
+    { 11, 15,  5,  0,  1,  9,  8,  6, 14, 10,  2, 12,  3,  4,  7, 13 },
+};
+
+__host__ __device__ void blake3_compress_words_7r(
+    const uint32_t block_words[16],   // 64B -> shared memory
+    const uint32_t cv[8],             // 8×u32 -> shared memory
+    uint64_t chunk_counter,           // 64-bit
+    uint32_t block_len,               // [0..64]
+    uint32_t flags,                   // CHUNK_START/END/PARENT/ROOT…
+    uint32_t out_state[16])           // 返回 16×u32 状态向量（按规范）
+{
+    // v[0..7]=cv；v[8..11]=IV；v[12..15]=IV^t0^t1^b^flags
+    uint32_t v0=cv[0], v1=cv[1], v2=cv[2], v3=cv[3];
+    uint32_t v4=cv[4], v5=cv[5], v6=cv[6], v7=cv[7];
+
+    uint32_t v8 = BLAKE3_IV[0], v9  = BLAKE3_IV[1];
+    uint32_t v10= BLAKE3_IV[2], v11 = BLAKE3_IV[3];
+    uint32_t v12= BLAKE3_IV[4] ^ (uint32_t)chunk_counter;
+    uint32_t v13= BLAKE3_IV[5] ^ (uint32_t)(chunk_counter >> 32);
+    uint32_t v14= BLAKE3_IV[6] ^ block_len;
+    uint32_t v15= BLAKE3_IV[7] ^ flags;
+
+    // 7 轮
+#pragma unroll 1
+    for (int r = 0; r < 7; ++r) {
+        // 取本轮 16 个消息字（按 BLAKE3 调度表）
+        const uint8_t* SR = B3_MSG_SCHEDULE[r];  // 0..15
+        const uint32_t m0  = block_words[SR[0]],  m1  = block_words[SR[1]];
+        const uint32_t m2  = block_words[SR[2]],  m3  = block_words[SR[3]];
+        const uint32_t m4  = block_words[SR[4]],  m5  = block_words[SR[5]];
+        const uint32_t m6  = block_words[SR[6]],  m7  = block_words[SR[7]];
+        const uint32_t m8  = block_words[SR[8]],  m9  = block_words[SR[9]];
+        const uint32_t m10 = block_words[SR[10]], m11 = block_words[SR[11]];
+        const uint32_t m12 = block_words[SR[12]], m13 = block_words[SR[13]];
+        const uint32_t m14 = block_words[SR[14]], m15 = block_words[SR[15]];
+
+        // 列步：四个 G
+        B3_G(v0, v4, v8,  v12, m0,  m1);
+        B3_G(v1, v5, v9,  v13, m2,  m3);
+        B3_G(v2, v6, v10, v14, m4,  m5);
+        B3_G(v3, v7, v11, v15, m6,  m7);
+
+        // 对角步：四个 G
+        B3_G(v0, v5, v10, v15, m8,  m9);
+        B3_G(v1, v6, v11, v12, m10, m11);
+        B3_G(v2, v7, v8,  v13, m12, m13);
+        B3_G(v3, v4, v9,  v14, m14, m15);
+    }
+
+    // 输出 16×u32 状态（后续由调用者做 state_to_cv）
+    out_state[0]=v0;  out_state[1]=v1;  out_state[2]=v2;  out_state[3]=v3;
+    out_state[4]=v4;  out_state[5]=v5;  out_state[6]=v6;  out_state[7]=v7;
+    out_state[8]=v8;  out_state[9]=v9;  out_state[10]=v10; out_state[11]=v11;
+    out_state[12]=v12;out_state[13]=v13;out_state[14]=v14; out_state[15]=v15;
+}
+
+// 从 out_state 派生新的 CV（按规范应取 state[0..7] ^ state[8..15]）
+__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_cv[i] = st[i] ^ st[8+i];
+}
+
+// 叶：处理 1KiB chunk（16×64B blocks）→ 1 个 CV
+// 假定输入为小端 u32 流，chunk 不足 1KiB 最后一块 block_len<64 并置 END 标志
+__device__ void blake3_leaf_cv(const uint32_t* chunk_words, 
+                                int chunk_len_bytes,
+                               uint64_t chunk_counter, 
+                               uint32_t out_cv[8])
+{
+    uint32_t cv[8];
+    // 初始 cv = IV
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        cv[i] = BLAKE3_IV[i];
+
+    const int nblocks = (chunk_len_bytes + 63) / 64; // ceil
+    for (int b = 0; b < nblocks; ++b) {
+        uint32_t st[16];
+        const uint32_t* block = chunk_words + b*16;
+        const int remain = chunk_len_bytes - b*64;
+        const uint32_t blk_len = remain >= 64 ? 64u : (uint32_t)remain;
+
+        const uint32_t flags =
+            ((b == 0) ? (1u<<0) : 0u) |                       // CHUNK_START（示意：bit0）
+            ((b == nblocks-1) ? (1u<<1) : 0u);                // CHUNK_END   （示意：bit1）
+
+        blake3_compress_words_7r(block, cv, chunk_counter, blk_len, flags, st);
+        blake3_state_to_cv(st, cv);
+    }
+
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_cv[i] = cv[i];
+}
+
+__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){
+    uint32_t msg[16];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[i] = L[i]; 
+    }
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[8+i] = R[i]; 
+    }
+    uint32_t st[16];
+    blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st);
+    blake3_state_to_cv(st, out_cv);
+}
+
+// ============ Big kernel: 16 WARPS in total ============
+// grid: (chunks / 64), thread: (512,)
+template<const int NUM_THREADS=512, 
+        const int CHUNK_SIZE=1024, 
+        const int CHUNKS_PER_BLOCK=64,
+        const int PAD_CHUNK=4,
+        const int PAD_CV=4>        // pad shared memory
+__global__ void blake3_block_reduce_kernel(uint32_t* d_input, 
+                                            uint32_t* block_cvs,
+                                            int chunk_len_bytes,
+                                            uint64_t base_chunk_counter,
+                                            int total_chunks) {
+    // NUM_WARPS also stands for NUM_CHUKNS per block
+    constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;        // consider it as 16
+    constexpr int CHUNKS_PROCEED = 64;
+
+    static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned");
+    static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4");
+
+    // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total
+    // 8 x 4 x 64 = 2 KiB shared memory in sum
+    __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV];        // avoid bank conflict
+
+    // 4 bytes x 256 x 64 = 64 KiB shared memory.
+    __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][(CHUNK_SIZE / 4) + PAD_CHUNK];     // [64][256]
+
+    const int tid = threadIdx.x;
+    const int bx = blockIdx.x;
+    const int warp_id = tid / WARP_SIZE;
+    const int lane_id = tid % WARP_SIZE;
+
+    constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4;     // 256
+    constexpr int VEC_ELEMS = 4;                        // uint4
+    constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB)
+    constexpr int WARPS_PER_CTA = NUM_WARPS;            // 16
+
+    // ============== STAGE 1: Coalsced Global Memory Loading ==============
+    const int tile_id   = blockIdx.x;
+    const int tile_base = tile_id * CHUNKS_PER_BLOCK;       // which chunk do this block start loading
+
+    int valid_chunks = total_chunks - tile_base;
+    if (valid_chunks <= 0) {
+        return; // overflow
+    }
+    if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK;
+
+    for (int ldt = 0; ldt < 4; ldt++) {
+        // each warp load 4 chunks
+        int chunk_local  = ldt * WARPS_PER_CTA + warp_id;           // ldt*16 + warp -> start chunk
+        int chunk_global = tile_base + chunk_local;                 // global chunk idx
+
+        // the pointer for shared memory
+        uint32_t* s_u32 = &chunk_smem[chunk_local][0];
+
+        // only read from global, when it's valid
+        // or, we fill it with 0
+        if (chunk_local < valid_chunks) {
+            const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK;
+
+            // move 16 bytes -> 128 bits each time
+            // each thread will load 2 x 16 bytes
+            // so 32 threads will load 32 x 2 x 16 = 1024 B
+            const uint4* __restrict__ g4 = reinterpret_cast<const uint4*>(g_u32);
+            uint4* __restrict__ s4 = reinterpret_cast<uint4*>(s_u32);
+
+            // idx = lane_id (0..31) 与 lane_id+32 (32..63)
+            int idx0 = lane_id;           // 0..31
+            int idx1 = lane_id + WARP_SIZE; // 32..63
+
+            // thread 0 -> 0, 32
+            // thread 1 -> 1, 33
+            // ...
+            // thread 31 -> 31, 63
+            // so the global memory access is coalsced
+
+            // notice, we load 16 bytes a time. the index is compressed
+            // tid 0 -> 0,       tid 1 -> 16
+            // tid 0 -> 32 x 16, tid 2 -> 32 x 16 + 16
+
+            uint4 v0 = g4[idx0];        // still, this step we load from gmem, in 4 elements aligned.
+            uint4 v1 = g4[idx1];
+
+            s_u32[4*idx0 + 0] = v0.x;   // when load into shared mem, do manually
+            s_u32[4*idx0 + 1] = v0.y;     
+            s_u32[4*idx0 + 2] = v0.z; 
+            s_u32[4*idx0 + 3] = v0.w;
+
+            s_u32[4*idx1 + 0] = v1.x; 
+            s_u32[4*idx1 + 1] = v1.y;
+            s_u32[4*idx1 + 2] = v1.z; 
+            s_u32[4*idx1 + 3] = v1.w;
+        } else {
+            uint4* s4 = reinterpret_cast<uint4*>(s_u32);
+            int idx0 = lane_id;
+            int idx1 = lane_id + WARP_SIZE;
+            s4[idx0] = make_uint4(0u,0u,0u,0u);
+            s4[idx1] = make_uint4(0u,0u,0u,0u);
+        }
+    }
+
+    __syncthreads();        // sync all warps
+
+    // ============== STAGE 2: Compress Leaf to 64 chain value ==============
+    const int pass0_valid = min(32, valid_chunks);              // pass0 cover [0, 31] chunks
+    const int pass1_valid = max(0, valid_chunks - 32);          // pass1 cover [32, 63] chunks
+
+    __shared__ int parents_count;
+    if (threadIdx.x == 0) {
+        const int parents0 = (pass0_valid + 1) >> 1;
+        const int parents1 = (pass1_valid + 1) >> 1;
+        parents_count = parents0 + parents1;             // ≤ 32
+    }
+    __syncthreads();
+
+    auto compute_leaf_cv_from_row = [&](int chunk_local, uint32_t out_cv[8]) {
+        const uint32_t* mline = &chunk_smem[chunk_local][0];   // 1 KiB = 256 u32
+        const uint64_t  cc = base_chunk_counter + (uint64_t)(tile_base + chunk_local);
+        blake3_leaf_cv(mline, chunk_len_bytes, cc, out_cv);
+    };
+
+    uint32_t warp_cv_pass0[8], warp_cv_pass1[8];
+    bool have_pass0 = false, have_pass1 = false;
+
+    // ------- STAGE 2 pass = 0 -------
+    {
+        const int left  = (warp_id << 1);            // 2*warp_id : 0,2,4,...,30
+        const int right = left + 1;                  // neighbor: 1,3,5,...,31
+        const int pair_idx = left >> 1;              // 0..15
+
+        uint32_t left_cv[8], right_cv[8];
+
+        bool have_left = false, have_right = false;
+
+        if (lane_id == 0 && left < pass0_valid) {
+            compute_leaf_cv_from_row(left, left_cv);
+            have_left = true;
+        }
+        if (lane_id == 1 && right < pass0_valid) {
+            compute_leaf_cv_from_row(right, right_cv);
+            have_right = true;
+        }
+
+        // merge two neighbor
+        unsigned mask = __ballot_sync(0xFFFFFFFFu, (lane_id==0 && have_left) || (lane_id==1 && have_right));
+        if (lane_id == 0 && left < pass0_valid) {
+            uint32_t parent[8];
+            if (have_right) {
+                uint32_t rcv[8];
+                #pragma unroll
+                for (int j = 0; j < 8; ++j) 
+                    rcv[j] = __shfl_sync(mask, right_cv[j], 1);
+                blake3_parent_cv(left_cv, rcv, parent);
+            } else {
+                #pragma unroll
+                for (int j = 0; j < 8; ++j) 
+                    parent[j] = left_cv[j];   // 奇数晋级
+            }
+            // 写入 cv_smem 的正确位置：pair_idx = left/2  → 0..15
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                cv_smem[pair_idx][j] = parent[j];
+        }
+        __syncwarp();
+    }
+
+    // ---- STAGE 2 pass 1 ----
+    {
+        const int left  = 32 + (warp_id << 1);       // 32,34,...,62
+        const int right = left + 1;                  // 33,35,...,63
+        const int pair_idx = left >> 1;              // 16..31
+
+        uint32_t left_cv[8], right_cv[8];
+
+        bool have_left = false, have_right = false;
+
+        if (lane_id == 0 && (left - 32) < pass1_valid) {
+            compute_leaf_cv_from_row(left, left_cv);
+            have_left = true;
+        }
+        if (lane_id == 1 && (right - 32) < pass1_valid) {
+            compute_leaf_cv_from_row(right, right_cv);
+            have_right = true;
+        }
+
+        // TODO: here we may have some issue: overflow and border situation
+        unsigned mask = __ballot_sync(0xFFFFFFFFu, (lane_id==0 && have_left) || (lane_id==1 && have_right));
+        if (lane_id == 0 && (left - 32) < pass1_valid) {
+            uint32_t parent[8];
+            if (have_right) {
+                uint32_t rcv[8];
+                #pragma unroll
+                for (int j = 0; j < 8; ++j)
+                    rcv[j] = __shfl_sync(mask, right_cv[j], 1);
+                blake3_parent_cv(left_cv, rcv, parent);
+            } else {
+                #pragma unroll
+                for (int j = 0; j < 8; ++j) 
+                    parent[j] = left_cv[j];   // 奇数晋级
+            }
+            // write to the right position
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                cv_smem[pair_idx][j] = parent[j];
+        }
+        __syncwarp();
+    }
+
+    __syncthreads();
+
+    // ============== STAGE 3: Block-Reduce ==============
+    // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv
+    // we will only use warp 0 to handle this thing
+    if (warp_id == 0) {
+        uint32_t cv[8] = {0,0,0,0,0,0,0,0};
+
+        const bool active_lane = (lane_id < parents_count);
+        if (active_lane) {
+    #pragma unroll
+            for (int j = 0; j < 8; ++j) cv[j] = cv_smem[lane_id][j];
+        }
+
+        // 2) warp reduce 32 -> 16 -> 8 -> 4 -> 2 -> 1
+        unsigned mask = __ballot_sync(0xFFFFFFFFu, active_lane);
+        int cur_n = parents_count;  // 当前层的有效节点数（逐层更新）
+
+        for (int step = 1; step < WARP_SIZE; step <<= 1) {
+            // right-neighbor
+            uint32_t nbr[8];
+    #pragma unroll
+            for (int j = 0; j < 8; ++j) {
+                nbr[j] = __shfl_down_sync(mask, cv[j], step);
+            }
+
+            // safety checking
+            const bool do_pair =
+                (lane_id % (step << 1) == 0) &&               // 左侧
+                (lane_id + step < cur_n) &&                   // 右侧在当前层有效范围内
+                (lane_id < cur_n);                            // 左侧也必须有效
+
+            if (do_pair) {
+                blake3_parent_cv(cv, nbr, cv);               // parent(left, right) -> cv
+            }
+
+            cur_n = (cur_n + 1) >> 1;
+            __syncwarp(mask);
+        }
+
+        // 3) write back to global memory
+        if (lane_id == 0 && parents_count > 0) {
+            const int tile_id = blockIdx.x;
+            uint32_t* out = block_cvs + (size_t)tile_id * 8;        // 8 x 4 = 32 B
+
+            // two different write ways
+            #if 0
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                out[j] = cv[j];
+            #else
+            // block_cvs should be cudaMalloc ed
+            reinterpret_cast<uint4*>(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]);
+            reinterpret_cast<uint4*>(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]);
+            #endif
+        }
+    }
+}   // blake3_block_reduce_kernel
+
+__device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) {
+    const uint4* g4 = reinterpret_cast<const uint4*>(g);
+    uint4 a = g4[0], b = g4[1];
+    r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w;
+    r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w;
+}
+
+__device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) {
+    uint4* g4 = reinterpret_cast<uint4*>(g);
+    g4[0] = make_uint4(r[0],r[1],r[2],r[3]);
+    g4[1] = make_uint4(r[4],r[5],r[6],r[7]);
+}
+
+// ============ Tiny kernel ============
+// In big kernel, it will consume 64 KiB each block
+// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 64 root = 16384 root
+// And this tiny kernel is designed to process these 16384 root
+template<int NUM_THREADS=512, int TILE_CVS=2048, int PAD=0>
+__global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32,
+                                              uint32_t* __restrict__ out_cv32,
+                                              int N)
+{
+    extern __shared__ __align__(16) uint32_t smem[]; // 动态 SMEM；需要 >= TILE_CVS*8*4 字节
+    // 视作 2D：[TILE_CVS][8+PAD]
+    uint32_t* cv_tile = smem;
+
+    const int tid     = threadIdx.x;
+    const int warp_id = tid / WARP_SIZE;   // 0..15
+    const int lane_id = tid % WARP_SIZE;   // 0..31
+
+    // 本 block 负责的分片起点
+    const int tile_start = blockIdx.x * TILE_CVS;
+    if (tile_start >= N) return;
+
+    // N等于8的时候，这里就是8
+    const int tile_n = min(TILE_CVS, N - tile_start); // 该分片的实际 CV 数（<=2048）
+
+    // ---------------- Stage 1: 合并访存 loading 到 SMEM ----------------
+    // 每线程搬多个 CV：i = tid, tid+blockDim, ...
+    for (int i = tid; i < tile_n; i += NUM_THREADS) {       // 注意：i = tid, 不是等于0
+        const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8;
+        uint32_t* s = cv_tile + (size_t)i * (8 + PAD);
+        // 两次 16B
+        const uint4* g4 = reinterpret_cast<const uint4*>(g);
+        uint4*       s4 = reinterpret_cast<uint4*>(s);
+        // s4[0] = g4[0];
+        // s4[1] = g4[1];
+
+        // in case that the address is not aligned
+        uint4 v0 = g4[0];
+        uint4 v1 = g4[1];
+
+        s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w;
+        s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w;
+    }
+    // 对于 tile_n < TILE_CVS 的尾部，无需清零；后续按有效范围处理
+    __syncthreads();
+
+    // ---------------- Stage 2: 线程内 4→1（保持相邻配对） ----------------
+    // 共有 reduced_n0 = ceil(tile_n / 4) 个 lane-root
+    const int reduced_n0 = (tile_n + 3) >> 1 >> 1; // 等价于 (tile_n+3)/4
+    uint32_t lane_cv[8]; // 本线程输出的 lane-root
+    bool lane_valid = false;
+
+    // 每线程的 4 个输入的起始索引
+    int base4 = tid << 2; // tid*4
+    if (base4 < tile_n) {
+        // 读取最多 4 个相邻 CV：idx = base4 + 0,1,2,3
+        uint32_t a[8], b[8], c[8], d[8];
+        const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD);
+        load_cv_g2r(s0, a);
+
+        int remain = tile_n - base4;
+
+        if (remain >= 2) {
+            const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD);
+            load_cv_g2r(s1, b);
+        }
+        if (remain >= 3) {
+            const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD);
+            load_cv_g2r(s2, c);
+        }
+        if (remain >= 4) {
+            const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD);
+            load_cv_g2r(s3, d);
+        }
+
+        // 两层相邻配对（奇数晋级）
+        if (remain == 1) {
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                lane_cv[j] = a[j];
+        } else if (remain == 2) {
+            blake3_parent_cv(a, b, lane_cv);
+        } else if (remain == 3) {
+            uint32_t p01[8];
+            blake3_parent_cv(a, b, p01);
+            blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01，(p01,c)->lane_cv
+        } else { // remain >= 4
+            uint32_t p01[8], p23[8];
+            blake3_parent_cv(a, b, p01);
+            blake3_parent_cv(c, d, p23);
+            blake3_parent_cv(p01, p23, lane_cv);
+        }
+        lane_valid = true;
+    }
+
+    // ---------------- Stage 3: Warp 内 32→1 相邻配对 ----------------
+    // 每个 warp 负责一个连续段：warp_base = warp_id*32
+    const int warp_base = warp_id * WARP_SIZE;
+    const int cur_n_w   = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // 本 warp 段内的有效数量
+
+    // 把 lane_cv 保留在寄存器里做归约；无效 lane 用 mask 剔除
+    unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // 仅做存在检测
+    int cur_n = cur_n_w;
+
+    // 把“段外的线程”标成无效（避免读越界）
+    bool active_lane = (lane_id < cur_n_w);
+
+    // 对无效 lane 把值清成 0（不会被使用）
+    if (!active_lane) { 
+        #pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            lane_cv[j] = 0u; 
+    }
+
+    // 逐层配对：1,2,4,8,16 - warp-reduce
+    for (int step = 1; step < WARP_SIZE; step <<= 1) {
+        // 取右邻
+        uint32_t nbr[8];
+        #pragma unroll
+        for (int j = 0; j < 8; ++j)
+            nbr[j] = __shfl_down_sync(0xFFFFFFFFu, lane_cv[j], step);
+
+        const bool do_pair =
+            active_lane &&
+            ((lane_id % (step<<1)) == 0) &&
+            (lane_id + step < cur_n);
+
+        if (do_pair) {
+            blake3_parent_cv(lane_cv, nbr, lane_cv);
+        }
+
+        cur_n = (cur_n + 1) >> 1;
+        // __syncwarp();
+    }
+
+    // 这一段的结果在 lane0；把 16 个 warp-root 写入 SMEM 的前 16 行
+    __shared__ uint32_t warp_roots[WARP_SIZE/2][8]; // 16×8
+    if (lane_id == 0 && cur_n_w > 0) {
+        #pragma unroll
+        for (int j=0;j<8;++j) 
+            warp_roots[warp_id][j] = lane_cv[j];
+    }
+    __syncthreads();
+
+    // ---------------- Stage 4: CTA 内 16→1 相邻配对 ----------------
+    // 有效 warp 数：ceil(reduced_n0 / 32)
+    int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..16
+    if (valid_warps == 0) return;
+
+    // 每一个warp的lane 0来做计算
+    // 用 lane0 做计算，其它 lane 空转
+    for (int stride = (valid_warps >> 1); stride >= 1; stride >>= 1) {
+        if (warp_id < stride && lane_id == 0) {
+            uint32_t p[8];
+            blake3_parent_cv(&warp_roots[2*warp_id][0],
+                             &warp_roots[2*warp_id + 1][0], p);
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                warp_roots[warp_id][j] = p[j];
+        }
+        __syncthreads();
+        // 奇数晋级
+        if ((valid_warps & 1) && warp_id==0 && lane_id==0) {
+            #pragma unroll
+            for (int j=0;j<8;++j)
+                warp_roots[stride][j] = warp_roots[valid_warps-1][j];
+        }
+        __syncthreads();
+        valid_warps = (valid_warps + 1) >> 1;
+    }
+
+    // 写回本 block 的根
+    if (threadIdx.x == 0) {
+        uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8;
+        #pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            out[j] = warp_roots[0][j];
+    }
+}
+
+constexpr uint32_t FLAG_ROOT = 8;
+
+inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) {
+    const uint32_t zero_block[16] = {0};
+    uint32_t st[16];
+    blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st);
+    // 写出前 32 字节（state[0..7]，小端）
+    for (int i = 0; i < 8; ++i) {
+        uint32_t w = st[i];
+        out32[4*i+0] = (uint8_t)( w        & 0xFF);
+        out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF);
+        out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF);
+        out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF);
+    }
+}
+
+// wrapper function
+void blake3_block_reduce_sm70(const uint8_t* d_data, 
+                            uint64_t bytes_len,
+                            std::array<uint32_t,8>* root_out = nullptr,
+                            cudaStream_t stream = 0) {
+    if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) {
+        fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n",
+                (unsigned long long)bytes_len);
+        std::abort();
+    }
+
+    int optin = 0, deflt = 0;
+    cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0);
+    cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0);
+
+    const int dyn_smem = 64 * 1024;     // 64KiB
+
+    // 编译器在编译期决定分配多少动态shmem给kernel
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_cv_block_reduce_kernel<512, 2048, 0>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_cv_block_reduce_kernel<32, 2048, 0>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+
+        
+    constexpr int  CHUNK_SIZE = 1024;                       // 1 KiB
+    constexpr int  WORDS_PER_CHUNK = CHUNK_SIZE / 4;        // 256
+    constexpr int  NUM_THREADS = 512;                       // for big kernel
+    constexpr int  NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;       // 16
+    constexpr int  CHUNKS_PER_BLOCK = 64;                     // 16 * 32 = 512
+    const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
+    const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
+    const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);     // 16384 blocks
+
+    constexpr int pad_chunk = 0;
+    constexpr int pad_cv = 0;
+
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>,
+        cudaFuncAttributePreferredSharedMemoryCarveout, 100));
+        
+    uint8_t*  d_bytes = const_cast<uint8_t*>(d_data);
+    uint32_t* d_words = reinterpret_cast<uint32_t*>(d_bytes);; // alias
+    uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32
+
+    // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch.
+
+    // TODO: use thrust
+    cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t));     // 512 KiB
+
+    // ============= launch big kernel =============
+    dim3 grid_big(num_blocks);
+    dim3 block_big(NUM_THREADS);
+    uint64_t base_chunk_counter = 0ull;
+    
+    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>
+        <<<grid_big, block_big, /*smem*/0, stream>>>(
+            d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks);
+
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    if (num_blocks == 1) {
+        std::array<uint32_t,8> host_root{};
+        CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        // last final process
+        uint8_t digest32[32];
+        blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+        if (root_out) *root_out = host_root;
+        else {
+            // 简单打印
+            printf("root CV:");
+            for (int i=0;i<8;++i) 
+                printf(" %08x", host_root[i]);
+            printf("\n");
+        }
+
+        CUDA_CHECK(cudaFree(d_blockCV));
+        CUDA_CHECK(cudaFree(d_bytes));
+        return;
+    }
+
+    // the first round of tiny kernel
+    // 1) 16384 output reduce -> 8
+    uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32
+    {
+        const int N = 16384;        // total number
+        const int TILE = 2048;
+        const int grid = (N + TILE - 1) / TILE;  // = 8
+        const int block = 512;
+        const size_t smem_bytes = (size_t)(TILE) * 8u * sizeof(uint32_t); // 2048×8×4 = 64 KiB
+
+        cudaMalloc(&d_mid_out, (size_t)8 * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<512, 2048, 0>
+            <<<grid, block, smem_bytes, stream>>>(d_blockCV /*in: 16384×8 x 4*/,
+                                                d_mid_out   /*out: 8×8*/, N);
+        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    // second round
+    uint32_t* d_root_cv = nullptr;
+    {
+        const int N = 8;
+        const int TILE = 2048; // 任意 >=N 即可
+        const int grid = 1;
+        const int block = 32; // 32 线程够用
+        const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t);        // 8 x 8 x 4 = 8 x 32 B
+
+        cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<32, 2048, 0>
+            <<<grid, block, smem_bytes, stream>>>(d_mid_out /*in: 8×8*/,
+                                                d_root_cv /*out: 1×8*/, N);
+        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    std::array<uint32_t, 8> host_root{};
+    CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root_cv, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    // last final process
+    uint8_t digest32[32];
+    blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+    if (root_out) {
+        *root_out = host_root;
+    } else {
+        printf("root CV:");
+        for (int i=0;i<8;++i) printf(" %08x", host_root[i]);
+        printf("\n");
+    }
+
+    // clear
+    CUDA_CHECK(cudaFree(d_mid_out));
+    CUDA_CHECK(cudaFree(d_root_cv));
+    CUDA_CHECK(cudaFree(d_blockCV));
+    // CUDA_CHECK(cudaFree(d_bytes));
+}
\ No newline at end of file
diff --git a/backup_deprecated/blake3_sm70_v2.cu b/backup_deprecated/blake3_sm70_v2.cu
new file mode 100644
index 0000000..e61b85b
--- /dev/null
+++ b/backup_deprecated/blake3_sm70_v2.cu
@@ -0,0 +1,943 @@
+
+#include <cstdint>
+#include <array>
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <cstring>
+
+#define WARP_SIZE 32
+#define LDST128BITS(value) (reinterpret_cast<const float4*>(&(value))[0])
+
+#define CUDA_CHECK(expr) do {                                   \
+    cudaError_t _e = (expr);                                    \
+    if (_e != cudaSuccess) {                                    \
+      fprintf(stderr, "CUDA error %s at %s:%d: %s\n",           \
+              #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\
+      std::abort();                                             \
+    }                                                           \
+  } while(0)
+
+__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = {
+    0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
+    0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
+};
+
+enum : uint32_t {
+    FLAG_CHUNK_START        = 1u << 0,
+    FLAG_CHUNK_END          = 1u << 1,
+    FLAG_PARENT             = 1u << 2,
+    FLAG_ROOT               = 1u << 3,
+    FLAG_KEYED_HASH         = 1u << 4,
+    FLAG_DERIVE_KEY_CONTEXT = 1u << 5,
+    FLAG_DERIVE_KEY_MATERIAL= 1u << 6,
+};
+
+__device__ __noinline__
+uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) {
+    uint32_t f = 0;
+    f |= (uint32_t)-(block_idx_in_chunk==0)          & FLAG_CHUNK_START;
+    f |= (uint32_t)-(block_idx_in_chunk==nblocks_in_chunk-1)  & FLAG_CHUNK_END;
+    if (is_root_chunk)                         f |= FLAG_ROOT; // only this block in msg, or this is root
+    return f;
+}
+
+__device__ __forceinline__
+uint32_t blake3_parent_flags(bool is_root_parent) {
+    return FLAG_PARENT | (is_root_parent ? FLAG_ROOT : 0);
+}
+
+// ---- 小工具 ----
+__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
+#if defined(__CUDA_ARCH__)
+    return __funnelshift_r(x, x, n);
+#else
+  return (x >> n) | (x << (32 - n));    // host 路径
+#endif
+}
+
+__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) {
+#if defined(__CUDA_ARCH__)
+    const uint4 v = *reinterpret_cast<const uint4*>(src);
+    dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w;
+#else
+    std::memcpy(dst, src, 16);
+#endif
+}
+
+__host__ __device__ void blake3_compress_words_7r(
+    const uint32_t block_words[16],   // 64B -> shared memory
+    const uint32_t cv[8],             // 8×u32 -> shared memory
+    uint64_t chunk_counter,           // 64-bit
+    uint32_t block_len,               // [0..64]
+    uint32_t flags,                   // CHUNK_START/END/PARENT/ROOT…
+    uint32_t out_state[16])           // 返回 16×u32 状态向量（按规范）
+{
+    // TODO: 根据 BLAKE3（基于 BLAKE2s）的 G/round 实现7轮
+    // 这里先给占位：将 IV+cv 混合到 out_state，真实实现请替换
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_state[i] = cv[i];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_state[8+i] = BLAKE3_IV[i];
+
+    out_state[12] ^= (uint32_t)chunk_counter;
+    out_state[13] ^= (uint32_t)(chunk_counter >> 32);
+    out_state[14] ^= block_len;
+    out_state[15] ^= flags;
+
+    // so far, the block_words are still pointers.
+    // now we load it into kernel, as pointed out by ncu profile
+    uint32_t block_reg_1[4];
+
+#pragma unroll
+    for (int i = 0; i < 16; i += 4) {        // the gap is 4
+        // load_u128_u32x4(block_words + i, block_reg_1);
+        out_state[i] ^= block_words[i];
+        // 做一点点搅动（占位）
+        out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31);
+    }
+}
+
+// 从 out_state 派生新的 CV（按规范应取 state[0..7] ^ state[8..15]）
+__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_cv[i] = st[i] ^ st[8+i];
+}
+
+// swap-table
+// BLAKE3 message schedule: rows are P^r, r=0..6.
+// Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) 
+// https://www.cse.unsw.edu.au/~cs4601/refs/papers/blake3.pdf
+__device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = {
+    // r = 0: identity
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+    // r = 1: P
+    {  2,  6,  3, 10,  7,  0,  4, 13,  1, 11, 12,  5,  9, 14, 15,  8 },
+    // r = 2: P∘P
+    {  3,  4, 10, 12, 13,  2,  7, 14,  6,  5,  9,  0, 11, 15,  8,  1 },
+    // r = 3
+    { 10,  7, 12,  9, 14,  3, 13, 15,  4,  0, 11,  2,  5,  8,  1,  6 },
+    // r = 4
+    { 12, 13,  9, 11, 15, 10, 14,  8,  7,  2,  5,  3,  0,  1,  6,  4 },
+    // r = 5
+    {  9, 14, 11,  5,  8, 12, 15,  1, 13,  3,  0, 10,  2,  6,  4,  7 },
+    // r = 6
+    { 11, 15,  5,  0,  1,  9,  8,  6, 14, 10,  2, 12,  3,  4,  7, 13 },
+};
+
+// get the "r" round, "k" message, it is broadcasted from m[li] lane. li = k
+__device__ __forceinline__ uint32_t msg_rk(uint32_t m_lane, int round, int k, unsigned mask16) {
+    int src = B3_MSG_SCHEDULE[round][k];
+    return __shfl_sync(mask16, m_lane, src, 16);
+}
+
+__device__ __noinline__
+uint32_t G_update_role(uint32_t v_self, uint32_t v_b, uint32_t v_c, uint32_t v_d,
+                       uint32_t mx, uint32_t my, int role)
+{
+    // 按 BLAKE2s 32-bit 的 G 序列算出 a',b',c',d'，最后返回“当前 role”的那个值
+    uint32_t a = v_self, b = v_b, c = v_c, d = v_d;
+
+    // a = a + b + mx; 
+    // d ^= a; 
+    // d >>>= 16
+    a = a + b + mx;   
+    d ^= a;   
+    d = rotr32(d, 16);
+
+    // c = c + d; 
+    // b ^= c; 
+    // b >>>= 12
+    c = c + d;        
+    b ^= c;   
+    b = rotr32(b, 12);
+
+    // a = a + b + my; 
+    // d ^= a; 
+    // d >>>= 8
+    a = a + b + my;   
+    d ^= a;   
+    d = rotr32(d, 8);
+
+    // c = c + d; 
+    // b ^= c; 
+    // b >>>= 7
+    c = c + d;        
+    b ^= c;   
+    b = rotr32(b, 7);
+
+    // role choice:
+    switch (role) {
+      case 0: return a;
+      case 1: return b;
+      case 2: return c;
+      default: return d;
+    }
+}
+
+// notice that, this function will proceed 2 chunks, each time.
+// - chunk_words_row: current chunk
+// - out_cv: written by lane 0, or lane 16
+__device__ __noinline__
+void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row, // 256×u32 -> 1024 Bytes, from shared memory
+                                    // so the chunks_row += 2 as gap
+                                    int chunk_len_bytes,
+                                    uint64_t chunk_counter,
+                                    uint32_t out_cv[8],
+                                    unsigned mask16) {
+    // computing index
+    int lane = threadIdx.x & 31;    // lane_id: 0-31
+    int sub  = lane >> 4;            // 0/1
+    int li   = lane & 15;            // 0..15, abstract lane id. for example, lane 16 will be li=0
+    int role = li & 3;               // a/b/c/d role
+    int base = (sub << 4);           // 0 or 16 the absolute base
+
+    const int nblocks = (chunk_len_bytes + 63) >> 6;  // ceil(chunk_len/64)
+
+    int warp_id = threadIdx.x / WARP_SIZE;
+
+    // initialize
+    uint32_t cv_word = 0;
+    if (li < 8) cv_word = BLAKE3_IV[li];
+
+    // process all blocks
+    // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks
+    // each block has 64B -> 16 x u32
+    for (int b = 0; b < nblocks; ++b) {
+        // each lane holds one u32, 
+        // 16 lane will hold 16 x 4 = 64 B -> it's block
+        // the another 16 lane will hold opposite 64 B
+        const uint32_t m_lane = chunk_words_row[b * 16 + li];
+
+        // 初始化 v：v[0..7]=cv, v[8..11]=IV，v[12..15]^=t/len/flags
+        // 先把“自己的那个索引”的初值准备好：
+        uint32_t v = (li < 8)
+            ? cv_word                                 // v[i]（i<8）
+            : BLAKE3_IV[li - 8];                      // v[8..15] ← IV
+
+        // 计数器/长度/标志（按 BLAKE3 规范）
+        const uint32_t t0 = (uint32_t)chunk_counter;
+        const uint32_t t1 = (uint32_t)(chunk_counter >> 32);
+        const int remain = chunk_len_bytes - (b << 6);
+        const uint32_t block_len = (remain >= 64) ? 64u : (uint32_t)remain;
+
+        const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false);
+
+        // 只在 12..15 四个索引上异或相应域（不分支，用谓词掩码）
+        v ^= (li == 12) ? t0       : 0u;
+        v ^= (li == 13) ? t1       : 0u;
+        v ^= (li == 14) ? block_len: 0u;
+        v ^= (li == 15) ? flags    : 0u;
+
+        // 把索引写成 li = q + 4*rq；对角步先做置换 li' = (rq<<2) | ((q+rq)&3)
+        int q  = (li & 3);
+        int rq = (li >> 2);
+        int li_diag = (rq << 2) | ((q + rq) & 3);
+        int li_undo = (rq << 2) | ((q - rq) & 3);
+        int gi_col = q; // 0..3
+        int gi_diag = (li_diag & 3); // 0..3
+
+        // ===== 7 rounds =====
+        #pragma unroll 4
+        for (int r = 0; r < 7; ++r) {
+            // inside this loop, each lane will do one job
+            // 16 lane will execute 16 x 2 operations
+            // in sequential-programming, will do 8 operation
+
+            // ---- 列步（quartet: {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15}）----
+            {
+                // 取同 quartet 的 b/c/d（基于当前 v）
+                uint32_t vb = __shfl_xor_sync(mask16, v,  4, 16);
+                uint32_t vc = __shfl_xor_sync(mask16, v,  8, 16);
+                uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16);
+
+                // 本 quartet 的 i ∈ {0,1,2,3}，列步用 msg 索引 0..7（两两为一对）
+                
+                uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16);
+                uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16);
+
+                v = G_update_role(v, vb, vc, vd, mx, my, role);
+            }
+
+            // ---- 对角步 ----
+            {
+                // 在“对角置换域”取到当前 v 值
+                uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16);
+
+                // 在该域内做“列步”同样的四邻取值
+                uint32_t vb = __shfl_xor_sync(mask16, v_diag,  4, 16);
+                uint32_t vc = __shfl_xor_sync(mask16, v_diag,  8, 16);
+                uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16);
+
+                // 对角步的 4 组 G 使用本轮消息对的后半（索引 8..15）
+                
+                uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16);
+                uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16);
+
+                uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role);
+
+                // 反置换回原位：li_undo = (rq<<2) | ((q - rq) & 3)
+                
+                // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16);
+                v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16);
+            }
+        } // 7 rounds end
+
+        // 派生新的 CV：cv[i] = v[i] ^ v[i+8]（仅 li=0..7 生效）
+        uint32_t vip8_all = __shfl_sync(mask16, v, li ^ 8, 16);
+        if (li < 8) {
+            cv_word = v ^ vip8_all;
+        }
+
+        // 下一块继续（本函数内 16 个 block 串行）
+    }
+
+    // 由 lane0 / lane16 收集 8×u32 输出
+    #pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用
+        if (li == 0) out_cv[j] = wj;                       // 仅 lane0 落盘
+    }
+}
+
+__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){
+    uint32_t msg[16];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[i] = L[i]; 
+    }
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[8+i] = R[i]; 
+    }
+    uint32_t st[16];
+    blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st);
+    blake3_state_to_cv(st, out_cv);
+}
+
+// ============ Big kernel: 16 WARPS in total ============
+// grid: (chunks / 64), thread: (512,)
+template<const int NUM_THREADS=512, 
+        const int CHUNK_SIZE=1024, 
+        const int CHUNKS_PER_BLOCK=64,
+        const int PAD_CHUNK=16,
+        const int PAD_CV=0>        // pad shared memory
+__global__ void blake3_block_reduce_kernel(uint32_t* d_input, 
+                                            uint32_t* block_cvs,
+                                            int chunk_len_bytes,
+                                            uint64_t base_chunk_counter,
+                                            int total_chunks) {
+    // NUM_WARPS also stands for NUM_CHUKNS per block
+    constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;        // consider it as 16
+    constexpr int CHUNKS_PROCEED = CHUNKS_PER_BLOCK;
+
+    static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned");
+    static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4");
+
+    // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total
+    // 8 x 4 x 64 = 2 KiB shared memory in sum
+    __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV];        // avoid bank conflict
+
+    // 4 bytes x 256 x 64 = 64 KiB shared memory.
+    __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][(CHUNK_SIZE / 4) + PAD_CHUNK];     // [64][256]
+
+    const int tid = threadIdx.x;
+    const int bx = blockIdx.x;
+    const int warp_id = tid / WARP_SIZE;
+    const int lane_id = tid % WARP_SIZE;
+
+    constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4;     // 256
+    constexpr int VEC_ELEMS = 4;                        // uint4
+    constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB)
+    constexpr int WARPS_PER_CTA = NUM_WARPS;            // 16
+
+    // ============== STAGE 1: Coalsced Global Memory Loading ==============
+    const int tile_id   = blockIdx.x;
+    const int tile_base = tile_id * CHUNKS_PER_BLOCK;       // which chunk do this block start loading
+
+    int valid_chunks = total_chunks - tile_base;
+    if (valid_chunks <= 0) {
+        return; // overflow
+    }
+    if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK;
+
+    for (int ldt = 0; ldt < 4; ldt++) {
+        // each warp load 4 chunks
+        int chunk_local  = ldt * WARPS_PER_CTA + warp_id;           // ldt*16 + warp -> start chunk
+        int chunk_global = tile_base + chunk_local;                 // global chunk idx
+
+        // the pointer for shared memory
+        uint32_t* s_u32 = &chunk_smem[chunk_local][0];
+
+        // only read from global, when it's valid
+        // or, we fill it with 0
+        if (chunk_local < valid_chunks) {
+            const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK;
+
+            // move 16 bytes -> 128 bits each time
+            // each thread will load 2 x 16 bytes
+            // so 32 threads will load 32 x 2 x 16 = 1024 B
+            const uint4* __restrict__ g4 = reinterpret_cast<const uint4*>(g_u32);
+            uint4* __restrict__ s4 = reinterpret_cast<uint4*>(s_u32);
+
+            // idx = lane_id (0..31) 与 lane_id+32 (32..63)
+            int idx0 = lane_id;           // 0..31
+            int idx1 = lane_id + WARP_SIZE; // 32..63
+
+            // thread 0 -> 0, 32
+            // thread 1 -> 1, 33
+            // ...
+            // thread 31 -> 31, 63
+            // so the global memory access is coalsced
+
+            // notice, we load 16 bytes a time. the index is compressed
+            // tid 0 -> 0,       tid 1 -> 16
+            // tid 0 -> 32 x 16, tid 2 -> 32 x 16 + 16
+
+            uint4 v0 = g4[idx0];        // still, this step we load from gmem, in 4 elements aligned.
+            uint4 v1 = g4[idx1];
+
+            s_u32[4*idx0 + 0] = v0.x;   // when load into shared mem, do manually
+            s_u32[4*idx0 + 1] = v0.y;     
+            s_u32[4*idx0 + 2] = v0.z; 
+            s_u32[4*idx0 + 3] = v0.w;
+
+            s_u32[4*idx1 + 0] = v1.x; 
+            s_u32[4*idx1 + 1] = v1.y;
+            s_u32[4*idx1 + 2] = v1.z; 
+            s_u32[4*idx1 + 3] = v1.w;
+        } else {
+            uint4* s4 = reinterpret_cast<uint4*>(s_u32);
+            int idx0 = lane_id;
+            int idx1 = lane_id + WARP_SIZE;
+            s4[idx0] = make_uint4(0u,0u,0u,0u);
+            s4[idx1] = make_uint4(0u,0u,0u,0u);
+        }
+    }
+
+    __syncthreads();        // sync all warps
+
+    // ============== STAGE 2: Compress Leaf to 64 chain value ==============
+    const int pass0_valid = min(32, valid_chunks);              // pass0 cover [0, 31] chunks
+    const int pass1_valid = max(0, valid_chunks - 32);          // pass1 cover [32, 63] chunks
+
+    __shared__ int parents_count;
+    if (threadIdx.x == 0) {
+        const int parents0 = (pass0_valid + 1) >> 1;
+        const int parents1 = (pass1_valid + 1) >> 1;
+        parents_count = parents0 + parents1;             // ≤ 32
+    }
+    __syncthreads();
+
+    // if (blockIdx.x==0 && threadIdx.x==0) printf("after stage1\n");
+
+    // this is for each warp's lane0 and lane16 written
+    // to decrease the register usage.
+    __shared__ __align__(16) uint32_t tmp_cv[CHUNKS_PER_BLOCK][8 + PAD_CV]; // 2 KiB
+
+    // lambda function: compress this thing
+    auto do_big_pass = [&](int base /*0 或 32*/, int pass_valid) {
+        // left=base+2*warp_id, right=left+1
+        const int left       = base + (warp_id << 1);       // base + 0,2,4,6,...
+        const int right      = left + 1;
+        const int left_rel   = left  - base;   // 0..31
+        const int right_rel  = right - base;   // 1..32
+        const bool has_left  = (left_rel  < pass_valid);
+        const bool has_right = (right_rel < pass_valid);
+
+        // const int lane_id    = threadIdx.x & 31;
+        const int sub        = lane_id >> 4;               // sub-warp: 0 or 1, lane0-15: sub-warp0; lane16-lane31: sub-warp1
+        const int li         = lane_id & 15;               // 0..15
+        const unsigned mask16= (sub == 0 ? 0x0000FFFFu : 0xFFFF0000u);
+
+        const int chunk_local = left + sub;        // sub=0→left, sub=1→right
+        const bool active = (sub==0 ? (left - base)  < pass_valid
+                                    : (right - base) < pass_valid);
+
+        // uint32_t my_cv[8];
+
+        // the left-sub-warp and right-sub-warp will execute the same code
+        // distinguish the index by computing,
+        // to avoid warp-divergence
+        if (active) {
+            // the chunk local identifies the left or right chunk, so do not worry.
+            const uint32_t* row = &chunk_smem[chunk_local][0];
+            const uint64_t  cc  = base_chunk_counter + (uint64_t)(tile_base + chunk_local);
+            blake3_leaf_cv_simd16_onechunk(row, 
+                chunk_len_bytes,
+                cc, 
+                &tmp_cv[chunk_local][0], 
+                mask16);
+        }
+
+        __syncwarp();       // make sure two warps written into `tmp_cv`
+
+        // now we have compute 2 chunks' cv
+        // merge it to a parent cv
+        if (lane_id == 0 && has_left) {
+            const uint32_t* lcv = &tmp_cv[left][0];
+            uint32_t parent[8];
+            if ((right - base) < pass_valid) {
+                const uint32_t* rcv = &tmp_cv[right][0];
+                blake3_parent_cv(lcv, rcv, parent);
+            } else {        // odd: up-flow directly
+                #pragma unroll
+                for (int j = 0 ; j < 8; ++j) 
+                    parent[j] = lcv[j];
+            }
+
+            // now, one warp computes 2 chunks, yield one parent-cv value
+            const int pair_idx = (base >> 1) + warp_id;     // 0, 16 + warp_id
+            #pragma unroll
+            for (int j = 0; j < 8; ++j)
+                cv_smem[pair_idx][j] = parent[j];
+        }
+
+        __syncwarp();       // NOTICE: this is necessary!
+    }; // do_big_pass
+
+    // big-pass 1: computing 0-31 chunks
+    do_big_pass(/*base=*/0,  pass0_valid);
+
+    // if (bx == 0) printf("Finish 1 big pass\n");
+
+    // big-pass 2: computing 32-63 chunks
+    do_big_pass(/*base=*/32, pass1_valid);
+
+    // if (bx == 0) printf("Finish 2 big pass\n");
+
+    __syncthreads();
+
+    // printf("Stage 2 done!!!\n");
+
+    // right now, we have got 32 chain values
+    // a warp-reduce to merge.
+
+    // ============== STAGE 3: Block-Reduce ==============
+    // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv
+    // we will only use warp 0 to handle this thing
+    if (warp_id == 0) {
+        uint32_t cv[8] = {0,0,0,0,0,0,0,0};
+
+        const bool active_lane = (lane_id < parents_count);
+        if (active_lane) {
+    #pragma unroll
+            for (int j = 0; j < 8; ++j) cv[j] = cv_smem[lane_id][j];
+        }
+
+        // 2) warp reduce 32 -> 16 -> 8 -> 4 -> 2 -> 1
+        unsigned mask = __ballot_sync(0xFFFFFFFFu, active_lane);
+        int cur_n = parents_count;  // 当前层的有效节点数（逐层更新）
+
+        for (int step = 1; step < WARP_SIZE; step <<= 1) {
+            // right-neighbor
+            uint32_t nbr[8];
+    #pragma unroll
+            for (int j = 0; j < 8; ++j) {
+                nbr[j] = __shfl_down_sync(mask, cv[j], step);
+            }
+
+            // safety checking
+            const bool do_pair =
+                (lane_id % (step << 1) == 0) &&               // 左侧
+                (lane_id + step < cur_n) &&                   // 右侧在当前层有效范围内
+                (lane_id < cur_n);                            // 左侧也必须有效
+
+            if (do_pair) {
+                blake3_parent_cv(cv, nbr, cv);               // parent(left, right) -> cv
+            }
+
+            cur_n = (cur_n + 1) >> 1;
+            __syncwarp(mask);
+        }
+
+        // 3) write back to global memory
+        if (lane_id == 0 && parents_count > 0) {
+            const int tile_id = blockIdx.x;
+            uint32_t* out = block_cvs + (size_t)tile_id * 8;        // 8 x 4 = 32 B
+
+            // two different write ways
+            #if 0
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                out[j] = cv[j];
+            #else
+            // block_cvs should be cudaMalloc ed
+            reinterpret_cast<uint4*>(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]);
+            reinterpret_cast<uint4*>(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]);
+            #endif
+        }
+    }
+}   // blake3_block_reduce_kernel
+
+__device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) {
+    const uint4* g4 = reinterpret_cast<const uint4*>(g);
+    uint4 a = g4[0], b = g4[1];
+    r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w;
+    r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w;
+}
+
+__device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) {
+    uint4* g4 = reinterpret_cast<uint4*>(g);
+    g4[0] = make_uint4(r[0],r[1],r[2],r[3]);
+    g4[1] = make_uint4(r[4],r[5],r[6],r[7]);
+}
+
+// ============ Tiny kernel ============
+// In big kernel, it will consume 64 KiB each block
+// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 64 root = 16384 root
+// And this tiny kernel is designed to process these 16384 root
+template<int NUM_THREADS=512, int TILE_CVS=2048, int PAD=0>
+__global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32,
+                                              uint32_t* __restrict__ out_cv32,
+                                              int N)
+{
+    extern __shared__ __align__(16) uint32_t smem[]; // 动态 SMEM；需要 >= TILE_CVS*8*4 字节
+    // 视作 2D：[TILE_CVS][8+PAD]
+    uint32_t* cv_tile = smem;
+
+    const int tid     = threadIdx.x;
+    const int warp_id = tid / WARP_SIZE;   // 0..15
+    const int lane_id = tid % WARP_SIZE;   // 0..31
+
+    // 本 block 负责的分片起点
+    const int tile_start = blockIdx.x * TILE_CVS;
+    if (tile_start >= N) return;
+
+    // N等于8的时候，这里就是8
+    const int tile_n = min(TILE_CVS, N - tile_start); // 该分片的实际 CV 数（<=2048）
+
+    // ---------------- Stage 1: 合并访存 loading 到 SMEM ----------------
+    // 每线程搬多个 CV：i = tid, tid+blockDim, ...
+    for (int i = tid; i < tile_n; i += NUM_THREADS) {       // 注意：i = tid, 不是等于0
+        const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8;
+        uint32_t* s = cv_tile + (size_t)i * (8 + PAD);
+        // 两次 16B
+        const uint4* g4 = reinterpret_cast<const uint4*>(g);
+        uint4*       s4 = reinterpret_cast<uint4*>(s);
+        // s4[0] = g4[0];
+        // s4[1] = g4[1];
+
+        // in case that the address is not aligned
+        uint4 v0 = g4[0];
+        uint4 v1 = g4[1];
+
+        s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w;
+        s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w;
+    }
+    // 对于 tile_n < TILE_CVS 的尾部，无需清零；后续按有效范围处理
+    __syncthreads();
+
+    // ---------------- Stage 2: 线程内 4→1（保持相邻配对） ----------------
+    // 共有 reduced_n0 = ceil(tile_n / 4) 个 lane-root
+    const int reduced_n0 = (tile_n + 3) >> 1 >> 1; // 等价于 (tile_n+3)/4
+    uint32_t lane_cv[8]; // 本线程输出的 lane-root
+    bool lane_valid = false;
+
+    // 每线程的 4 个输入的起始索引
+    int base4 = tid << 2; // tid*4
+    if (base4 < tile_n) {
+        // 读取最多 4 个相邻 CV：idx = base4 + 0,1,2,3
+        uint32_t a[8], b[8], c[8], d[8];
+        const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD);
+        load_cv_g2r(s0, a);
+
+        int remain = tile_n - base4;
+
+        if (remain >= 2) {
+            const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD);
+            load_cv_g2r(s1, b);
+        }
+        if (remain >= 3) {
+            const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD);
+            load_cv_g2r(s2, c);
+        }
+        if (remain >= 4) {
+            const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD);
+            load_cv_g2r(s3, d);
+        }
+
+        // 两层相邻配对（奇数晋级）
+        if (remain == 1) {
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                lane_cv[j] = a[j];
+        } else if (remain == 2) {
+            blake3_parent_cv(a, b, lane_cv);
+        } else if (remain == 3) {
+            uint32_t p01[8];
+            blake3_parent_cv(a, b, p01);
+            blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01，(p01,c)->lane_cv
+        } else { // remain >= 4
+            uint32_t p01[8], p23[8];
+            blake3_parent_cv(a, b, p01);
+            blake3_parent_cv(c, d, p23);
+            blake3_parent_cv(p01, p23, lane_cv);
+        }
+        lane_valid = true;
+    }
+
+    // ---------------- Stage 3: Warp 内 32→1 相邻配对 ----------------
+    // 每个 warp 负责一个连续段：warp_base = warp_id*32
+    const int warp_base = warp_id * WARP_SIZE;
+    const int cur_n_w   = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // 本 warp 段内的有效数量
+
+    // 把 lane_cv 保留在寄存器里做归约；无效 lane 用 mask 剔除
+    unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // 仅做存在检测
+    int cur_n = cur_n_w;
+
+    // 把“段外的线程”标成无效（避免读越界）
+    bool active_lane = (lane_id < cur_n_w);
+
+    // 对无效 lane 把值清成 0（不会被使用）
+    if (!active_lane) { 
+        #pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            lane_cv[j] = 0u; 
+    }
+
+    // 逐层配对：1,2,4,8,16 - warp-reduce
+    for (int step = 1; step < WARP_SIZE; step <<= 1) {
+        // 取右邻
+        uint32_t nbr[8];
+        #pragma unroll
+        for (int j = 0; j < 8; ++j)
+            nbr[j] = __shfl_down_sync(0xFFFFFFFFu, lane_cv[j], step);
+
+        const bool do_pair =
+            active_lane &&
+            ((lane_id % (step<<1)) == 0) &&
+            (lane_id + step < cur_n);
+
+        if (do_pair) {
+            blake3_parent_cv(lane_cv, nbr, lane_cv);
+        }
+
+        cur_n = (cur_n + 1) >> 1;
+        // __syncwarp();
+    }
+
+    // 这一段的结果在 lane0；把 16 个 warp-root 写入 SMEM 的前 16 行
+    __shared__ uint32_t warp_roots[WARP_SIZE/2][8]; // 16×8
+    if (lane_id == 0 && cur_n_w > 0) {
+        #pragma unroll
+        for (int j=0;j<8;++j) 
+            warp_roots[warp_id][j] = lane_cv[j];
+    }
+    __syncthreads();
+
+    // ---------------- Stage 4: CTA 内 16→1 相邻配对 ----------------
+    // 有效 warp 数：ceil(reduced_n0 / 32)
+    int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..16
+    if (valid_warps == 0) return;
+
+    // 每一个warp的lane 0来做计算
+    // 用 lane0 做计算，其它 lane 空转
+    for (int stride = (valid_warps >> 1); stride >= 1; stride >>= 1) {
+        if (warp_id < stride && lane_id == 0) {
+            uint32_t p[8];
+            blake3_parent_cv(&warp_roots[2*warp_id][0],
+                             &warp_roots[2*warp_id + 1][0], p);
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                warp_roots[warp_id][j] = p[j];
+        }
+        __syncthreads();
+        // 奇数晋级
+        if ((valid_warps & 1) && warp_id==0 && lane_id==0) {
+            #pragma unroll
+            for (int j=0;j<8;++j)
+                warp_roots[stride][j] = warp_roots[valid_warps-1][j];
+        }
+        __syncthreads();
+        valid_warps = (valid_warps + 1) >> 1;
+    }
+
+    // 写回本 block 的根
+    if (threadIdx.x == 0) {
+        uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8;
+        #pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            out[j] = warp_roots[0][j];
+    }
+}
+
+inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) {
+    const uint32_t zero_block[16] = {0};
+    uint32_t st[16];
+    blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st);
+    // 写出前 32 字节（state[0..7]，小端）
+    for (int i = 0; i < 8; ++i) {
+        uint32_t w = st[i];
+        out32[4*i+0] = (uint8_t)( w        & 0xFF);
+        out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF);
+        out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF);
+        out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF);
+    }
+}
+
+// wrapper function
+void blake3_block_reduce_sm70(const uint8_t* d_data, 
+                            uint64_t bytes_len,
+                            std::array<uint32_t,8>* root_out = nullptr,
+                            cudaStream_t stream = 0) {
+    if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) {
+        fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n",
+                (unsigned long long)bytes_len);
+        std::abort();
+    }
+
+    // int dev = -1;
+    // cudaGetDevice(&dev);
+    // printf("[dbg] my runtime current device = %d\n", dev);
+
+    // cudaPointerAttributes attr{};
+    // auto st = cudaPointerGetAttributes(&attr, d_data);
+    // printf("[dbg] getAttr=%d, type=%d (0=host,1=device,2=managed), device=%d\n",
+    //     (int)st, (int)attr.type, attr.device);
+
+    // cudaPointerAttributes attr{};
+    // CUDA_CHECK(cudaPointerGetAttributes(&attr, d_data));
+    // if (attr.type != cudaMemoryTypeDevice) {
+    //     fprintf(stderr, "d_data is not device memory!\n");
+    //     std::abort();
+    // }
+
+    int optin = 0, deflt = 0;
+    cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0);
+    cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0);
+
+    const int dyn_smem = 64 * 1024;     // 64KiB
+
+    // 编译器在编译期决定分配多少动态shmem给kernel
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_cv_block_reduce_kernel<512, 2048, 0>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_cv_block_reduce_kernel<32, 2048, 0>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+
+        
+    constexpr int  CHUNKS_PER_BLOCK = 32;                     // 16 * 32 = 512
+    constexpr int  CHUNK_SIZE = 1024;                       // 1 KiB
+    constexpr int  WORDS_PER_CHUNK = CHUNK_SIZE / 4;        // 256
+    constexpr int  NUM_THREADS = CHUNKS_PER_BLOCK * 512 / 64;                       // for big kernel, 512 or 256
+    constexpr int  NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;       // 16 or 8
+    const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
+    const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
+    const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);     // 16384 blocks
+
+    constexpr int pad_chunk = 16;
+    constexpr int pad_cv = 0;
+
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>,
+        cudaFuncAttributePreferredSharedMemoryCarveout, 100));
+        
+    uint8_t*  d_bytes = const_cast<uint8_t*>(d_data);
+    uint32_t* d_words = reinterpret_cast<uint32_t*>(d_bytes);; // alias
+    uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32
+
+    // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch.
+
+    // TODO: use thrust
+    cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t));     // 512 KiB
+
+    // ============= launch big kernel =============
+    dim3 grid_big(num_blocks);
+    dim3 block_big(NUM_THREADS);
+    uint64_t base_chunk_counter = 0ull;
+    
+    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>
+        <<<grid_big, block_big, /*smem*/0, stream>>>(
+            d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks);
+
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    if (num_blocks == 1) {
+        std::array<uint32_t,8> host_root{};
+        CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        // last final process
+        uint8_t digest32[32];
+        blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+        if (root_out) *root_out = host_root;
+        else {
+            // 简单打印
+            printf("root CV:");
+            for (int i=0;i<8;++i) 
+                printf(" %08x", host_root[i]);
+            printf("\n");
+        }
+
+        CUDA_CHECK(cudaFree(d_blockCV));
+        CUDA_CHECK(cudaFree(d_bytes));
+        return;
+    }
+
+    // the first round of tiny kernel
+    // 1) 16384 output reduce -> 8
+    uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32
+    {
+        const int N = 16384;        // total number
+        const int TILE = 2048;
+        const int grid = (N + TILE - 1) / TILE;  // = 8
+        const int block = 512;
+        const size_t smem_bytes = (size_t)(TILE) * 8u * sizeof(uint32_t); // 2048×8×4 = 64 KiB
+
+        cudaMalloc(&d_mid_out, (size_t)8 * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<512, 2048, 0>
+            <<<grid, block, smem_bytes, stream>>>(d_blockCV /*in: 16384×8 x 4*/,
+                                                d_mid_out   /*out: 8×8*/, N);
+        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    // second round
+    uint32_t* d_root_cv = nullptr;
+    {
+        const int N = 8;
+        const int TILE = 2048; // 任意 >=N 即可
+        const int grid = 1;
+        const int block = 32; // 32 线程够用
+        const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t);        // 8 x 8 x 4 = 8 x 32 B
+
+        cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<32, 2048, 0>
+            <<<grid, block, smem_bytes, stream>>>(d_mid_out /*in: 8×8*/,
+                                                d_root_cv /*out: 1×8*/, N);
+        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    std::array<uint32_t, 8> host_root{};
+    CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root_cv, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    // last final process
+    uint8_t digest32[32];
+    blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+    if (root_out) {
+        *root_out = host_root;
+    } else {
+        printf("root CV:");
+        for (int i=0;i<8;++i) printf(" %08x", host_root[i]);
+        printf("\n");
+    }
+
+    // clear
+    CUDA_CHECK(cudaFree(d_mid_out));
+    CUDA_CHECK(cudaFree(d_root_cv));
+    CUDA_CHECK(cudaFree(d_blockCV));
+    // CUDA_CHECK(cudaFree(d_bytes));
+}
\ No newline at end of file
diff --git a/backup_deprecated/blake3_sm80_v1.cu b/backup_deprecated/blake3_sm80_v1.cu
new file mode 100644
index 0000000..5f08a97
--- /dev/null
+++ b/backup_deprecated/blake3_sm80_v1.cu
@@ -0,0 +1,454 @@
+
+#include <cstdint>
+#include <array>
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <cstring>
+
+#define WARP_SIZE 32
+#define LDST128BITS(value) (reinterpret_cast<const float4*>(&(value))[0])
+
+#define CUDA_CHECK(expr) do {                                   \
+    cudaError_t _e = (expr);                                    \
+    if (_e != cudaSuccess) {                                    \
+      fprintf(stderr, "CUDA error %s at %s:%d: %s\n",           \
+              #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\
+      std::abort();                                             \
+    }                                                           \
+  } while(0)
+
+__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = {
+    0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
+    0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
+};
+
+// ---- 小工具 ----
+__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
+#if defined(__CUDA_ARCH__)
+    return __funnelshift_r(x, x, n);
+#else
+  return (x >> n) | (x << (32 - n));    // host 路径
+#endif
+}
+
+__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) {
+#if defined(__CUDA_ARCH__)
+    const uint4 v = *reinterpret_cast<const uint4*>(src);
+    dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w;
+#else
+    std::memcpy(dst, src, 16);
+#endif
+}
+
+__host__ __device__ void blake3_compress_words_7r(
+    const uint32_t block_words[16],   // 64B -> 16 x 4 bytes = 64 B = 4 x 16 bytes = 4 x 128 bits
+    const uint32_t cv[8], // 8×u32
+    uint64_t chunk_counter,           // 64-bit
+    uint32_t block_len,               // [0..64]
+    uint32_t flags,                   // CHUNK_START/END/PARENT/ROOT…
+    uint32_t out_state[16])           // 返回 16×u32 状态向量（按规范）
+{
+    // TODO: 根据 BLAKE3（基于 BLAKE2s）的 G/round 实现7轮
+    // 这里先给占位：将 IV+cv 混合到 out_state，真实实现请替换
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_state[i] = cv[i];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_state[8+i] = BLAKE3_IV[i];
+
+    out_state[12] ^= (uint32_t)chunk_counter;
+    out_state[13] ^= (uint32_t)(chunk_counter >> 32);
+    out_state[14] ^= block_len;
+    out_state[15] ^= flags;
+
+    // so far, the block_words are still pointers.
+    // now we load it into kernel, as pointed out by ncu profile
+    uint32_t block_reg_1[4];
+
+#pragma unroll
+    for (int i = 0; i < 16; i += 4) {        // the gap is 4
+        load_u128_u32x4(block_words + i, block_reg_1);
+        out_state[i] ^= block_words[i];
+        // 做一点点搅动（占位）
+        out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31);
+    }
+}
+
+// 从 out_state 派生新的 CV（按规范应取 state[0..7] ^ state[8..15]）
+__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_cv[i] = st[i] ^ st[8+i];
+}
+
+// 叶：处理 1KiB chunk（16×64B blocks）→ 1 个 CV
+// 假定输入为小端 u32 流，chunk 不足 1KiB 最后一块 block_len<64 并置 END 标志
+__device__ void blake3_leaf_cv(const uint32_t* chunk_words, 
+                                int chunk_len_bytes,
+                               uint64_t chunk_counter, 
+                               uint32_t out_cv[8])
+{
+    uint32_t cv[8];
+    // 初始 cv = IV
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        cv[i] = BLAKE3_IV[i];
+
+    const int nblocks = (chunk_len_bytes + 63) / 64; // ceil
+    for (int b = 0; b < nblocks; ++b) {
+        uint32_t st[16];
+        const uint32_t* block = chunk_words + b*16;
+        const int remain = chunk_len_bytes - b*64;
+        const uint32_t blk_len = remain >= 64 ? 64u : (uint32_t)remain;
+
+        const uint32_t flags =
+            ((b == 0) ? (1u<<0) : 0u) |                       // CHUNK_START（示意：bit0）
+            ((b == nblocks-1) ? (1u<<1) : 0u);                // CHUNK_END   （示意：bit1）
+
+        blake3_compress_words_7r(block, cv, chunk_counter, blk_len, flags, st);
+        blake3_state_to_cv(st, cv);
+    }
+
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_cv[i] = cv[i];
+}
+
+__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){
+    uint32_t msg[16];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[i] = L[i]; 
+    }
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[8+i] = R[i]; 
+    }
+    uint32_t st[16];
+    blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st);
+    blake3_state_to_cv(st, out_cv);
+}
+
+// ============ Big kernel: 1 warp -> 32 chunks, 1 thread = 1 chunk, 16 WARPS in total ============
+// Each block has 512 threads
+// 1 warp process 32 chunk -> 32 KiB
+// NUM_WARPS = 512 / 32 = 16
+// Each block processes 16 x 32 chunks = 16 x 32 KiB = 512 KiB
+template<const int NUM_THREADS=512, 
+        const int CHUNK_SIZE = 1024, 
+        const int PADSIZE=0>        // pad shared memory
+__global__ void blake3_block_reduce_kernel(uint32_t* d_input, 
+                                            uint32_t* block_cvs,
+                                            int chunk_len_bytes,
+                                            uint64_t base_chunk_counter,
+                                            int total_chunks) {
+    // NUM_WARPS also stands for NUM_CHUKNS per block
+    constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;        // consider it as 16
+
+    // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total
+    // 8 x 4 x 16 = 512 B shared memory in sum
+    __shared__ uint32_t cv_smem[NUM_WARPS][8 + PADSIZE];        // avoid bank conflict
+
+    // reduce pipeline: 16 -> 8 -> 4 -> 2 -> 1
+    const int tid = threadIdx.x;
+    const int warp_id = tid / WARP_SIZE;
+    const int lane_id = tid % WARP_SIZE;
+
+    const uint64_t global_warp_id = blockIdx.x * NUM_WARPS + warp_id;
+    const uint64_t chunk_counter = base_chunk_counter + global_warp_id;
+
+    // index
+    const uint64_t warp_chunk_base = global_warp_id * WARP_SIZE;        // the start of this warp
+    // each thread process one chunk
+    const uint64_t chunk_idx = warp_chunk_base + lane_id;
+
+    // edge processing
+    int valid = total_chunks - warp_chunk_base;
+    if (valid <= 0) return;     // TODO: will this affect warp shfl?
+    if (valid > WARP_SIZE) valid = WARP_SIZE;
+
+    // compute idx for this thread
+    const int WORDS_PER_CHUNK = CHUNK_SIZE / 4;     // 256
+    const uint32_t* chunk_words_ptr = d_input + (size_t)chunk_idx * WORDS_PER_CHUNK;
+
+    uint32_t cv[8] = {0}; // 8 x u32
+    bool active = lane_id < valid;
+    if (active) {
+        const uint64_t chunk_counter = base_chunk_counter + chunk_idx;
+        blake3_leaf_cv(chunk_words_ptr, chunk_len_bytes, chunk_counter, cv);
+    }
+
+    // Stage 2: merging to parent, in O(logN) depth
+
+    // take care: we cannot use general reduce
+    // 0-1-2-3-4-...-31, keep this sequential
+    unsigned mask = __ballot_sync(0xFFFFFFFFu, active);
+    // step = 1,2,4,8,16
+    for (int step = 1; step < WARP_SIZE; step <<= 1) {
+        int partner_lane = lane_id + step;
+
+        // neighbor cv
+        uint32_t neighbor_cv[8];
+#pragma unroll
+        for (int j = 0; j < 8; ++j) {
+            neighbor_cv[j] = __shfl_down_sync(mask, cv[j], step);
+        }
+
+        // the left be parent, and make sure `the right` is valid
+        if (active && ((lane_id & ((step << 1) - 1)) == 0) && (partner_lane < valid)) {
+            blake3_parent_cv(cv, neighbor_cv, cv);
+        }
+        __syncwarp(mask);
+
+        // in the next level, reduce half of active threads
+        if (lane_id >= (valid & ~(step))) 
+            active = false;
+    }
+
+    // now, lane 0 holds the root
+    if (lane_id == 0) {
+#pragma unroll
+        for (int j = 0 ; j < 8; ++j) 
+            cv_smem[warp_id][j] = cv[j];
+    }
+    __syncthreads();
+
+    // after all these things, we have finished
+    // 32 -> 16 -> 8 -> 4 -> 2 -> 1 merge
+    // and now, we are going to implement higher-level merge
+    // we have 16 warps, each warp has a root cv
+    // so we are going to execute another logN steps
+
+    // 16 -> 8 -> 4 -> 2 -> 1
+    for (int stride = NUM_WARPS >> 1; stride >= 1; stride >>= 1) {
+        if (warp_id < stride && lane_id == 0) {
+            uint32_t p[8];
+            blake3_parent_cv(&cv_smem[2*warp_id][0], &cv_smem[2*warp_id + 1][0], p);
+#pragma unroll
+            for (int j=0;j<8;++j) 
+                cv_smem[warp_id][j] = p[j];     // write back to shared memory
+        }
+        __syncthreads();
+    }
+
+    // write this root cv to global memory, not done yet! we need another tiny kernel to sweep
+    if (tid == 0) {
+        uint32_t* out = block_cvs + (size_t)blockIdx.x * 8;
+#pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            out[j] = cv_smem[0][j];
+    }
+
+}   // blake3_block_reduce_kernel
+
+
+// ============ Tiny kernel ============
+// In big kernel, it will consume 512 KiB each block
+// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 512 root = 2048 root
+// And this tiny kernel is designed to process these 2048 root
+template <int NUM_THREADS>
+__global__ void blake3_pair_reduce_kernel(const uint32_t* __restrict__ in_cv32,
+                                          uint32_t* __restrict__ out_cv32,
+                                          int n) {
+    extern __shared__ uint32_t smem[];   // dynamic shared memory
+    uint32_t* tile = smem;               // -> [tile_n][8]
+
+    const int tid   = threadIdx.x;
+    const int b     = blockIdx.x;
+    const int B     = gridDim.x;
+
+    const int start = (int)((1ll * n * b)     / B);
+    const int end   = (int)((1ll * n * (b+1)) / B);
+    int tile_n      = end - start;
+
+    if (tile_n <= 0) return;   // border
+
+    const int words = tile_n * 8;
+    for (int w = tid; w < words; w += NUM_THREADS) {
+        tile[w] = in_cv32[start * 8 + w];
+    }
+    __syncthreads();
+
+    int cur = tile_n;
+    while (cur > 1) {
+        const int pairs = cur >> 1;   // floor(cur/2)
+        // process pairs
+        for (int i = tid; i < pairs; i += NUM_THREADS) {
+            const uint32_t* L = &tile[(2*i)   * 8];
+            const uint32_t* R = &tile[(2*i+1) * 8];
+            uint32_t p[8];
+            blake3_parent_cv(L, R, p);
+#pragma unroll
+            for (int j=0;j<8;++j) tile[i*8 + j] = p[j];
+        }
+        __syncthreads();
+
+        // even situation: 
+        if ((cur & 1) && tid == 0) {
+            uint32_t* dst = &tile[pairs * 8];
+            uint32_t* src = &tile[(cur - 1) * 8];
+#pragma unroll
+            for (int j=0;j<8;++j) 
+                dst[j] = src[j];
+        }
+        __syncthreads();
+
+        cur = pairs + (cur & 1);
+    }
+
+    // write output
+    if (tid == 0) {
+        uint32_t* out = &out_cv32[b * 8];
+#pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            out[j] = tile[j];
+    }
+}
+
+constexpr uint32_t FLAG_ROOT = 8;
+
+inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) {
+    uint32_t zero_block[16] = {0};
+    uint32_t st[16];
+    blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st);
+    // 写出前 32 字节（state[0..7]，小端）
+    for (int i = 0; i < 8; ++i) {
+        uint32_t w = st[i];
+        out32[4*i+0] = (uint8_t)( w        & 0xFF);
+        out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF);
+        out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF);
+        out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF);
+    }
+}
+
+// wrapper function
+void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len,
+                            std::array<uint32_t,8>* root_out = nullptr,
+                            cudaStream_t stream = 0) {
+    if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) {
+        fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n",
+                (unsigned long long)bytes_len);
+        std::abort();
+    }
+
+    constexpr int  CHUNK_SIZE = 1024;                       // 1 KiB
+    constexpr int  WORDS_PER_CHUNK = CHUNK_SIZE / 4;        // 256
+    constexpr int  NUM_THREADS = 512;                       // for big kernel
+    constexpr int  NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;       // 16
+    constexpr int  CHUNKS_PER_BLOCK= NUM_WARPS * WARP_SIZE;                     // 16 * 32 = 512
+    const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
+    const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
+    const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);
+
+    uint8_t*  d_bytes   = nullptr;
+    uint32_t* d_words   = nullptr; // alias
+    uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32
+
+    // TODO: use thrust
+    cudaMalloc(&d_bytes, bytes_len);
+    cudaMemcpyAsync(d_bytes, data, bytes_len, cudaMemcpyHostToDevice, stream);
+    d_words = reinterpret_cast<uint32_t*>(d_bytes);
+
+    cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t));
+
+    // launch big kernel
+    dim3 grid_big(num_blocks);
+    dim3 block_big(NUM_THREADS);
+    uint64_t base_chunk_counter = 0ull;
+    
+    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, 0>
+        <<<grid_big, block_big, /*smem*/0, stream>>>(
+            d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks);
+
+    CUDA_CHECK(cudaGetLastError());
+
+    if (num_blocks == 1) {
+        std::array<uint32_t,8> host_root{};
+        CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        // last final process
+        uint8_t digest32[32];
+        blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+        if (root_out) *root_out = host_root;
+        else {
+            // 简单打印
+            printf("root CV:");
+            for (int i=0;i<8;++i) 
+                printf(" %08x", host_root[i]);
+            printf("\n");
+        }
+
+        CUDA_CHECK(cudaFree(d_blockCV));
+        CUDA_CHECK(cudaFree(d_bytes));
+        return;
+    }
+
+    // the first round of tiny kernel
+    const int B = (num_blocks >= 8) ? 8 : num_blocks;
+    uint32_t* d_midCV = nullptr;
+    cudaMalloc(&d_midCV, (size_t)B * 8u * sizeof(uint32_t));
+
+    {
+        dim3 grid(B);
+        dim3 block(512);                                  // 你指定 “每个 block 512 线程”
+        // 每个 block 负责 ceil(num_blocks / B) 个 CV；SMEM 大小按此计算
+        const int tile = (num_blocks + B - 1) / B;
+        const size_t smem_bytes = (size_t)tile * 8u * sizeof(uint32_t);
+
+        blake3_pair_reduce_kernel<512>
+            <<<grid, block, smem_bytes, stream>>>(d_blockCV, d_midCV, num_blocks);
+        CUDA_CHECK(cudaGetLastError());
+    }
+
+    // second round
+    uint32_t* d_root = nullptr;
+    cudaMalloc(&d_root, 8u * sizeof(uint32_t));
+
+    {
+        dim3 grid(1);
+        dim3 block(B);
+        const size_t smem_bytes = (size_t)B * 8u * sizeof(uint32_t);
+
+        // generate kernel during compile time
+        switch (B) {
+            case 1:  blake3_pair_reduce_kernel<1 ><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 2:  blake3_pair_reduce_kernel<2 ><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 4:  blake3_pair_reduce_kernel<4 ><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 8:  blake3_pair_reduce_kernel<8 ><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 16: blake3_pair_reduce_kernel<16><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 32: blake3_pair_reduce_kernel<32><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            case 64: blake3_pair_reduce_kernel<64><<<grid, block, smem_bytes, stream>>>(d_midCV, d_root, B); break;
+            default: {
+                blake3_pair_reduce_kernel<256><<<grid, /*blockDim*/B, smem_bytes, stream>>>(d_midCV, d_root, B);
+            } break;
+        }
+        CUDA_CHECK(cudaGetLastError());
+    }
+
+    std::array<uint32_t, 8> host_root{};
+    CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    // last final process
+    uint8_t digest32[32];
+    blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+    if (root_out) {
+        *root_out = host_root;
+    } else {
+        printf("root CV:");
+        for (int i=0;i<8;++i) printf(" %08x", host_root[i]);
+        printf("\n");
+    }
+
+    // clear
+    CUDA_CHECK(cudaFree(d_root));
+    CUDA_CHECK(cudaFree(d_midCV));
+    CUDA_CHECK(cudaFree(d_blockCV));
+    CUDA_CHECK(cudaFree(d_bytes));
+}
\ No newline at end of file
diff --git a/backup_deprecated/blake3_sm80_v3.cu b/backup_deprecated/blake3_sm80_v3.cu
new file mode 100644
index 0000000..fdf1716
--- /dev/null
+++ b/backup_deprecated/blake3_sm80_v3.cu
@@ -0,0 +1,994 @@
+
+#include "cute/numeric/int.hpp"
+#include <cstdint>
+#include <array>
+#include <cstdio>
+#include <cstring>
+
+#include <cuda_runtime.h>
+#include <cute/tensor.hpp>
+#include <cute/atom/copy_atom.hpp>
+
+#if __CUDA_ARCH__ >= 800
+    #include <cute/arch/copy_sm80.hpp>
+#endif
+
+#include <cute/algorithm/copy.hpp>
+#include <cute/config.hpp>
+#include <cute/numeric/numeric_types.hpp>
+
+#define WARP_SIZE 32
+#define LDST128BITS(value) (reinterpret_cast<const float4*>(&(value))[0])
+
+#define CUDA_CHECK(expr) do {                                   \
+    cudaError_t _e = (expr);                                    \
+    if (_e != cudaSuccess) {                                    \
+      fprintf(stderr, "CUDA error %s at %s:%d: %s\n",           \
+              #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\
+      std::abort();                                             \
+    }                                                           \
+  } while(0)
+
+
+using namespace cute;
+
+using vec_t = cute::uint128_t;        // one time loading 16 B
+
+#if __CUDA_ARCH__ >= 800
+  // SM80 branch
+  using Atom = cute::Copy_Atom<cute::SM80_CP_ASYNC_CACHEGLOBAL<vec_t>, vec_t>;
+#else
+  // Volta/Turing（SM70/75）general copy
+  using Atom = cute::Copy_Atom<cute::AutoVectorizingCopyWithAssumedAlignment<vec_t>, vec_t>;
+  // 也可用更保守的：using Atom = cute::Copy_Atom<cute::UniversalCopy<vec_t>, vec_t>;
+#endif
+
+__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = {
+    0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
+    0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
+};
+
+enum : uint32_t {
+    FLAG_CHUNK_START        = 1u << 0,
+    FLAG_CHUNK_END          = 1u << 1,
+    FLAG_PARENT             = 1u << 2,
+    FLAG_ROOT               = 1u << 3,
+    FLAG_KEYED_HASH         = 1u << 4,
+    FLAG_DERIVE_KEY_CONTEXT = 1u << 5,
+    FLAG_DERIVE_KEY_MATERIAL= 1u << 6,
+};
+
+__device__ __noinline__
+uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) {
+    uint32_t f = 0;
+    f |= (uint32_t)-(block_idx_in_chunk==0)          & FLAG_CHUNK_START;
+    f |= (uint32_t)-(block_idx_in_chunk==nblocks_in_chunk-1)  & FLAG_CHUNK_END;
+    if (is_root_chunk)                         f |= FLAG_ROOT; // only this block in msg, or this is root
+    return f;
+}
+
+__device__ __forceinline__
+uint32_t blake3_parent_flags(bool is_root_parent) {
+    return FLAG_PARENT | (is_root_parent ? FLAG_ROOT : 0);
+}
+
+// ---- 小工具 ----
+__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
+#if defined(__CUDA_ARCH__)
+    return __funnelshift_r(x, x, n);
+#else
+  return (x >> n) | (x << (32 - n));    // host 路径
+#endif
+}
+
+__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) {
+#if defined(__CUDA_ARCH__)
+    const uint4 v = *reinterpret_cast<const uint4*>(src);
+    dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w;
+#else
+    std::memcpy(dst, src, 16);
+#endif
+}
+
+__host__ __device__ void blake3_compress_words_7r(
+    const uint32_t block_words[16],   // 64B -> shared memory
+    const uint32_t cv[8],             // 8×u32 -> shared memory
+    uint64_t chunk_counter,           // 64-bit
+    uint32_t block_len,               // [0..64]
+    uint32_t flags,                   // CHUNK_START/END/PARENT/ROOT…
+    uint32_t out_state[16])           // 返回 16×u32 状态向量（按规范）
+{
+    // TODO: 根据 BLAKE3（基于 BLAKE2s）的 G/round 实现7轮
+    // 这里先给占位：将 IV+cv 混合到 out_state，真实实现请替换
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_state[i] = cv[i];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_state[8+i] = BLAKE3_IV[i];
+
+    out_state[12] ^= (uint32_t)chunk_counter;
+    out_state[13] ^= (uint32_t)(chunk_counter >> 32);
+    out_state[14] ^= block_len;
+    out_state[15] ^= flags;
+
+    // so far, the block_words are still pointers.
+    // now we load it into kernel, as pointed out by ncu profile
+    uint32_t block_reg_1[4];
+
+#pragma unroll
+    for (int i = 0; i < 16; i += 4) {        // the gap is 4
+        // load_u128_u32x4(block_words + i, block_reg_1);
+        out_state[i] ^= block_words[i];
+        // 做一点点搅动（占位）
+        out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31);
+    }
+}
+
+// 从 out_state 派生新的 CV（按规范应取 state[0..7] ^ state[8..15]）
+__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_cv[i] = st[i] ^ st[8+i];
+}
+
+// swap-table
+// BLAKE3 message schedule: rows are P^r, r=0..6.
+// Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) 
+// https://www.cse.unsw.edu.au/~cs4601/refs/papers/blake3.pdf
+__device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = {
+    // r = 0: identity
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+    // r = 1: P
+    {  2,  6,  3, 10,  7,  0,  4, 13,  1, 11, 12,  5,  9, 14, 15,  8 },
+    // r = 2: P∘P
+    {  3,  4, 10, 12, 13,  2,  7, 14,  6,  5,  9,  0, 11, 15,  8,  1 },
+    // r = 3
+    { 10,  7, 12,  9, 14,  3, 13, 15,  4,  0, 11,  2,  5,  8,  1,  6 },
+    // r = 4
+    { 12, 13,  9, 11, 15, 10, 14,  8,  7,  2,  5,  3,  0,  1,  6,  4 },
+    // r = 5
+    {  9, 14, 11,  5,  8, 12, 15,  1, 13,  3,  0, 10,  2,  6,  4,  7 },
+    // r = 6
+    { 11, 15,  5,  0,  1,  9,  8,  6, 14, 10,  2, 12,  3,  4,  7, 13 },
+};
+
+// get the "r" round, "k" message, it is broadcasted from m[li] lane. li = k
+__device__ __forceinline__ uint32_t msg_rk(uint32_t m_lane, int round, int k, unsigned mask16) {
+    int src = B3_MSG_SCHEDULE[round][k];
+    return __shfl_sync(mask16, m_lane, src, 16);
+}
+
+__device__ __noinline__
+uint32_t G_update_role(uint32_t v_self, uint32_t v_b, uint32_t v_c, uint32_t v_d,
+                       uint32_t mx, uint32_t my, int role)
+{
+    // 按 BLAKE2s 32-bit 的 G 序列算出 a',b',c',d'，最后返回“当前 role”的那个值
+    uint32_t a = v_self, b = v_b, c = v_c, d = v_d;
+
+    // a = a + b + mx; 
+    // d ^= a; 
+    // d >>>= 16
+    a = a + b + mx;   
+    d ^= a;   
+    d = rotr32(d, 16);
+
+    // c = c + d; 
+    // b ^= c; 
+    // b >>>= 12
+    c = c + d;        
+    b ^= c;   
+    b = rotr32(b, 12);
+
+    // a = a + b + my; 
+    // d ^= a; 
+    // d >>>= 8
+    a = a + b + my;   
+    d ^= a;   
+    d = rotr32(d, 8);
+
+    // c = c + d; 
+    // b ^= c; 
+    // b >>>= 7
+    c = c + d;        
+    b ^= c;   
+    b = rotr32(b, 7);
+
+    // role choice:
+    switch (role) {
+      case 0: return a;
+      case 1: return b;
+      case 2: return c;
+      default: return d;
+    }
+}
+
+// notice that, this function will proceed 2 chunks, each time.
+// - chunk_words_row: current chunk
+// - out_cv: written by lane 0, or lane 16
+__device__ __noinline__
+void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row, // 256×u32 -> 1024 Bytes, from shared memory
+                                    // so the chunks_row += 2 as gap
+                                    int chunk_len_bytes,
+                                    uint64_t chunk_counter,
+                                    uint32_t out_cv[8],
+                                    unsigned mask16) {
+    // computing index
+    int lane = threadIdx.x & 31;    // lane_id: 0-31
+    int sub  = lane >> 4;            // 0/1
+    int li   = lane & 15;            // 0..15, abstract lane id. for example, lane 16 will be li=0
+    int role = li & 3;               // a/b/c/d role
+    int base = (sub << 4);           // 0 or 16 the absolute base
+
+    const int nblocks = (chunk_len_bytes + 63) >> 6;  // ceil(chunk_len/64)
+
+    int warp_id = threadIdx.x / WARP_SIZE;
+
+    // initialize
+    uint32_t cv_word = 0;
+    if (li < 8) cv_word = BLAKE3_IV[li];
+
+    // process all blocks
+    // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks
+    // each block has 64B -> 16 x u32
+    for (int b = 0; b < nblocks; ++b) {
+        // each lane holds one u32, 
+        // 16 lane will hold 16 x 4 = 64 B -> it's block
+        // the another 16 lane will hold opposite 64 B
+        const uint32_t m_lane = chunk_words_row[b * 16 + li];
+
+        // 初始化 v：v[0..7]=cv, v[8..11]=IV，v[12..15]^=t/len/flags
+        // 先把“自己的那个索引”的初值准备好：
+        uint32_t v = (li < 8)
+            ? cv_word                                 // v[i]（i<8）
+            : BLAKE3_IV[li - 8];                      // v[8..15] ← IV
+
+        // 计数器/长度/标志（按 BLAKE3 规范）
+        const uint32_t t0 = (uint32_t)chunk_counter;
+        const uint32_t t1 = (uint32_t)(chunk_counter >> 32);
+        const int remain = chunk_len_bytes - (b << 6);
+        const uint32_t block_len = (remain >= 64) ? 64u : (uint32_t)remain;
+
+        const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false);
+
+        // 只在 12..15 四个索引上异或相应域（不分支，用谓词掩码）
+        v ^= (li == 12) ? t0       : 0u;
+        v ^= (li == 13) ? t1       : 0u;
+        v ^= (li == 14) ? block_len: 0u;
+        v ^= (li == 15) ? flags    : 0u;
+
+        // 把索引写成 li = q + 4*rq；对角步先做置换 li' = (rq<<2) | ((q+rq)&3)
+        int q  = (li & 3);
+        int rq = (li >> 2);
+        int li_diag = (rq << 2) | ((q + rq) & 3);
+        int li_undo = (rq << 2) | ((q - rq) & 3);
+        int gi_col = q; // 0..3
+        int gi_diag = (li_diag & 3); // 0..3
+
+        // ===== 7 rounds =====
+        #pragma unroll 4
+        for (int r = 0; r < 7; ++r) {
+            // inside this loop, each lane will do one job
+            // 16 lane will execute 16 x 2 operations
+            // in sequential-programming, will do 8 operation
+
+            // ---- 列步（quartet: {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15}）----
+            {
+                // 取同 quartet 的 b/c/d（基于当前 v）
+                uint32_t vb = __shfl_xor_sync(mask16, v,  4, 16);
+                uint32_t vc = __shfl_xor_sync(mask16, v,  8, 16);
+                uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16);
+
+                // 本 quartet 的 i ∈ {0,1,2,3}，列步用 msg 索引 0..7（两两为一对）
+                
+                uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16);
+                uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16);
+
+                v = G_update_role(v, vb, vc, vd, mx, my, role);
+            }
+
+            // ---- 对角步 ----
+            {
+                // 在“对角置换域”取到当前 v 值
+                uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16);
+
+                // 在该域内做“列步”同样的四邻取值
+                uint32_t vb = __shfl_xor_sync(mask16, v_diag,  4, 16);
+                uint32_t vc = __shfl_xor_sync(mask16, v_diag,  8, 16);
+                uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16);
+
+                // 对角步的 4 组 G 使用本轮消息对的后半（索引 8..15）
+                
+                uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16);
+                uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16);
+
+                uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role);
+
+                // 反置换回原位：li_undo = (rq<<2) | ((q - rq) & 3)
+                
+                // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16);
+                v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16);
+            }
+        } // 7 rounds end
+
+        // 派生新的 CV：cv[i] = v[i] ^ v[i+8]（仅 li=0..7 生效）
+        uint32_t vip8_all = __shfl_sync(mask16, v, li ^ 8, 16);
+        if (li < 8) {
+            cv_word = v ^ vip8_all;
+        }
+
+        // 下一块继续（本函数内 16 个 block 串行）
+    }
+
+    // 由 lane0 / lane16 收集 8×u32 输出
+    #pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用
+        if (li == 0) out_cv[j] = wj;                       // 仅 lane0 落盘
+    }
+}
+
+__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){
+    uint32_t msg[16];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[i] = L[i]; 
+    }
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[8+i] = R[i]; 
+    }
+    uint32_t st[16];
+    blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st);
+    blake3_state_to_cv(st, out_cv);
+}
+
+
+
+// ============ Big kernel: 16 WARPS in total ============
+// grid: (chunks / 64), thread: (512,)
+template<const int NUM_THREADS=512,         // or 256
+        const int CHUNK_SIZE=1024, 
+        const int CHUNKS_PER_BLOCK=64,      // 64 KB, if 256 threads -> 32 KB
+        const int PAD_CHUNK=16,
+        const int PAD_CV=0>        // pad shared memory
+__global__ void blake3_block_reduce_kernel(uint32_t* d_input, 
+                                            uint32_t* block_cvs,
+                                            int chunk_len_bytes,
+                                            uint64_t base_chunk_counter,
+                                            int total_chunks) {
+    // NUM_WARPS also stands for NUM_CHUKNS per block
+    constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;        // consider it as 16
+    constexpr int CHUNKS_PROCEED = CHUNKS_PER_BLOCK;
+
+    static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned");
+    static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4");
+
+    // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total
+    // 8 x 4 x 64 = 2 KiB shared memory in sum
+    __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV];        // avoid bank conflict
+
+    constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4;     // 256
+    constexpr int VEC_ELEMS = 4;                        // uint4, 16B
+    constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB)
+    constexpr int WARPS_PER_CTA = NUM_WARPS;            // 16
+
+    // 4 bytes x 256 x 64 = 64 KiB shared memory.
+    __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][WORDS_PER_CHUNK + PAD_CHUNK];     // [64][256+PAD]
+
+    const int tid = threadIdx.x;
+    const int bx = blockIdx.x;
+    const int warp_id = tid / WARP_SIZE;
+    const int lane_id = tid % WARP_SIZE;
+
+    // ============== STAGE 1: Coalsced Global Memory Loading ==============
+    const int tile_id   = blockIdx.x;
+    const int tile_base = tile_id * CHUNKS_PER_BLOCK;       // which chunk do this block start loading
+
+    int valid_chunks = total_chunks - tile_base;
+    if (valid_chunks <= 0) {
+        return; // overflow
+    }
+    if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK;
+
+    // use CuTe for loading
+
+    auto make_smem_tensor = [&](int chunk_local /* The chunk id on shared memory */) {
+        // smem index：chunk_smem[chunk_local][0]
+        // length: (WORDS_PER_CHUNK + PAD_CHUNK)
+        uint32_t* s_u32 = &chunk_smem[chunk_local][0];
+        vec_t* s_vec = reinterpret_cast<vec_t*>(s_u32); // each element is 16B
+        return make_tensor(make_smem_ptr(s_vec),
+                           make_shape(Int<VEC_PER_CHUNK>{}),        // 64 x 4 = 256, one row of shared memory
+                           make_stride(Int<1>{}));  // continuous
+    };
+
+    // shared memory 256 items = 64 x vec_t(4)
+    // thread layout: 32 threads
+    //      thread 0: 0, 32
+    //      thread 1: 1, 33
+    //      thread 32:  31,63
+    // so we load data in coalsced mode
+    using Atom = cute::Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<vec_t>, vec_t>;
+
+    // thread layout. (lane,0)->idx=2*lane，(lane,1)->idx=2*lane+1
+    auto thr_layout = make_layout(
+        make_shape(Int<2>{}),
+        make_stride(Int<1>{})
+    );
+
+    auto tile_layout = make_layout(
+        make_shape(Int<WARP_SIZE>{}, Int<2>{}),     // (32,2)
+        make_stride(Int<1>{}, Int<WARP_SIZE>{})   // idx = lane + k2*32
+    );
+
+    // build the real copy
+    TiledCopy tcopy = make_tiled_copy(Atom{}, thr_layout, tile_layout);
+
+    // this sentence will load
+    // auto thr_copy = local_partition(tcopy, lane_id);
+
+
+    for (int ldt = 0; ldt < 4; ldt++) {
+        // each warp load 4 chunks
+        int chunk_local  = ldt * WARPS_PER_CTA + warp_id;           // ldt*16 + warp -> start chunk
+        int chunk_global = tile_base + chunk_local;                 // global chunk idx
+
+        // the pointer for shared memory
+        auto smem_vec1d = make_smem_tensor(chunk_local);        // 64 x uint128_t
+
+        // only read from global, when it's valid
+        // or, we fill it with 0
+        const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK;
+        const vec_t* g_vec_base = reinterpret_cast<const vec_t*>(g_u32);        // 16 B each time
+
+        // the src is 64 x 4 = 256, so is dest
+        auto gmem_vec1d = make_tensor(make_gmem_ptr(g_vec_base),
+                                make_shape(Int<VEC_PER_CHUNK>{}),       // 64 x uint128_t
+                                make_stride(Int<1>{}));
+        bool g_valid = (chunk_local < valid_chunks);
+
+        auto smem_vec2d = make_tensor(smem_vec1d.data(), tile_layout);
+        auto gmem_vec2d = make_tensor(gmem_vec1d.data(), tile_layout);
+
+        // now we will load this
+        auto tCg = local_partition(gmem_vec2d, tile_layout, lane_id);
+        auto tCs = local_partition(smem_vec2d, tile_layout, lane_id);
+
+        // launch the copy inst.
+        if (g_valid) {
+            // gmem load → smem store：twice 16B
+            copy(tcopy, tCg, tCs);
+        } else {
+            for (int i = 0; i < size(tCs); ++i) {
+                *reinterpret_cast<vec_t*>(raw_pointer_cast(&tCs(i))) = uint128_t(0, 0);
+            }
+        }
+    }
+    cp_async_fence();
+    cp_async_wait<0>();
+
+    __syncthreads();        // sync all warps
+
+    // ============== STAGE 2: Compress Leaf to 64 chain value ==============
+    const int pass0_valid = min(32, valid_chunks);              // pass0 cover [0, 31] chunks
+    const int pass1_valid = max(0, valid_chunks - 32);          // pass1 cover [32, 63] chunks
+
+    __shared__ int parents_count;
+    if (threadIdx.x == 0) {
+        const int parents0 = (pass0_valid + 1) >> 1;
+        const int parents1 = (pass1_valid + 1) >> 1;
+        parents_count = parents0 + parents1;             // ≤ 32
+    }
+    __syncthreads();
+
+    // if (blockIdx.x==0 && threadIdx.x==0) printf("after stage1\n");
+
+    // this is for each warp's lane0 and lane16 written
+    // to decrease the register usage.
+    __shared__ __align__(16) uint32_t tmp_cv[CHUNKS_PER_BLOCK][8 + PAD_CV]; // 2 KiB
+
+    // lambda function: compress this thing
+    auto do_big_pass = [&](int base /*0 或 32*/, int pass_valid) {
+        // left=base+2*warp_id, right=left+1
+        const int left       = base + (warp_id << 1);       // base + 0,2,4,6,...
+        const int right      = left + 1;
+        const int left_rel   = left  - base;   // 0..31
+        const int right_rel  = right - base;   // 1..32
+        const bool has_left  = (left_rel  < pass_valid);
+        const bool has_right = (right_rel < pass_valid);
+
+        // const int lane_id    = threadIdx.x & 31;
+        const int sub        = lane_id >> 4;               // sub-warp: 0 or 1, lane0-15: sub-warp0; lane16-lane31: sub-warp1
+        const int li         = lane_id & 15;               // 0..15
+        const unsigned mask16= (sub == 0 ? 0x0000FFFFu : 0xFFFF0000u);
+
+        const int chunk_local = left + sub;        // sub=0→left, sub=1→right
+        const bool active = (sub==0 ? (left - base)  < pass_valid
+                                    : (right - base) < pass_valid);
+
+        // uint32_t my_cv[8];
+
+        // the left-sub-warp and right-sub-warp will execute the same code
+        // distinguish the index by computing,
+        // to avoid warp-divergence
+        if (active) {
+            // the chunk local identifies the left or right chunk, so do not worry.
+            const uint32_t* row = &chunk_smem[chunk_local][0];
+            const uint64_t  cc  = base_chunk_counter + (uint64_t)(tile_base + chunk_local);
+            blake3_leaf_cv_simd16_onechunk(row, 
+                chunk_len_bytes,
+                cc, 
+                &tmp_cv[chunk_local][0], 
+                mask16);
+        }
+
+        __syncwarp();       // make sure two warps written into `tmp_cv`
+
+        // now we have compute 2 chunks' cv
+        // merge it to a parent cv
+        if (lane_id == 0 && has_left) {
+            const uint32_t* lcv = &tmp_cv[left][0];
+            uint32_t parent[8];
+            if ((right - base) < pass_valid) {
+                const uint32_t* rcv = &tmp_cv[right][0];
+                blake3_parent_cv(lcv, rcv, parent);
+            } else {        // odd: up-flow directly
+                #pragma unroll
+                for (int j = 0 ; j < 8; ++j) 
+                    parent[j] = lcv[j];
+            }
+
+            // now, one warp computes 2 chunks, yield one parent-cv value
+            const int pair_idx = (base >> 1) + warp_id;     // 0, 16 + warp_id
+            #pragma unroll
+            for (int j = 0; j < 8; ++j)
+                cv_smem[pair_idx][j] = parent[j];
+        }
+
+        __syncwarp();       // NOTICE: this is necessary!
+    }; // do_big_pass
+
+    // big-pass 1: computing 0-31 chunks
+    do_big_pass(/*base=*/0,  pass0_valid);
+
+    // if (bx == 0) printf("Finish 1 big pass\n");
+
+    // big-pass 2: computing 32-63 chunks
+    do_big_pass(/*base=*/32, pass1_valid);
+
+    // if (bx == 0) printf("Finish 2 big pass\n");
+
+    __syncthreads();
+
+    // printf("Stage 2 done!!!\n");
+
+    // right now, we have got 32 chain values
+    // a warp-reduce to merge.
+
+    // ============== STAGE 3: Block-Reduce ==============
+    // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv
+    // we will only use warp 0 to handle this thing
+    if (warp_id == 0) {
+        uint32_t cv[8] = {0,0,0,0,0,0,0,0};
+
+        const bool active_lane = (lane_id < parents_count);
+        if (active_lane) {
+    #pragma unroll
+            for (int j = 0; j < 8; ++j) cv[j] = cv_smem[lane_id][j];
+        }
+
+        // 2) warp reduce 32 -> 16 -> 8 -> 4 -> 2 -> 1
+        unsigned mask = __ballot_sync(0xFFFFFFFFu, active_lane);
+        int cur_n = parents_count;  // 当前层的有效节点数（逐层更新）
+
+        for (int step = 1; step < WARP_SIZE; step <<= 1) {
+            // right-neighbor
+            uint32_t nbr[8];
+    #pragma unroll
+            for (int j = 0; j < 8; ++j) {
+                nbr[j] = __shfl_down_sync(mask, cv[j], step);
+            }
+
+            // safety checking
+            const bool do_pair =
+                (lane_id % (step << 1) == 0) &&               // 左侧
+                (lane_id + step < cur_n) &&                   // 右侧在当前层有效范围内
+                (lane_id < cur_n);                            // 左侧也必须有效
+
+            if (do_pair) {
+                blake3_parent_cv(cv, nbr, cv);               // parent(left, right) -> cv
+            }
+
+            cur_n = (cur_n + 1) >> 1;
+            __syncwarp(mask);
+        }
+
+        // 3) write back to global memory
+        if (lane_id == 0 && parents_count > 0) {
+            const int tile_id = blockIdx.x;
+            uint32_t* out = block_cvs + (size_t)tile_id * 8;        // 8 x 4 = 32 B
+
+            // two different write ways
+            #if 0
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                out[j] = cv[j];
+            #else
+            // block_cvs should be cudaMalloc ed
+            reinterpret_cast<uint4*>(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]);
+            reinterpret_cast<uint4*>(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]);
+            #endif
+        }
+    }
+}   // blake3_block_reduce_kernel
+
+__device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) {
+    const uint4* g4 = reinterpret_cast<const uint4*>(g);
+    uint4 a = g4[0], b = g4[1];
+    r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w;
+    r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w;
+}
+
+__device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) {
+    uint4* g4 = reinterpret_cast<uint4*>(g);
+    g4[0] = make_uint4(r[0],r[1],r[2],r[3]);
+    g4[1] = make_uint4(r[4],r[5],r[6],r[7]);
+}
+
+// ============ Tiny kernel ============
+// In big kernel, it will consume 64 KiB each block
+// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 64 root = 16384 root
+// And this tiny kernel is designed to process these 16384 root
+template<int NUM_THREADS=512, int TILE_CVS=2048, int PAD=0>
+__global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32,
+                                              uint32_t* __restrict__ out_cv32,
+                                              int N)
+{
+    extern __shared__ __align__(16) uint32_t smem[]; // 动态 SMEM；需要 >= TILE_CVS*8*4 字节
+    // 视作 2D：[TILE_CVS][8+PAD]
+    uint32_t* cv_tile = smem;
+
+    const int tid     = threadIdx.x;
+    const int warp_id = tid / WARP_SIZE;   // 0..15
+    const int lane_id = tid % WARP_SIZE;   // 0..31
+
+    // 本 block 负责的分片起点
+    const int tile_start = blockIdx.x * TILE_CVS;
+    if (tile_start >= N) return;
+
+    // N等于8的时候，这里就是8
+    const int tile_n = min(TILE_CVS, N - tile_start); // 该分片的实际 CV 数（<=2048）
+
+    // ---------------- Stage 1: 合并访存 loading 到 SMEM ----------------
+    // 每线程搬多个 CV：i = tid, tid+blockDim, ...
+    for (int i = tid; i < tile_n; i += NUM_THREADS) {       // 注意：i = tid, 不是等于0
+        const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8;
+        uint32_t* s = cv_tile + (size_t)i * (8 + PAD);
+        // 两次 16B
+        const uint4* g4 = reinterpret_cast<const uint4*>(g);
+        uint4*       s4 = reinterpret_cast<uint4*>(s);
+        // s4[0] = g4[0];
+        // s4[1] = g4[1];
+
+        // in case that the address is not aligned
+        uint4 v0 = g4[0];
+        uint4 v1 = g4[1];
+
+        s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w;
+        s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w;
+    }
+    // 对于 tile_n < TILE_CVS 的尾部，无需清零；后续按有效范围处理
+    __syncthreads();
+
+    // ---------------- Stage 2: 线程内 4→1（保持相邻配对） ----------------
+    // 共有 reduced_n0 = ceil(tile_n / 4) 个 lane-root
+    const int reduced_n0 = (tile_n + 3) >> 1 >> 1; // 等价于 (tile_n+3)/4
+    uint32_t lane_cv[8]; // 本线程输出的 lane-root
+    bool lane_valid = false;
+
+    // 每线程的 4 个输入的起始索引
+    int base4 = tid << 2; // tid*4
+    if (base4 < tile_n) {
+        // 读取最多 4 个相邻 CV：idx = base4 + 0,1,2,3
+        uint32_t a[8], b[8], c[8], d[8];
+        const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD);
+        load_cv_g2r(s0, a);
+
+        int remain = tile_n - base4;
+
+        if (remain >= 2) {
+            const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD);
+            load_cv_g2r(s1, b);
+        }
+        if (remain >= 3) {
+            const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD);
+            load_cv_g2r(s2, c);
+        }
+        if (remain >= 4) {
+            const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD);
+            load_cv_g2r(s3, d);
+        }
+
+        // 两层相邻配对（奇数晋级）
+        if (remain == 1) {
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                lane_cv[j] = a[j];
+        } else if (remain == 2) {
+            blake3_parent_cv(a, b, lane_cv);
+        } else if (remain == 3) {
+            uint32_t p01[8];
+            blake3_parent_cv(a, b, p01);
+            blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01，(p01,c)->lane_cv
+        } else { // remain >= 4
+            uint32_t p01[8], p23[8];
+            blake3_parent_cv(a, b, p01);
+            blake3_parent_cv(c, d, p23);
+            blake3_parent_cv(p01, p23, lane_cv);
+        }
+        lane_valid = true;
+    }
+
+    // ---------------- Stage 3: Warp 内 32→1 相邻配对 ----------------
+    // 每个 warp 负责一个连续段：warp_base = warp_id*32
+    const int warp_base = warp_id * WARP_SIZE;
+    const int cur_n_w   = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // 本 warp 段内的有效数量
+
+    // 把 lane_cv 保留在寄存器里做归约；无效 lane 用 mask 剔除
+    unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // 仅做存在检测
+    int cur_n = cur_n_w;
+
+    // 把“段外的线程”标成无效（避免读越界）
+    bool active_lane = (lane_id < cur_n_w);
+
+    // 对无效 lane 把值清成 0（不会被使用）
+    if (!active_lane) { 
+        #pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            lane_cv[j] = 0u; 
+    }
+
+    // 逐层配对：1,2,4,8,16 - warp-reduce
+    for (int step = 1; step < WARP_SIZE; step <<= 1) {
+        // 取右邻
+        uint32_t nbr[8];
+        #pragma unroll
+        for (int j = 0; j < 8; ++j)
+            nbr[j] = __shfl_down_sync(0xFFFFFFFFu, lane_cv[j], step);
+
+        const bool do_pair =
+            active_lane &&
+            ((lane_id % (step<<1)) == 0) &&
+            (lane_id + step < cur_n);
+
+        if (do_pair) {
+            blake3_parent_cv(lane_cv, nbr, lane_cv);
+        }
+
+        cur_n = (cur_n + 1) >> 1;
+        // __syncwarp();
+    }
+
+    // 这一段的结果在 lane0；把 16 个 warp-root 写入 SMEM 的前 16 行
+    __shared__ uint32_t warp_roots[WARP_SIZE/2][8]; // 16×8
+    if (lane_id == 0 && cur_n_w > 0) {
+        #pragma unroll
+        for (int j=0;j<8;++j) 
+            warp_roots[warp_id][j] = lane_cv[j];
+    }
+    __syncthreads();
+
+    // ---------------- Stage 4: CTA 内 16→1 相邻配对 ----------------
+    // 有效 warp 数：ceil(reduced_n0 / 32)
+    int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..16
+    if (valid_warps == 0) return;
+
+    // 每一个warp的lane 0来做计算
+    // 用 lane0 做计算，其它 lane 空转
+    for (int stride = (valid_warps >> 1); stride >= 1; stride >>= 1) {
+        if (warp_id < stride && lane_id == 0) {
+            uint32_t p[8];
+            blake3_parent_cv(&warp_roots[2*warp_id][0],
+                             &warp_roots[2*warp_id + 1][0], p);
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                warp_roots[warp_id][j] = p[j];
+        }
+        __syncthreads();
+        // 奇数晋级
+        if ((valid_warps & 1) && warp_id==0 && lane_id==0) {
+            #pragma unroll
+            for (int j=0;j<8;++j)
+                warp_roots[stride][j] = warp_roots[valid_warps-1][j];
+        }
+        __syncthreads();
+        valid_warps = (valid_warps + 1) >> 1;
+    }
+
+    // 写回本 block 的根
+    if (threadIdx.x == 0) {
+        uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8;
+        #pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            out[j] = warp_roots[0][j];
+    }
+}
+
+inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) {
+    const uint32_t zero_block[16] = {0};
+    uint32_t st[16];
+    blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st);
+    // 写出前 32 字节（state[0..7]，小端）
+    for (int i = 0; i < 8; ++i) {
+        uint32_t w = st[i];
+        out32[4*i+0] = (uint8_t)( w        & 0xFF);
+        out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF);
+        out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF);
+        out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF);
+    }
+}
+
+// wrapper function
+void blake3_block_reduce_sm70(const uint8_t* d_data, 
+                            uint64_t bytes_len,
+                            std::array<uint32_t,8>* root_out = nullptr,
+                            cudaStream_t stream = 0) {
+    if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) {
+        fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n",
+                (unsigned long long)bytes_len);
+        std::abort();
+    }
+
+    // int dev = -1;
+    // cudaGetDevice(&dev);
+    // printf("[dbg] my runtime current device = %d\n", dev);
+
+    // cudaPointerAttributes attr{};
+    // auto st = cudaPointerGetAttributes(&attr, d_data);
+    // printf("[dbg] getAttr=%d, type=%d (0=host,1=device,2=managed), device=%d\n",
+    //     (int)st, (int)attr.type, attr.device);
+
+    // cudaPointerAttributes attr{};
+    // CUDA_CHECK(cudaPointerGetAttributes(&attr, d_data));
+    // if (attr.type != cudaMemoryTypeDevice) {
+    //     fprintf(stderr, "d_data is not device memory!\n");
+    //     std::abort();
+    // }
+
+    int optin = 0, deflt = 0;
+    cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0);
+    cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0);
+
+    const int dyn_smem = 64 * 1024;     // 64KiB
+
+    // 编译器在编译期决定分配多少动态shmem给kernel
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_cv_block_reduce_kernel<512, 2048, 0>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_cv_block_reduce_kernel<32, 2048, 0>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+
+        
+    constexpr int  CHUNKS_PER_BLOCK = 32;                     // 16 * 32 = 512
+    constexpr int  CHUNK_SIZE = 1024;                       // 1 KiB
+    constexpr int  WORDS_PER_CHUNK = CHUNK_SIZE / 4;        // 256
+    constexpr int  NUM_THREADS = CHUNKS_PER_BLOCK * 512 / 64;                       // for big kernel, 512 or 256
+    constexpr int  NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;       // 16 or 8
+    const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
+    const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
+    const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);     // 16384 blocks
+
+    constexpr int pad_chunk = 16;
+    constexpr int pad_cv = 0;
+
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>,
+        cudaFuncAttributePreferredSharedMemoryCarveout, 100));
+        
+    uint8_t*  d_bytes = const_cast<uint8_t*>(d_data);
+    uint32_t* d_words = reinterpret_cast<uint32_t*>(d_bytes);; // alias
+    uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32
+
+    // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch.
+
+    // TODO: use thrust
+    cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t));     // 512 KiB
+
+    // ============= launch big kernel =============
+    dim3 grid_big(num_blocks);
+    dim3 block_big(NUM_THREADS);
+    uint64_t base_chunk_counter = 0ull;
+    
+    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>
+        <<<grid_big, block_big, /*smem*/0, stream>>>(
+            d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks);
+
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    if (num_blocks == 1) {
+        std::array<uint32_t,8> host_root{};
+        CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        // last final process
+        uint8_t digest32[32];
+        blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+        if (root_out) *root_out = host_root;
+        else {
+            // 简单打印
+            printf("root CV:");
+            for (int i=0;i<8;++i) 
+                printf(" %08x", host_root[i]);
+            printf("\n");
+        }
+
+        CUDA_CHECK(cudaFree(d_blockCV));
+        CUDA_CHECK(cudaFree(d_bytes));
+        return;
+    }
+
+    // the first round of tiny kernel
+    // 1) 16384 output reduce -> 8
+    uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32
+    {
+        const int N = 16384;        // total number
+        const int TILE = 2048;
+        const int grid = (N + TILE - 1) / TILE;  // = 8
+        const int block = 512;
+        const size_t smem_bytes = (size_t)(TILE) * 8u * sizeof(uint32_t); // 2048×8×4 = 64 KiB
+
+        cudaMalloc(&d_mid_out, (size_t)8 * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<512, 2048, 0>
+            <<<grid, block, smem_bytes, stream>>>(d_blockCV /*in: 16384×8 x 4*/,
+                                                d_mid_out   /*out: 8×8*/, N);
+        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    // second round
+    uint32_t* d_root_cv = nullptr;
+    {
+        const int N = 8;
+        const int TILE = 2048; // 任意 >=N 即可
+        const int grid = 1;
+        const int block = 32; // 32 线程够用
+        const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t);        // 8 x 8 x 4 = 8 x 32 B
+
+        cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<32, 2048, 0>
+            <<<grid, block, smem_bytes, stream>>>(d_mid_out /*in: 8×8*/,
+                                                d_root_cv /*out: 1×8*/, N);
+        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    std::array<uint32_t, 8> host_root{};
+    CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root_cv, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    // last final process
+    uint8_t digest32[32];
+    blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+    if (root_out) {
+        *root_out = host_root;
+    } else {
+        printf("root CV:");
+        for (int i=0;i<8;++i) printf(" %08x", host_root[i]);
+        printf("\n");
+    }
+
+    // clear
+    CUDA_CHECK(cudaFree(d_mid_out));
+    CUDA_CHECK(cudaFree(d_root_cv));
+    CUDA_CHECK(cudaFree(d_blockCV));
+    // CUDA_CHECK(cudaFree(d_bytes));
+}
\ No newline at end of file
diff --git a/benchmark/bench.py b/benchmark/bench.py
new file mode 100644
index 0000000..307e2fa
--- /dev/null
+++ b/benchmark/bench.py
@@ -0,0 +1,116 @@
+# bench_blake3_sizes.py
+import time
+import math
+import torch
+import blake3
+import flashashing as fh
+
+# ===== 可调参数 =====
+CHECK_ACCURACY = False       # 是否校验与 CPU blake3 一致（小规模可开，大规模会慢一点）
+USE_PINNED_HOST = True       # 是否使用 pinned host memory（建议 True，用于端到端吞吐）
+WARMUP_PER_SIZE = 2          # 每个尺寸的预热次数（不计入统计）
+
+# 每个尺寸的重复次数，尽量让统计时间不至于太短
+def pick_repeats(n_bytes: int) -> int:
+    if n_bytes <= 1*1024:          # 1 KB
+        return 2000
+    if n_bytes <= 16*1024:
+        return 1000
+    if n_bytes <= 64*1024:
+        return 500
+    if n_bytes <= 256*1024:
+        return 200
+    if n_bytes <= 1*1024*1024:
+        return 100
+    if n_bytes <= 4*1024*1024:
+        return 50
+    return 20  # 10.3MB
+
+# ===== 测试尺寸（字节）=====
+sizes = [
+    ("1KB (1 chunk)", 1 * 1024),
+    ("8KB",            8 * 1024),
+    ("16KB",          16 * 1024),
+    ("64KB",          64 * 1024),
+    ("256KB",        256 * 1024),
+    ("1MB",         1 * 1024 * 1024),
+    ("4MB",         4 * 1024 * 1024),
+    ("10.3MB",      int(11 * 1024 * 1024)),  # 以 MiB 解释 10.3MB
+]
+
+device = torch.device("cuda")
+stream = torch.cuda.current_stream().cuda_stream
+
+# 预分配最大的 host/device buffer，子切片复用，保证公平且减少反复分配
+max_n = max(n for _, n in sizes)
+cpu = torch.empty(max_n, dtype=torch.uint8, pin_memory=USE_PINNED_HOST)
+cpu[:] = ord('A')
+
+d = torch.empty_like(cpu, device=device)  # device 大小与 max_n 一致
+torch.cuda.synchronize()
+
+# 标题
+print("BLAKE3 GPU benchmark across message sizes")
+print(f"Host pinned memory: {USE_PINNED_HOST}")
+print("-" * 96)
+print("{:<14} {:>10} {:>8} {:>12} {:>12} {:>12} {:>12}".format(
+    "size", "bytes", "repeat", "kern_ms", "kern_MiB/s", "e2e_ms", "e2e_MiB/s"
+))
+print("-" * 96)
+
+for label, n in sizes:
+    # 预热
+    for _ in range(WARMUP_PER_SIZE):
+        # 给 device 子切片复制 n 字节
+        d[:n].copy_(cpu[:n], non_blocking=True)
+        torch.cuda.synchronize()
+        _ = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), n, stream)
+    torch.cuda.synchronize()
+
+    repeat = pick_repeats(n)
+    total_kernel_ms = 0.0
+    total_e2e_sec = 0.0
+    last_hex = None
+
+    # CUDA event 用于“纯 kernel”计时
+    start_evt = torch.cuda.Event(enable_timing=True)
+    end_evt = torch.cuda.Event(enable_timing=True)
+
+    for _ in range(repeat):
+        # -------- End-to-end: H2D + kernel --------
+        torch.cuda.synchronize()
+        t0 = time.perf_counter()
+        d[:n].copy_(cpu[:n], non_blocking=True)
+        # kernel-only 计时：events 只包住内核调用
+        start_evt.record()
+        last_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), n, stream)
+        end_evt.record()
+        torch.cuda.synchronize()  # 等 H2D+kernel 都完成
+        t1 = time.perf_counter()
+
+        total_e2e_sec += (t1 - t0)
+        total_kernel_ms += start_evt.elapsed_time(end_evt)
+
+    # 平均时间
+    avg_kernel_ms = total_kernel_ms / repeat
+    avg_e2e_ms = (total_e2e_sec / repeat) * 1e3
+
+    # 吞吐（MiB/s；1 MiB = 1024^2）
+    mib = n / (1024 ** 2) if n != 10.3 else 11 / (1024 ** 2)
+    kernel_throughput = mib / (avg_kernel_ms / 1e3) if avg_kernel_ms > 0 else float('inf')
+    e2e_throughput = mib / (avg_e2e_ms / 1e3) if avg_e2e_ms > 0 else float('inf')
+
+    print("{:<14} {:>10} {:>8} {:>12.3f} {:>12.2f} {:>12.3f} {:>12.2f}".format(
+        label, n, repeat, avg_kernel_ms, kernel_throughput, avg_e2e_ms, e2e_throughput
+    ))
+
+    if CHECK_ACCURACY:
+        # CPU 校验（只对当前尺寸做一次）
+        std_hex = blake3.blake3(cpu[:n].numpy()).hexdigest()
+        assert last_hex == std_hex, f"Mismatch at {label}: GPU {last_hex} vs CPU {std_hex}"
+
+print("-" * 96)
+print("Notes:")
+print("  • kernel_ms / kern_MiB/s 只统计 GPU 计算时间（CUDA events），不含 H2D；")
+print("  • e2e_ms / e2e_MiB/s 统计 H2D + kernel 的总时间；")
+print("  • '10.3MB' 按 10.3 * 1024 * 1024 字节计算。")
diff --git a/benchmark/perf.txt b/benchmark/perf.txt
new file mode 100644
index 0000000..0058b2e
--- /dev/null
+++ b/benchmark/perf.txt
@@ -0,0 +1,5 @@
+10.7 V100: 54819.25 MiB/s
+    RTX 4090: 150585.83 MiB/s (False)
+
+10.9, RTX4090: 67184.92 MiB/s
+10.9  A800: 20803.11 MiB/s
\ No newline at end of file
diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py
new file mode 100644
index 0000000..d634d21
--- /dev/null
+++ b/benchmark/test_gpu.py
@@ -0,0 +1,69 @@
+import torch
+import flashashing as fh
+import hashlib
+import time
+import blake3
+
+check_accuracy = False
+check_perf = True
+
+GiB = 1024*1024*1024  # bytes -> 1 GiB
+
+cpu = torch.empty(GiB * 1, dtype=torch.uint8)
+cpu[:] = ord('A')
+
+# 一次性 H2D（可重用）
+d = torch.empty_like(cpu, device='cuda')
+d.copy_(cpu, non_blocking=True)
+torch.cuda.synchronize()
+
+stream = torch.cuda.current_stream().cuda_stream
+
+# std_hex = blake3.blake3(data).hexdigest()
+
+if check_accuracy:
+    # 2) GPU 版本
+    cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
+    torch.cuda.synchronize()
+    print("GPU BLAKE3 Result:   ", cv_hex)
+
+    # 1) CPU 版本
+    std_hex = blake3.blake3(cpu.numpy()).hexdigest()
+    print("std BLAKE3 Expected: ", std_hex)
+
+    # std_hex_1KB = blake3.blake3(cpu[:1024].numpy()).hexdigest()
+    # print("std BLAKE3 1KB:      ", std_hex_1KB)
+
+    assert cv_hex == std_hex, "GPU BLAKE3 result does not match CPU result!"
+    print("GPU BLAKE3 result matches CPU result!")
+
+if check_perf:
+    # 2) 预热，触发 JIT/驱动初始化，避免首轮偏慢
+    for _ in range(2):
+        # d.copy_(cpu, non_blocking=True)
+        torch.cuda.synchronize()
+        output = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
+        print("warmup CV (hex) =", output)
+    torch.cuda.synchronize()
+
+    # 3) 正式计时：这测的是“端到端吞吐”（包含 H2D）
+    repeat = 10  # 1GiB × 5 已经很重，按机器调整
+    cv_hex = None
+    total_time = 1e-8
+    for _ in range(repeat):
+        d.copy_(cpu, non_blocking=True)
+        torch.cuda.synchronize()
+
+        t0 = time.perf_counter()
+        cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream)
+        t1 = time.perf_counter()
+
+        elapsed = t1 - t0
+        total_time += elapsed
+    torch.cuda.synchronize()
+
+    print(f"Elapsed time for {repeat}x BLAKE3 (GPU SM70): {total_time:.3f} seconds")
+    print(f"Throughput: {repeat * d.numel() / total_time / (1024**2):.2f} MiB/s")
+    print("root CV (hex) =", cv_hex)
+
+# print(f"std BLAKE3 Expected: {std_hex}")
\ No newline at end of file
diff --git a/csrc/binding.cpp b/csrc/binding.cpp
index 2da163f..43290c4 100644
--- a/csrc/binding.cpp
+++ b/csrc/binding.cpp
@@ -76,6 +76,115 @@ static std::string blake3_hash_naive(py::object obj) {
     }
 }
 
+
+struct PyBytesView {
+    const uint8_t* ptr = nullptr;
+    size_t len = 0;
+    std::string storage;
+    py::object keep_alive;
+};
+
+static PyBytesView get_bytes_view(py::object obj) {
+  PyBytesView v;
+
+  // bytes -> 复制到 storage
+  if (py::isinstance<py::bytes>(obj)) {
+    v.storage = static_cast<std::string>(py::bytes(obj));
+    v.ptr = reinterpret_cast<const uint8_t*>(v.storage.data());
+    v.len = v.storage.size();
+    return v;
+  }
+
+  // str -> 按 UTF-8 编码复制；如果你想按“原始字节”处理，建议在 Python 侧先 .encode()
+  if (py::isinstance<py::str>(obj)) {
+    v.storage = obj.cast<std::string>();
+    v.ptr = reinterpret_cast<const uint8_t*>(v.storage.data());
+    v.len = v.storage.size();
+    return v;
+  }
+
+  // bytearray -> 正确姿势：转成 py::bytes 再拿 std::string（不要用迭代器）
+  if (py::isinstance<py::bytearray>(obj)) {
+    v.storage = static_cast<std::string>(py::bytes(obj));
+    v.ptr = reinterpret_cast<const uint8_t*>(v.storage.data());
+    v.len = v.storage.size();
+    return v;
+  }
+
+  // 任意 buffer（memoryview / numpy 等）
+  if (PyObject_CheckBuffer(obj.ptr())) {
+    py::buffer buf = py::reinterpret_borrow<py::buffer>(obj);
+    py::buffer_info info = buf.request(/*writable=*/false);
+
+    // 一维 C 连续：零拷贝，注意保活
+    if (info.ndim == 1 &&
+        info.strides.size() == 1 &&
+        info.strides[0] == static_cast<ssize_t>(info.itemsize)) {
+      v.ptr = reinterpret_cast<const uint8_t*>(info.ptr);
+      v.len = static_cast<size_t>(info.size) * static_cast<size_t>(info.itemsize);
+      v.keep_alive = obj;  // 保活底层内存
+      return v;
+    }
+
+    // 非连续：用 tobytes() 拷贝成线性字节
+    py::object tobytes = obj.attr("tobytes")();
+    v.storage = static_cast<std::string>(py::bytes(tobytes));
+    v.ptr = reinterpret_cast<const uint8_t*>(v.storage.data());
+    v.len = v.storage.size();
+    return v;
+  }
+
+  throw std::invalid_argument(
+      "blake3_gpu_root_* expects bytes/str/bytearray/memoryview/numpy buffer.");
+}
+
+static std::string cv_words_to_bytes_le(const std::array<uint32_t,8>& cv) {
+    std::string out;
+    out.resize(32);
+    uint8_t* p = reinterpret_cast<uint8_t*>(&out[0]);
+    for (int i = 0; i < 8; ++i) {
+        uint32_t w = cv[i];
+        p[4*i + 0] = static_cast<uint8_t>( w        & 0xFF);
+        p[4*i + 1] = static_cast<uint8_t>((w >> 8)  & 0xFF);
+        p[4*i + 2] = static_cast<uint8_t>((w >> 16) & 0xFF);
+        p[4*i + 3] = static_cast<uint8_t>((w >> 24) & 0xFF);
+    }
+    return out;
+}
+
+struct GilRelease {
+    py::gil_scoped_release rel;
+};
+
+// static py::bytes blake3_gpu_root_cv_bytes(py::object obj) {
+//     auto v = get_bytes_view(obj);
+//     std::array<uint32_t,8> root{};
+//     {
+//         GilRelease _g;
+//         blake3_block_reduce_sm70(v.ptr, static_cast<uint64_t>(v.len), &root, /*stream=*/0);
+//     }
+//     std::string b = cv_words_to_bytes_le(root);
+//     return py::bytes(b);
+// }
+
+static std::string blake3_gpu_root_hex(uint64_t device_ptr,
+                                        uint64_t nbytes,
+                                        uint64_t stream_int = 0) {
+    auto d_data = reinterpret_cast<const uint8_t*>(device_ptr);
+    auto stream = reinterpret_cast<cudaStream_t>(stream_int);
+    cudaSetDevice(0);
+    cudaFree(0);               // attach
+
+    std::array<uint32_t,8> root{};
+    {
+        GilRelease _g;
+        blake3_block_reduce_sm70_sm80(d_data, nbytes, &root, stream);
+    }
+    std::string b = cv_words_to_bytes_le(root);
+    return bytes_to_hex(reinterpret_cast<const uint8_t*>(b.data()), b.size());
+}
+
+
 PYBIND11_MODULE(flashashing, m) {
     m.doc() = "SHA-256 and Blake3 bindings (pybind11)";
 
@@ -84,9 +193,23 @@ PYBIND11_MODULE(flashashing, m) {
           "Compute SHA-256 of a str/bytes and return hex string.");
     m.def("hash_simd", &hash_hex_simd4,
           py::arg("data"), 
-          "SIMD enhanced sha256."),
+          "SIMD enhanced sha256.");
 
     m.def("blake3_hash_naive", &blake3_hash_naive,
           py::arg("data"),
           "Compute BLAKE3 hash (single-threaded).");
+
+//     m.def("blake3_gpu_sm70",
+//           &blake3_gpu_root_cv_bytes,
+//           py::arg("data"),
+//           R"pbdoc(
+// Return the 32-byte *root chaining value* (CV) computed on GPU for the given data.
+// NOTE: This is not the standard BLAKE3 digest/XOF output. It's the root CV.
+// )pbdoc");
+    m.def("blake3_gpu_sm70_sm80_hex",
+          &blake3_gpu_root_hex,
+          py::arg("d_data"), py::arg("nbytes"), py::arg("stream")=0,
+          R"pbdoc(
+Return the hex string of the *root chaining value* (CV) computed on GPU.
+)pbdoc");
 }
\ No newline at end of file
diff --git a/csrc/blake3.h b/csrc/blake3.h
index 55aaa43..27509c9 100644
--- a/csrc/blake3.h
+++ b/csrc/blake3.h
@@ -2,6 +2,9 @@
 
 #include <cstdint>
 #include <string>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <array>
 
 namespace flashashing {
 
@@ -23,4 +26,11 @@ class Blake3 {
 
 std::string bytes_to_hex(const uint8_t *data, size_t len);
 
-} // namespace flashashing
\ No newline at end of file
+} // namespace flashashing
+
+
+// ============== GPU implementations ================
+void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, 
+                            uint64_t bytes_len,
+                            std::array<uint32_t,8>* root_out = nullptr,
+                            cudaStream_t stream = 0);
\ No newline at end of file
diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu
new file mode 100644
index 0000000..b835c15
--- /dev/null
+++ b/csrc/blake3_sm70_sm80.cu
@@ -0,0 +1,850 @@
+#include <cute/numeric/int.hpp>
+#include <cstdint>
+#include <array>
+#include <cstdio>
+#include <cstring>
+
+#include <cuda_runtime.h>
+#include <cute/tensor.hpp>
+#include <cute/atom/copy_atom.hpp>
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#include <cute/arch/copy_sm80.hpp>
+#endif
+
+#include <cute/algorithm/copy.hpp>
+#include <cute/config.hpp>
+#include <cute/numeric/numeric_types.hpp>
+
+#include "utils.cuh"
+
+#define WARP_SIZE 32
+#define LDST128BITS(value) (reinterpret_cast<const float4*>(&(value))[0])
+
+#define CUDA_CHECK(expr) do {                                   \
+    cudaError_t _e = (expr);                                    \
+    if (_e != cudaSuccess) {                                    \
+      fprintf(stderr, "CUDA error %s at %s:%d: %s\n",           \
+              #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\
+      std::abort();                                             \
+    }                                                           \
+  } while(0)
+
+
+
+using namespace cute;
+
+using vec_t = cute::uint128_t;        // one time loading 16 B
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  // SM80 branch
+  using Atom = cute::Copy_Atom<cute::SM80_CP_ASYNC_CACHEGLOBAL<vec_t>, vec_t>;
+#else
+  // Volta/Turing（SM70/75）general copy
+  using Atom = cute::Copy_Atom<cute::AutoVectorizingCopyWithAssumedAlignment<128>, vec_t>;
+#endif
+
+
+// ============ Big kernel: 16 WARPS in total ============
+// grid: (chunks / 64), thread: (512,)
+template<const int NUM_THREADS=512,         // or 256
+        const int CHUNK_SIZE=1024, 
+        const int CHUNKS_PER_BLOCK=64,      // 64 KB, if 256 threads -> 32 KB
+        const int PAD_CHUNK=16,
+        const int PAD_CV=0>        // pad shared memory
+__global__ void blake3_block_reduce_kernel(const uint32_t* d_input, 
+                                            uint32_t* block_cvs,
+                                            int chunk_len_bytes,
+                                            uint64_t base_chunk_counter,
+                                            int total_chunks) {
+    // NUM_WARPS also stands for NUM_CHUKNS per block
+    constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;        // consider it as 16
+    constexpr int CHUNKS_PROCEED = CHUNKS_PER_BLOCK;
+
+    static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned");
+    static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4");
+
+    // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total
+    // 8 x 4 x 64 = 2 KiB shared memory in sum
+    __shared__ __align__(16) uint32_t cv_smem[CHUNKS_PER_BLOCK / 2][8 + PAD_CV];        // avoid bank conflict
+
+    constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4;     // 256
+    constexpr int VEC_ELEMS = 4;                        // uint4, 16B
+    constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB)
+    constexpr int WARPS_PER_CTA = NUM_WARPS;            // 16
+
+    // 4 bytes x 256 x 64 = 64 KiB shared memory.
+    __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][WORDS_PER_CHUNK + PAD_CHUNK];     // [64][256+PAD]
+
+    const int tid = threadIdx.x;
+    const int bx = blockIdx.x;
+    const int warp_id = tid / WARP_SIZE;
+    const int lane_id = tid % WARP_SIZE;
+
+    // ============== STAGE 1: Coalsced Global Memory Loading ==============
+    const int tile_id   = blockIdx.x;
+    const int tile_base = tile_id * CHUNKS_PER_BLOCK;       // which chunk do this block start loading
+
+    int valid_chunks = total_chunks - tile_base;
+    if (valid_chunks <= 0) {
+        return; // overflow
+    }
+    if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK;
+
+    // use CuTe for loading
+
+    auto make_smem_tensor = [&](int chunk_local /* The chunk id on shared memory */) {
+        // smem index：chunk_smem[chunk_local][0]
+        // length: (WORDS_PER_CHUNK + PAD_CHUNK)
+        uint32_t* s_u32 = &chunk_smem[chunk_local][0];
+        vec_t* s_vec = reinterpret_cast<vec_t*>(s_u32); // each element is 16B
+        return make_tensor(make_smem_ptr(s_vec),
+                           make_shape(Int<VEC_PER_CHUNK>{}),        // 64 x 4 = 256, one row of shared memory
+                           make_stride(Int<1>{}));  // continuous
+    };
+
+    // shared memory 256 items = 64 x vec_t(4)
+    // thread layout: 32 threads
+    //      thread 0: 0, 32
+    //      thread 1: 1, 33
+    //      thread 32:  31,63
+    // so we load data in coalsced mode
+    // using Atom = cute::Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<vec_t>, vec_t>;
+
+    // thread layout. (lane,0)->idx=2*lane，(lane,1)->idx=2*lane+1
+    auto thr_layout = make_layout(
+        make_shape(Int<2>{}),
+        make_stride(Int<1>{})
+    );
+
+    auto tile_layout = make_layout(
+        make_shape(Int<WARP_SIZE>{}, Int<2>{}),     // (32,2)
+        make_stride(Int<1>{}, Int<WARP_SIZE>{})   // idx = lane + k2*32
+    );
+
+    // build the real copy
+    TiledCopy tcopy = make_tiled_copy(Atom{}, thr_layout, tile_layout);
+
+    // this sentence will load
+    // auto thr_copy = local_partition(tcopy, lane_id);
+
+    for (int ldt = 0; ldt < 4; ldt++) {
+        // each warp load 4 chunks
+        int chunk_local  = ldt * WARPS_PER_CTA + warp_id;           // ldt*16 + warp -> start chunk
+        int chunk_global = tile_base + chunk_local;                 // global chunk idx
+
+        // the pointer for shared memory
+        auto smem_vec1d = make_smem_tensor(chunk_local);        // 64 x uint128_t
+
+        // only read from global, when it's valid
+        // or, we fill it with 0
+        const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK;
+        const vec_t* g_vec_base = reinterpret_cast<const vec_t*>(g_u32);        // 16 B each time
+
+        // the src is 64 x 4 = 256, so is dest
+        auto gmem_vec1d = make_tensor(make_gmem_ptr(g_vec_base),
+                                make_shape(Int<VEC_PER_CHUNK>{}),       // 64 x uint128_t
+                                make_stride(Int<1>{}));
+        bool g_valid = (chunk_local < valid_chunks);
+
+        auto smem_vec2d = make_tensor(smem_vec1d.data(), tile_layout);
+        auto gmem_vec2d = make_tensor(gmem_vec1d.data(), tile_layout);
+
+        // now we will load this
+        auto tCg = local_partition(gmem_vec2d, tile_layout, lane_id);
+        auto tCs = local_partition(smem_vec2d, tile_layout, lane_id);
+
+        // launch the copy inst.
+        if (g_valid) {
+            // gmem load → smem store: 16B x 2
+            copy(tcopy, tCg, tCs);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+            cp_async_fence();
+            cp_async_wait<0>();
+#endif
+        } else {
+            for (int i = 0; i < size(tCs); ++i) {
+                *reinterpret_cast<vec_t*>(raw_pointer_cast(&tCs(i))) = uint128_t(0, 0);
+            }
+        }
+        __syncwarp();   // inner-warp sync
+    }
+
+    __syncthreads();        // sync all warps
+
+#if defined(DBG_KERNEL) && DBG_KERNEL
+        if (threadIdx.x == 0 && blockIdx.x == 0) {
+            printf("Stage 1 finish processing\n");
+        }
+#endif
+
+    // ============== STAGE 2: Compress Leaf to `CHUNKS_PER_BLOCK` chain value ==============
+    const int pass0_valid = min(CHUNKS_PER_BLOCK / 2, valid_chunks);                // pass0 cover [0, CHUNKS_PER_BLOCK / 2] chunks
+    const int pass1_valid = max(0, valid_chunks - (CHUNKS_PER_BLOCK / 2));          // pass1 cover [CHUNKS_PER_BLOCK / 2, CHUNKS_PER_BLOCK] chunks
+
+    __shared__ int parents_count;
+    if (threadIdx.x == 0) {
+        const int parents0 = (pass0_valid + 1) >> 1;     // 8 or 16
+        const int parents1 = (pass1_valid + 1) >> 1;     // 8 or 16
+        parents_count = parents0 + parents1;             // ≤ 32
+    }
+    __syncthreads();
+
+    // this is for each warp's lane0 and lane16 written
+    // to decrease the register usage.
+    __shared__ __align__(16) uint32_t tmp_cv[CHUNKS_PER_BLOCK][8 + PAD_CV]; // 2 KiB
+
+    // lambda function: compress this thing
+    // 64 chunks - 16 warps, 1 warp -> 4 chunks -> 2pass
+    // 32 chunks - 8  warps, 1 warp -> 4 chunks -> 2pass
+    auto do_big_pass = [&](int base /*0 or chunks/2 */, int pass_valid) {
+        // left=base+2*warp_id, right=left+1
+        const int left       = base + (warp_id << 1);       // base + 0,2,4,6,...
+        const int right      = left + 1;
+        const int left_rel   = left  - base;   // 0..31
+        const int right_rel  = right - base;   // 1..32
+        const bool has_left  = (left_rel  < pass_valid);
+        const bool has_right = (right_rel < pass_valid);
+// #if defined(DBG_KERNEL) && DBG_KERNEL
+//         if (base == 16 && threadIdx.x % 32 == 31 && blockIdx.x == 0) {
+//             printf("left %d right %d left_rel %d, right_rel %d, pass_valid %d\n", left, right, left_rel, right_rel, pass_valid);
+//         }
+// #endif
+        
+        // const int lane_id    = threadIdx.x & 31;
+        const int sub        = lane_id >> 4;               // sub-warp: 0 or 1, lane0-15: sub-warp0; lane16-lane31: sub-warp1
+        const int li         = lane_id & 15;               // 0..15
+
+        const unsigned full = __activemask();
+        const unsigned submask = (sub==0 ? 0x0000FFFFu : 0xFFFF0000u) & full;
+        const unsigned mask16  = (sub == 0 ? 0x0000FFFFu : 0xFFFF0000u);
+
+        const int chunk_local = left + sub;        // sub=0→left, sub=1→right
+        const bool valid_sub = (sub==0 ? (left_rel  < pass_valid) : (right_rel < pass_valid));
+
+        // uint32_t my_cv[8];
+
+        // the left-sub-warp and right-sub-warp will execute the same code
+        // distinguish the index by computing,
+        // to avoid warp-divergence
+
+        // the chunk local identifies the left or right chunk, so do not worry.
+        const uint32_t* row = valid_sub ? &chunk_smem[chunk_local][0] : &chunk_smem[0][0];
+        uint32_t* out = valid_sub ? &tmp_cv[chunk_local][0] : nullptr;      // FIXME: here has some problem!!!
+
+#if defined(DBG_KERNEL) && DBG_KERNEL
+        if (!valid_sub) {
+            printf("tile %d warp %d not valid in base %d, pass_valid %d\n", blockIdx.x, threadIdx.x/32, base, pass_valid);
+        }
+#endif
+        const uint64_t  cc  = base_chunk_counter + (uint64_t)(tile_base + chunk_local);
+        blake3_leaf_cv_simd16_onechunk(row, 
+            chunk_len_bytes,
+            cc, 
+            out, 
+            submask);
+
+        __syncwarp(submask);       // make sure two warps written into `tmp_cv`
+
+// #if defined(DBG_KERNEL) && DBG_KERNEL
+//             if (blockIdx.x == 0 && threadIdx.x == 0) printf("The simd16-lane res: \n");
+//             print_cv(out);
+// #endif
+
+        // now, one warp computes 2 chunks, yield one parent-cv value
+        const int pair_idx = (base >> 1) + warp_id;     // 0, 16 + warp_id
+
+        if (has_left) {
+            const uint32_t* lcv = &tmp_cv[left][0];
+
+            if (has_right) {
+                const uint32_t* rcv = &tmp_cv[right][0];
+
+                // sub==0 half warp participate
+                const unsigned sub0_mask = 0x0000FFFFu & full;
+                if ((lane_id >> 4) == 0) {
+                    blake3_parent_cv_simd16(lcv, rcv, &cv_smem[pair_idx][0], sub0_mask);
+                }
+                // sub==1 do not involve
+            } else {
+                // odd: left -> pair_idx
+                if ((lane_id >> 4) == 0) {
+                    int li = lane_id & 15;
+                    if (li < 8) cv_smem[pair_idx][li] = lcv[li];
+                }
+            }
+        }
+
+#if defined(DBG_KERNEL) && DBG_KERNEL
+        if (blockIdx.x == 0 && threadIdx.x == 0) printf("The 1st chunk merged res: \n");
+        print_cv(cv_smem[pair_idx]);
+#endif
+
+        __syncwarp();       // NOTICE: this is necessary!
+    }; // do_big_pass
+
+    // big-pass 1: computing [CHUNK_PER_BLOCK / 2] chunks
+    do_big_pass(0,  pass0_valid);
+
+#if defined(DBG_KERNEL) && DBG_KERNEL
+        if (threadIdx.x == 0 && blockIdx.x == 0 && pass0_valid != 0) {
+            printf("Stage 2 - pass 1 finish processing\n");
+        }
+#endif
+
+    // big-pass 2: computing [CHUNK_PER_BLOCK / 2] chunks
+    do_big_pass(CHUNKS_PER_BLOCK / 2, pass1_valid);
+
+#if defined(DBG_KERNEL) && DBG_KERNEL
+        if (threadIdx.x == 0 && blockIdx.x == 0 && pass1_valid != 0) {
+            printf("Stage 2 - pass 2 finish processing, pass 1: %d\n", pass1_valid);
+        }
+#endif
+
+    __syncthreads();
+
+    // right now, we have got CHUNKS_PER_BLOCK / 2 chain values
+    // a warp-reduce to merge.
+
+    // ============== STAGE 3: Block-Reduce ==============
+    // [32] - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv
+    // we will only use warp 0 to handle this thing
+    // the `parent_count` is the indicator
+    // for 256-8warps-32chunks, the parent count is 16
+    if (parents_count > 0) {
+        // half-warp info
+        const int sub      = lane_id >> 4;        // 0/1 -> which sub warp it belong to
+        const int li       = lane_id & 15;        // 0..15
+        const unsigned full    = __activemask();
+        const unsigned submask = ((sub==0) ? 0x0000FFFFu : 0xFFFF0000u) & full;
+
+        int cur_n = parents_count;
+
+        while (cur_n > 1) {
+            int pairs   = cur_n >> 1;                      // the merge num this layer will do, for 16 parents, this start at 8
+            // warp 0, 0-15: 0
+            // warp 0, 16-31: 1
+            // warp 1, 0-15: 2
+            // warp 1, 16-31: 3
+            // ..
+            // warp 7, 0-15: 14
+            // warp 7, 16-31: 15
+            int half_id = warp_id * 2 + sub;               // half-warp id
+
+            // each half-warp handle one pair： (left=2*half_id, right=left+1)
+            // only half warps will participate in
+            if (half_id < pairs) {
+                int left_idx  = (half_id << 1);
+                int right_idx = left_idx + 1;
+                const uint32_t* L = &cv_smem[left_idx][0];
+                const uint32_t* R = &cv_smem[right_idx][0];
+
+                // 半 warp SIMD16 合并：结果就地写回 left_idx
+                blake3_parent_cv_simd16(L, R, &cv_smem[left_idx][0], submask);
+            }
+
+            __syncthreads();  // make sure all warps could see.
+
+            // odd: up-flow
+            if (cur_n & 1) {
+                // 让 warp0 的 sub0 来搬运（li < 8）
+                if (warp_id == 0 && sub == 0 && li < 8) {
+                    cv_smem[pairs][li] = cv_smem[cur_n - 1][li];
+                }
+            }
+
+            __syncthreads();  // next - level
+
+            cur_n = pairs + (cur_n & 1);
+        }
+
+        // write out
+        if (warp_id == 0 && lane_id == 0) {
+            uint32_t* out = block_cvs + (size_t)blockIdx.x * 8;
+// #if defined(DBG_KERNEL) && DBG_KERNEL
+//             print_cv(cv_smem[0]);
+// #endif
+            reinterpret_cast<uint4*>(out)[0] = make_uint4(cv_smem[0][0], cv_smem[0][1], cv_smem[0][2], cv_smem[0][3]);
+            reinterpret_cast<uint4*>(out)[1] = make_uint4(cv_smem[0][4], cv_smem[0][5], cv_smem[0][6], cv_smem[0][7]);
+        }
+    }
+
+#if defined(DBG_KERNEL) && DBG_KERNEL
+    if (blockIdx.x == 0 && threadIdx.x == 0) 
+        printf("================================ Finishing all in big kernel! ================================\n");
+#endif
+
+}   // blake3_block_reduce_kernel
+
+__device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) {
+    const uint4* g4 = reinterpret_cast<const uint4*>(g);
+    uint4 a = g4[0], b = g4[1];
+    r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w;
+    r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w;
+}
+
+__device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) {
+    uint4* g4 = reinterpret_cast<uint4*>(g);
+    g4[0] = make_uint4(r[0],r[1],r[2],r[3]);
+    g4[1] = make_uint4(r[4],r[5],r[6],r[7]);
+}
+
+__device__ __forceinline__
+void load_cv_s2r_vec(const uint32_t* __restrict__ s, uint32_t dst[8]) {
+    // 确保 16B 对齐：cv_tile 要 __align__(16)，stride=(8+PAD)*4 也要是16的倍数
+    const uint4* p = reinterpret_cast<const uint4*>(s);
+    uint4 v0 = p[0];
+    uint4 v1 = p[1];
+    dst[0]=v0.x; dst[1]=v0.y; dst[2]=v0.z; dst[3]=v0.w;
+    dst[4]=v1.x; dst[5]=v1.y; dst[6]=v1.z; dst[7]=v1.w;
+}
+
+__device__ __forceinline__
+void load_cv_s2r_vec_shared(const uint32_t* __restrict__ s, uint32_t dst[8]) {
+#if __CUDA_ARCH__ >= 700
+    // 把泛型指针显式转成 shared 地址，然后 ld.shared.v4.u32 两次
+    unsigned smem_addr = static_cast<unsigned>(__cvta_generic_to_shared(s));
+    uint32_t x0,x1,x2,x3,x4,x5,x6,x7;
+    asm volatile(
+        "ld.shared.v4.u32 {%0,%1,%2,%3}, [%8];\n\t"
+        "ld.shared.v4.u32 {%4,%5,%6,%7}, [%8+16];\n\t"
+        : "=r"(x0),"=r"(x1),"=r"(x2),"=r"(x3),
+          "=r"(x4),"=r"(x5),"=r"(x6),"=r"(x7)
+        : "r"(smem_addr));
+    dst[0]=x0; dst[1]=x1; dst[2]=x2; dst[3]=x3;
+    dst[4]=x4; dst[5]=x5; dst[6]=x6; dst[7]=x7;
+#else
+    // 旧架构退化到标量
+    #pragma unroll
+    for (int j=0;j<8;++j) dst[j] = s[j];
+#endif
+}
+
+// ============ Tiny kernel ============
+// In big kernel, it will consume 64 or 32 KiB each block  [CHUNKS_PER_BLOCK]
+// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / CHUNKS_PER_BLOCK root = 16384 or 32768 roots
+// And this tiny kernel is designed to process these 16384 or 32768 root
+// For one chain value, takes u32 x 8 = 32B, 
+// 32KB shared memory could contain 1K chain value
+// 16KB shmem         could contain 512 chain value
+template<int NUM_THREADS=256, int TILE_CVS=1024, int PAD=0>
+__global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32,
+                                              uint32_t* __restrict__ out_cv32,
+                                              int N)
+{
+    extern __shared__ __align__(16) uint32_t smem[]; // dyn SMEM >= TILE_CVS*32B
+    // regarad as 2D dyn shmem: [TILE_CVS][8+PAD]
+    uint32_t* cv_tile = smem;
+
+    const int tid     = threadIdx.x;
+    constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+    const int warp_id = tid / WARP_SIZE;   // 0..NUM_WARPS (8 or 16)
+    const int lane_id = tid % WARP_SIZE;   // 0..31
+
+    // the start of this block.
+    // each block will process TILE_CVS roots.
+    const int tile_start = blockIdx.x * TILE_CVS;
+    if (tile_start >= N) return;
+
+    const int tile_n = min(TILE_CVS, N - tile_start); // actual cv count of this block
+
+    // ---------------- Stage 1: coalsced loading to SMEM ----------------
+    // each time load 8 KB, [tile_n/NUM_THREADS] times in total.
+    for (int i = tid; i < tile_n; i += NUM_THREADS) {       // notice: i = tid, not start from 0
+        const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8;
+        uint32_t* s = cv_tile + (size_t)i * (8 + PAD);
+        // 16B x 2
+        const uint4* g4 = reinterpret_cast<const uint4*>(g);
+        uint4*       s4 = reinterpret_cast<uint4*>(s);
+        // s4[0] = g4[0];
+        // s4[1] = g4[1];
+
+        // in case that the address is not aligned
+        uint4 v0 = g4[0];       // each thread load 16B, 256 x 16=4KB
+        uint4 v1 = g4[1];       // each thread load 16B, 256 x 16=4KB
+
+        s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w;
+        s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w;
+    }
+    __syncthreads();
+
+#if defined(DBG_KERNEL) && DBG_KERNEL
+    if (tid == 0 && blockIdx.x == 0) {
+        printf("Block %d root CV for tiny kernel entry:", blockIdx.x);
+        print_cv(smem);
+    }
+#endif
+
+    // ---------------- Stage 2: each lane merge 4 → 1 (keep the neighbor order) ----------------
+    // reduced_n0 = ceil(tile_n / 4)  lane-root
+    const int reduced_n0 = (tile_n + 3) >> 1 >> 1;
+    uint32_t lane_cv[8]; // output of this lane 4-1 root
+    bool lane_valid = false;
+
+    // start index
+    int base4 = tid << 2; // tid*4, 0..8
+    if (base4 < tile_n) {
+        // 4-neighbor CV：idx = base4 + 0,1,2,3
+        uint32_t a[8], b[8], c[8], d[8];
+
+        const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD);
+        load_cv_s2r_vec_shared(s0, a);
+
+        int remain = tile_n - base4;
+
+        // Yazhu: branch-prediction extra-overhead
+
+        if (remain >= 2) {
+            const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD);
+            load_cv_s2r_vec_shared(s1, b);
+        }
+        if (remain >= 3) {
+            const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD);
+            load_cv_s2r_vec_shared(s2, c);
+        }
+        if (remain >= 4) {
+            const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD);
+            load_cv_s2r_vec_shared(s3, d);
+        }
+
+// #if defined(DBG_KERNEL) && DBG_KERNEL
+//         // 石锤了：load进来的东西压根不一样
+//         if (blockIdx.x == 0 && threadIdx.x < 28 && threadIdx.x > 10) {
+//             for (int i = 0 ; i < 32; i++) {
+//                 if (i == lane_id) {
+//                     printf("The lane-merge input from lane: %d:\n", lane_id);
+//                     print_cv(a, lane_id);
+//                     print_cv(b, lane_id);
+//                     print_cv(c, lane_id);
+//                     print_cv(d, lane_id);
+//                 }
+//             }
+//         }
+// #endif
+
+        // merge the neighbor
+        if (remain == 1) {
+            #pragma unroll
+            for (int j = 0; j < 8; ++j) 
+                lane_cv[j] = a[j];
+        } else if (remain == 2) {
+            blake3_parent_cv(a, b, lane_cv);        // write to lane_cv directly
+        } else if (remain == 3) {
+            uint32_t p01[8];
+            blake3_parent_cv(a, b, p01);        // one buffer
+            blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01，(p01,c)->lane_cv
+        } else { // remain >= 4
+            uint32_t p01[8], p23[8];
+            blake3_parent_cv(a, b, p01);
+            blake3_parent_cv(c, d, p23);
+            blake3_parent_cv(p01, p23, lane_cv);
+        }
+        lane_valid = true;
+    }
+
+#if defined(DBG_KERNEL) && DBG_KERNEL
+    if (blockIdx.x == 0 && threadIdx.x == 4 && lane_valid) {
+        printf("The lane-reduce-4 output:");
+        print_cv(lane_cv, 4);
+    }
+#endif
+
+    // ---------------- Stage 3: Warp-level 32→1 neighbor-shfl merge ----------------
+    const int warp_base = warp_id * WARP_SIZE;
+    const int n   = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // active number
+
+    // this will introduce extra branch-prediction overhead
+    const unsigned active_mask = __ballot_sync(0xFFFFFFFFu, lane_id < n);
+
+// #if defined(DBG_KERNEL) && DBG_KERNEL
+//     if (gridDim.x == 1 && lane_valid) {
+//         printf("The tid: %d\n", tid);
+//     }
+// #endif
+
+    if (lane_id >= n) { 
+        #pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            lane_cv[j] = 0u; 
+    }
+
+    __syncwarp(active_mask);
+
+    // step = 1,2,4,8,16 - warp-reduce
+    for (int step = 1; step < n; step <<= 1) {
+        // right-neighbor
+        uint32_t nbr[8];
+        #pragma unroll
+        for (int j = 0; j < 8; ++j)
+            nbr[j] = __shfl_down_sync(active_mask, lane_cv[j], step);
+
+        __syncwarp(active_mask);  // 所有 lane 都抓完再继续
+
+        // only the left-half lane will participant 
+        // 0,2,4,6,8,... step = 1
+        // 0,4,8,...    step = 2
+        // 0,8,..       step = 4
+        // 0,16         step = 8
+        // 0            step = 16
+        const bool do_pair =
+            ((lane_id & ((step << 1) - 1)) == 0) &&   // 只允许每 2*step 组的第一个 lane 合并
+            ((lane_id ^ step) < n);                   // 右邻确实在活跃集合内
+
+// #if defined(DBG_KERNEL) && DBG_KERNEL
+//         if (gridDim.x != 1 && blockIdx.x == 0 && threadIdx.x < 32) printf("lane %d do pair? %d\n", lane_id, do_pair);
+// #endif
+
+// #if defined(DBG_KERNEL) && DBG_KERNEL
+//         if (gridDim.x != 1 && blockIdx.x == 0 && threadIdx.x == 16) {
+//                 printf("left on step %d:", step);
+//                 print_cv(lane_cv, 16);
+//                 printf("right on step %d:", step);
+//                 print_cv(nbr, 16);
+//         }
+// #endif
+
+        if (do_pair) {
+            blake3_parent_cv(lane_cv, nbr, lane_cv);        // does not require shared memory
+        }
+        __syncwarp(active_mask);
+    }
+
+    // lane0；NUM_WARPS warp-root write to SMEM
+    // e.g. if this block has 8 warp, there will be 8 warp-root
+    __shared__ uint32_t warp_roots[NUM_WARPS][8]; // NUM_WARPS × 8
+    if (lane_id == 0 && n > 0) {
+        #pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            warp_roots[warp_id][j] = lane_cv[j];
+    }
+    __syncthreads();
+
+#if defined(DBG_KERNEL) && DBG_KERNEL
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        printf("The warp-reduce output:");
+        print_cv(warp_roots[0]);
+    }
+#endif
+
+    // ---------------- Stage 4: CTA's NUM_WARPS → 1 block reduce ----------------
+    int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..NUM_WARPS
+    if (valid_warps == 0) return;
+
+    
+    // 16 lane compute the merge together
+    const int  sub      = lane_id >> 4;              // 0/1
+    const int  li       = lane_id & 15;              // 0..15
+    const unsigned full    = __activemask();
+    const unsigned submask = ((sub==0) ? 0x0000FFFFu : 0xFFFF0000u) & full;
+    const int half_id = warp_id * 2 + sub;       // half-warp index
+    int cur_n = valid_warps;
+
+    while (cur_n > 1) {
+        const int pairs   = cur_n >> 1;              // the pair count.
+
+        if (half_id < pairs) {
+            const int left_idx  = (half_id << 1);
+            const int right_idx = left_idx + 1;
+
+            const uint32_t* L = &warp_roots[left_idx][0];
+            const uint32_t* R = &warp_roots[right_idx][0];
+
+            blake3_parent_cv_simd16(L, R, &warp_roots[left_idx][0], submask);
+        }
+
+        __syncthreads();
+
+        // if odd
+        if (cur_n & 1) {
+            if (warp_id == 0 && sub == 0 && li < 8) {
+                warp_roots[pairs][li] = warp_roots[cur_n - 1][li];
+            }
+        }
+
+        __syncthreads();                             // next-level
+        cur_n = pairs + (cur_n & 1);                 //
+    }
+#if defined(DBG_KERNEL) && DBG_KERNEL
+    if (tid == 0 && blockIdx.x == 0) {
+        printf("Block %d root CV for tiny kernel:", blockIdx.x);
+        print_cv(warp_roots[0]);
+    }
+#endif
+
+    // ---------------- Stage 5: write to output ----------------
+    if (threadIdx.x == 0) {
+        uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8;
+        #pragma unroll
+        for (int j = 0; j < 8; ++j) 
+            out[j] = warp_roots[0][j];
+    }
+
+#if defined(DBG_KERNEL) && DBG_KERNEL
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        printf("================================ Finishing all in tiny kernel! ================================\n");
+    }
+#endif
+
+}
+
+inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) {
+    const uint32_t zero_block[16] = {0};
+    uint32_t st[16];
+    blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st);
+    for (int i = 0; i < 8; ++i) {
+        uint32_t w = st[i];
+        out32[4*i+0] = (uint8_t)( w        & 0xFF);
+        out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF);
+        out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF);
+        out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF);
+    }
+}
+
+// wrapper function
+void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, 
+                            uint64_t bytes_len,
+                            std::array<uint32_t,8>* root_out = nullptr,
+                            cudaStream_t stream = 0) {
+    if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) {
+        fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n",
+                (unsigned long long)bytes_len);
+        std::abort();
+    }
+
+    int optin = 0, deflt = 0;
+    cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0);
+    cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0);
+        
+    constexpr int  CHUNKS_PER_BLOCK = 32;                     // 16 * 32 = 512
+    constexpr int  CHUNK_SIZE = 1024;                       // 1 KiB
+    constexpr int  WORDS_PER_CHUNK = CHUNK_SIZE / 4;        // 256
+    constexpr int  NUM_THREADS = CHUNKS_PER_BLOCK * 512 / 64;                       // for big kernel, 512 or 256
+    constexpr int  NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE;       // 16 or 8
+    const int chunk_len_bytes = CHUNK_SIZE;                 // 1 KiB per chunk
+    const uint64_t total_chunks = bytes_len / CHUNK_SIZE;
+    const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK);     // 16384 blocks, 32768 for 32-size
+
+    constexpr int pad_chunk = 16;
+    constexpr int pad_cv = 0;
+
+    CUDA_CHECK(cudaFuncSetAttribute(
+        blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>,
+        cudaFuncAttributePreferredSharedMemoryCarveout, 100));
+        
+    const auto* d_words = reinterpret_cast<const uint32_t*>(d_data); // alias
+    uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32
+
+    // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch.
+
+    // TODO: use thrust
+    cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t));     // 512 KiB, 1M for 32-size
+
+    // ============= launch big kernel =============
+    dim3 grid_big(num_blocks);
+    dim3 block_big(NUM_THREADS);
+    uint64_t base_chunk_counter = 0ull;
+    
+    blake3_block_reduce_kernel<NUM_THREADS, CHUNK_SIZE, CHUNKS_PER_BLOCK, pad_chunk, pad_cv>
+        <<<grid_big, block_big, /*smem*/0, stream>>>(
+            d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks);
+
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    if (num_blocks == 1) {
+        std::array<uint32_t,8> host_root{};
+        CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        // last final process
+        uint8_t digest32[32];
+        blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+        if (root_out) *root_out = host_root;
+        else {
+            // 简单打印
+            printf("root CV:");
+            for (int i=0;i<8;++i) 
+                printf(" %08x", host_root[i]);
+            printf("\n");
+        }
+
+        CUDA_CHECK(cudaFree(d_blockCV));
+        // CUDA_CHECK(cudaFree(d_bytes));
+        return;
+    }
+
+    // the first round of tiny kernel
+    // 1) 16384 or 32768 output reduce -> 8
+    
+    uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32
+    {
+        const int dyn_smem = 32 * 1024;     // 32KiB
+        const int N = num_blocks;        // total root-cv number
+        constexpr int TILE = 1024;
+        const int grid = (N + TILE - 1) / TILE; // 32 KB / 1024 = 32
+        constexpr int NUM_THREADS = 256;
+        const size_t smem_bytes = (size_t)(TILE) * (8u + pad_cv) * sizeof(uint32_t); // 1024 x 32 = 32 KiB
+
+        // decide upon compiling time
+        CUDA_CHECK(cudaFuncSetAttribute(
+            blake3_cv_block_reduce_kernel<NUM_THREADS, TILE, pad_cv>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+
+        cudaMalloc(&d_mid_out, (size_t)grid * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<NUM_THREADS, TILE, pad_cv>
+            <<<grid, NUM_THREADS, smem_bytes, stream>>>(d_blockCV /*in: 16384×8 x 4*/,
+                                                d_mid_out   /*out: 8×8*/, N);
+        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    // second round
+    uint32_t* d_root_cv = nullptr;
+    {
+        const int dyn_smem = 1024;     // 1 KiB will enough
+        constexpr int TILE = 1024; // any >= N
+        const int N = (num_blocks + TILE - 1) / TILE;       // 32
+        constexpr int grid = 1;
+        constexpr int NUM_THREADS = 32; // 32 threads will be enough
+        const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t);        // 8 x 8 x 4 = 8 x 32 B
+
+        // decide upon compiling time
+        CUDA_CHECK(cudaFuncSetAttribute(
+            blake3_cv_block_reduce_kernel<NUM_THREADS, TILE, pad_cv>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem));
+
+        cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t));
+
+        blake3_cv_block_reduce_kernel<NUM_THREADS, TILE, pad_cv>
+            <<<grid, NUM_THREADS, smem_bytes, stream>>>(d_mid_out /*in: 8×8*/,
+                                                d_root_cv /*out: 1×8*/, N);
+        CUDA_CHECK(cudaGetLastError());
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    std::array<uint32_t, 8> host_root{};
+    CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root_cv, 8*sizeof(uint32_t),
+                                cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    // last final process
+    uint8_t digest32[32];
+    blake3_digest32_from_root_cv(host_root.data(), digest32);
+
+    if (root_out) {
+        *root_out = host_root;
+    } else {
+        printf("root CV:");
+        for (int i=0;i<8;++i) printf(" %08x", host_root[i]);
+        printf("\n");
+    }
+
+    // clear
+    CUDA_CHECK(cudaFree(d_mid_out));
+    CUDA_CHECK(cudaFree(d_root_cv));
+    CUDA_CHECK(cudaFree(d_blockCV));
+    // CUDA_CHECK(cudaFree(d_bytes));       // this memory is managed by torch, we do not free it.
+}
\ No newline at end of file
diff --git a/csrc/utils.cuh b/csrc/utils.cuh
new file mode 100644
index 0000000..c0311b8
--- /dev/null
+++ b/csrc/utils.cuh
@@ -0,0 +1,455 @@
+#include <cstring>
+#pragma once
+
+#define WARP_SIZE 32
+#define G(a,b,c,d, x, y)            \
+    do {                            \
+        (a) = (a) + (b) + (x);      \
+        (d) = rotr32((d) ^ (a),16); \
+        (c) = (c) + (d);            \
+        (b) = rotr32((b) ^ (c),12); \
+        (a) = (a) + (b) + (y);      \
+        (d) = rotr32((d) ^ (a), 8); \
+        (c) = (c) + (d);            \
+        (b) = rotr32((b) ^ (c), 7); \
+    } while (0)
+
+// host - side definition
+inline constexpr uint32_t BLAKE3_IV_HOST[8] = {
+  0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
+  0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
+};
+
+__device__ __constant__ uint32_t BLAKE3_IV_DEV[8] = {
+  0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
+  0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
+};
+
+inline constexpr int B3_PERMUTE_HOST[16] = {
+    2, 6, 3,10, 7, 0, 4,13, 1,11,12, 5, 9, 14, 15, 8
+};
+
+__device__ __constant__ int B3_PERMUTE_DEV[16] = {
+    2, 6, 3,10, 7, 0, 4,13, 1,11,12, 5, 9, 14, 15, 8
+};
+
+enum : uint32_t {
+    FLAG_CHUNK_START        = 1u << 0,
+    FLAG_CHUNK_END          = 1u << 1,
+    FLAG_PARENT             = 1u << 2,
+    FLAG_ROOT               = 1u << 3,
+    FLAG_KEYED_HASH         = 1u << 4,
+    FLAG_DERIVE_KEY_CONTEXT = 1u << 5,
+    FLAG_DERIVE_KEY_MATERIAL= 1u << 6,
+};
+
+__device__ __noinline__
+uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) {
+    uint32_t f = 0;
+    f |= (uint32_t)-(block_idx_in_chunk==0)          & FLAG_CHUNK_START;
+    f |= (uint32_t)-(block_idx_in_chunk==nblocks_in_chunk-1)  & FLAG_CHUNK_END;
+    if (is_root_chunk)                         f |= FLAG_ROOT; // only this block in msg, or this is root
+    return f;
+}
+
+__device__ __forceinline__
+uint32_t blake3_parent_flags(bool is_root_parent) {
+    return FLAG_PARENT | (is_root_parent ? FLAG_ROOT : 0);
+}
+
+// ---- 小工具 ----
+__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
+#if defined(__CUDA_ARCH__)
+    return __funnelshift_r(x, x, n);
+#else
+  return (x >> n) | (x << (32 - n));    // host 路径
+#endif
+}
+
+__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) {
+#if defined(__CUDA_ARCH__)
+    const uint4 v = *reinterpret_cast<const uint4*>(src);
+    dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w;
+#else
+    std::memcpy(dst, src, 16);
+#endif
+}
+
+__device__ void print_cv(uint32_t cv[8], int tgt_tid = 0) {
+    if (blockIdx.x == 0 && threadIdx.x == tgt_tid) {
+        auto get_byte = [&](int i) {
+            int w = i >> 2;          // 第 i 个字节来自第 w 个 u32
+            int off = (i & 3) * 8;   // 在该 u32 中的偏移
+            return (unsigned)((cv[w] >> off) & 0xFFu);
+        };
+
+        printf("block %d root CV (u32, little-endian words):", blockIdx.x);
+        for (int i = 0; i < 32; ++i) {
+            printf("%02x", get_byte(i));
+            if ((i & 3) == 3) printf(" ");   // 每 4 字节空格
+        }
+        printf("\n");
+    }
+}
+
+// swap-table
+// BLAKE3 message schedule: rows are P^r, r=0..6.
+// Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) 
+// https://www.cse.unsw.edu.au/~cs4601/refs/papers/blake3.pdf
+__device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = {
+    // r = 0: identity
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+    // r = 1: P
+    {  2,  6,  3, 10,  7,  0,  4, 13,  1, 11, 12,  5,  9, 14, 15,  8 },
+    // r = 2: P∘P
+    {  3,  4, 10, 12, 13,  2,  7, 14,  6,  5,  9,  0, 11, 15,  8,  1 },
+    // r = 3
+    { 10,  7, 12,  9, 14,  3, 13, 15,  4,  0, 11,  2,  5,  8,  1,  6 },
+    // r = 4
+    { 12, 13,  9, 11, 15, 10, 14,  8,  7,  2,  5,  3,  0,  1,  6,  4 },
+    // r = 5
+    {  9, 14, 11,  5,  8, 12, 15,  1, 13,  3,  0, 10,  2,  6,  4,  7 },
+    // r = 6
+    { 11, 15,  5,  0,  1,  9,  8,  6, 14, 10,  2, 12,  3,  4,  7, 13 },
+};
+
+// get the "r" round, "k" message, it is broadcasted from m[li] lane. li = k
+__device__ __forceinline__ uint32_t msg_rk(uint32_t m_lane, int round, int k, unsigned mask16) {
+    int src = B3_MSG_SCHEDULE[round][k];
+    return __shfl_sync(mask16, m_lane, src, 16);
+}
+
+__device__ __noinline__
+uint32_t G_update_role(uint32_t v_self, uint32_t v_b, uint32_t v_c, uint32_t v_d,
+                       uint32_t mx, uint32_t my, int role)
+{
+    // 按 BLAKE2s 32-bit 的 G 序列算出 a',b',c',d'，最后返回“当前 role”的那个值
+    uint32_t a = v_self, b = v_b, c = v_c, d = v_d;
+
+    // a = a + b + mx; 
+    // d ^= a; 
+    // d >>>= 16
+    a = a + b + mx;   
+    d ^= a;   
+    d = rotr32(d, 16);
+
+    // c = c + d; 
+    // b ^= c; 
+    // b >>>= 12
+    c = c + d;        
+    b ^= c;   
+    b = rotr32(b, 12);
+
+    // a = a + b + my; 
+    // d ^= a; 
+    // d >>>= 8
+    a = a + b + my;   
+    d ^= a;   
+    d = rotr32(d, 8);
+
+    // c = c + d; 
+    // b ^= c; 
+    // b >>>= 7
+    c = c + d;        
+    b ^= c;   
+    b = rotr32(b, 7);
+
+    // role choice:
+    switch (role) {
+      case 0: return a;
+      case 1: return b;
+      case 2: return c;
+      default: return d;
+    }
+}
+
+
+// =============== Leaf 16-lane compressing ===============
+// notice that, this function will proceed 2 chunks, each time.
+// - chunk_words_row: current chunk
+// - out_cv: written by lane 0, or lane 16
+__device__ __noinline__
+void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row, // 256×u32 -> 1024 Bytes, from shared memory
+                                    // so the chunks_row += 2 as gap
+                                    int chunk_len_bytes,
+                                    uint64_t chunk_counter,
+                                    uint32_t out_cv[8],
+                                    unsigned mask16) {
+    // computing index
+    int lane = threadIdx.x & 31;    // lane_id: 0-31
+    int sub  = lane >> 4;            // 0/1
+    int li   = lane & 15;            // 0..15, abstract lane id. for example, lane 16 will be li=0
+    int role = li & 3;               // a/b/c/d role
+    int base = (sub << 4);           // 0 or 16 the absolute base
+
+    const int nblocks = (chunk_len_bytes + 63) >> 6;  // ceil(chunk_len/64)
+
+    int warp_id = threadIdx.x / WARP_SIZE;
+
+    // initialize
+    uint32_t cv_word = 0;
+    if (li < 8) cv_word = BLAKE3_IV_DEV[li];
+
+    // process all blocks
+    // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks
+    // each block has 64B -> 16 x u32
+    for (int b = 0; b < nblocks; ++b) {
+        // each lane holds one u32, 
+        // 16 lane will hold 16 x 4 = 64 B -> it's block
+        // the another 16 lane will hold opposite 64 B
+        const uint32_t m_lane = chunk_words_row[b * 16 + li];
+
+        // 初始化 v：v[0..7]=cv, v[8..11]=IV，v[12..15]^=t/len/flags
+        // 先把“自己的那个索引”的初值准备好：
+        
+        // 计数器/长度/标志（按 BLAKE3 规范）
+        const uint32_t t0 = (uint32_t)chunk_counter;
+        const uint32_t t1 = (uint32_t)(chunk_counter >> 32);
+        const int remain = chunk_len_bytes - (b << 6);
+        const uint32_t block_len = (remain >= 64) ? 64u : (uint32_t)remain;
+        
+        const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false);
+        uint32_t v =
+            (li < 8)  ? cv_word :
+            (li < 12) ? BLAKE3_IV_DEV[(li - 8) & 3] :
+            (li == 12) ? t0 :
+            (li == 13) ? t1 :
+            (li == 14) ? block_len : flags;
+
+        // 把索引写成 li = q + 4*rq；对角步先做置换 li' = (rq<<2) | ((q+rq)&3)
+        int q  = (li & 3);
+        int rq = (li >> 2);
+        int li_diag = (rq << 2) | ((q + rq) & 3);
+        int li_undo = (rq << 2) | ((q - rq) & 3);
+        int gi_col = q; // 0..3
+        int gi_diag = (li_diag & 3); // 0..3
+
+        // ===== 7 rounds =====
+        #pragma unroll 4
+        for (int r = 0; r < 7; ++r) {
+            // inside this loop, each lane will do one job
+            // 16 lane will execute 16 x 2 operations
+            // in sequential-programming, will do 8 operation
+
+            // ---- 列步（quartet: {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15}）----
+            {
+                // 取同 quartet 的 b/c/d（基于当前 v）
+                uint32_t vb = __shfl_xor_sync(mask16, v,  4, 16);
+                uint32_t vc = __shfl_xor_sync(mask16, v,  8, 16);
+                uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16);
+
+                // 本 quartet 的 i ∈ {0,1,2,3}，列步用 msg 索引 0..7（两两为一对）
+                
+                uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16);
+                uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16);
+
+                v = G_update_role(v, vb, vc, vd, mx, my, role);
+            }
+
+            // ---- 对角步 ----
+            {
+                // 在“对角置换域”取到当前 v 值
+                uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16);
+
+                // 在该域内做“列步”同样的四邻取值
+                uint32_t vb = __shfl_xor_sync(mask16, v_diag,  4, 16);
+                uint32_t vc = __shfl_xor_sync(mask16, v_diag,  8, 16);
+                uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16);
+
+                // 对角步的 4 组 G 使用本轮消息对的后半（索引 8..15）
+                
+                uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16);
+                uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16);
+
+                uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role);
+
+                // 反置换回原位：li_undo = (rq<<2) | ((q - rq) & 3)
+                
+                // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16);
+                v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16);
+            }
+        } // 7 rounds end
+
+        // 派生新的 CV：cv[i] = v[i] ^ v[i+8]（仅 li=0..7 生效）
+        uint32_t vip8_all = __shfl_sync(mask16, v, li ^ 8, 16);
+        if (li < 8) {
+            cv_word = v ^ vip8_all;
+        }
+
+        // 下一块继续（本函数内 16 个 block 串行）
+    }
+
+    // 由 lane0 / lane16 收集 8×u32 输出
+    #pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用
+        if (li == 0 && out_cv) out_cv[j] = wj;                       // 仅 lane0 落盘
+    }
+}
+
+// =============== Parent 16-lane compressing ===============
+__device__ __forceinline__
+void blake3_parent_cv_simd16(const uint32_t* __restrict__ L,     // 8×u32
+                             const uint32_t* __restrict__ R,     // 8×u32
+                             uint32_t* __restrict__ out_cv,      // 8×u32
+                             unsigned mask16)                    // half-warp masks for 16 lanes
+{
+    const int lane = threadIdx.x & 31;
+    const int li   = lane & 15;      // 0..15 half the warp
+    const int role = li & 3;
+
+    // messages: the front 8 from L, and the latter 8 from R
+    const uint32_t m_lane = (li < 8) ? L[li] : R[li - 8];
+
+    // v initialize
+
+    const uint32_t t0 = 0u;
+    const uint32_t t1 = 0u;
+    const uint32_t block_len = 64u;
+    const uint32_t flags = FLAG_PARENT;
+
+    uint32_t iv_val = BLAKE3_IV_DEV[li & 7];
+
+    uint32_t v =
+        (li < 12)  ? iv_val :
+        (li == 12) ? t0 :
+        (li == 13) ? t1 :
+        (li == 14) ? block_len : flags;
+
+    // 与 leaf 相同的“列/对角”两步、共 7 轮
+    int q  = (li & 3);
+    int rq = (li >> 2);
+    int li_diag = (rq << 2) | ((q + rq) & 3);
+    int li_undo = (rq << 2) | ((q - rq) & 3);
+    int gi_col  = q;
+    int gi_diag = (li_diag & 3);
+
+    #pragma unroll 4
+    for (int r = 0; r < 7; ++r) {
+        // 列步
+        {
+            uint32_t vb = __shfl_xor_sync(mask16, v,  4, 16);
+            uint32_t vc = __shfl_xor_sync(mask16, v,  8, 16);
+            uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16);
+
+            uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16);
+            uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16);
+
+            v = G_update_role(v, vb, vc, vd, mx, my, role);
+        }
+        // 对角步
+        {
+            uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16);
+            uint32_t vb = __shfl_xor_sync(mask16, v_diag,  4, 16);
+            uint32_t vc = __shfl_xor_sync(mask16, v_diag,  8, 16);
+            uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16);
+
+            uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16);
+            uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16);
+
+            uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role);
+            v = __shfl_sync(mask16, v_diag_new, li_undo, 16);
+        }
+    }
+
+    // state -> CV：cv[i] = v[i] ^ v[i+8]
+    uint32_t vip8 = __shfl_sync(mask16, v, li ^ 8, 16);
+    uint32_t cv_word = (li < 8) ? (v ^ vip8) : 0;
+
+    // 半 warp 汇聚到 out_cv[0..7]（仅 li==0 的 4×收集也可以）
+    #pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        uint32_t wj = __shfl_sync(mask16, cv_word, j, 16);
+        if (li == 0) out_cv[j] = wj;
+    }
+}
+
+// =============== Parent 1-lane compressing ===============
+// the actually right compress 7r in a single lane function
+__host__ __device__ void blake3_compress_words_7r(
+    const uint32_t block_words[16],   // 64B -> shared memory
+    const uint32_t cv[8],             // 8×u32 -> shared memory
+    uint64_t chunk_counter,           // 64-bit
+    uint32_t block_len,               // [0..64]
+    uint32_t flags,                   // CHUNK_START/END/PARENT/ROOT…
+    uint32_t out_state[16])           // output
+{
+    // 1) initialize v
+    uint32_t v0=cv[0], v1=cv[1], v2=cv[2], v3=cv[3];
+    uint32_t v4=cv[4], v5=cv[5], v6=cv[6], v7=cv[7];
+
+#if defined(__CUDA_ARCH__)
+    // device-side call
+    const uint32_t* IV = BLAKE3_IV_DEV;
+#else
+    // host-side call
+    const uint32_t* IV = BLAKE3_IV_HOST;
+#endif
+
+    uint32_t v8 =IV[0], v9 =IV[1], v10=IV[2], v11=IV[3];
+    // injection
+    uint32_t v12=(uint32_t)chunk_counter, v13=(uint32_t)(chunk_counter >> 32), v14=block_len, v15=flags;
+
+    // 2) 7 轮
+    int perm[16];    // 每轮的消息索引
+    #pragma unroll
+    for (int i = 0; i < 16; ++i) 
+        perm[i] = i;
+
+    #pragma unroll
+    for (int r=0; r < 7; ++r) {
+        // col-step
+        G(v0, v4, v8, v12, block_words[perm[0]],  block_words[perm[1]]);
+        G(v1, v5, v9, v13, block_words[perm[2]],  block_words[perm[3]]);
+        G(v2, v6, v10,v14, block_words[perm[4]],  block_words[perm[5]]);
+        G(v3, v7, v11,v15, block_words[perm[6]],  block_words[perm[7]]);
+
+        // diag-step
+        G(v0, v5, v10,v15, block_words[perm[8]], block_words[perm[9]]);
+        G(v1, v6, v11,v12, block_words[perm[10]], block_words[perm[11]]);
+        G(v2, v7, v8, v13, block_words[perm[12]], block_words[perm[13]]);
+        G(v3, v4, v9, v14, block_words[perm[14]], block_words[perm[15]]);
+
+        // perm = perm ∘ PERMUTE
+        int np[16];
+#if defined (__CUDA_ARCH__)
+        const int* PERM_T = B3_PERMUTE_DEV;
+#else
+        const int* PERM_T = B3_PERMUTE_HOST;
+#endif
+        #pragma unroll
+        for (int i = 0; i < 16; ++i) 
+            np[i] = perm[PERM_T[i]];
+        #pragma unroll
+        for (int i = 0; i < 16; ++i) 
+            perm[i] = np[i];
+    }
+
+    // 3) write to out state
+    out_state[ 0]=v0;  out_state[ 1]=v1;  out_state[ 2]=v2;  out_state[ 3]=v3;
+    out_state[ 4]=v4;  out_state[ 5]=v5;  out_state[ 6]=v6;  out_state[ 7]=v7;
+    out_state[ 8]=v8;  out_state[ 9]=v9;  out_state[10]=v10; out_state[11]=v11;
+    out_state[12]=v12; out_state[13]=v13; out_state[14]=v14; out_state[15]=v15;
+}
+
+// from out_state yields CV
+__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){
+#pragma unroll
+    for (int i = 0; i < 8; ++i) 
+        out_cv[i] = st[i] ^ st[8+i];
+}
+
+__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){
+    uint32_t msg[16];
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[i] = L[i]; 
+    }
+#pragma unroll
+    for (int i = 0; i < 8; ++i) { 
+        msg[8+i] = R[i]; 
+    }
+    uint32_t st[16];
+    blake3_compress_words_7r(msg, BLAKE3_IV_DEV, 0ull, 64u, FLAG_PARENT, st);
+    blake3_state_to_cv(st, out_cv);
+}
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index b62068c..68e8d6d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,5 @@ pandas==2.3.3
 pip-chill==1.0.3
 pybind11==3.0.1
 tomli==2.0.1
+torch
+blake3
\ No newline at end of file
diff --git a/setup.py b/setup.py
index eb3a2e4..a45ea14 100644
--- a/setup.py
+++ b/setup.py
@@ -1,39 +1,158 @@
+# setup.py
 from setuptools import setup
-from pybind11.setup_helpers import Pybind11Extension, build_ext
-import sys
-import sysconfig
-
-import numpy as np
-
-cxx_std = 17
-extra_compile_args = []
-extra_link_args = []
-
-# linux environment, default
-extra_compile_args += ["-O2", "-ffast-math", "-march=native", "-fopenmp", "-Wall", "-Wextra", "-Wpedantic", "-mavx2", "-mfma"]
-extra_link_args += ["-fopenmp"]
-
-ext_modules = [
-    Pybind11Extension(
-        "flashashing",
-        sources=[
-          "csrc/sha256_base.cpp",
-          "csrc/sha256_simd.cpp",
-          "csrc/blake3_base.cpp",
-          "csrc/binding.cpp"
-        ],
-        include_dirs=[np.get_include()],
-        cxx_std=cxx_std,
-        extra_compile_args=extra_compile_args,
-        extra_link_args=extra_link_args,
+from setuptools.command.build_ext import build_ext
+import pybind11, numpy as np
+import sys, os, shutil
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+def find_in_path(name, path):
+    for d in path.split(os.pathsep):
+        p = os.path.join(d, name)
+        if os.path.exists(p):
+            return os.path.abspath(p)
+    return None
+
+def locate_cuda():
+    cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
+    nvcc = None
+    if cuda_home:
+        nvcc = os.path.join(cuda_home, "bin", "nvcc")
+    else:
+        nvcc = find_in_path("nvcc", os.environ.get("PATH", ""))
+        if nvcc:
+            cuda_home = os.path.dirname(os.path.dirname(nvcc))
+    if not nvcc or not os.path.exists(nvcc):
+        raise RuntimeError("nvcc not found. Set CUDA_HOME/CUDA_PATH or add nvcc to PATH.")
+    cuda_include = os.path.join(cuda_home, "include")
+    if sys.platform.startswith("win"):
+        cuda_libdir = os.path.join(cuda_home, "lib", "x64")
+    else:
+        cuda_libdir = os.path.join(cuda_home, "lib64")
+    return {"home": cuda_home, "nvcc": nvcc, "include": cuda_include, "libdir": cuda_libdir}
+
+def locate_cutlass_cute():
+    """
+    返回 CUTLASS/CuTe 的 include 根目录（要求该目录下有 cutlass/ 与 cute/）。
+    优先使用环境变量 CUTLASS_HOME 或 CUTE_HOME。
+    """
+    cand = []
+    if "CUTE_HOME" in os.environ:
+        cand.append(os.path.join(os.environ["CUTE_HOME"], "include"))
+        cand.append(os.environ["CUTE_HOME"])
+    if "CUTLASS_HOME" in os.environ:
+        cand.append(os.path.join(os.environ["CUTLASS_HOME"], "include"))
+        cand.append(os.environ["CUTLASS_HOME"])
+
+    cand += [
+        "/usr/local/include",                         # linux 常见
+        os.path.expanduser("~/cutlass/include"),      # clone 到家目录
+        os.path.expanduser("~/CUTLASS/include"),
+        os.path.expanduser("~/third_party/cutlass/include"),
+        os.path.abspath("third_party/cutlass/include"),
+    ]
+
+    def ok(p):
+        return p and os.path.isdir(p) and \
+               os.path.isdir(os.path.join(p, "cute"))  # CuTe 在 cutlass/include/cute
+
+    for p in cand:
+        if ok(p):
+            return os.path.abspath(p)
+
+    raise RuntimeError(
+        "Cannot find CUTLASS/CuTe include path.\n"
+        "Set CUTLASS_HOME or CUTE_HOME to the repository root (the path that contains 'include/cute')."
     )
+
+CUDA = locate_cuda()
+CUTLASS_INCLUDE = locate_cutlass_cute()
+
+print(f"Found cutlass include: {CUTLASS_INCLUDE}")
+
+CXX_STD = 17
+
+arch_list = os.environ.get("FLASHASHING_CUDA_ARCH_LIST", "80;89").split(";")
+NVCC_ARCH_FLAGS = []
+for a in arch_list:
+    a = a.strip()
+    if a:
+        NVCC_ARCH_FLAGS += ["-gencode", f"arch=compute_{a},code=sm_{a}"]
+
+COMMON_DEFINES = []
+COMMON_INCLUDES = [
+    np.get_include(), 
+    pybind11.get_include(), 
+    pybind11.get_include(user=True), 
+    CUDA["include"],
+    CUTLASS_INCLUDE,
+]
+COMMON_LIB_DIRS = [CUDA["libdir"]]
+COMMON_LIBS = ["cudart"]
+RPATH = [CUDA["libdir"]] if not sys.platform.startswith("win") else []
+
+debug = os.environ.get("DBG_KERNEL", "0") == "1"
+
+CXX_FLAGS = [
+    f"-std=c++{CXX_STD}", "-O3", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math", "-march=native", "-mavx2", "-mfma"
+] if not debug else [
+    f"-std=c++{CXX_STD}", "-g", "-O0", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math", "-march=native", "-mavx2", "-mfma", "-DDBG_KERNEL=1"
+]
+LINK_FLAGS = []
+
+# OpenMP
+if sys.platform.startswith("win"):
+    # MSVC
+    CXX_FLAGS += ["/openmp"]
+else:
+    CXX_FLAGS += ["-fopenmp"]
+    LINK_FLAGS += ["-fopenmp"]
+
+NVCC_FLAGS = [
+    f"-std=c++{CXX_STD}",
+    "-O3", "-Xcompiler", "-fPIC",
+    "--expt-relaxed-constexpr",
+    "--use_fast_math",
+    "-lineinfo",
+] + NVCC_ARCH_FLAGS if not debug else [
+    f"-std=c++{CXX_STD}",
+    "-g", "-O0", "-Xcompiler", "-fPIC",
+    "--expt-relaxed-constexpr",
+    "--use_fast_math",
+    # "-G", 
+    "-lineinfo",
+    "-DDBG_KERNEL=1",
+] + NVCC_ARCH_FLAGS
+
+if not sys.platform.startswith("win"):
+    NVCC_FLAGS += ["-Xcompiler", "-fopenmp"]
+else:
+    # MSVC 的 NVCC 透传
+    NVCC_FLAGS += ["-Xcompiler", "/openmp", "-Xcompiler", "/MD", "-Xcompiler", "/O2"]
+
+# ---------- 扩展模块 ----------
+sources = [
+    "csrc/sha256_base.cpp",
+    "csrc/sha256_simd.cpp",
+    "csrc/blake3_base.cpp",
+    "csrc/blake3_sm70_sm80.cu",
+    "csrc/binding.cpp",
 ]
 
 setup(
     name="flashashing",
-    version="0.1.0",
-    description="High performance hashing (SHA-256, BLAKE3) implementation with pybind11",
-    ext_modules=ext_modules,
-    cmdclass={"build_ext": build_ext},
-    zip_safe=False,
+    ext_modules=[
+        CUDAExtension(
+            "flashashing",
+            sources=sources,
+            extra_compile_args={
+                "cxx": CXX_FLAGS,
+                "nvcc": NVCC_FLAGS,
+            },
+            include_dirs=COMMON_INCLUDES,
+            library_dirs=COMMON_LIB_DIRS,
+            libraries=COMMON_LIBS,
+            extra_link_args=LINK_FLAGS + (["-Wl,-rpath," + RPATH[0]] if RPATH else []),
+        )
+    ],
+    cmdclass={"build_ext": BuildExtension},
 )