diff --git a/.gitignore b/.gitignore index aebc9d7..47c25b7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ run.sh +profile.sh **.__pycache__ @@ -6,4 +7,11 @@ run.sh build/ dist/ -*.egg-info/ \ No newline at end of file +*.egg-info/ + +*.ncu-rep +*.nsys-rep + +*.ptx + +*.o \ No newline at end of file diff --git a/README.md b/README.md index ef873a4..88cb978 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,11 @@ Proposal (from Google Drive): https://docs.google.com/document/d/10B3_nT8xF49vLg conda create -n ceg5206 python=3.12 pip install -r requirements.txt +# clone the cutlass repository +git clone https://github.com/NVIDIA/cutlass.git ~/cutlass --depth 1 + +export FLASHASHING_CUDA_ARCH_LIST= # your arch here + # install the cpp source file python setup.py install @@ -19,4 +24,15 @@ python benchmark/test_script.py + 10.3: Implement `SHA256-SIMD` version, still some flaws: only allow short string processing (`len < 55`) + 10.2: Consultation with doctor, invite to repo, add `BLAKE3` basic implementations -+ 10.1: Provide basic template for project, implementing basic sha256 in C++. \ No newline at end of file ++ 10.1: Provide basic template for project, implementing basic sha256 in C++. + +# GPU kernel performance on File Compress hashing + +> This logs were tested on RTX 4090 Laptop GPU, computation / memory is limited, soon in the latter section we will see the perf on different machine. + ++ 10.5 - v1 - [commit:4f31a2c55551965a2e5f048565f021c4551554ae]: 1709.34 MiB/s ++ 10.5 - v2 - [commit:2e71051367a9533af36f6c54a46876e20bc0bab5]: 2044.49 MiB/s ++ 10.6 - v3 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s (False Result) ++ 10.6 - v4 - [commit:9781ad9759d13e0e482162d57c68b65c946f4ff9]: 17069.23 MiB/s ++ 10.6 - v5 - [commit:4be8258f5e82aab4e57e8b70a604ebb9361d8aa0]: 34261.37 MiB/s ++ 10.8 - v6 - [commit:]: 15145.81 MiB/s \ No newline at end of file diff --git a/backup_deprecated/blake3_sm70_v1.cu b/backup_deprecated/blake3_sm70_v1.cu new file mode 100644 index 0000000..867b6af --- /dev/null +++ b/backup_deprecated/blake3_sm70_v1.cu @@ -0,0 +1,800 @@ + +#include +#include +#include +#include +#include + +#define WARP_SIZE 32 +#define LDST128BITS(value) (reinterpret_cast(&(value))[0]) + +#define CUDA_CHECK(expr) do { \ + cudaError_t _e = (expr); \ + if (_e != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s at %s:%d: %s\n", \ + #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\ + std::abort(); \ + } \ + } while(0) + +__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = { + 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, + 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u +}; + +// ---- 小工具 ---- +__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { +#if defined(__CUDA_ARCH__) + return __funnelshift_r(x, x, n); +#else + return (x >> n) | (x << (32 - n)); // host 路径 +#endif +} + +#define B3_G(a,b,c,d, mx, my) \ + do { \ + a = a + b + (mx); \ + d = rotr32(d ^ a, 16); \ + c = c + d; \ + b = rotr32(b ^ c, 12); \ + a = a + b + (my); \ + d = rotr32(d ^ a, 8); \ + c = c + d; \ + b = rotr32(b ^ c, 7); \ + } while (0) + +__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) { +#if defined(__CUDA_ARCH__) + const uint4 v = *reinterpret_cast(src); + dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w; +#else + std::memcpy(dst, src, 16); +#endif +} + +__host__ __device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = { + // r = 0: identity + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + // r = 1: P + { 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8 }, + // r = 2: P∘P + { 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1 }, + // r = 3 + { 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6 }, + // r = 4 + { 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4 }, + // r = 5 + { 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7 }, + // r = 6 + { 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13 }, +}; + +__host__ __device__ void blake3_compress_words_7r( + const uint32_t block_words[16], // 64B -> shared memory + const uint32_t cv[8], // 8×u32 -> shared memory + uint64_t chunk_counter, // 64-bit + uint32_t block_len, // [0..64] + uint32_t flags, // CHUNK_START/END/PARENT/ROOT… + uint32_t out_state[16]) // 返回 16×u32 状态向量(按规范) +{ + // v[0..7]=cv;v[8..11]=IV;v[12..15]=IV^t0^t1^b^flags + uint32_t v0=cv[0], v1=cv[1], v2=cv[2], v3=cv[3]; + uint32_t v4=cv[4], v5=cv[5], v6=cv[6], v7=cv[7]; + + uint32_t v8 = BLAKE3_IV[0], v9 = BLAKE3_IV[1]; + uint32_t v10= BLAKE3_IV[2], v11 = BLAKE3_IV[3]; + uint32_t v12= BLAKE3_IV[4] ^ (uint32_t)chunk_counter; + uint32_t v13= BLAKE3_IV[5] ^ (uint32_t)(chunk_counter >> 32); + uint32_t v14= BLAKE3_IV[6] ^ block_len; + uint32_t v15= BLAKE3_IV[7] ^ flags; + + // 7 轮 +#pragma unroll 1 + for (int r = 0; r < 7; ++r) { + // 取本轮 16 个消息字(按 BLAKE3 调度表) + const uint8_t* SR = B3_MSG_SCHEDULE[r]; // 0..15 + const uint32_t m0 = block_words[SR[0]], m1 = block_words[SR[1]]; + const uint32_t m2 = block_words[SR[2]], m3 = block_words[SR[3]]; + const uint32_t m4 = block_words[SR[4]], m5 = block_words[SR[5]]; + const uint32_t m6 = block_words[SR[6]], m7 = block_words[SR[7]]; + const uint32_t m8 = block_words[SR[8]], m9 = block_words[SR[9]]; + const uint32_t m10 = block_words[SR[10]], m11 = block_words[SR[11]]; + const uint32_t m12 = block_words[SR[12]], m13 = block_words[SR[13]]; + const uint32_t m14 = block_words[SR[14]], m15 = block_words[SR[15]]; + + // 列步:四个 G + B3_G(v0, v4, v8, v12, m0, m1); + B3_G(v1, v5, v9, v13, m2, m3); + B3_G(v2, v6, v10, v14, m4, m5); + B3_G(v3, v7, v11, v15, m6, m7); + + // 对角步:四个 G + B3_G(v0, v5, v10, v15, m8, m9); + B3_G(v1, v6, v11, v12, m10, m11); + B3_G(v2, v7, v8, v13, m12, m13); + B3_G(v3, v4, v9, v14, m14, m15); + } + + // 输出 16×u32 状态(后续由调用者做 state_to_cv) + out_state[0]=v0; out_state[1]=v1; out_state[2]=v2; out_state[3]=v3; + out_state[4]=v4; out_state[5]=v5; out_state[6]=v6; out_state[7]=v7; + out_state[8]=v8; out_state[9]=v9; out_state[10]=v10; out_state[11]=v11; + out_state[12]=v12;out_state[13]=v13;out_state[14]=v14; out_state[15]=v15; +} + +// 从 out_state 派生新的 CV(按规范应取 state[0..7] ^ state[8..15]) +__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){ +#pragma unroll + for (int i = 0; i < 8; ++i) + out_cv[i] = st[i] ^ st[8+i]; +} + +// 叶:处理 1KiB chunk(16×64B blocks)→ 1 个 CV +// 假定输入为小端 u32 流,chunk 不足 1KiB 最后一块 block_len<64 并置 END 标志 +__device__ void blake3_leaf_cv(const uint32_t* chunk_words, + int chunk_len_bytes, + uint64_t chunk_counter, + uint32_t out_cv[8]) +{ + uint32_t cv[8]; + // 初始 cv = IV +#pragma unroll + for (int i = 0; i < 8; ++i) + cv[i] = BLAKE3_IV[i]; + + const int nblocks = (chunk_len_bytes + 63) / 64; // ceil + for (int b = 0; b < nblocks; ++b) { + uint32_t st[16]; + const uint32_t* block = chunk_words + b*16; + const int remain = chunk_len_bytes - b*64; + const uint32_t blk_len = remain >= 64 ? 64u : (uint32_t)remain; + + const uint32_t flags = + ((b == 0) ? (1u<<0) : 0u) | // CHUNK_START(示意:bit0) + ((b == nblocks-1) ? (1u<<1) : 0u); // CHUNK_END (示意:bit1) + + blake3_compress_words_7r(block, cv, chunk_counter, blk_len, flags, st); + blake3_state_to_cv(st, cv); + } + +#pragma unroll + for (int i = 0; i < 8; ++i) + out_cv[i] = cv[i]; +} + +__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){ + uint32_t msg[16]; +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[i] = L[i]; + } +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[8+i] = R[i]; + } + uint32_t st[16]; + blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st); + blake3_state_to_cv(st, out_cv); +} + +// ============ Big kernel: 16 WARPS in total ============ +// grid: (chunks / 64), thread: (512,) +template // pad shared memory +__global__ void blake3_block_reduce_kernel(uint32_t* d_input, + uint32_t* block_cvs, + int chunk_len_bytes, + uint64_t base_chunk_counter, + int total_chunks) { + // NUM_WARPS also stands for NUM_CHUKNS per block + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // consider it as 16 + constexpr int CHUNKS_PROCEED = 64; + + static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned"); + static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4"); + + // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total + // 8 x 4 x 64 = 2 KiB shared memory in sum + __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV]; // avoid bank conflict + + // 4 bytes x 256 x 64 = 64 KiB shared memory. + __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][(CHUNK_SIZE / 4) + PAD_CHUNK]; // [64][256] + + const int tid = threadIdx.x; + const int bx = blockIdx.x; + const int warp_id = tid / WARP_SIZE; + const int lane_id = tid % WARP_SIZE; + + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int VEC_ELEMS = 4; // uint4 + constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB) + constexpr int WARPS_PER_CTA = NUM_WARPS; // 16 + + // ============== STAGE 1: Coalsced Global Memory Loading ============== + const int tile_id = blockIdx.x; + const int tile_base = tile_id * CHUNKS_PER_BLOCK; // which chunk do this block start loading + + int valid_chunks = total_chunks - tile_base; + if (valid_chunks <= 0) { + return; // overflow + } + if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK; + + for (int ldt = 0; ldt < 4; ldt++) { + // each warp load 4 chunks + int chunk_local = ldt * WARPS_PER_CTA + warp_id; // ldt*16 + warp -> start chunk + int chunk_global = tile_base + chunk_local; // global chunk idx + + // the pointer for shared memory + uint32_t* s_u32 = &chunk_smem[chunk_local][0]; + + // only read from global, when it's valid + // or, we fill it with 0 + if (chunk_local < valid_chunks) { + const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK; + + // move 16 bytes -> 128 bits each time + // each thread will load 2 x 16 bytes + // so 32 threads will load 32 x 2 x 16 = 1024 B + const uint4* __restrict__ g4 = reinterpret_cast(g_u32); + uint4* __restrict__ s4 = reinterpret_cast(s_u32); + + // idx = lane_id (0..31) 与 lane_id+32 (32..63) + int idx0 = lane_id; // 0..31 + int idx1 = lane_id + WARP_SIZE; // 32..63 + + // thread 0 -> 0, 32 + // thread 1 -> 1, 33 + // ... + // thread 31 -> 31, 63 + // so the global memory access is coalsced + + // notice, we load 16 bytes a time. the index is compressed + // tid 0 -> 0, tid 1 -> 16 + // tid 0 -> 32 x 16, tid 2 -> 32 x 16 + 16 + + uint4 v0 = g4[idx0]; // still, this step we load from gmem, in 4 elements aligned. + uint4 v1 = g4[idx1]; + + s_u32[4*idx0 + 0] = v0.x; // when load into shared mem, do manually + s_u32[4*idx0 + 1] = v0.y; + s_u32[4*idx0 + 2] = v0.z; + s_u32[4*idx0 + 3] = v0.w; + + s_u32[4*idx1 + 0] = v1.x; + s_u32[4*idx1 + 1] = v1.y; + s_u32[4*idx1 + 2] = v1.z; + s_u32[4*idx1 + 3] = v1.w; + } else { + uint4* s4 = reinterpret_cast(s_u32); + int idx0 = lane_id; + int idx1 = lane_id + WARP_SIZE; + s4[idx0] = make_uint4(0u,0u,0u,0u); + s4[idx1] = make_uint4(0u,0u,0u,0u); + } + } + + __syncthreads(); // sync all warps + + // ============== STAGE 2: Compress Leaf to 64 chain value ============== + const int pass0_valid = min(32, valid_chunks); // pass0 cover [0, 31] chunks + const int pass1_valid = max(0, valid_chunks - 32); // pass1 cover [32, 63] chunks + + __shared__ int parents_count; + if (threadIdx.x == 0) { + const int parents0 = (pass0_valid + 1) >> 1; + const int parents1 = (pass1_valid + 1) >> 1; + parents_count = parents0 + parents1; // ≤ 32 + } + __syncthreads(); + + auto compute_leaf_cv_from_row = [&](int chunk_local, uint32_t out_cv[8]) { + const uint32_t* mline = &chunk_smem[chunk_local][0]; // 1 KiB = 256 u32 + const uint64_t cc = base_chunk_counter + (uint64_t)(tile_base + chunk_local); + blake3_leaf_cv(mline, chunk_len_bytes, cc, out_cv); + }; + + uint32_t warp_cv_pass0[8], warp_cv_pass1[8]; + bool have_pass0 = false, have_pass1 = false; + + // ------- STAGE 2 pass = 0 ------- + { + const int left = (warp_id << 1); // 2*warp_id : 0,2,4,...,30 + const int right = left + 1; // neighbor: 1,3,5,...,31 + const int pair_idx = left >> 1; // 0..15 + + uint32_t left_cv[8], right_cv[8]; + + bool have_left = false, have_right = false; + + if (lane_id == 0 && left < pass0_valid) { + compute_leaf_cv_from_row(left, left_cv); + have_left = true; + } + if (lane_id == 1 && right < pass0_valid) { + compute_leaf_cv_from_row(right, right_cv); + have_right = true; + } + + // merge two neighbor + unsigned mask = __ballot_sync(0xFFFFFFFFu, (lane_id==0 && have_left) || (lane_id==1 && have_right)); + if (lane_id == 0 && left < pass0_valid) { + uint32_t parent[8]; + if (have_right) { + uint32_t rcv[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) + rcv[j] = __shfl_sync(mask, right_cv[j], 1); + blake3_parent_cv(left_cv, rcv, parent); + } else { + #pragma unroll + for (int j = 0; j < 8; ++j) + parent[j] = left_cv[j]; // 奇数晋级 + } + // 写入 cv_smem 的正确位置:pair_idx = left/2 → 0..15 + #pragma unroll + for (int j = 0; j < 8; ++j) + cv_smem[pair_idx][j] = parent[j]; + } + __syncwarp(); + } + + // ---- STAGE 2 pass 1 ---- + { + const int left = 32 + (warp_id << 1); // 32,34,...,62 + const int right = left + 1; // 33,35,...,63 + const int pair_idx = left >> 1; // 16..31 + + uint32_t left_cv[8], right_cv[8]; + + bool have_left = false, have_right = false; + + if (lane_id == 0 && (left - 32) < pass1_valid) { + compute_leaf_cv_from_row(left, left_cv); + have_left = true; + } + if (lane_id == 1 && (right - 32) < pass1_valid) { + compute_leaf_cv_from_row(right, right_cv); + have_right = true; + } + + // TODO: here we may have some issue: overflow and border situation + unsigned mask = __ballot_sync(0xFFFFFFFFu, (lane_id==0 && have_left) || (lane_id==1 && have_right)); + if (lane_id == 0 && (left - 32) < pass1_valid) { + uint32_t parent[8]; + if (have_right) { + uint32_t rcv[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) + rcv[j] = __shfl_sync(mask, right_cv[j], 1); + blake3_parent_cv(left_cv, rcv, parent); + } else { + #pragma unroll + for (int j = 0; j < 8; ++j) + parent[j] = left_cv[j]; // 奇数晋级 + } + // write to the right position + #pragma unroll + for (int j = 0; j < 8; ++j) + cv_smem[pair_idx][j] = parent[j]; + } + __syncwarp(); + } + + __syncthreads(); + + // ============== STAGE 3: Block-Reduce ============== + // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv + // we will only use warp 0 to handle this thing + if (warp_id == 0) { + uint32_t cv[8] = {0,0,0,0,0,0,0,0}; + + const bool active_lane = (lane_id < parents_count); + if (active_lane) { + #pragma unroll + for (int j = 0; j < 8; ++j) cv[j] = cv_smem[lane_id][j]; + } + + // 2) warp reduce 32 -> 16 -> 8 -> 4 -> 2 -> 1 + unsigned mask = __ballot_sync(0xFFFFFFFFu, active_lane); + int cur_n = parents_count; // 当前层的有效节点数(逐层更新) + + for (int step = 1; step < WARP_SIZE; step <<= 1) { + // right-neighbor + uint32_t nbr[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) { + nbr[j] = __shfl_down_sync(mask, cv[j], step); + } + + // safety checking + const bool do_pair = + (lane_id % (step << 1) == 0) && // 左侧 + (lane_id + step < cur_n) && // 右侧在当前层有效范围内 + (lane_id < cur_n); // 左侧也必须有效 + + if (do_pair) { + blake3_parent_cv(cv, nbr, cv); // parent(left, right) -> cv + } + + cur_n = (cur_n + 1) >> 1; + __syncwarp(mask); + } + + // 3) write back to global memory + if (lane_id == 0 && parents_count > 0) { + const int tile_id = blockIdx.x; + uint32_t* out = block_cvs + (size_t)tile_id * 8; // 8 x 4 = 32 B + + // two different write ways + #if 0 + #pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = cv[j]; + #else + // block_cvs should be cudaMalloc ed + reinterpret_cast(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]); + reinterpret_cast(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]); + #endif + } + } +} // blake3_block_reduce_kernel + +__device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) { + const uint4* g4 = reinterpret_cast(g); + uint4 a = g4[0], b = g4[1]; + r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w; + r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w; +} + +__device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) { + uint4* g4 = reinterpret_cast(g); + g4[0] = make_uint4(r[0],r[1],r[2],r[3]); + g4[1] = make_uint4(r[4],r[5],r[6],r[7]); +} + +// ============ Tiny kernel ============ +// In big kernel, it will consume 64 KiB each block +// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 64 root = 16384 root +// And this tiny kernel is designed to process these 16384 root +template +__global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32, + uint32_t* __restrict__ out_cv32, + int N) +{ + extern __shared__ __align__(16) uint32_t smem[]; // 动态 SMEM;需要 >= TILE_CVS*8*4 字节 + // 视作 2D:[TILE_CVS][8+PAD] + uint32_t* cv_tile = smem; + + const int tid = threadIdx.x; + const int warp_id = tid / WARP_SIZE; // 0..15 + const int lane_id = tid % WARP_SIZE; // 0..31 + + // 本 block 负责的分片起点 + const int tile_start = blockIdx.x * TILE_CVS; + if (tile_start >= N) return; + + // N等于8的时候,这里就是8 + const int tile_n = min(TILE_CVS, N - tile_start); // 该分片的实际 CV 数(<=2048) + + // ---------------- Stage 1: 合并访存 loading 到 SMEM ---------------- + // 每线程搬多个 CV:i = tid, tid+blockDim, ... + for (int i = tid; i < tile_n; i += NUM_THREADS) { // 注意:i = tid, 不是等于0 + const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8; + uint32_t* s = cv_tile + (size_t)i * (8 + PAD); + // 两次 16B + const uint4* g4 = reinterpret_cast(g); + uint4* s4 = reinterpret_cast(s); + // s4[0] = g4[0]; + // s4[1] = g4[1]; + + // in case that the address is not aligned + uint4 v0 = g4[0]; + uint4 v1 = g4[1]; + + s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w; + s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w; + } + // 对于 tile_n < TILE_CVS 的尾部,无需清零;后续按有效范围处理 + __syncthreads(); + + // ---------------- Stage 2: 线程内 4→1(保持相邻配对) ---------------- + // 共有 reduced_n0 = ceil(tile_n / 4) 个 lane-root + const int reduced_n0 = (tile_n + 3) >> 1 >> 1; // 等价于 (tile_n+3)/4 + uint32_t lane_cv[8]; // 本线程输出的 lane-root + bool lane_valid = false; + + // 每线程的 4 个输入的起始索引 + int base4 = tid << 2; // tid*4 + if (base4 < tile_n) { + // 读取最多 4 个相邻 CV:idx = base4 + 0,1,2,3 + uint32_t a[8], b[8], c[8], d[8]; + const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD); + load_cv_g2r(s0, a); + + int remain = tile_n - base4; + + if (remain >= 2) { + const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD); + load_cv_g2r(s1, b); + } + if (remain >= 3) { + const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD); + load_cv_g2r(s2, c); + } + if (remain >= 4) { + const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD); + load_cv_g2r(s3, d); + } + + // 两层相邻配对(奇数晋级) + if (remain == 1) { + #pragma unroll + for (int j = 0; j < 8; ++j) + lane_cv[j] = a[j]; + } else if (remain == 2) { + blake3_parent_cv(a, b, lane_cv); + } else if (remain == 3) { + uint32_t p01[8]; + blake3_parent_cv(a, b, p01); + blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01,(p01,c)->lane_cv + } else { // remain >= 4 + uint32_t p01[8], p23[8]; + blake3_parent_cv(a, b, p01); + blake3_parent_cv(c, d, p23); + blake3_parent_cv(p01, p23, lane_cv); + } + lane_valid = true; + } + + // ---------------- Stage 3: Warp 内 32→1 相邻配对 ---------------- + // 每个 warp 负责一个连续段:warp_base = warp_id*32 + const int warp_base = warp_id * WARP_SIZE; + const int cur_n_w = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // 本 warp 段内的有效数量 + + // 把 lane_cv 保留在寄存器里做归约;无效 lane 用 mask 剔除 + unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // 仅做存在检测 + int cur_n = cur_n_w; + + // 把“段外的线程”标成无效(避免读越界) + bool active_lane = (lane_id < cur_n_w); + + // 对无效 lane 把值清成 0(不会被使用) + if (!active_lane) { + #pragma unroll + for (int j = 0; j < 8; ++j) + lane_cv[j] = 0u; + } + + // 逐层配对:1,2,4,8,16 - warp-reduce + for (int step = 1; step < WARP_SIZE; step <<= 1) { + // 取右邻 + uint32_t nbr[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) + nbr[j] = __shfl_down_sync(0xFFFFFFFFu, lane_cv[j], step); + + const bool do_pair = + active_lane && + ((lane_id % (step<<1)) == 0) && + (lane_id + step < cur_n); + + if (do_pair) { + blake3_parent_cv(lane_cv, nbr, lane_cv); + } + + cur_n = (cur_n + 1) >> 1; + // __syncwarp(); + } + + // 这一段的结果在 lane0;把 16 个 warp-root 写入 SMEM 的前 16 行 + __shared__ uint32_t warp_roots[WARP_SIZE/2][8]; // 16×8 + if (lane_id == 0 && cur_n_w > 0) { + #pragma unroll + for (int j=0;j<8;++j) + warp_roots[warp_id][j] = lane_cv[j]; + } + __syncthreads(); + + // ---------------- Stage 4: CTA 内 16→1 相邻配对 ---------------- + // 有效 warp 数:ceil(reduced_n0 / 32) + int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..16 + if (valid_warps == 0) return; + + // 每一个warp的lane 0来做计算 + // 用 lane0 做计算,其它 lane 空转 + for (int stride = (valid_warps >> 1); stride >= 1; stride >>= 1) { + if (warp_id < stride && lane_id == 0) { + uint32_t p[8]; + blake3_parent_cv(&warp_roots[2*warp_id][0], + &warp_roots[2*warp_id + 1][0], p); + #pragma unroll + for (int j = 0; j < 8; ++j) + warp_roots[warp_id][j] = p[j]; + } + __syncthreads(); + // 奇数晋级 + if ((valid_warps & 1) && warp_id==0 && lane_id==0) { + #pragma unroll + for (int j=0;j<8;++j) + warp_roots[stride][j] = warp_roots[valid_warps-1][j]; + } + __syncthreads(); + valid_warps = (valid_warps + 1) >> 1; + } + + // 写回本 block 的根 + if (threadIdx.x == 0) { + uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8; + #pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = warp_roots[0][j]; + } +} + +constexpr uint32_t FLAG_ROOT = 8; + +inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) { + const uint32_t zero_block[16] = {0}; + uint32_t st[16]; + blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st); + // 写出前 32 字节(state[0..7],小端) + for (int i = 0; i < 8; ++i) { + uint32_t w = st[i]; + out32[4*i+0] = (uint8_t)( w & 0xFF); + out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF); + out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF); + out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF); + } +} + +// wrapper function +void blake3_block_reduce_sm70(const uint8_t* d_data, + uint64_t bytes_len, + std::array* root_out = nullptr, + cudaStream_t stream = 0) { + if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) { + fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n", + (unsigned long long)bytes_len); + std::abort(); + } + + int optin = 0, deflt = 0; + cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0); + cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0); + + const int dyn_smem = 64 * 1024; // 64KiB + + // 编译器在编译期决定分配多少动态shmem给kernel + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel<512, 2048, 0>, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel<32, 2048, 0>, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + + + constexpr int CHUNK_SIZE = 1024; // 1 KiB + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int NUM_THREADS = 512; // for big kernel + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // 16 + constexpr int CHUNKS_PER_BLOCK = 64; // 16 * 32 = 512 + const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk + const uint64_t total_chunks = bytes_len / CHUNK_SIZE; + const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); // 16384 blocks + + constexpr int pad_chunk = 0; + constexpr int pad_cv = 0; + + CUDA_CHECK(cudaFuncSetAttribute( + blake3_block_reduce_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100)); + + uint8_t* d_bytes = const_cast(d_data); + uint32_t* d_words = reinterpret_cast(d_bytes);; // alias + uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32 + + // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch. + + // TODO: use thrust + cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t)); // 512 KiB + + // ============= launch big kernel ============= + dim3 grid_big(num_blocks); + dim3 block_big(NUM_THREADS); + uint64_t base_chunk_counter = 0ull; + + blake3_block_reduce_kernel + <<>>( + d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks); + + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + + if (num_blocks == 1) { + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) *root_out = host_root; + else { + // 简单打印 + printf("root CV:"); + for (int i=0;i<8;++i) + printf(" %08x", host_root[i]); + printf("\n"); + } + + CUDA_CHECK(cudaFree(d_blockCV)); + CUDA_CHECK(cudaFree(d_bytes)); + return; + } + + // the first round of tiny kernel + // 1) 16384 output reduce -> 8 + uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32 + { + const int N = 16384; // total number + const int TILE = 2048; + const int grid = (N + TILE - 1) / TILE; // = 8 + const int block = 512; + const size_t smem_bytes = (size_t)(TILE) * 8u * sizeof(uint32_t); // 2048×8×4 = 64 KiB + + cudaMalloc(&d_mid_out, (size_t)8 * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel<512, 2048, 0> + <<>>(d_blockCV /*in: 16384×8 x 4*/, + d_mid_out /*out: 8×8*/, N); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + } + + // second round + uint32_t* d_root_cv = nullptr; + { + const int N = 8; + const int TILE = 2048; // 任意 >=N 即可 + const int grid = 1; + const int block = 32; // 32 线程够用 + const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t); // 8 x 8 x 4 = 8 x 32 B + + cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel<32, 2048, 0> + <<>>(d_mid_out /*in: 8×8*/, + d_root_cv /*out: 1×8*/, N); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + } + + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root_cv, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) { + *root_out = host_root; + } else { + printf("root CV:"); + for (int i=0;i<8;++i) printf(" %08x", host_root[i]); + printf("\n"); + } + + // clear + CUDA_CHECK(cudaFree(d_mid_out)); + CUDA_CHECK(cudaFree(d_root_cv)); + CUDA_CHECK(cudaFree(d_blockCV)); + // CUDA_CHECK(cudaFree(d_bytes)); +} \ No newline at end of file diff --git a/backup_deprecated/blake3_sm70_v2.cu b/backup_deprecated/blake3_sm70_v2.cu new file mode 100644 index 0000000..e61b85b --- /dev/null +++ b/backup_deprecated/blake3_sm70_v2.cu @@ -0,0 +1,943 @@ + +#include +#include +#include +#include +#include + +#define WARP_SIZE 32 +#define LDST128BITS(value) (reinterpret_cast(&(value))[0]) + +#define CUDA_CHECK(expr) do { \ + cudaError_t _e = (expr); \ + if (_e != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s at %s:%d: %s\n", \ + #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\ + std::abort(); \ + } \ + } while(0) + +__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = { + 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, + 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u +}; + +enum : uint32_t { + FLAG_CHUNK_START = 1u << 0, + FLAG_CHUNK_END = 1u << 1, + FLAG_PARENT = 1u << 2, + FLAG_ROOT = 1u << 3, + FLAG_KEYED_HASH = 1u << 4, + FLAG_DERIVE_KEY_CONTEXT = 1u << 5, + FLAG_DERIVE_KEY_MATERIAL= 1u << 6, +}; + +__device__ __noinline__ +uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) { + uint32_t f = 0; + f |= (uint32_t)-(block_idx_in_chunk==0) & FLAG_CHUNK_START; + f |= (uint32_t)-(block_idx_in_chunk==nblocks_in_chunk-1) & FLAG_CHUNK_END; + if (is_root_chunk) f |= FLAG_ROOT; // only this block in msg, or this is root + return f; +} + +__device__ __forceinline__ +uint32_t blake3_parent_flags(bool is_root_parent) { + return FLAG_PARENT | (is_root_parent ? FLAG_ROOT : 0); +} + +// ---- 小工具 ---- +__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { +#if defined(__CUDA_ARCH__) + return __funnelshift_r(x, x, n); +#else + return (x >> n) | (x << (32 - n)); // host 路径 +#endif +} + +__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) { +#if defined(__CUDA_ARCH__) + const uint4 v = *reinterpret_cast(src); + dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w; +#else + std::memcpy(dst, src, 16); +#endif +} + +__host__ __device__ void blake3_compress_words_7r( + const uint32_t block_words[16], // 64B -> shared memory + const uint32_t cv[8], // 8×u32 -> shared memory + uint64_t chunk_counter, // 64-bit + uint32_t block_len, // [0..64] + uint32_t flags, // CHUNK_START/END/PARENT/ROOT… + uint32_t out_state[16]) // 返回 16×u32 状态向量(按规范) +{ + // TODO: 根据 BLAKE3(基于 BLAKE2s)的 G/round 实现7轮 + // 这里先给占位:将 IV+cv 混合到 out_state,真实实现请替换 +#pragma unroll + for (int i = 0; i < 8; ++i) + out_state[i] = cv[i]; +#pragma unroll + for (int i = 0; i < 8; ++i) + out_state[8+i] = BLAKE3_IV[i]; + + out_state[12] ^= (uint32_t)chunk_counter; + out_state[13] ^= (uint32_t)(chunk_counter >> 32); + out_state[14] ^= block_len; + out_state[15] ^= flags; + + // so far, the block_words are still pointers. + // now we load it into kernel, as pointed out by ncu profile + uint32_t block_reg_1[4]; + +#pragma unroll + for (int i = 0; i < 16; i += 4) { // the gap is 4 + // load_u128_u32x4(block_words + i, block_reg_1); + out_state[i] ^= block_words[i]; + // 做一点点搅动(占位) + out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31); + } +} + +// 从 out_state 派生新的 CV(按规范应取 state[0..7] ^ state[8..15]) +__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){ +#pragma unroll + for (int i = 0; i < 8; ++i) + out_cv[i] = st[i] ^ st[8+i]; +} + +// swap-table +// BLAKE3 message schedule: rows are P^r, r=0..6. +// Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) +// https://www.cse.unsw.edu.au/~cs4601/refs/papers/blake3.pdf +__device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = { + // r = 0: identity + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + // r = 1: P + { 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8 }, + // r = 2: P∘P + { 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1 }, + // r = 3 + { 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6 }, + // r = 4 + { 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4 }, + // r = 5 + { 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7 }, + // r = 6 + { 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13 }, +}; + +// get the "r" round, "k" message, it is broadcasted from m[li] lane. li = k +__device__ __forceinline__ uint32_t msg_rk(uint32_t m_lane, int round, int k, unsigned mask16) { + int src = B3_MSG_SCHEDULE[round][k]; + return __shfl_sync(mask16, m_lane, src, 16); +} + +__device__ __noinline__ +uint32_t G_update_role(uint32_t v_self, uint32_t v_b, uint32_t v_c, uint32_t v_d, + uint32_t mx, uint32_t my, int role) +{ + // 按 BLAKE2s 32-bit 的 G 序列算出 a',b',c',d',最后返回“当前 role”的那个值 + uint32_t a = v_self, b = v_b, c = v_c, d = v_d; + + // a = a + b + mx; + // d ^= a; + // d >>>= 16 + a = a + b + mx; + d ^= a; + d = rotr32(d, 16); + + // c = c + d; + // b ^= c; + // b >>>= 12 + c = c + d; + b ^= c; + b = rotr32(b, 12); + + // a = a + b + my; + // d ^= a; + // d >>>= 8 + a = a + b + my; + d ^= a; + d = rotr32(d, 8); + + // c = c + d; + // b ^= c; + // b >>>= 7 + c = c + d; + b ^= c; + b = rotr32(b, 7); + + // role choice: + switch (role) { + case 0: return a; + case 1: return b; + case 2: return c; + default: return d; + } +} + +// notice that, this function will proceed 2 chunks, each time. +// - chunk_words_row: current chunk +// - out_cv: written by lane 0, or lane 16 +__device__ __noinline__ +void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row, // 256×u32 -> 1024 Bytes, from shared memory + // so the chunks_row += 2 as gap + int chunk_len_bytes, + uint64_t chunk_counter, + uint32_t out_cv[8], + unsigned mask16) { + // computing index + int lane = threadIdx.x & 31; // lane_id: 0-31 + int sub = lane >> 4; // 0/1 + int li = lane & 15; // 0..15, abstract lane id. for example, lane 16 will be li=0 + int role = li & 3; // a/b/c/d role + int base = (sub << 4); // 0 or 16 the absolute base + + const int nblocks = (chunk_len_bytes + 63) >> 6; // ceil(chunk_len/64) + + int warp_id = threadIdx.x / WARP_SIZE; + + // initialize + uint32_t cv_word = 0; + if (li < 8) cv_word = BLAKE3_IV[li]; + + // process all blocks + // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks + // each block has 64B -> 16 x u32 + for (int b = 0; b < nblocks; ++b) { + // each lane holds one u32, + // 16 lane will hold 16 x 4 = 64 B -> it's block + // the another 16 lane will hold opposite 64 B + const uint32_t m_lane = chunk_words_row[b * 16 + li]; + + // 初始化 v:v[0..7]=cv, v[8..11]=IV,v[12..15]^=t/len/flags + // 先把“自己的那个索引”的初值准备好: + uint32_t v = (li < 8) + ? cv_word // v[i](i<8) + : BLAKE3_IV[li - 8]; // v[8..15] ← IV + + // 计数器/长度/标志(按 BLAKE3 规范) + const uint32_t t0 = (uint32_t)chunk_counter; + const uint32_t t1 = (uint32_t)(chunk_counter >> 32); + const int remain = chunk_len_bytes - (b << 6); + const uint32_t block_len = (remain >= 64) ? 64u : (uint32_t)remain; + + const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false); + + // 只在 12..15 四个索引上异或相应域(不分支,用谓词掩码) + v ^= (li == 12) ? t0 : 0u; + v ^= (li == 13) ? t1 : 0u; + v ^= (li == 14) ? block_len: 0u; + v ^= (li == 15) ? flags : 0u; + + // 把索引写成 li = q + 4*rq;对角步先做置换 li' = (rq<<2) | ((q+rq)&3) + int q = (li & 3); + int rq = (li >> 2); + int li_diag = (rq << 2) | ((q + rq) & 3); + int li_undo = (rq << 2) | ((q - rq) & 3); + int gi_col = q; // 0..3 + int gi_diag = (li_diag & 3); // 0..3 + + // ===== 7 rounds ===== + #pragma unroll 4 + for (int r = 0; r < 7; ++r) { + // inside this loop, each lane will do one job + // 16 lane will execute 16 x 2 operations + // in sequential-programming, will do 8 operation + + // ---- 列步(quartet: {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15})---- + { + // 取同 quartet 的 b/c/d(基于当前 v) + uint32_t vb = __shfl_xor_sync(mask16, v, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16); + + // 本 quartet 的 i ∈ {0,1,2,3},列步用 msg 索引 0..7(两两为一对) + + uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16); + + v = G_update_role(v, vb, vc, vd, mx, my, role); + } + + // ---- 对角步 ---- + { + // 在“对角置换域”取到当前 v 值 + uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16); + + // 在该域内做“列步”同样的四邻取值 + uint32_t vb = __shfl_xor_sync(mask16, v_diag, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v_diag, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16); + + // 对角步的 4 组 G 使用本轮消息对的后半(索引 8..15) + + uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16); + + uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role); + + // 反置换回原位:li_undo = (rq<<2) | ((q - rq) & 3) + + // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16); + v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16); + } + } // 7 rounds end + + // 派生新的 CV:cv[i] = v[i] ^ v[i+8](仅 li=0..7 生效) + uint32_t vip8_all = __shfl_sync(mask16, v, li ^ 8, 16); + if (li < 8) { + cv_word = v ^ vip8_all; + } + + // 下一块继续(本函数内 16 个 block 串行) + } + + // 由 lane0 / lane16 收集 8×u32 输出 + #pragma unroll + for (int j = 0; j < 8; ++j) { + uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用 + if (li == 0) out_cv[j] = wj; // 仅 lane0 落盘 + } +} + +__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){ + uint32_t msg[16]; +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[i] = L[i]; + } +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[8+i] = R[i]; + } + uint32_t st[16]; + blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st); + blake3_state_to_cv(st, out_cv); +} + +// ============ Big kernel: 16 WARPS in total ============ +// grid: (chunks / 64), thread: (512,) +template // pad shared memory +__global__ void blake3_block_reduce_kernel(uint32_t* d_input, + uint32_t* block_cvs, + int chunk_len_bytes, + uint64_t base_chunk_counter, + int total_chunks) { + // NUM_WARPS also stands for NUM_CHUKNS per block + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // consider it as 16 + constexpr int CHUNKS_PROCEED = CHUNKS_PER_BLOCK; + + static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned"); + static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4"); + + // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total + // 8 x 4 x 64 = 2 KiB shared memory in sum + __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV]; // avoid bank conflict + + // 4 bytes x 256 x 64 = 64 KiB shared memory. + __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][(CHUNK_SIZE / 4) + PAD_CHUNK]; // [64][256] + + const int tid = threadIdx.x; + const int bx = blockIdx.x; + const int warp_id = tid / WARP_SIZE; + const int lane_id = tid % WARP_SIZE; + + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int VEC_ELEMS = 4; // uint4 + constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB) + constexpr int WARPS_PER_CTA = NUM_WARPS; // 16 + + // ============== STAGE 1: Coalsced Global Memory Loading ============== + const int tile_id = blockIdx.x; + const int tile_base = tile_id * CHUNKS_PER_BLOCK; // which chunk do this block start loading + + int valid_chunks = total_chunks - tile_base; + if (valid_chunks <= 0) { + return; // overflow + } + if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK; + + for (int ldt = 0; ldt < 4; ldt++) { + // each warp load 4 chunks + int chunk_local = ldt * WARPS_PER_CTA + warp_id; // ldt*16 + warp -> start chunk + int chunk_global = tile_base + chunk_local; // global chunk idx + + // the pointer for shared memory + uint32_t* s_u32 = &chunk_smem[chunk_local][0]; + + // only read from global, when it's valid + // or, we fill it with 0 + if (chunk_local < valid_chunks) { + const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK; + + // move 16 bytes -> 128 bits each time + // each thread will load 2 x 16 bytes + // so 32 threads will load 32 x 2 x 16 = 1024 B + const uint4* __restrict__ g4 = reinterpret_cast(g_u32); + uint4* __restrict__ s4 = reinterpret_cast(s_u32); + + // idx = lane_id (0..31) 与 lane_id+32 (32..63) + int idx0 = lane_id; // 0..31 + int idx1 = lane_id + WARP_SIZE; // 32..63 + + // thread 0 -> 0, 32 + // thread 1 -> 1, 33 + // ... + // thread 31 -> 31, 63 + // so the global memory access is coalsced + + // notice, we load 16 bytes a time. the index is compressed + // tid 0 -> 0, tid 1 -> 16 + // tid 0 -> 32 x 16, tid 2 -> 32 x 16 + 16 + + uint4 v0 = g4[idx0]; // still, this step we load from gmem, in 4 elements aligned. + uint4 v1 = g4[idx1]; + + s_u32[4*idx0 + 0] = v0.x; // when load into shared mem, do manually + s_u32[4*idx0 + 1] = v0.y; + s_u32[4*idx0 + 2] = v0.z; + s_u32[4*idx0 + 3] = v0.w; + + s_u32[4*idx1 + 0] = v1.x; + s_u32[4*idx1 + 1] = v1.y; + s_u32[4*idx1 + 2] = v1.z; + s_u32[4*idx1 + 3] = v1.w; + } else { + uint4* s4 = reinterpret_cast(s_u32); + int idx0 = lane_id; + int idx1 = lane_id + WARP_SIZE; + s4[idx0] = make_uint4(0u,0u,0u,0u); + s4[idx1] = make_uint4(0u,0u,0u,0u); + } + } + + __syncthreads(); // sync all warps + + // ============== STAGE 2: Compress Leaf to 64 chain value ============== + const int pass0_valid = min(32, valid_chunks); // pass0 cover [0, 31] chunks + const int pass1_valid = max(0, valid_chunks - 32); // pass1 cover [32, 63] chunks + + __shared__ int parents_count; + if (threadIdx.x == 0) { + const int parents0 = (pass0_valid + 1) >> 1; + const int parents1 = (pass1_valid + 1) >> 1; + parents_count = parents0 + parents1; // ≤ 32 + } + __syncthreads(); + + // if (blockIdx.x==0 && threadIdx.x==0) printf("after stage1\n"); + + // this is for each warp's lane0 and lane16 written + // to decrease the register usage. + __shared__ __align__(16) uint32_t tmp_cv[CHUNKS_PER_BLOCK][8 + PAD_CV]; // 2 KiB + + // lambda function: compress this thing + auto do_big_pass = [&](int base /*0 或 32*/, int pass_valid) { + // left=base+2*warp_id, right=left+1 + const int left = base + (warp_id << 1); // base + 0,2,4,6,... + const int right = left + 1; + const int left_rel = left - base; // 0..31 + const int right_rel = right - base; // 1..32 + const bool has_left = (left_rel < pass_valid); + const bool has_right = (right_rel < pass_valid); + + // const int lane_id = threadIdx.x & 31; + const int sub = lane_id >> 4; // sub-warp: 0 or 1, lane0-15: sub-warp0; lane16-lane31: sub-warp1 + const int li = lane_id & 15; // 0..15 + const unsigned mask16= (sub == 0 ? 0x0000FFFFu : 0xFFFF0000u); + + const int chunk_local = left + sub; // sub=0→left, sub=1→right + const bool active = (sub==0 ? (left - base) < pass_valid + : (right - base) < pass_valid); + + // uint32_t my_cv[8]; + + // the left-sub-warp and right-sub-warp will execute the same code + // distinguish the index by computing, + // to avoid warp-divergence + if (active) { + // the chunk local identifies the left or right chunk, so do not worry. + const uint32_t* row = &chunk_smem[chunk_local][0]; + const uint64_t cc = base_chunk_counter + (uint64_t)(tile_base + chunk_local); + blake3_leaf_cv_simd16_onechunk(row, + chunk_len_bytes, + cc, + &tmp_cv[chunk_local][0], + mask16); + } + + __syncwarp(); // make sure two warps written into `tmp_cv` + + // now we have compute 2 chunks' cv + // merge it to a parent cv + if (lane_id == 0 && has_left) { + const uint32_t* lcv = &tmp_cv[left][0]; + uint32_t parent[8]; + if ((right - base) < pass_valid) { + const uint32_t* rcv = &tmp_cv[right][0]; + blake3_parent_cv(lcv, rcv, parent); + } else { // odd: up-flow directly + #pragma unroll + for (int j = 0 ; j < 8; ++j) + parent[j] = lcv[j]; + } + + // now, one warp computes 2 chunks, yield one parent-cv value + const int pair_idx = (base >> 1) + warp_id; // 0, 16 + warp_id + #pragma unroll + for (int j = 0; j < 8; ++j) + cv_smem[pair_idx][j] = parent[j]; + } + + __syncwarp(); // NOTICE: this is necessary! + }; // do_big_pass + + // big-pass 1: computing 0-31 chunks + do_big_pass(/*base=*/0, pass0_valid); + + // if (bx == 0) printf("Finish 1 big pass\n"); + + // big-pass 2: computing 32-63 chunks + do_big_pass(/*base=*/32, pass1_valid); + + // if (bx == 0) printf("Finish 2 big pass\n"); + + __syncthreads(); + + // printf("Stage 2 done!!!\n"); + + // right now, we have got 32 chain values + // a warp-reduce to merge. + + // ============== STAGE 3: Block-Reduce ============== + // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv + // we will only use warp 0 to handle this thing + if (warp_id == 0) { + uint32_t cv[8] = {0,0,0,0,0,0,0,0}; + + const bool active_lane = (lane_id < parents_count); + if (active_lane) { + #pragma unroll + for (int j = 0; j < 8; ++j) cv[j] = cv_smem[lane_id][j]; + } + + // 2) warp reduce 32 -> 16 -> 8 -> 4 -> 2 -> 1 + unsigned mask = __ballot_sync(0xFFFFFFFFu, active_lane); + int cur_n = parents_count; // 当前层的有效节点数(逐层更新) + + for (int step = 1; step < WARP_SIZE; step <<= 1) { + // right-neighbor + uint32_t nbr[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) { + nbr[j] = __shfl_down_sync(mask, cv[j], step); + } + + // safety checking + const bool do_pair = + (lane_id % (step << 1) == 0) && // 左侧 + (lane_id + step < cur_n) && // 右侧在当前层有效范围内 + (lane_id < cur_n); // 左侧也必须有效 + + if (do_pair) { + blake3_parent_cv(cv, nbr, cv); // parent(left, right) -> cv + } + + cur_n = (cur_n + 1) >> 1; + __syncwarp(mask); + } + + // 3) write back to global memory + if (lane_id == 0 && parents_count > 0) { + const int tile_id = blockIdx.x; + uint32_t* out = block_cvs + (size_t)tile_id * 8; // 8 x 4 = 32 B + + // two different write ways + #if 0 + #pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = cv[j]; + #else + // block_cvs should be cudaMalloc ed + reinterpret_cast(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]); + reinterpret_cast(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]); + #endif + } + } +} // blake3_block_reduce_kernel + +__device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) { + const uint4* g4 = reinterpret_cast(g); + uint4 a = g4[0], b = g4[1]; + r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w; + r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w; +} + +__device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) { + uint4* g4 = reinterpret_cast(g); + g4[0] = make_uint4(r[0],r[1],r[2],r[3]); + g4[1] = make_uint4(r[4],r[5],r[6],r[7]); +} + +// ============ Tiny kernel ============ +// In big kernel, it will consume 64 KiB each block +// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 64 root = 16384 root +// And this tiny kernel is designed to process these 16384 root +template +__global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32, + uint32_t* __restrict__ out_cv32, + int N) +{ + extern __shared__ __align__(16) uint32_t smem[]; // 动态 SMEM;需要 >= TILE_CVS*8*4 字节 + // 视作 2D:[TILE_CVS][8+PAD] + uint32_t* cv_tile = smem; + + const int tid = threadIdx.x; + const int warp_id = tid / WARP_SIZE; // 0..15 + const int lane_id = tid % WARP_SIZE; // 0..31 + + // 本 block 负责的分片起点 + const int tile_start = blockIdx.x * TILE_CVS; + if (tile_start >= N) return; + + // N等于8的时候,这里就是8 + const int tile_n = min(TILE_CVS, N - tile_start); // 该分片的实际 CV 数(<=2048) + + // ---------------- Stage 1: 合并访存 loading 到 SMEM ---------------- + // 每线程搬多个 CV:i = tid, tid+blockDim, ... + for (int i = tid; i < tile_n; i += NUM_THREADS) { // 注意:i = tid, 不是等于0 + const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8; + uint32_t* s = cv_tile + (size_t)i * (8 + PAD); + // 两次 16B + const uint4* g4 = reinterpret_cast(g); + uint4* s4 = reinterpret_cast(s); + // s4[0] = g4[0]; + // s4[1] = g4[1]; + + // in case that the address is not aligned + uint4 v0 = g4[0]; + uint4 v1 = g4[1]; + + s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w; + s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w; + } + // 对于 tile_n < TILE_CVS 的尾部,无需清零;后续按有效范围处理 + __syncthreads(); + + // ---------------- Stage 2: 线程内 4→1(保持相邻配对) ---------------- + // 共有 reduced_n0 = ceil(tile_n / 4) 个 lane-root + const int reduced_n0 = (tile_n + 3) >> 1 >> 1; // 等价于 (tile_n+3)/4 + uint32_t lane_cv[8]; // 本线程输出的 lane-root + bool lane_valid = false; + + // 每线程的 4 个输入的起始索引 + int base4 = tid << 2; // tid*4 + if (base4 < tile_n) { + // 读取最多 4 个相邻 CV:idx = base4 + 0,1,2,3 + uint32_t a[8], b[8], c[8], d[8]; + const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD); + load_cv_g2r(s0, a); + + int remain = tile_n - base4; + + if (remain >= 2) { + const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD); + load_cv_g2r(s1, b); + } + if (remain >= 3) { + const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD); + load_cv_g2r(s2, c); + } + if (remain >= 4) { + const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD); + load_cv_g2r(s3, d); + } + + // 两层相邻配对(奇数晋级) + if (remain == 1) { + #pragma unroll + for (int j = 0; j < 8; ++j) + lane_cv[j] = a[j]; + } else if (remain == 2) { + blake3_parent_cv(a, b, lane_cv); + } else if (remain == 3) { + uint32_t p01[8]; + blake3_parent_cv(a, b, p01); + blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01,(p01,c)->lane_cv + } else { // remain >= 4 + uint32_t p01[8], p23[8]; + blake3_parent_cv(a, b, p01); + blake3_parent_cv(c, d, p23); + blake3_parent_cv(p01, p23, lane_cv); + } + lane_valid = true; + } + + // ---------------- Stage 3: Warp 内 32→1 相邻配对 ---------------- + // 每个 warp 负责一个连续段:warp_base = warp_id*32 + const int warp_base = warp_id * WARP_SIZE; + const int cur_n_w = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // 本 warp 段内的有效数量 + + // 把 lane_cv 保留在寄存器里做归约;无效 lane 用 mask 剔除 + unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // 仅做存在检测 + int cur_n = cur_n_w; + + // 把“段外的线程”标成无效(避免读越界) + bool active_lane = (lane_id < cur_n_w); + + // 对无效 lane 把值清成 0(不会被使用) + if (!active_lane) { + #pragma unroll + for (int j = 0; j < 8; ++j) + lane_cv[j] = 0u; + } + + // 逐层配对:1,2,4,8,16 - warp-reduce + for (int step = 1; step < WARP_SIZE; step <<= 1) { + // 取右邻 + uint32_t nbr[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) + nbr[j] = __shfl_down_sync(0xFFFFFFFFu, lane_cv[j], step); + + const bool do_pair = + active_lane && + ((lane_id % (step<<1)) == 0) && + (lane_id + step < cur_n); + + if (do_pair) { + blake3_parent_cv(lane_cv, nbr, lane_cv); + } + + cur_n = (cur_n + 1) >> 1; + // __syncwarp(); + } + + // 这一段的结果在 lane0;把 16 个 warp-root 写入 SMEM 的前 16 行 + __shared__ uint32_t warp_roots[WARP_SIZE/2][8]; // 16×8 + if (lane_id == 0 && cur_n_w > 0) { + #pragma unroll + for (int j=0;j<8;++j) + warp_roots[warp_id][j] = lane_cv[j]; + } + __syncthreads(); + + // ---------------- Stage 4: CTA 内 16→1 相邻配对 ---------------- + // 有效 warp 数:ceil(reduced_n0 / 32) + int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..16 + if (valid_warps == 0) return; + + // 每一个warp的lane 0来做计算 + // 用 lane0 做计算,其它 lane 空转 + for (int stride = (valid_warps >> 1); stride >= 1; stride >>= 1) { + if (warp_id < stride && lane_id == 0) { + uint32_t p[8]; + blake3_parent_cv(&warp_roots[2*warp_id][0], + &warp_roots[2*warp_id + 1][0], p); + #pragma unroll + for (int j = 0; j < 8; ++j) + warp_roots[warp_id][j] = p[j]; + } + __syncthreads(); + // 奇数晋级 + if ((valid_warps & 1) && warp_id==0 && lane_id==0) { + #pragma unroll + for (int j=0;j<8;++j) + warp_roots[stride][j] = warp_roots[valid_warps-1][j]; + } + __syncthreads(); + valid_warps = (valid_warps + 1) >> 1; + } + + // 写回本 block 的根 + if (threadIdx.x == 0) { + uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8; + #pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = warp_roots[0][j]; + } +} + +inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) { + const uint32_t zero_block[16] = {0}; + uint32_t st[16]; + blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st); + // 写出前 32 字节(state[0..7],小端) + for (int i = 0; i < 8; ++i) { + uint32_t w = st[i]; + out32[4*i+0] = (uint8_t)( w & 0xFF); + out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF); + out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF); + out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF); + } +} + +// wrapper function +void blake3_block_reduce_sm70(const uint8_t* d_data, + uint64_t bytes_len, + std::array* root_out = nullptr, + cudaStream_t stream = 0) { + if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) { + fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n", + (unsigned long long)bytes_len); + std::abort(); + } + + // int dev = -1; + // cudaGetDevice(&dev); + // printf("[dbg] my runtime current device = %d\n", dev); + + // cudaPointerAttributes attr{}; + // auto st = cudaPointerGetAttributes(&attr, d_data); + // printf("[dbg] getAttr=%d, type=%d (0=host,1=device,2=managed), device=%d\n", + // (int)st, (int)attr.type, attr.device); + + // cudaPointerAttributes attr{}; + // CUDA_CHECK(cudaPointerGetAttributes(&attr, d_data)); + // if (attr.type != cudaMemoryTypeDevice) { + // fprintf(stderr, "d_data is not device memory!\n"); + // std::abort(); + // } + + int optin = 0, deflt = 0; + cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0); + cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0); + + const int dyn_smem = 64 * 1024; // 64KiB + + // 编译器在编译期决定分配多少动态shmem给kernel + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel<512, 2048, 0>, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel<32, 2048, 0>, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + + + constexpr int CHUNKS_PER_BLOCK = 32; // 16 * 32 = 512 + constexpr int CHUNK_SIZE = 1024; // 1 KiB + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int NUM_THREADS = CHUNKS_PER_BLOCK * 512 / 64; // for big kernel, 512 or 256 + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // 16 or 8 + const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk + const uint64_t total_chunks = bytes_len / CHUNK_SIZE; + const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); // 16384 blocks + + constexpr int pad_chunk = 16; + constexpr int pad_cv = 0; + + CUDA_CHECK(cudaFuncSetAttribute( + blake3_block_reduce_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100)); + + uint8_t* d_bytes = const_cast(d_data); + uint32_t* d_words = reinterpret_cast(d_bytes);; // alias + uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32 + + // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch. + + // TODO: use thrust + cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t)); // 512 KiB + + // ============= launch big kernel ============= + dim3 grid_big(num_blocks); + dim3 block_big(NUM_THREADS); + uint64_t base_chunk_counter = 0ull; + + blake3_block_reduce_kernel + <<>>( + d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks); + + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + + if (num_blocks == 1) { + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) *root_out = host_root; + else { + // 简单打印 + printf("root CV:"); + for (int i=0;i<8;++i) + printf(" %08x", host_root[i]); + printf("\n"); + } + + CUDA_CHECK(cudaFree(d_blockCV)); + CUDA_CHECK(cudaFree(d_bytes)); + return; + } + + // the first round of tiny kernel + // 1) 16384 output reduce -> 8 + uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32 + { + const int N = 16384; // total number + const int TILE = 2048; + const int grid = (N + TILE - 1) / TILE; // = 8 + const int block = 512; + const size_t smem_bytes = (size_t)(TILE) * 8u * sizeof(uint32_t); // 2048×8×4 = 64 KiB + + cudaMalloc(&d_mid_out, (size_t)8 * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel<512, 2048, 0> + <<>>(d_blockCV /*in: 16384×8 x 4*/, + d_mid_out /*out: 8×8*/, N); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + } + + // second round + uint32_t* d_root_cv = nullptr; + { + const int N = 8; + const int TILE = 2048; // 任意 >=N 即可 + const int grid = 1; + const int block = 32; // 32 线程够用 + const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t); // 8 x 8 x 4 = 8 x 32 B + + cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel<32, 2048, 0> + <<>>(d_mid_out /*in: 8×8*/, + d_root_cv /*out: 1×8*/, N); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + } + + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root_cv, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) { + *root_out = host_root; + } else { + printf("root CV:"); + for (int i=0;i<8;++i) printf(" %08x", host_root[i]); + printf("\n"); + } + + // clear + CUDA_CHECK(cudaFree(d_mid_out)); + CUDA_CHECK(cudaFree(d_root_cv)); + CUDA_CHECK(cudaFree(d_blockCV)); + // CUDA_CHECK(cudaFree(d_bytes)); +} \ No newline at end of file diff --git a/backup_deprecated/blake3_sm80_v1.cu b/backup_deprecated/blake3_sm80_v1.cu new file mode 100644 index 0000000..5f08a97 --- /dev/null +++ b/backup_deprecated/blake3_sm80_v1.cu @@ -0,0 +1,454 @@ + +#include +#include +#include +#include +#include + +#define WARP_SIZE 32 +#define LDST128BITS(value) (reinterpret_cast(&(value))[0]) + +#define CUDA_CHECK(expr) do { \ + cudaError_t _e = (expr); \ + if (_e != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s at %s:%d: %s\n", \ + #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\ + std::abort(); \ + } \ + } while(0) + +__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = { + 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, + 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u +}; + +// ---- 小工具 ---- +__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { +#if defined(__CUDA_ARCH__) + return __funnelshift_r(x, x, n); +#else + return (x >> n) | (x << (32 - n)); // host 路径 +#endif +} + +__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) { +#if defined(__CUDA_ARCH__) + const uint4 v = *reinterpret_cast(src); + dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w; +#else + std::memcpy(dst, src, 16); +#endif +} + +__host__ __device__ void blake3_compress_words_7r( + const uint32_t block_words[16], // 64B -> 16 x 4 bytes = 64 B = 4 x 16 bytes = 4 x 128 bits + const uint32_t cv[8], // 8×u32 + uint64_t chunk_counter, // 64-bit + uint32_t block_len, // [0..64] + uint32_t flags, // CHUNK_START/END/PARENT/ROOT… + uint32_t out_state[16]) // 返回 16×u32 状态向量(按规范) +{ + // TODO: 根据 BLAKE3(基于 BLAKE2s)的 G/round 实现7轮 + // 这里先给占位:将 IV+cv 混合到 out_state,真实实现请替换 +#pragma unroll + for (int i = 0; i < 8; ++i) + out_state[i] = cv[i]; +#pragma unroll + for (int i = 0; i < 8; ++i) + out_state[8+i] = BLAKE3_IV[i]; + + out_state[12] ^= (uint32_t)chunk_counter; + out_state[13] ^= (uint32_t)(chunk_counter >> 32); + out_state[14] ^= block_len; + out_state[15] ^= flags; + + // so far, the block_words are still pointers. + // now we load it into kernel, as pointed out by ncu profile + uint32_t block_reg_1[4]; + +#pragma unroll + for (int i = 0; i < 16; i += 4) { // the gap is 4 + load_u128_u32x4(block_words + i, block_reg_1); + out_state[i] ^= block_words[i]; + // 做一点点搅动(占位) + out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31); + } +} + +// 从 out_state 派生新的 CV(按规范应取 state[0..7] ^ state[8..15]) +__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){ +#pragma unroll + for (int i = 0; i < 8; ++i) + out_cv[i] = st[i] ^ st[8+i]; +} + +// 叶:处理 1KiB chunk(16×64B blocks)→ 1 个 CV +// 假定输入为小端 u32 流,chunk 不足 1KiB 最后一块 block_len<64 并置 END 标志 +__device__ void blake3_leaf_cv(const uint32_t* chunk_words, + int chunk_len_bytes, + uint64_t chunk_counter, + uint32_t out_cv[8]) +{ + uint32_t cv[8]; + // 初始 cv = IV +#pragma unroll + for (int i = 0; i < 8; ++i) + cv[i] = BLAKE3_IV[i]; + + const int nblocks = (chunk_len_bytes + 63) / 64; // ceil + for (int b = 0; b < nblocks; ++b) { + uint32_t st[16]; + const uint32_t* block = chunk_words + b*16; + const int remain = chunk_len_bytes - b*64; + const uint32_t blk_len = remain >= 64 ? 64u : (uint32_t)remain; + + const uint32_t flags = + ((b == 0) ? (1u<<0) : 0u) | // CHUNK_START(示意:bit0) + ((b == nblocks-1) ? (1u<<1) : 0u); // CHUNK_END (示意:bit1) + + blake3_compress_words_7r(block, cv, chunk_counter, blk_len, flags, st); + blake3_state_to_cv(st, cv); + } + +#pragma unroll + for (int i = 0; i < 8; ++i) + out_cv[i] = cv[i]; +} + +__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){ + uint32_t msg[16]; +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[i] = L[i]; + } +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[8+i] = R[i]; + } + uint32_t st[16]; + blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st); + blake3_state_to_cv(st, out_cv); +} + +// ============ Big kernel: 1 warp -> 32 chunks, 1 thread = 1 chunk, 16 WARPS in total ============ +// Each block has 512 threads +// 1 warp process 32 chunk -> 32 KiB +// NUM_WARPS = 512 / 32 = 16 +// Each block processes 16 x 32 chunks = 16 x 32 KiB = 512 KiB +template // pad shared memory +__global__ void blake3_block_reduce_kernel(uint32_t* d_input, + uint32_t* block_cvs, + int chunk_len_bytes, + uint64_t base_chunk_counter, + int total_chunks) { + // NUM_WARPS also stands for NUM_CHUKNS per block + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // consider it as 16 + + // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total + // 8 x 4 x 16 = 512 B shared memory in sum + __shared__ uint32_t cv_smem[NUM_WARPS][8 + PADSIZE]; // avoid bank conflict + + // reduce pipeline: 16 -> 8 -> 4 -> 2 -> 1 + const int tid = threadIdx.x; + const int warp_id = tid / WARP_SIZE; + const int lane_id = tid % WARP_SIZE; + + const uint64_t global_warp_id = blockIdx.x * NUM_WARPS + warp_id; + const uint64_t chunk_counter = base_chunk_counter + global_warp_id; + + // index + const uint64_t warp_chunk_base = global_warp_id * WARP_SIZE; // the start of this warp + // each thread process one chunk + const uint64_t chunk_idx = warp_chunk_base + lane_id; + + // edge processing + int valid = total_chunks - warp_chunk_base; + if (valid <= 0) return; // TODO: will this affect warp shfl? + if (valid > WARP_SIZE) valid = WARP_SIZE; + + // compute idx for this thread + const int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + const uint32_t* chunk_words_ptr = d_input + (size_t)chunk_idx * WORDS_PER_CHUNK; + + uint32_t cv[8] = {0}; // 8 x u32 + bool active = lane_id < valid; + if (active) { + const uint64_t chunk_counter = base_chunk_counter + chunk_idx; + blake3_leaf_cv(chunk_words_ptr, chunk_len_bytes, chunk_counter, cv); + } + + // Stage 2: merging to parent, in O(logN) depth + + // take care: we cannot use general reduce + // 0-1-2-3-4-...-31, keep this sequential + unsigned mask = __ballot_sync(0xFFFFFFFFu, active); + // step = 1,2,4,8,16 + for (int step = 1; step < WARP_SIZE; step <<= 1) { + int partner_lane = lane_id + step; + + // neighbor cv + uint32_t neighbor_cv[8]; +#pragma unroll + for (int j = 0; j < 8; ++j) { + neighbor_cv[j] = __shfl_down_sync(mask, cv[j], step); + } + + // the left be parent, and make sure `the right` is valid + if (active && ((lane_id & ((step << 1) - 1)) == 0) && (partner_lane < valid)) { + blake3_parent_cv(cv, neighbor_cv, cv); + } + __syncwarp(mask); + + // in the next level, reduce half of active threads + if (lane_id >= (valid & ~(step))) + active = false; + } + + // now, lane 0 holds the root + if (lane_id == 0) { +#pragma unroll + for (int j = 0 ; j < 8; ++j) + cv_smem[warp_id][j] = cv[j]; + } + __syncthreads(); + + // after all these things, we have finished + // 32 -> 16 -> 8 -> 4 -> 2 -> 1 merge + // and now, we are going to implement higher-level merge + // we have 16 warps, each warp has a root cv + // so we are going to execute another logN steps + + // 16 -> 8 -> 4 -> 2 -> 1 + for (int stride = NUM_WARPS >> 1; stride >= 1; stride >>= 1) { + if (warp_id < stride && lane_id == 0) { + uint32_t p[8]; + blake3_parent_cv(&cv_smem[2*warp_id][0], &cv_smem[2*warp_id + 1][0], p); +#pragma unroll + for (int j=0;j<8;++j) + cv_smem[warp_id][j] = p[j]; // write back to shared memory + } + __syncthreads(); + } + + // write this root cv to global memory, not done yet! we need another tiny kernel to sweep + if (tid == 0) { + uint32_t* out = block_cvs + (size_t)blockIdx.x * 8; +#pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = cv_smem[0][j]; + } + +} // blake3_block_reduce_kernel + + +// ============ Tiny kernel ============ +// In big kernel, it will consume 512 KiB each block +// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 512 root = 2048 root +// And this tiny kernel is designed to process these 2048 root +template +__global__ void blake3_pair_reduce_kernel(const uint32_t* __restrict__ in_cv32, + uint32_t* __restrict__ out_cv32, + int n) { + extern __shared__ uint32_t smem[]; // dynamic shared memory + uint32_t* tile = smem; // -> [tile_n][8] + + const int tid = threadIdx.x; + const int b = blockIdx.x; + const int B = gridDim.x; + + const int start = (int)((1ll * n * b) / B); + const int end = (int)((1ll * n * (b+1)) / B); + int tile_n = end - start; + + if (tile_n <= 0) return; // border + + const int words = tile_n * 8; + for (int w = tid; w < words; w += NUM_THREADS) { + tile[w] = in_cv32[start * 8 + w]; + } + __syncthreads(); + + int cur = tile_n; + while (cur > 1) { + const int pairs = cur >> 1; // floor(cur/2) + // process pairs + for (int i = tid; i < pairs; i += NUM_THREADS) { + const uint32_t* L = &tile[(2*i) * 8]; + const uint32_t* R = &tile[(2*i+1) * 8]; + uint32_t p[8]; + blake3_parent_cv(L, R, p); +#pragma unroll + for (int j=0;j<8;++j) tile[i*8 + j] = p[j]; + } + __syncthreads(); + + // even situation: + if ((cur & 1) && tid == 0) { + uint32_t* dst = &tile[pairs * 8]; + uint32_t* src = &tile[(cur - 1) * 8]; +#pragma unroll + for (int j=0;j<8;++j) + dst[j] = src[j]; + } + __syncthreads(); + + cur = pairs + (cur & 1); + } + + // write output + if (tid == 0) { + uint32_t* out = &out_cv32[b * 8]; +#pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = tile[j]; + } +} + +constexpr uint32_t FLAG_ROOT = 8; + +inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) { + uint32_t zero_block[16] = {0}; + uint32_t st[16]; + blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st); + // 写出前 32 字节(state[0..7],小端) + for (int i = 0; i < 8; ++i) { + uint32_t w = st[i]; + out32[4*i+0] = (uint8_t)( w & 0xFF); + out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF); + out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF); + out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF); + } +} + +// wrapper function +void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, + std::array* root_out = nullptr, + cudaStream_t stream = 0) { + if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) { + fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n", + (unsigned long long)bytes_len); + std::abort(); + } + + constexpr int CHUNK_SIZE = 1024; // 1 KiB + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int NUM_THREADS = 512; // for big kernel + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // 16 + constexpr int CHUNKS_PER_BLOCK= NUM_WARPS * WARP_SIZE; // 16 * 32 = 512 + const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk + const uint64_t total_chunks = bytes_len / CHUNK_SIZE; + const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); + + uint8_t* d_bytes = nullptr; + uint32_t* d_words = nullptr; // alias + uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32 + + // TODO: use thrust + cudaMalloc(&d_bytes, bytes_len); + cudaMemcpyAsync(d_bytes, data, bytes_len, cudaMemcpyHostToDevice, stream); + d_words = reinterpret_cast(d_bytes); + + cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t)); + + // launch big kernel + dim3 grid_big(num_blocks); + dim3 block_big(NUM_THREADS); + uint64_t base_chunk_counter = 0ull; + + blake3_block_reduce_kernel + <<>>( + d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks); + + CUDA_CHECK(cudaGetLastError()); + + if (num_blocks == 1) { + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) *root_out = host_root; + else { + // 简单打印 + printf("root CV:"); + for (int i=0;i<8;++i) + printf(" %08x", host_root[i]); + printf("\n"); + } + + CUDA_CHECK(cudaFree(d_blockCV)); + CUDA_CHECK(cudaFree(d_bytes)); + return; + } + + // the first round of tiny kernel + const int B = (num_blocks >= 8) ? 8 : num_blocks; + uint32_t* d_midCV = nullptr; + cudaMalloc(&d_midCV, (size_t)B * 8u * sizeof(uint32_t)); + + { + dim3 grid(B); + dim3 block(512); // 你指定 “每个 block 512 线程” + // 每个 block 负责 ceil(num_blocks / B) 个 CV;SMEM 大小按此计算 + const int tile = (num_blocks + B - 1) / B; + const size_t smem_bytes = (size_t)tile * 8u * sizeof(uint32_t); + + blake3_pair_reduce_kernel<512> + <<>>(d_blockCV, d_midCV, num_blocks); + CUDA_CHECK(cudaGetLastError()); + } + + // second round + uint32_t* d_root = nullptr; + cudaMalloc(&d_root, 8u * sizeof(uint32_t)); + + { + dim3 grid(1); + dim3 block(B); + const size_t smem_bytes = (size_t)B * 8u * sizeof(uint32_t); + + // generate kernel during compile time + switch (B) { + case 1: blake3_pair_reduce_kernel<1 ><<>>(d_midCV, d_root, B); break; + case 2: blake3_pair_reduce_kernel<2 ><<>>(d_midCV, d_root, B); break; + case 4: blake3_pair_reduce_kernel<4 ><<>>(d_midCV, d_root, B); break; + case 8: blake3_pair_reduce_kernel<8 ><<>>(d_midCV, d_root, B); break; + case 16: blake3_pair_reduce_kernel<16><<>>(d_midCV, d_root, B); break; + case 32: blake3_pair_reduce_kernel<32><<>>(d_midCV, d_root, B); break; + case 64: blake3_pair_reduce_kernel<64><<>>(d_midCV, d_root, B); break; + default: { + blake3_pair_reduce_kernel<256><<>>(d_midCV, d_root, B); + } break; + } + CUDA_CHECK(cudaGetLastError()); + } + + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) { + *root_out = host_root; + } else { + printf("root CV:"); + for (int i=0;i<8;++i) printf(" %08x", host_root[i]); + printf("\n"); + } + + // clear + CUDA_CHECK(cudaFree(d_root)); + CUDA_CHECK(cudaFree(d_midCV)); + CUDA_CHECK(cudaFree(d_blockCV)); + CUDA_CHECK(cudaFree(d_bytes)); +} \ No newline at end of file diff --git a/backup_deprecated/blake3_sm80_v3.cu b/backup_deprecated/blake3_sm80_v3.cu new file mode 100644 index 0000000..fdf1716 --- /dev/null +++ b/backup_deprecated/blake3_sm80_v3.cu @@ -0,0 +1,994 @@ + +#include "cute/numeric/int.hpp" +#include +#include +#include +#include + +#include +#include +#include + +#if __CUDA_ARCH__ >= 800 + #include +#endif + +#include +#include +#include + +#define WARP_SIZE 32 +#define LDST128BITS(value) (reinterpret_cast(&(value))[0]) + +#define CUDA_CHECK(expr) do { \ + cudaError_t _e = (expr); \ + if (_e != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s at %s:%d: %s\n", \ + #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\ + std::abort(); \ + } \ + } while(0) + + +using namespace cute; + +using vec_t = cute::uint128_t; // one time loading 16 B + +#if __CUDA_ARCH__ >= 800 + // SM80 branch + using Atom = cute::Copy_Atom, vec_t>; +#else + // Volta/Turing(SM70/75)general copy + using Atom = cute::Copy_Atom, vec_t>; + // 也可用更保守的:using Atom = cute::Copy_Atom, vec_t>; +#endif + +__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = { + 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, + 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u +}; + +enum : uint32_t { + FLAG_CHUNK_START = 1u << 0, + FLAG_CHUNK_END = 1u << 1, + FLAG_PARENT = 1u << 2, + FLAG_ROOT = 1u << 3, + FLAG_KEYED_HASH = 1u << 4, + FLAG_DERIVE_KEY_CONTEXT = 1u << 5, + FLAG_DERIVE_KEY_MATERIAL= 1u << 6, +}; + +__device__ __noinline__ +uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) { + uint32_t f = 0; + f |= (uint32_t)-(block_idx_in_chunk==0) & FLAG_CHUNK_START; + f |= (uint32_t)-(block_idx_in_chunk==nblocks_in_chunk-1) & FLAG_CHUNK_END; + if (is_root_chunk) f |= FLAG_ROOT; // only this block in msg, or this is root + return f; +} + +__device__ __forceinline__ +uint32_t blake3_parent_flags(bool is_root_parent) { + return FLAG_PARENT | (is_root_parent ? FLAG_ROOT : 0); +} + +// ---- 小工具 ---- +__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { +#if defined(__CUDA_ARCH__) + return __funnelshift_r(x, x, n); +#else + return (x >> n) | (x << (32 - n)); // host 路径 +#endif +} + +__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) { +#if defined(__CUDA_ARCH__) + const uint4 v = *reinterpret_cast(src); + dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w; +#else + std::memcpy(dst, src, 16); +#endif +} + +__host__ __device__ void blake3_compress_words_7r( + const uint32_t block_words[16], // 64B -> shared memory + const uint32_t cv[8], // 8×u32 -> shared memory + uint64_t chunk_counter, // 64-bit + uint32_t block_len, // [0..64] + uint32_t flags, // CHUNK_START/END/PARENT/ROOT… + uint32_t out_state[16]) // 返回 16×u32 状态向量(按规范) +{ + // TODO: 根据 BLAKE3(基于 BLAKE2s)的 G/round 实现7轮 + // 这里先给占位:将 IV+cv 混合到 out_state,真实实现请替换 +#pragma unroll + for (int i = 0; i < 8; ++i) + out_state[i] = cv[i]; +#pragma unroll + for (int i = 0; i < 8; ++i) + out_state[8+i] = BLAKE3_IV[i]; + + out_state[12] ^= (uint32_t)chunk_counter; + out_state[13] ^= (uint32_t)(chunk_counter >> 32); + out_state[14] ^= block_len; + out_state[15] ^= flags; + + // so far, the block_words are still pointers. + // now we load it into kernel, as pointed out by ncu profile + uint32_t block_reg_1[4]; + +#pragma unroll + for (int i = 0; i < 16; i += 4) { // the gap is 4 + // load_u128_u32x4(block_words + i, block_reg_1); + out_state[i] ^= block_words[i]; + // 做一点点搅动(占位) + out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31); + } +} + +// 从 out_state 派生新的 CV(按规范应取 state[0..7] ^ state[8..15]) +__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){ +#pragma unroll + for (int i = 0; i < 8; ++i) + out_cv[i] = st[i] ^ st[8+i]; +} + +// swap-table +// BLAKE3 message schedule: rows are P^r, r=0..6. +// Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) +// https://www.cse.unsw.edu.au/~cs4601/refs/papers/blake3.pdf +__device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = { + // r = 0: identity + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + // r = 1: P + { 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8 }, + // r = 2: P∘P + { 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1 }, + // r = 3 + { 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6 }, + // r = 4 + { 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4 }, + // r = 5 + { 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7 }, + // r = 6 + { 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13 }, +}; + +// get the "r" round, "k" message, it is broadcasted from m[li] lane. li = k +__device__ __forceinline__ uint32_t msg_rk(uint32_t m_lane, int round, int k, unsigned mask16) { + int src = B3_MSG_SCHEDULE[round][k]; + return __shfl_sync(mask16, m_lane, src, 16); +} + +__device__ __noinline__ +uint32_t G_update_role(uint32_t v_self, uint32_t v_b, uint32_t v_c, uint32_t v_d, + uint32_t mx, uint32_t my, int role) +{ + // 按 BLAKE2s 32-bit 的 G 序列算出 a',b',c',d',最后返回“当前 role”的那个值 + uint32_t a = v_self, b = v_b, c = v_c, d = v_d; + + // a = a + b + mx; + // d ^= a; + // d >>>= 16 + a = a + b + mx; + d ^= a; + d = rotr32(d, 16); + + // c = c + d; + // b ^= c; + // b >>>= 12 + c = c + d; + b ^= c; + b = rotr32(b, 12); + + // a = a + b + my; + // d ^= a; + // d >>>= 8 + a = a + b + my; + d ^= a; + d = rotr32(d, 8); + + // c = c + d; + // b ^= c; + // b >>>= 7 + c = c + d; + b ^= c; + b = rotr32(b, 7); + + // role choice: + switch (role) { + case 0: return a; + case 1: return b; + case 2: return c; + default: return d; + } +} + +// notice that, this function will proceed 2 chunks, each time. +// - chunk_words_row: current chunk +// - out_cv: written by lane 0, or lane 16 +__device__ __noinline__ +void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row, // 256×u32 -> 1024 Bytes, from shared memory + // so the chunks_row += 2 as gap + int chunk_len_bytes, + uint64_t chunk_counter, + uint32_t out_cv[8], + unsigned mask16) { + // computing index + int lane = threadIdx.x & 31; // lane_id: 0-31 + int sub = lane >> 4; // 0/1 + int li = lane & 15; // 0..15, abstract lane id. for example, lane 16 will be li=0 + int role = li & 3; // a/b/c/d role + int base = (sub << 4); // 0 or 16 the absolute base + + const int nblocks = (chunk_len_bytes + 63) >> 6; // ceil(chunk_len/64) + + int warp_id = threadIdx.x / WARP_SIZE; + + // initialize + uint32_t cv_word = 0; + if (li < 8) cv_word = BLAKE3_IV[li]; + + // process all blocks + // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks + // each block has 64B -> 16 x u32 + for (int b = 0; b < nblocks; ++b) { + // each lane holds one u32, + // 16 lane will hold 16 x 4 = 64 B -> it's block + // the another 16 lane will hold opposite 64 B + const uint32_t m_lane = chunk_words_row[b * 16 + li]; + + // 初始化 v:v[0..7]=cv, v[8..11]=IV,v[12..15]^=t/len/flags + // 先把“自己的那个索引”的初值准备好: + uint32_t v = (li < 8) + ? cv_word // v[i](i<8) + : BLAKE3_IV[li - 8]; // v[8..15] ← IV + + // 计数器/长度/标志(按 BLAKE3 规范) + const uint32_t t0 = (uint32_t)chunk_counter; + const uint32_t t1 = (uint32_t)(chunk_counter >> 32); + const int remain = chunk_len_bytes - (b << 6); + const uint32_t block_len = (remain >= 64) ? 64u : (uint32_t)remain; + + const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false); + + // 只在 12..15 四个索引上异或相应域(不分支,用谓词掩码) + v ^= (li == 12) ? t0 : 0u; + v ^= (li == 13) ? t1 : 0u; + v ^= (li == 14) ? block_len: 0u; + v ^= (li == 15) ? flags : 0u; + + // 把索引写成 li = q + 4*rq;对角步先做置换 li' = (rq<<2) | ((q+rq)&3) + int q = (li & 3); + int rq = (li >> 2); + int li_diag = (rq << 2) | ((q + rq) & 3); + int li_undo = (rq << 2) | ((q - rq) & 3); + int gi_col = q; // 0..3 + int gi_diag = (li_diag & 3); // 0..3 + + // ===== 7 rounds ===== + #pragma unroll 4 + for (int r = 0; r < 7; ++r) { + // inside this loop, each lane will do one job + // 16 lane will execute 16 x 2 operations + // in sequential-programming, will do 8 operation + + // ---- 列步(quartet: {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15})---- + { + // 取同 quartet 的 b/c/d(基于当前 v) + uint32_t vb = __shfl_xor_sync(mask16, v, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16); + + // 本 quartet 的 i ∈ {0,1,2,3},列步用 msg 索引 0..7(两两为一对) + + uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16); + + v = G_update_role(v, vb, vc, vd, mx, my, role); + } + + // ---- 对角步 ---- + { + // 在“对角置换域”取到当前 v 值 + uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16); + + // 在该域内做“列步”同样的四邻取值 + uint32_t vb = __shfl_xor_sync(mask16, v_diag, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v_diag, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16); + + // 对角步的 4 组 G 使用本轮消息对的后半(索引 8..15) + + uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16); + + uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role); + + // 反置换回原位:li_undo = (rq<<2) | ((q - rq) & 3) + + // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16); + v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16); + } + } // 7 rounds end + + // 派生新的 CV:cv[i] = v[i] ^ v[i+8](仅 li=0..7 生效) + uint32_t vip8_all = __shfl_sync(mask16, v, li ^ 8, 16); + if (li < 8) { + cv_word = v ^ vip8_all; + } + + // 下一块继续(本函数内 16 个 block 串行) + } + + // 由 lane0 / lane16 收集 8×u32 输出 + #pragma unroll + for (int j = 0; j < 8; ++j) { + uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用 + if (li == 0) out_cv[j] = wj; // 仅 lane0 落盘 + } +} + +__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){ + uint32_t msg[16]; +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[i] = L[i]; + } +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[8+i] = R[i]; + } + uint32_t st[16]; + blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st); + blake3_state_to_cv(st, out_cv); +} + + + +// ============ Big kernel: 16 WARPS in total ============ +// grid: (chunks / 64), thread: (512,) +template 32 KB + const int PAD_CHUNK=16, + const int PAD_CV=0> // pad shared memory +__global__ void blake3_block_reduce_kernel(uint32_t* d_input, + uint32_t* block_cvs, + int chunk_len_bytes, + uint64_t base_chunk_counter, + int total_chunks) { + // NUM_WARPS also stands for NUM_CHUKNS per block + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // consider it as 16 + constexpr int CHUNKS_PROCEED = CHUNKS_PER_BLOCK; + + static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned"); + static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4"); + + // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total + // 8 x 4 x 64 = 2 KiB shared memory in sum + __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV]; // avoid bank conflict + + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int VEC_ELEMS = 4; // uint4, 16B + constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB) + constexpr int WARPS_PER_CTA = NUM_WARPS; // 16 + + // 4 bytes x 256 x 64 = 64 KiB shared memory. + __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][WORDS_PER_CHUNK + PAD_CHUNK]; // [64][256+PAD] + + const int tid = threadIdx.x; + const int bx = blockIdx.x; + const int warp_id = tid / WARP_SIZE; + const int lane_id = tid % WARP_SIZE; + + // ============== STAGE 1: Coalsced Global Memory Loading ============== + const int tile_id = blockIdx.x; + const int tile_base = tile_id * CHUNKS_PER_BLOCK; // which chunk do this block start loading + + int valid_chunks = total_chunks - tile_base; + if (valid_chunks <= 0) { + return; // overflow + } + if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK; + + // use CuTe for loading + + auto make_smem_tensor = [&](int chunk_local /* The chunk id on shared memory */) { + // smem index:chunk_smem[chunk_local][0] + // length: (WORDS_PER_CHUNK + PAD_CHUNK) + uint32_t* s_u32 = &chunk_smem[chunk_local][0]; + vec_t* s_vec = reinterpret_cast(s_u32); // each element is 16B + return make_tensor(make_smem_ptr(s_vec), + make_shape(Int{}), // 64 x 4 = 256, one row of shared memory + make_stride(Int<1>{})); // continuous + }; + + // shared memory 256 items = 64 x vec_t(4) + // thread layout: 32 threads + // thread 0: 0, 32 + // thread 1: 1, 33 + // thread 32: 31,63 + // so we load data in coalsced mode + using Atom = cute::Copy_Atom, vec_t>; + + // thread layout. (lane,0)->idx=2*lane,(lane,1)->idx=2*lane+1 + auto thr_layout = make_layout( + make_shape(Int<2>{}), + make_stride(Int<1>{}) + ); + + auto tile_layout = make_layout( + make_shape(Int{}, Int<2>{}), // (32,2) + make_stride(Int<1>{}, Int{}) // idx = lane + k2*32 + ); + + // build the real copy + TiledCopy tcopy = make_tiled_copy(Atom{}, thr_layout, tile_layout); + + // this sentence will load + // auto thr_copy = local_partition(tcopy, lane_id); + + + for (int ldt = 0; ldt < 4; ldt++) { + // each warp load 4 chunks + int chunk_local = ldt * WARPS_PER_CTA + warp_id; // ldt*16 + warp -> start chunk + int chunk_global = tile_base + chunk_local; // global chunk idx + + // the pointer for shared memory + auto smem_vec1d = make_smem_tensor(chunk_local); // 64 x uint128_t + + // only read from global, when it's valid + // or, we fill it with 0 + const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK; + const vec_t* g_vec_base = reinterpret_cast(g_u32); // 16 B each time + + // the src is 64 x 4 = 256, so is dest + auto gmem_vec1d = make_tensor(make_gmem_ptr(g_vec_base), + make_shape(Int{}), // 64 x uint128_t + make_stride(Int<1>{})); + bool g_valid = (chunk_local < valid_chunks); + + auto smem_vec2d = make_tensor(smem_vec1d.data(), tile_layout); + auto gmem_vec2d = make_tensor(gmem_vec1d.data(), tile_layout); + + // now we will load this + auto tCg = local_partition(gmem_vec2d, tile_layout, lane_id); + auto tCs = local_partition(smem_vec2d, tile_layout, lane_id); + + // launch the copy inst. + if (g_valid) { + // gmem load → smem store:twice 16B + copy(tcopy, tCg, tCs); + } else { + for (int i = 0; i < size(tCs); ++i) { + *reinterpret_cast(raw_pointer_cast(&tCs(i))) = uint128_t(0, 0); + } + } + } + cp_async_fence(); + cp_async_wait<0>(); + + __syncthreads(); // sync all warps + + // ============== STAGE 2: Compress Leaf to 64 chain value ============== + const int pass0_valid = min(32, valid_chunks); // pass0 cover [0, 31] chunks + const int pass1_valid = max(0, valid_chunks - 32); // pass1 cover [32, 63] chunks + + __shared__ int parents_count; + if (threadIdx.x == 0) { + const int parents0 = (pass0_valid + 1) >> 1; + const int parents1 = (pass1_valid + 1) >> 1; + parents_count = parents0 + parents1; // ≤ 32 + } + __syncthreads(); + + // if (blockIdx.x==0 && threadIdx.x==0) printf("after stage1\n"); + + // this is for each warp's lane0 and lane16 written + // to decrease the register usage. + __shared__ __align__(16) uint32_t tmp_cv[CHUNKS_PER_BLOCK][8 + PAD_CV]; // 2 KiB + + // lambda function: compress this thing + auto do_big_pass = [&](int base /*0 或 32*/, int pass_valid) { + // left=base+2*warp_id, right=left+1 + const int left = base + (warp_id << 1); // base + 0,2,4,6,... + const int right = left + 1; + const int left_rel = left - base; // 0..31 + const int right_rel = right - base; // 1..32 + const bool has_left = (left_rel < pass_valid); + const bool has_right = (right_rel < pass_valid); + + // const int lane_id = threadIdx.x & 31; + const int sub = lane_id >> 4; // sub-warp: 0 or 1, lane0-15: sub-warp0; lane16-lane31: sub-warp1 + const int li = lane_id & 15; // 0..15 + const unsigned mask16= (sub == 0 ? 0x0000FFFFu : 0xFFFF0000u); + + const int chunk_local = left + sub; // sub=0→left, sub=1→right + const bool active = (sub==0 ? (left - base) < pass_valid + : (right - base) < pass_valid); + + // uint32_t my_cv[8]; + + // the left-sub-warp and right-sub-warp will execute the same code + // distinguish the index by computing, + // to avoid warp-divergence + if (active) { + // the chunk local identifies the left or right chunk, so do not worry. + const uint32_t* row = &chunk_smem[chunk_local][0]; + const uint64_t cc = base_chunk_counter + (uint64_t)(tile_base + chunk_local); + blake3_leaf_cv_simd16_onechunk(row, + chunk_len_bytes, + cc, + &tmp_cv[chunk_local][0], + mask16); + } + + __syncwarp(); // make sure two warps written into `tmp_cv` + + // now we have compute 2 chunks' cv + // merge it to a parent cv + if (lane_id == 0 && has_left) { + const uint32_t* lcv = &tmp_cv[left][0]; + uint32_t parent[8]; + if ((right - base) < pass_valid) { + const uint32_t* rcv = &tmp_cv[right][0]; + blake3_parent_cv(lcv, rcv, parent); + } else { // odd: up-flow directly + #pragma unroll + for (int j = 0 ; j < 8; ++j) + parent[j] = lcv[j]; + } + + // now, one warp computes 2 chunks, yield one parent-cv value + const int pair_idx = (base >> 1) + warp_id; // 0, 16 + warp_id + #pragma unroll + for (int j = 0; j < 8; ++j) + cv_smem[pair_idx][j] = parent[j]; + } + + __syncwarp(); // NOTICE: this is necessary! + }; // do_big_pass + + // big-pass 1: computing 0-31 chunks + do_big_pass(/*base=*/0, pass0_valid); + + // if (bx == 0) printf("Finish 1 big pass\n"); + + // big-pass 2: computing 32-63 chunks + do_big_pass(/*base=*/32, pass1_valid); + + // if (bx == 0) printf("Finish 2 big pass\n"); + + __syncthreads(); + + // printf("Stage 2 done!!!\n"); + + // right now, we have got 32 chain values + // a warp-reduce to merge. + + // ============== STAGE 3: Block-Reduce ============== + // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv + // we will only use warp 0 to handle this thing + if (warp_id == 0) { + uint32_t cv[8] = {0,0,0,0,0,0,0,0}; + + const bool active_lane = (lane_id < parents_count); + if (active_lane) { + #pragma unroll + for (int j = 0; j < 8; ++j) cv[j] = cv_smem[lane_id][j]; + } + + // 2) warp reduce 32 -> 16 -> 8 -> 4 -> 2 -> 1 + unsigned mask = __ballot_sync(0xFFFFFFFFu, active_lane); + int cur_n = parents_count; // 当前层的有效节点数(逐层更新) + + for (int step = 1; step < WARP_SIZE; step <<= 1) { + // right-neighbor + uint32_t nbr[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) { + nbr[j] = __shfl_down_sync(mask, cv[j], step); + } + + // safety checking + const bool do_pair = + (lane_id % (step << 1) == 0) && // 左侧 + (lane_id + step < cur_n) && // 右侧在当前层有效范围内 + (lane_id < cur_n); // 左侧也必须有效 + + if (do_pair) { + blake3_parent_cv(cv, nbr, cv); // parent(left, right) -> cv + } + + cur_n = (cur_n + 1) >> 1; + __syncwarp(mask); + } + + // 3) write back to global memory + if (lane_id == 0 && parents_count > 0) { + const int tile_id = blockIdx.x; + uint32_t* out = block_cvs + (size_t)tile_id * 8; // 8 x 4 = 32 B + + // two different write ways + #if 0 + #pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = cv[j]; + #else + // block_cvs should be cudaMalloc ed + reinterpret_cast(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]); + reinterpret_cast(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]); + #endif + } + } +} // blake3_block_reduce_kernel + +__device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) { + const uint4* g4 = reinterpret_cast(g); + uint4 a = g4[0], b = g4[1]; + r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w; + r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w; +} + +__device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) { + uint4* g4 = reinterpret_cast(g); + g4[0] = make_uint4(r[0],r[1],r[2],r[3]); + g4[1] = make_uint4(r[4],r[5],r[6],r[7]); +} + +// ============ Tiny kernel ============ +// In big kernel, it will consume 64 KiB each block +// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 64 root = 16384 root +// And this tiny kernel is designed to process these 16384 root +template +__global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32, + uint32_t* __restrict__ out_cv32, + int N) +{ + extern __shared__ __align__(16) uint32_t smem[]; // 动态 SMEM;需要 >= TILE_CVS*8*4 字节 + // 视作 2D:[TILE_CVS][8+PAD] + uint32_t* cv_tile = smem; + + const int tid = threadIdx.x; + const int warp_id = tid / WARP_SIZE; // 0..15 + const int lane_id = tid % WARP_SIZE; // 0..31 + + // 本 block 负责的分片起点 + const int tile_start = blockIdx.x * TILE_CVS; + if (tile_start >= N) return; + + // N等于8的时候,这里就是8 + const int tile_n = min(TILE_CVS, N - tile_start); // 该分片的实际 CV 数(<=2048) + + // ---------------- Stage 1: 合并访存 loading 到 SMEM ---------------- + // 每线程搬多个 CV:i = tid, tid+blockDim, ... + for (int i = tid; i < tile_n; i += NUM_THREADS) { // 注意:i = tid, 不是等于0 + const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8; + uint32_t* s = cv_tile + (size_t)i * (8 + PAD); + // 两次 16B + const uint4* g4 = reinterpret_cast(g); + uint4* s4 = reinterpret_cast(s); + // s4[0] = g4[0]; + // s4[1] = g4[1]; + + // in case that the address is not aligned + uint4 v0 = g4[0]; + uint4 v1 = g4[1]; + + s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w; + s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w; + } + // 对于 tile_n < TILE_CVS 的尾部,无需清零;后续按有效范围处理 + __syncthreads(); + + // ---------------- Stage 2: 线程内 4→1(保持相邻配对) ---------------- + // 共有 reduced_n0 = ceil(tile_n / 4) 个 lane-root + const int reduced_n0 = (tile_n + 3) >> 1 >> 1; // 等价于 (tile_n+3)/4 + uint32_t lane_cv[8]; // 本线程输出的 lane-root + bool lane_valid = false; + + // 每线程的 4 个输入的起始索引 + int base4 = tid << 2; // tid*4 + if (base4 < tile_n) { + // 读取最多 4 个相邻 CV:idx = base4 + 0,1,2,3 + uint32_t a[8], b[8], c[8], d[8]; + const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD); + load_cv_g2r(s0, a); + + int remain = tile_n - base4; + + if (remain >= 2) { + const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD); + load_cv_g2r(s1, b); + } + if (remain >= 3) { + const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD); + load_cv_g2r(s2, c); + } + if (remain >= 4) { + const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD); + load_cv_g2r(s3, d); + } + + // 两层相邻配对(奇数晋级) + if (remain == 1) { + #pragma unroll + for (int j = 0; j < 8; ++j) + lane_cv[j] = a[j]; + } else if (remain == 2) { + blake3_parent_cv(a, b, lane_cv); + } else if (remain == 3) { + uint32_t p01[8]; + blake3_parent_cv(a, b, p01); + blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01,(p01,c)->lane_cv + } else { // remain >= 4 + uint32_t p01[8], p23[8]; + blake3_parent_cv(a, b, p01); + blake3_parent_cv(c, d, p23); + blake3_parent_cv(p01, p23, lane_cv); + } + lane_valid = true; + } + + // ---------------- Stage 3: Warp 内 32→1 相邻配对 ---------------- + // 每个 warp 负责一个连续段:warp_base = warp_id*32 + const int warp_base = warp_id * WARP_SIZE; + const int cur_n_w = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // 本 warp 段内的有效数量 + + // 把 lane_cv 保留在寄存器里做归约;无效 lane 用 mask 剔除 + unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // 仅做存在检测 + int cur_n = cur_n_w; + + // 把“段外的线程”标成无效(避免读越界) + bool active_lane = (lane_id < cur_n_w); + + // 对无效 lane 把值清成 0(不会被使用) + if (!active_lane) { + #pragma unroll + for (int j = 0; j < 8; ++j) + lane_cv[j] = 0u; + } + + // 逐层配对:1,2,4,8,16 - warp-reduce + for (int step = 1; step < WARP_SIZE; step <<= 1) { + // 取右邻 + uint32_t nbr[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) + nbr[j] = __shfl_down_sync(0xFFFFFFFFu, lane_cv[j], step); + + const bool do_pair = + active_lane && + ((lane_id % (step<<1)) == 0) && + (lane_id + step < cur_n); + + if (do_pair) { + blake3_parent_cv(lane_cv, nbr, lane_cv); + } + + cur_n = (cur_n + 1) >> 1; + // __syncwarp(); + } + + // 这一段的结果在 lane0;把 16 个 warp-root 写入 SMEM 的前 16 行 + __shared__ uint32_t warp_roots[WARP_SIZE/2][8]; // 16×8 + if (lane_id == 0 && cur_n_w > 0) { + #pragma unroll + for (int j=0;j<8;++j) + warp_roots[warp_id][j] = lane_cv[j]; + } + __syncthreads(); + + // ---------------- Stage 4: CTA 内 16→1 相邻配对 ---------------- + // 有效 warp 数:ceil(reduced_n0 / 32) + int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..16 + if (valid_warps == 0) return; + + // 每一个warp的lane 0来做计算 + // 用 lane0 做计算,其它 lane 空转 + for (int stride = (valid_warps >> 1); stride >= 1; stride >>= 1) { + if (warp_id < stride && lane_id == 0) { + uint32_t p[8]; + blake3_parent_cv(&warp_roots[2*warp_id][0], + &warp_roots[2*warp_id + 1][0], p); + #pragma unroll + for (int j = 0; j < 8; ++j) + warp_roots[warp_id][j] = p[j]; + } + __syncthreads(); + // 奇数晋级 + if ((valid_warps & 1) && warp_id==0 && lane_id==0) { + #pragma unroll + for (int j=0;j<8;++j) + warp_roots[stride][j] = warp_roots[valid_warps-1][j]; + } + __syncthreads(); + valid_warps = (valid_warps + 1) >> 1; + } + + // 写回本 block 的根 + if (threadIdx.x == 0) { + uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8; + #pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = warp_roots[0][j]; + } +} + +inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) { + const uint32_t zero_block[16] = {0}; + uint32_t st[16]; + blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st); + // 写出前 32 字节(state[0..7],小端) + for (int i = 0; i < 8; ++i) { + uint32_t w = st[i]; + out32[4*i+0] = (uint8_t)( w & 0xFF); + out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF); + out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF); + out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF); + } +} + +// wrapper function +void blake3_block_reduce_sm70(const uint8_t* d_data, + uint64_t bytes_len, + std::array* root_out = nullptr, + cudaStream_t stream = 0) { + if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) { + fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n", + (unsigned long long)bytes_len); + std::abort(); + } + + // int dev = -1; + // cudaGetDevice(&dev); + // printf("[dbg] my runtime current device = %d\n", dev); + + // cudaPointerAttributes attr{}; + // auto st = cudaPointerGetAttributes(&attr, d_data); + // printf("[dbg] getAttr=%d, type=%d (0=host,1=device,2=managed), device=%d\n", + // (int)st, (int)attr.type, attr.device); + + // cudaPointerAttributes attr{}; + // CUDA_CHECK(cudaPointerGetAttributes(&attr, d_data)); + // if (attr.type != cudaMemoryTypeDevice) { + // fprintf(stderr, "d_data is not device memory!\n"); + // std::abort(); + // } + + int optin = 0, deflt = 0; + cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0); + cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0); + + const int dyn_smem = 64 * 1024; // 64KiB + + // 编译器在编译期决定分配多少动态shmem给kernel + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel<512, 2048, 0>, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel<32, 2048, 0>, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + + + constexpr int CHUNKS_PER_BLOCK = 32; // 16 * 32 = 512 + constexpr int CHUNK_SIZE = 1024; // 1 KiB + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int NUM_THREADS = CHUNKS_PER_BLOCK * 512 / 64; // for big kernel, 512 or 256 + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // 16 or 8 + const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk + const uint64_t total_chunks = bytes_len / CHUNK_SIZE; + const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); // 16384 blocks + + constexpr int pad_chunk = 16; + constexpr int pad_cv = 0; + + CUDA_CHECK(cudaFuncSetAttribute( + blake3_block_reduce_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100)); + + uint8_t* d_bytes = const_cast(d_data); + uint32_t* d_words = reinterpret_cast(d_bytes);; // alias + uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32 + + // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch. + + // TODO: use thrust + cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t)); // 512 KiB + + // ============= launch big kernel ============= + dim3 grid_big(num_blocks); + dim3 block_big(NUM_THREADS); + uint64_t base_chunk_counter = 0ull; + + blake3_block_reduce_kernel + <<>>( + d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks); + + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + + if (num_blocks == 1) { + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) *root_out = host_root; + else { + // 简单打印 + printf("root CV:"); + for (int i=0;i<8;++i) + printf(" %08x", host_root[i]); + printf("\n"); + } + + CUDA_CHECK(cudaFree(d_blockCV)); + CUDA_CHECK(cudaFree(d_bytes)); + return; + } + + // the first round of tiny kernel + // 1) 16384 output reduce -> 8 + uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32 + { + const int N = 16384; // total number + const int TILE = 2048; + const int grid = (N + TILE - 1) / TILE; // = 8 + const int block = 512; + const size_t smem_bytes = (size_t)(TILE) * 8u * sizeof(uint32_t); // 2048×8×4 = 64 KiB + + cudaMalloc(&d_mid_out, (size_t)8 * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel<512, 2048, 0> + <<>>(d_blockCV /*in: 16384×8 x 4*/, + d_mid_out /*out: 8×8*/, N); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + } + + // second round + uint32_t* d_root_cv = nullptr; + { + const int N = 8; + const int TILE = 2048; // 任意 >=N 即可 + const int grid = 1; + const int block = 32; // 32 线程够用 + const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t); // 8 x 8 x 4 = 8 x 32 B + + cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel<32, 2048, 0> + <<>>(d_mid_out /*in: 8×8*/, + d_root_cv /*out: 1×8*/, N); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + } + + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root_cv, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) { + *root_out = host_root; + } else { + printf("root CV:"); + for (int i=0;i<8;++i) printf(" %08x", host_root[i]); + printf("\n"); + } + + // clear + CUDA_CHECK(cudaFree(d_mid_out)); + CUDA_CHECK(cudaFree(d_root_cv)); + CUDA_CHECK(cudaFree(d_blockCV)); + // CUDA_CHECK(cudaFree(d_bytes)); +} \ No newline at end of file diff --git a/benchmark/bench.py b/benchmark/bench.py new file mode 100644 index 0000000..307e2fa --- /dev/null +++ b/benchmark/bench.py @@ -0,0 +1,116 @@ +# bench_blake3_sizes.py +import time +import math +import torch +import blake3 +import flashashing as fh + +# ===== 可调参数 ===== +CHECK_ACCURACY = False # 是否校验与 CPU blake3 一致(小规模可开,大规模会慢一点) +USE_PINNED_HOST = True # 是否使用 pinned host memory(建议 True,用于端到端吞吐) +WARMUP_PER_SIZE = 2 # 每个尺寸的预热次数(不计入统计) + +# 每个尺寸的重复次数,尽量让统计时间不至于太短 +def pick_repeats(n_bytes: int) -> int: + if n_bytes <= 1*1024: # 1 KB + return 2000 + if n_bytes <= 16*1024: + return 1000 + if n_bytes <= 64*1024: + return 500 + if n_bytes <= 256*1024: + return 200 + if n_bytes <= 1*1024*1024: + return 100 + if n_bytes <= 4*1024*1024: + return 50 + return 20 # 10.3MB + +# ===== 测试尺寸(字节)===== +sizes = [ + ("1KB (1 chunk)", 1 * 1024), + ("8KB", 8 * 1024), + ("16KB", 16 * 1024), + ("64KB", 64 * 1024), + ("256KB", 256 * 1024), + ("1MB", 1 * 1024 * 1024), + ("4MB", 4 * 1024 * 1024), + ("10.3MB", int(11 * 1024 * 1024)), # 以 MiB 解释 10.3MB +] + +device = torch.device("cuda") +stream = torch.cuda.current_stream().cuda_stream + +# 预分配最大的 host/device buffer,子切片复用,保证公平且减少反复分配 +max_n = max(n for _, n in sizes) +cpu = torch.empty(max_n, dtype=torch.uint8, pin_memory=USE_PINNED_HOST) +cpu[:] = ord('A') + +d = torch.empty_like(cpu, device=device) # device 大小与 max_n 一致 +torch.cuda.synchronize() + +# 标题 +print("BLAKE3 GPU benchmark across message sizes") +print(f"Host pinned memory: {USE_PINNED_HOST}") +print("-" * 96) +print("{:<14} {:>10} {:>8} {:>12} {:>12} {:>12} {:>12}".format( + "size", "bytes", "repeat", "kern_ms", "kern_MiB/s", "e2e_ms", "e2e_MiB/s" +)) +print("-" * 96) + +for label, n in sizes: + # 预热 + for _ in range(WARMUP_PER_SIZE): + # 给 device 子切片复制 n 字节 + d[:n].copy_(cpu[:n], non_blocking=True) + torch.cuda.synchronize() + _ = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), n, stream) + torch.cuda.synchronize() + + repeat = pick_repeats(n) + total_kernel_ms = 0.0 + total_e2e_sec = 0.0 + last_hex = None + + # CUDA event 用于“纯 kernel”计时 + start_evt = torch.cuda.Event(enable_timing=True) + end_evt = torch.cuda.Event(enable_timing=True) + + for _ in range(repeat): + # -------- End-to-end: H2D + kernel -------- + torch.cuda.synchronize() + t0 = time.perf_counter() + d[:n].copy_(cpu[:n], non_blocking=True) + # kernel-only 计时:events 只包住内核调用 + start_evt.record() + last_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), n, stream) + end_evt.record() + torch.cuda.synchronize() # 等 H2D+kernel 都完成 + t1 = time.perf_counter() + + total_e2e_sec += (t1 - t0) + total_kernel_ms += start_evt.elapsed_time(end_evt) + + # 平均时间 + avg_kernel_ms = total_kernel_ms / repeat + avg_e2e_ms = (total_e2e_sec / repeat) * 1e3 + + # 吞吐(MiB/s;1 MiB = 1024^2) + mib = n / (1024 ** 2) if n != 10.3 else 11 / (1024 ** 2) + kernel_throughput = mib / (avg_kernel_ms / 1e3) if avg_kernel_ms > 0 else float('inf') + e2e_throughput = mib / (avg_e2e_ms / 1e3) if avg_e2e_ms > 0 else float('inf') + + print("{:<14} {:>10} {:>8} {:>12.3f} {:>12.2f} {:>12.3f} {:>12.2f}".format( + label, n, repeat, avg_kernel_ms, kernel_throughput, avg_e2e_ms, e2e_throughput + )) + + if CHECK_ACCURACY: + # CPU 校验(只对当前尺寸做一次) + std_hex = blake3.blake3(cpu[:n].numpy()).hexdigest() + assert last_hex == std_hex, f"Mismatch at {label}: GPU {last_hex} vs CPU {std_hex}" + +print("-" * 96) +print("Notes:") +print(" • kernel_ms / kern_MiB/s 只统计 GPU 计算时间(CUDA events),不含 H2D;") +print(" • e2e_ms / e2e_MiB/s 统计 H2D + kernel 的总时间;") +print(" • '10.3MB' 按 10.3 * 1024 * 1024 字节计算。") diff --git a/benchmark/perf.txt b/benchmark/perf.txt new file mode 100644 index 0000000..0058b2e --- /dev/null +++ b/benchmark/perf.txt @@ -0,0 +1,5 @@ +10.7 V100: 54819.25 MiB/s + RTX 4090: 150585.83 MiB/s (False) + +10.9, RTX4090: 67184.92 MiB/s +10.9 A800: 20803.11 MiB/s \ No newline at end of file diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py new file mode 100644 index 0000000..d634d21 --- /dev/null +++ b/benchmark/test_gpu.py @@ -0,0 +1,69 @@ +import torch +import flashashing as fh +import hashlib +import time +import blake3 + +check_accuracy = False +check_perf = True + +GiB = 1024*1024*1024 # bytes -> 1 GiB + +cpu = torch.empty(GiB * 1, dtype=torch.uint8) +cpu[:] = ord('A') + +# 一次性 H2D(可重用) +d = torch.empty_like(cpu, device='cuda') +d.copy_(cpu, non_blocking=True) +torch.cuda.synchronize() + +stream = torch.cuda.current_stream().cuda_stream + +# std_hex = blake3.blake3(data).hexdigest() + +if check_accuracy: + # 2) GPU 版本 + cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) + torch.cuda.synchronize() + print("GPU BLAKE3 Result: ", cv_hex) + + # 1) CPU 版本 + std_hex = blake3.blake3(cpu.numpy()).hexdigest() + print("std BLAKE3 Expected: ", std_hex) + + # std_hex_1KB = blake3.blake3(cpu[:1024].numpy()).hexdigest() + # print("std BLAKE3 1KB: ", std_hex_1KB) + + assert cv_hex == std_hex, "GPU BLAKE3 result does not match CPU result!" + print("GPU BLAKE3 result matches CPU result!") + +if check_perf: + # 2) 预热,触发 JIT/驱动初始化,避免首轮偏慢 + for _ in range(2): + # d.copy_(cpu, non_blocking=True) + torch.cuda.synchronize() + output = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) + print("warmup CV (hex) =", output) + torch.cuda.synchronize() + + # 3) 正式计时:这测的是“端到端吞吐”(包含 H2D) + repeat = 10 # 1GiB × 5 已经很重,按机器调整 + cv_hex = None + total_time = 1e-8 + for _ in range(repeat): + d.copy_(cpu, non_blocking=True) + torch.cuda.synchronize() + + t0 = time.perf_counter() + cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) + t1 = time.perf_counter() + + elapsed = t1 - t0 + total_time += elapsed + torch.cuda.synchronize() + + print(f"Elapsed time for {repeat}x BLAKE3 (GPU SM70): {total_time:.3f} seconds") + print(f"Throughput: {repeat * d.numel() / total_time / (1024**2):.2f} MiB/s") + print("root CV (hex) =", cv_hex) + +# print(f"std BLAKE3 Expected: {std_hex}") \ No newline at end of file diff --git a/csrc/binding.cpp b/csrc/binding.cpp index 2da163f..43290c4 100644 --- a/csrc/binding.cpp +++ b/csrc/binding.cpp @@ -76,6 +76,115 @@ static std::string blake3_hash_naive(py::object obj) { } } + +struct PyBytesView { + const uint8_t* ptr = nullptr; + size_t len = 0; + std::string storage; + py::object keep_alive; +}; + +static PyBytesView get_bytes_view(py::object obj) { + PyBytesView v; + + // bytes -> 复制到 storage + if (py::isinstance(obj)) { + v.storage = static_cast(py::bytes(obj)); + v.ptr = reinterpret_cast(v.storage.data()); + v.len = v.storage.size(); + return v; + } + + // str -> 按 UTF-8 编码复制;如果你想按“原始字节”处理,建议在 Python 侧先 .encode() + if (py::isinstance(obj)) { + v.storage = obj.cast(); + v.ptr = reinterpret_cast(v.storage.data()); + v.len = v.storage.size(); + return v; + } + + // bytearray -> 正确姿势:转成 py::bytes 再拿 std::string(不要用迭代器) + if (py::isinstance(obj)) { + v.storage = static_cast(py::bytes(obj)); + v.ptr = reinterpret_cast(v.storage.data()); + v.len = v.storage.size(); + return v; + } + + // 任意 buffer(memoryview / numpy 等) + if (PyObject_CheckBuffer(obj.ptr())) { + py::buffer buf = py::reinterpret_borrow(obj); + py::buffer_info info = buf.request(/*writable=*/false); + + // 一维 C 连续:零拷贝,注意保活 + if (info.ndim == 1 && + info.strides.size() == 1 && + info.strides[0] == static_cast(info.itemsize)) { + v.ptr = reinterpret_cast(info.ptr); + v.len = static_cast(info.size) * static_cast(info.itemsize); + v.keep_alive = obj; // 保活底层内存 + return v; + } + + // 非连续:用 tobytes() 拷贝成线性字节 + py::object tobytes = obj.attr("tobytes")(); + v.storage = static_cast(py::bytes(tobytes)); + v.ptr = reinterpret_cast(v.storage.data()); + v.len = v.storage.size(); + return v; + } + + throw std::invalid_argument( + "blake3_gpu_root_* expects bytes/str/bytearray/memoryview/numpy buffer."); +} + +static std::string cv_words_to_bytes_le(const std::array& cv) { + std::string out; + out.resize(32); + uint8_t* p = reinterpret_cast(&out[0]); + for (int i = 0; i < 8; ++i) { + uint32_t w = cv[i]; + p[4*i + 0] = static_cast( w & 0xFF); + p[4*i + 1] = static_cast((w >> 8) & 0xFF); + p[4*i + 2] = static_cast((w >> 16) & 0xFF); + p[4*i + 3] = static_cast((w >> 24) & 0xFF); + } + return out; +} + +struct GilRelease { + py::gil_scoped_release rel; +}; + +// static py::bytes blake3_gpu_root_cv_bytes(py::object obj) { +// auto v = get_bytes_view(obj); +// std::array root{}; +// { +// GilRelease _g; +// blake3_block_reduce_sm70(v.ptr, static_cast(v.len), &root, /*stream=*/0); +// } +// std::string b = cv_words_to_bytes_le(root); +// return py::bytes(b); +// } + +static std::string blake3_gpu_root_hex(uint64_t device_ptr, + uint64_t nbytes, + uint64_t stream_int = 0) { + auto d_data = reinterpret_cast(device_ptr); + auto stream = reinterpret_cast(stream_int); + cudaSetDevice(0); + cudaFree(0); // attach + + std::array root{}; + { + GilRelease _g; + blake3_block_reduce_sm70_sm80(d_data, nbytes, &root, stream); + } + std::string b = cv_words_to_bytes_le(root); + return bytes_to_hex(reinterpret_cast(b.data()), b.size()); +} + + PYBIND11_MODULE(flashashing, m) { m.doc() = "SHA-256 and Blake3 bindings (pybind11)"; @@ -84,9 +193,23 @@ PYBIND11_MODULE(flashashing, m) { "Compute SHA-256 of a str/bytes and return hex string."); m.def("hash_simd", &hash_hex_simd4, py::arg("data"), - "SIMD enhanced sha256."), + "SIMD enhanced sha256."); m.def("blake3_hash_naive", &blake3_hash_naive, py::arg("data"), "Compute BLAKE3 hash (single-threaded)."); + +// m.def("blake3_gpu_sm70", +// &blake3_gpu_root_cv_bytes, +// py::arg("data"), +// R"pbdoc( +// Return the 32-byte *root chaining value* (CV) computed on GPU for the given data. +// NOTE: This is not the standard BLAKE3 digest/XOF output. It's the root CV. +// )pbdoc"); + m.def("blake3_gpu_sm70_sm80_hex", + &blake3_gpu_root_hex, + py::arg("d_data"), py::arg("nbytes"), py::arg("stream")=0, + R"pbdoc( +Return the hex string of the *root chaining value* (CV) computed on GPU. +)pbdoc"); } \ No newline at end of file diff --git a/csrc/blake3.h b/csrc/blake3.h index 55aaa43..27509c9 100644 --- a/csrc/blake3.h +++ b/csrc/blake3.h @@ -2,6 +2,9 @@ #include #include +#include +#include +#include namespace flashashing { @@ -23,4 +26,11 @@ class Blake3 { std::string bytes_to_hex(const uint8_t *data, size_t len); -} // namespace flashashing \ No newline at end of file +} // namespace flashashing + + +// ============== GPU implementations ================ +void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, + uint64_t bytes_len, + std::array* root_out = nullptr, + cudaStream_t stream = 0); \ No newline at end of file diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu new file mode 100644 index 0000000..b835c15 --- /dev/null +++ b/csrc/blake3_sm70_sm80.cu @@ -0,0 +1,850 @@ +#include +#include +#include +#include +#include + +#include +#include +#include + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +#include +#endif + +#include +#include +#include + +#include "utils.cuh" + +#define WARP_SIZE 32 +#define LDST128BITS(value) (reinterpret_cast(&(value))[0]) + +#define CUDA_CHECK(expr) do { \ + cudaError_t _e = (expr); \ + if (_e != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s at %s:%d: %s\n", \ + #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\ + std::abort(); \ + } \ + } while(0) + + + +using namespace cute; + +using vec_t = cute::uint128_t; // one time loading 16 B + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + // SM80 branch + using Atom = cute::Copy_Atom, vec_t>; +#else + // Volta/Turing(SM70/75)general copy + using Atom = cute::Copy_Atom, vec_t>; +#endif + + +// ============ Big kernel: 16 WARPS in total ============ +// grid: (chunks / 64), thread: (512,) +template 32 KB + const int PAD_CHUNK=16, + const int PAD_CV=0> // pad shared memory +__global__ void blake3_block_reduce_kernel(const uint32_t* d_input, + uint32_t* block_cvs, + int chunk_len_bytes, + uint64_t base_chunk_counter, + int total_chunks) { + // NUM_WARPS also stands for NUM_CHUKNS per block + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // consider it as 16 + constexpr int CHUNKS_PROCEED = CHUNKS_PER_BLOCK; + + static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned"); + static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4"); + + // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total + // 8 x 4 x 64 = 2 KiB shared memory in sum + __shared__ __align__(16) uint32_t cv_smem[CHUNKS_PER_BLOCK / 2][8 + PAD_CV]; // avoid bank conflict + + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int VEC_ELEMS = 4; // uint4, 16B + constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB) + constexpr int WARPS_PER_CTA = NUM_WARPS; // 16 + + // 4 bytes x 256 x 64 = 64 KiB shared memory. + __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][WORDS_PER_CHUNK + PAD_CHUNK]; // [64][256+PAD] + + const int tid = threadIdx.x; + const int bx = blockIdx.x; + const int warp_id = tid / WARP_SIZE; + const int lane_id = tid % WARP_SIZE; + + // ============== STAGE 1: Coalsced Global Memory Loading ============== + const int tile_id = blockIdx.x; + const int tile_base = tile_id * CHUNKS_PER_BLOCK; // which chunk do this block start loading + + int valid_chunks = total_chunks - tile_base; + if (valid_chunks <= 0) { + return; // overflow + } + if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK; + + // use CuTe for loading + + auto make_smem_tensor = [&](int chunk_local /* The chunk id on shared memory */) { + // smem index:chunk_smem[chunk_local][0] + // length: (WORDS_PER_CHUNK + PAD_CHUNK) + uint32_t* s_u32 = &chunk_smem[chunk_local][0]; + vec_t* s_vec = reinterpret_cast(s_u32); // each element is 16B + return make_tensor(make_smem_ptr(s_vec), + make_shape(Int{}), // 64 x 4 = 256, one row of shared memory + make_stride(Int<1>{})); // continuous + }; + + // shared memory 256 items = 64 x vec_t(4) + // thread layout: 32 threads + // thread 0: 0, 32 + // thread 1: 1, 33 + // thread 32: 31,63 + // so we load data in coalsced mode + // using Atom = cute::Copy_Atom, vec_t>; + + // thread layout. (lane,0)->idx=2*lane,(lane,1)->idx=2*lane+1 + auto thr_layout = make_layout( + make_shape(Int<2>{}), + make_stride(Int<1>{}) + ); + + auto tile_layout = make_layout( + make_shape(Int{}, Int<2>{}), // (32,2) + make_stride(Int<1>{}, Int{}) // idx = lane + k2*32 + ); + + // build the real copy + TiledCopy tcopy = make_tiled_copy(Atom{}, thr_layout, tile_layout); + + // this sentence will load + // auto thr_copy = local_partition(tcopy, lane_id); + + for (int ldt = 0; ldt < 4; ldt++) { + // each warp load 4 chunks + int chunk_local = ldt * WARPS_PER_CTA + warp_id; // ldt*16 + warp -> start chunk + int chunk_global = tile_base + chunk_local; // global chunk idx + + // the pointer for shared memory + auto smem_vec1d = make_smem_tensor(chunk_local); // 64 x uint128_t + + // only read from global, when it's valid + // or, we fill it with 0 + const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK; + const vec_t* g_vec_base = reinterpret_cast(g_u32); // 16 B each time + + // the src is 64 x 4 = 256, so is dest + auto gmem_vec1d = make_tensor(make_gmem_ptr(g_vec_base), + make_shape(Int{}), // 64 x uint128_t + make_stride(Int<1>{})); + bool g_valid = (chunk_local < valid_chunks); + + auto smem_vec2d = make_tensor(smem_vec1d.data(), tile_layout); + auto gmem_vec2d = make_tensor(gmem_vec1d.data(), tile_layout); + + // now we will load this + auto tCg = local_partition(gmem_vec2d, tile_layout, lane_id); + auto tCs = local_partition(smem_vec2d, tile_layout, lane_id); + + // launch the copy inst. + if (g_valid) { + // gmem load → smem store: 16B x 2 + copy(tcopy, tCg, tCs); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + cp_async_fence(); + cp_async_wait<0>(); +#endif + } else { + for (int i = 0; i < size(tCs); ++i) { + *reinterpret_cast(raw_pointer_cast(&tCs(i))) = uint128_t(0, 0); + } + } + __syncwarp(); // inner-warp sync + } + + __syncthreads(); // sync all warps + +#if defined(DBG_KERNEL) && DBG_KERNEL + if (threadIdx.x == 0 && blockIdx.x == 0) { + printf("Stage 1 finish processing\n"); + } +#endif + + // ============== STAGE 2: Compress Leaf to `CHUNKS_PER_BLOCK` chain value ============== + const int pass0_valid = min(CHUNKS_PER_BLOCK / 2, valid_chunks); // pass0 cover [0, CHUNKS_PER_BLOCK / 2] chunks + const int pass1_valid = max(0, valid_chunks - (CHUNKS_PER_BLOCK / 2)); // pass1 cover [CHUNKS_PER_BLOCK / 2, CHUNKS_PER_BLOCK] chunks + + __shared__ int parents_count; + if (threadIdx.x == 0) { + const int parents0 = (pass0_valid + 1) >> 1; // 8 or 16 + const int parents1 = (pass1_valid + 1) >> 1; // 8 or 16 + parents_count = parents0 + parents1; // ≤ 32 + } + __syncthreads(); + + // this is for each warp's lane0 and lane16 written + // to decrease the register usage. + __shared__ __align__(16) uint32_t tmp_cv[CHUNKS_PER_BLOCK][8 + PAD_CV]; // 2 KiB + + // lambda function: compress this thing + // 64 chunks - 16 warps, 1 warp -> 4 chunks -> 2pass + // 32 chunks - 8 warps, 1 warp -> 4 chunks -> 2pass + auto do_big_pass = [&](int base /*0 or chunks/2 */, int pass_valid) { + // left=base+2*warp_id, right=left+1 + const int left = base + (warp_id << 1); // base + 0,2,4,6,... + const int right = left + 1; + const int left_rel = left - base; // 0..31 + const int right_rel = right - base; // 1..32 + const bool has_left = (left_rel < pass_valid); + const bool has_right = (right_rel < pass_valid); +// #if defined(DBG_KERNEL) && DBG_KERNEL +// if (base == 16 && threadIdx.x % 32 == 31 && blockIdx.x == 0) { +// printf("left %d right %d left_rel %d, right_rel %d, pass_valid %d\n", left, right, left_rel, right_rel, pass_valid); +// } +// #endif + + // const int lane_id = threadIdx.x & 31; + const int sub = lane_id >> 4; // sub-warp: 0 or 1, lane0-15: sub-warp0; lane16-lane31: sub-warp1 + const int li = lane_id & 15; // 0..15 + + const unsigned full = __activemask(); + const unsigned submask = (sub==0 ? 0x0000FFFFu : 0xFFFF0000u) & full; + const unsigned mask16 = (sub == 0 ? 0x0000FFFFu : 0xFFFF0000u); + + const int chunk_local = left + sub; // sub=0→left, sub=1→right + const bool valid_sub = (sub==0 ? (left_rel < pass_valid) : (right_rel < pass_valid)); + + // uint32_t my_cv[8]; + + // the left-sub-warp and right-sub-warp will execute the same code + // distinguish the index by computing, + // to avoid warp-divergence + + // the chunk local identifies the left or right chunk, so do not worry. + const uint32_t* row = valid_sub ? &chunk_smem[chunk_local][0] : &chunk_smem[0][0]; + uint32_t* out = valid_sub ? &tmp_cv[chunk_local][0] : nullptr; // FIXME: here has some problem!!! + +#if defined(DBG_KERNEL) && DBG_KERNEL + if (!valid_sub) { + printf("tile %d warp %d not valid in base %d, pass_valid %d\n", blockIdx.x, threadIdx.x/32, base, pass_valid); + } +#endif + const uint64_t cc = base_chunk_counter + (uint64_t)(tile_base + chunk_local); + blake3_leaf_cv_simd16_onechunk(row, + chunk_len_bytes, + cc, + out, + submask); + + __syncwarp(submask); // make sure two warps written into `tmp_cv` + +// #if defined(DBG_KERNEL) && DBG_KERNEL +// if (blockIdx.x == 0 && threadIdx.x == 0) printf("The simd16-lane res: \n"); +// print_cv(out); +// #endif + + // now, one warp computes 2 chunks, yield one parent-cv value + const int pair_idx = (base >> 1) + warp_id; // 0, 16 + warp_id + + if (has_left) { + const uint32_t* lcv = &tmp_cv[left][0]; + + if (has_right) { + const uint32_t* rcv = &tmp_cv[right][0]; + + // sub==0 half warp participate + const unsigned sub0_mask = 0x0000FFFFu & full; + if ((lane_id >> 4) == 0) { + blake3_parent_cv_simd16(lcv, rcv, &cv_smem[pair_idx][0], sub0_mask); + } + // sub==1 do not involve + } else { + // odd: left -> pair_idx + if ((lane_id >> 4) == 0) { + int li = lane_id & 15; + if (li < 8) cv_smem[pair_idx][li] = lcv[li]; + } + } + } + +#if defined(DBG_KERNEL) && DBG_KERNEL + if (blockIdx.x == 0 && threadIdx.x == 0) printf("The 1st chunk merged res: \n"); + print_cv(cv_smem[pair_idx]); +#endif + + __syncwarp(); // NOTICE: this is necessary! + }; // do_big_pass + + // big-pass 1: computing [CHUNK_PER_BLOCK / 2] chunks + do_big_pass(0, pass0_valid); + +#if defined(DBG_KERNEL) && DBG_KERNEL + if (threadIdx.x == 0 && blockIdx.x == 0 && pass0_valid != 0) { + printf("Stage 2 - pass 1 finish processing\n"); + } +#endif + + // big-pass 2: computing [CHUNK_PER_BLOCK / 2] chunks + do_big_pass(CHUNKS_PER_BLOCK / 2, pass1_valid); + +#if defined(DBG_KERNEL) && DBG_KERNEL + if (threadIdx.x == 0 && blockIdx.x == 0 && pass1_valid != 0) { + printf("Stage 2 - pass 2 finish processing, pass 1: %d\n", pass1_valid); + } +#endif + + __syncthreads(); + + // right now, we have got CHUNKS_PER_BLOCK / 2 chain values + // a warp-reduce to merge. + + // ============== STAGE 3: Block-Reduce ============== + // [32] - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv + // we will only use warp 0 to handle this thing + // the `parent_count` is the indicator + // for 256-8warps-32chunks, the parent count is 16 + if (parents_count > 0) { + // half-warp info + const int sub = lane_id >> 4; // 0/1 -> which sub warp it belong to + const int li = lane_id & 15; // 0..15 + const unsigned full = __activemask(); + const unsigned submask = ((sub==0) ? 0x0000FFFFu : 0xFFFF0000u) & full; + + int cur_n = parents_count; + + while (cur_n > 1) { + int pairs = cur_n >> 1; // the merge num this layer will do, for 16 parents, this start at 8 + // warp 0, 0-15: 0 + // warp 0, 16-31: 1 + // warp 1, 0-15: 2 + // warp 1, 16-31: 3 + // .. + // warp 7, 0-15: 14 + // warp 7, 16-31: 15 + int half_id = warp_id * 2 + sub; // half-warp id + + // each half-warp handle one pair: (left=2*half_id, right=left+1) + // only half warps will participate in + if (half_id < pairs) { + int left_idx = (half_id << 1); + int right_idx = left_idx + 1; + const uint32_t* L = &cv_smem[left_idx][0]; + const uint32_t* R = &cv_smem[right_idx][0]; + + // 半 warp SIMD16 合并:结果就地写回 left_idx + blake3_parent_cv_simd16(L, R, &cv_smem[left_idx][0], submask); + } + + __syncthreads(); // make sure all warps could see. + + // odd: up-flow + if (cur_n & 1) { + // 让 warp0 的 sub0 来搬运(li < 8) + if (warp_id == 0 && sub == 0 && li < 8) { + cv_smem[pairs][li] = cv_smem[cur_n - 1][li]; + } + } + + __syncthreads(); // next - level + + cur_n = pairs + (cur_n & 1); + } + + // write out + if (warp_id == 0 && lane_id == 0) { + uint32_t* out = block_cvs + (size_t)blockIdx.x * 8; +// #if defined(DBG_KERNEL) && DBG_KERNEL +// print_cv(cv_smem[0]); +// #endif + reinterpret_cast(out)[0] = make_uint4(cv_smem[0][0], cv_smem[0][1], cv_smem[0][2], cv_smem[0][3]); + reinterpret_cast(out)[1] = make_uint4(cv_smem[0][4], cv_smem[0][5], cv_smem[0][6], cv_smem[0][7]); + } + } + +#if defined(DBG_KERNEL) && DBG_KERNEL + if (blockIdx.x == 0 && threadIdx.x == 0) + printf("================================ Finishing all in big kernel! ================================\n"); +#endif + +} // blake3_block_reduce_kernel + +__device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) { + const uint4* g4 = reinterpret_cast(g); + uint4 a = g4[0], b = g4[1]; + r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w; + r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w; +} + +__device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) { + uint4* g4 = reinterpret_cast(g); + g4[0] = make_uint4(r[0],r[1],r[2],r[3]); + g4[1] = make_uint4(r[4],r[5],r[6],r[7]); +} + +__device__ __forceinline__ +void load_cv_s2r_vec(const uint32_t* __restrict__ s, uint32_t dst[8]) { + // 确保 16B 对齐:cv_tile 要 __align__(16),stride=(8+PAD)*4 也要是16的倍数 + const uint4* p = reinterpret_cast(s); + uint4 v0 = p[0]; + uint4 v1 = p[1]; + dst[0]=v0.x; dst[1]=v0.y; dst[2]=v0.z; dst[3]=v0.w; + dst[4]=v1.x; dst[5]=v1.y; dst[6]=v1.z; dst[7]=v1.w; +} + +__device__ __forceinline__ +void load_cv_s2r_vec_shared(const uint32_t* __restrict__ s, uint32_t dst[8]) { +#if __CUDA_ARCH__ >= 700 + // 把泛型指针显式转成 shared 地址,然后 ld.shared.v4.u32 两次 + unsigned smem_addr = static_cast(__cvta_generic_to_shared(s)); + uint32_t x0,x1,x2,x3,x4,x5,x6,x7; + asm volatile( + "ld.shared.v4.u32 {%0,%1,%2,%3}, [%8];\n\t" + "ld.shared.v4.u32 {%4,%5,%6,%7}, [%8+16];\n\t" + : "=r"(x0),"=r"(x1),"=r"(x2),"=r"(x3), + "=r"(x4),"=r"(x5),"=r"(x6),"=r"(x7) + : "r"(smem_addr)); + dst[0]=x0; dst[1]=x1; dst[2]=x2; dst[3]=x3; + dst[4]=x4; dst[5]=x5; dst[6]=x6; dst[7]=x7; +#else + // 旧架构退化到标量 + #pragma unroll + for (int j=0;j<8;++j) dst[j] = s[j]; +#endif +} + +// ============ Tiny kernel ============ +// In big kernel, it will consume 64 or 32 KiB each block [CHUNKS_PER_BLOCK] +// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / CHUNKS_PER_BLOCK root = 16384 or 32768 roots +// And this tiny kernel is designed to process these 16384 or 32768 root +// For one chain value, takes u32 x 8 = 32B, +// 32KB shared memory could contain 1K chain value +// 16KB shmem could contain 512 chain value +template +__global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32, + uint32_t* __restrict__ out_cv32, + int N) +{ + extern __shared__ __align__(16) uint32_t smem[]; // dyn SMEM >= TILE_CVS*32B + // regarad as 2D dyn shmem: [TILE_CVS][8+PAD] + uint32_t* cv_tile = smem; + + const int tid = threadIdx.x; + constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + const int warp_id = tid / WARP_SIZE; // 0..NUM_WARPS (8 or 16) + const int lane_id = tid % WARP_SIZE; // 0..31 + + // the start of this block. + // each block will process TILE_CVS roots. + const int tile_start = blockIdx.x * TILE_CVS; + if (tile_start >= N) return; + + const int tile_n = min(TILE_CVS, N - tile_start); // actual cv count of this block + + // ---------------- Stage 1: coalsced loading to SMEM ---------------- + // each time load 8 KB, [tile_n/NUM_THREADS] times in total. + for (int i = tid; i < tile_n; i += NUM_THREADS) { // notice: i = tid, not start from 0 + const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8; + uint32_t* s = cv_tile + (size_t)i * (8 + PAD); + // 16B x 2 + const uint4* g4 = reinterpret_cast(g); + uint4* s4 = reinterpret_cast(s); + // s4[0] = g4[0]; + // s4[1] = g4[1]; + + // in case that the address is not aligned + uint4 v0 = g4[0]; // each thread load 16B, 256 x 16=4KB + uint4 v1 = g4[1]; // each thread load 16B, 256 x 16=4KB + + s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w; + s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w; + } + __syncthreads(); + +#if defined(DBG_KERNEL) && DBG_KERNEL + if (tid == 0 && blockIdx.x == 0) { + printf("Block %d root CV for tiny kernel entry:", blockIdx.x); + print_cv(smem); + } +#endif + + // ---------------- Stage 2: each lane merge 4 → 1 (keep the neighbor order) ---------------- + // reduced_n0 = ceil(tile_n / 4) lane-root + const int reduced_n0 = (tile_n + 3) >> 1 >> 1; + uint32_t lane_cv[8]; // output of this lane 4-1 root + bool lane_valid = false; + + // start index + int base4 = tid << 2; // tid*4, 0..8 + if (base4 < tile_n) { + // 4-neighbor CV:idx = base4 + 0,1,2,3 + uint32_t a[8], b[8], c[8], d[8]; + + const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD); + load_cv_s2r_vec_shared(s0, a); + + int remain = tile_n - base4; + + // Yazhu: branch-prediction extra-overhead + + if (remain >= 2) { + const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD); + load_cv_s2r_vec_shared(s1, b); + } + if (remain >= 3) { + const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD); + load_cv_s2r_vec_shared(s2, c); + } + if (remain >= 4) { + const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD); + load_cv_s2r_vec_shared(s3, d); + } + +// #if defined(DBG_KERNEL) && DBG_KERNEL +// // 石锤了:load进来的东西压根不一样 +// if (blockIdx.x == 0 && threadIdx.x < 28 && threadIdx.x > 10) { +// for (int i = 0 ; i < 32; i++) { +// if (i == lane_id) { +// printf("The lane-merge input from lane: %d:\n", lane_id); +// print_cv(a, lane_id); +// print_cv(b, lane_id); +// print_cv(c, lane_id); +// print_cv(d, lane_id); +// } +// } +// } +// #endif + + // merge the neighbor + if (remain == 1) { + #pragma unroll + for (int j = 0; j < 8; ++j) + lane_cv[j] = a[j]; + } else if (remain == 2) { + blake3_parent_cv(a, b, lane_cv); // write to lane_cv directly + } else if (remain == 3) { + uint32_t p01[8]; + blake3_parent_cv(a, b, p01); // one buffer + blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01,(p01,c)->lane_cv + } else { // remain >= 4 + uint32_t p01[8], p23[8]; + blake3_parent_cv(a, b, p01); + blake3_parent_cv(c, d, p23); + blake3_parent_cv(p01, p23, lane_cv); + } + lane_valid = true; + } + +#if defined(DBG_KERNEL) && DBG_KERNEL + if (blockIdx.x == 0 && threadIdx.x == 4 && lane_valid) { + printf("The lane-reduce-4 output:"); + print_cv(lane_cv, 4); + } +#endif + + // ---------------- Stage 3: Warp-level 32→1 neighbor-shfl merge ---------------- + const int warp_base = warp_id * WARP_SIZE; + const int n = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // active number + + // this will introduce extra branch-prediction overhead + const unsigned active_mask = __ballot_sync(0xFFFFFFFFu, lane_id < n); + +// #if defined(DBG_KERNEL) && DBG_KERNEL +// if (gridDim.x == 1 && lane_valid) { +// printf("The tid: %d\n", tid); +// } +// #endif + + if (lane_id >= n) { + #pragma unroll + for (int j = 0; j < 8; ++j) + lane_cv[j] = 0u; + } + + __syncwarp(active_mask); + + // step = 1,2,4,8,16 - warp-reduce + for (int step = 1; step < n; step <<= 1) { + // right-neighbor + uint32_t nbr[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) + nbr[j] = __shfl_down_sync(active_mask, lane_cv[j], step); + + __syncwarp(active_mask); // 所有 lane 都抓完再继续 + + // only the left-half lane will participant + // 0,2,4,6,8,... step = 1 + // 0,4,8,... step = 2 + // 0,8,.. step = 4 + // 0,16 step = 8 + // 0 step = 16 + const bool do_pair = + ((lane_id & ((step << 1) - 1)) == 0) && // 只允许每 2*step 组的第一个 lane 合并 + ((lane_id ^ step) < n); // 右邻确实在活跃集合内 + +// #if defined(DBG_KERNEL) && DBG_KERNEL +// if (gridDim.x != 1 && blockIdx.x == 0 && threadIdx.x < 32) printf("lane %d do pair? %d\n", lane_id, do_pair); +// #endif + +// #if defined(DBG_KERNEL) && DBG_KERNEL +// if (gridDim.x != 1 && blockIdx.x == 0 && threadIdx.x == 16) { +// printf("left on step %d:", step); +// print_cv(lane_cv, 16); +// printf("right on step %d:", step); +// print_cv(nbr, 16); +// } +// #endif + + if (do_pair) { + blake3_parent_cv(lane_cv, nbr, lane_cv); // does not require shared memory + } + __syncwarp(active_mask); + } + + // lane0;NUM_WARPS warp-root write to SMEM + // e.g. if this block has 8 warp, there will be 8 warp-root + __shared__ uint32_t warp_roots[NUM_WARPS][8]; // NUM_WARPS × 8 + if (lane_id == 0 && n > 0) { + #pragma unroll + for (int j = 0; j < 8; ++j) + warp_roots[warp_id][j] = lane_cv[j]; + } + __syncthreads(); + +#if defined(DBG_KERNEL) && DBG_KERNEL + if (blockIdx.x == 0 && threadIdx.x == 0) { + printf("The warp-reduce output:"); + print_cv(warp_roots[0]); + } +#endif + + // ---------------- Stage 4: CTA's NUM_WARPS → 1 block reduce ---------------- + int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..NUM_WARPS + if (valid_warps == 0) return; + + + // 16 lane compute the merge together + const int sub = lane_id >> 4; // 0/1 + const int li = lane_id & 15; // 0..15 + const unsigned full = __activemask(); + const unsigned submask = ((sub==0) ? 0x0000FFFFu : 0xFFFF0000u) & full; + const int half_id = warp_id * 2 + sub; // half-warp index + int cur_n = valid_warps; + + while (cur_n > 1) { + const int pairs = cur_n >> 1; // the pair count. + + if (half_id < pairs) { + const int left_idx = (half_id << 1); + const int right_idx = left_idx + 1; + + const uint32_t* L = &warp_roots[left_idx][0]; + const uint32_t* R = &warp_roots[right_idx][0]; + + blake3_parent_cv_simd16(L, R, &warp_roots[left_idx][0], submask); + } + + __syncthreads(); + + // if odd + if (cur_n & 1) { + if (warp_id == 0 && sub == 0 && li < 8) { + warp_roots[pairs][li] = warp_roots[cur_n - 1][li]; + } + } + + __syncthreads(); // next-level + cur_n = pairs + (cur_n & 1); // + } +#if defined(DBG_KERNEL) && DBG_KERNEL + if (tid == 0 && blockIdx.x == 0) { + printf("Block %d root CV for tiny kernel:", blockIdx.x); + print_cv(warp_roots[0]); + } +#endif + + // ---------------- Stage 5: write to output ---------------- + if (threadIdx.x == 0) { + uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8; + #pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = warp_roots[0][j]; + } + +#if defined(DBG_KERNEL) && DBG_KERNEL + if (blockIdx.x == 0 && threadIdx.x == 0) { + printf("================================ Finishing all in tiny kernel! ================================\n"); + } +#endif + +} + +inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) { + const uint32_t zero_block[16] = {0}; + uint32_t st[16]; + blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st); + for (int i = 0; i < 8; ++i) { + uint32_t w = st[i]; + out32[4*i+0] = (uint8_t)( w & 0xFF); + out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF); + out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF); + out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF); + } +} + +// wrapper function +void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, + uint64_t bytes_len, + std::array* root_out = nullptr, + cudaStream_t stream = 0) { + if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) { + fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n", + (unsigned long long)bytes_len); + std::abort(); + } + + int optin = 0, deflt = 0; + cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0); + cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0); + + constexpr int CHUNKS_PER_BLOCK = 32; // 16 * 32 = 512 + constexpr int CHUNK_SIZE = 1024; // 1 KiB + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int NUM_THREADS = CHUNKS_PER_BLOCK * 512 / 64; // for big kernel, 512 or 256 + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // 16 or 8 + const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk + const uint64_t total_chunks = bytes_len / CHUNK_SIZE; + const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); // 16384 blocks, 32768 for 32-size + + constexpr int pad_chunk = 16; + constexpr int pad_cv = 0; + + CUDA_CHECK(cudaFuncSetAttribute( + blake3_block_reduce_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100)); + + const auto* d_words = reinterpret_cast(d_data); // alias + uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32 + + // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch. + + // TODO: use thrust + cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t)); // 512 KiB, 1M for 32-size + + // ============= launch big kernel ============= + dim3 grid_big(num_blocks); + dim3 block_big(NUM_THREADS); + uint64_t base_chunk_counter = 0ull; + + blake3_block_reduce_kernel + <<>>( + d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks); + + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + + if (num_blocks == 1) { + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) *root_out = host_root; + else { + // 简单打印 + printf("root CV:"); + for (int i=0;i<8;++i) + printf(" %08x", host_root[i]); + printf("\n"); + } + + CUDA_CHECK(cudaFree(d_blockCV)); + // CUDA_CHECK(cudaFree(d_bytes)); + return; + } + + // the first round of tiny kernel + // 1) 16384 or 32768 output reduce -> 8 + + uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32 + { + const int dyn_smem = 32 * 1024; // 32KiB + const int N = num_blocks; // total root-cv number + constexpr int TILE = 1024; + const int grid = (N + TILE - 1) / TILE; // 32 KB / 1024 = 32 + constexpr int NUM_THREADS = 256; + const size_t smem_bytes = (size_t)(TILE) * (8u + pad_cv) * sizeof(uint32_t); // 1024 x 32 = 32 KiB + + // decide upon compiling time + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + + cudaMalloc(&d_mid_out, (size_t)grid * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel + <<>>(d_blockCV /*in: 16384×8 x 4*/, + d_mid_out /*out: 8×8*/, N); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + } + + // second round + uint32_t* d_root_cv = nullptr; + { + const int dyn_smem = 1024; // 1 KiB will enough + constexpr int TILE = 1024; // any >= N + const int N = (num_blocks + TILE - 1) / TILE; // 32 + constexpr int grid = 1; + constexpr int NUM_THREADS = 32; // 32 threads will be enough + const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t); // 8 x 8 x 4 = 8 x 32 B + + // decide upon compiling time + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + + cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel + <<>>(d_mid_out /*in: 8×8*/, + d_root_cv /*out: 1×8*/, N); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + } + + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root_cv, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) { + *root_out = host_root; + } else { + printf("root CV:"); + for (int i=0;i<8;++i) printf(" %08x", host_root[i]); + printf("\n"); + } + + // clear + CUDA_CHECK(cudaFree(d_mid_out)); + CUDA_CHECK(cudaFree(d_root_cv)); + CUDA_CHECK(cudaFree(d_blockCV)); + // CUDA_CHECK(cudaFree(d_bytes)); // this memory is managed by torch, we do not free it. +} \ No newline at end of file diff --git a/csrc/utils.cuh b/csrc/utils.cuh new file mode 100644 index 0000000..c0311b8 --- /dev/null +++ b/csrc/utils.cuh @@ -0,0 +1,455 @@ +#include +#pragma once + +#define WARP_SIZE 32 +#define G(a,b,c,d, x, y) \ + do { \ + (a) = (a) + (b) + (x); \ + (d) = rotr32((d) ^ (a),16); \ + (c) = (c) + (d); \ + (b) = rotr32((b) ^ (c),12); \ + (a) = (a) + (b) + (y); \ + (d) = rotr32((d) ^ (a), 8); \ + (c) = (c) + (d); \ + (b) = rotr32((b) ^ (c), 7); \ + } while (0) + +// host - side definition +inline constexpr uint32_t BLAKE3_IV_HOST[8] = { + 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, + 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u +}; + +__device__ __constant__ uint32_t BLAKE3_IV_DEV[8] = { + 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, + 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u +}; + +inline constexpr int B3_PERMUTE_HOST[16] = { + 2, 6, 3,10, 7, 0, 4,13, 1,11,12, 5, 9, 14, 15, 8 +}; + +__device__ __constant__ int B3_PERMUTE_DEV[16] = { + 2, 6, 3,10, 7, 0, 4,13, 1,11,12, 5, 9, 14, 15, 8 +}; + +enum : uint32_t { + FLAG_CHUNK_START = 1u << 0, + FLAG_CHUNK_END = 1u << 1, + FLAG_PARENT = 1u << 2, + FLAG_ROOT = 1u << 3, + FLAG_KEYED_HASH = 1u << 4, + FLAG_DERIVE_KEY_CONTEXT = 1u << 5, + FLAG_DERIVE_KEY_MATERIAL= 1u << 6, +}; + +__device__ __noinline__ +uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) { + uint32_t f = 0; + f |= (uint32_t)-(block_idx_in_chunk==0) & FLAG_CHUNK_START; + f |= (uint32_t)-(block_idx_in_chunk==nblocks_in_chunk-1) & FLAG_CHUNK_END; + if (is_root_chunk) f |= FLAG_ROOT; // only this block in msg, or this is root + return f; +} + +__device__ __forceinline__ +uint32_t blake3_parent_flags(bool is_root_parent) { + return FLAG_PARENT | (is_root_parent ? FLAG_ROOT : 0); +} + +// ---- 小工具 ---- +__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { +#if defined(__CUDA_ARCH__) + return __funnelshift_r(x, x, n); +#else + return (x >> n) | (x << (32 - n)); // host 路径 +#endif +} + +__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) { +#if defined(__CUDA_ARCH__) + const uint4 v = *reinterpret_cast(src); + dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w; +#else + std::memcpy(dst, src, 16); +#endif +} + +__device__ void print_cv(uint32_t cv[8], int tgt_tid = 0) { + if (blockIdx.x == 0 && threadIdx.x == tgt_tid) { + auto get_byte = [&](int i) { + int w = i >> 2; // 第 i 个字节来自第 w 个 u32 + int off = (i & 3) * 8; // 在该 u32 中的偏移 + return (unsigned)((cv[w] >> off) & 0xFFu); + }; + + printf("block %d root CV (u32, little-endian words):", blockIdx.x); + for (int i = 0; i < 32; ++i) { + printf("%02x", get_byte(i)); + if ((i & 3) == 3) printf(" "); // 每 4 字节空格 + } + printf("\n"); + } +} + +// swap-table +// BLAKE3 message schedule: rows are P^r, r=0..6. +// Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) +// https://www.cse.unsw.edu.au/~cs4601/refs/papers/blake3.pdf +__device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = { + // r = 0: identity + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + // r = 1: P + { 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8 }, + // r = 2: P∘P + { 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1 }, + // r = 3 + { 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6 }, + // r = 4 + { 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4 }, + // r = 5 + { 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7 }, + // r = 6 + { 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13 }, +}; + +// get the "r" round, "k" message, it is broadcasted from m[li] lane. li = k +__device__ __forceinline__ uint32_t msg_rk(uint32_t m_lane, int round, int k, unsigned mask16) { + int src = B3_MSG_SCHEDULE[round][k]; + return __shfl_sync(mask16, m_lane, src, 16); +} + +__device__ __noinline__ +uint32_t G_update_role(uint32_t v_self, uint32_t v_b, uint32_t v_c, uint32_t v_d, + uint32_t mx, uint32_t my, int role) +{ + // 按 BLAKE2s 32-bit 的 G 序列算出 a',b',c',d',最后返回“当前 role”的那个值 + uint32_t a = v_self, b = v_b, c = v_c, d = v_d; + + // a = a + b + mx; + // d ^= a; + // d >>>= 16 + a = a + b + mx; + d ^= a; + d = rotr32(d, 16); + + // c = c + d; + // b ^= c; + // b >>>= 12 + c = c + d; + b ^= c; + b = rotr32(b, 12); + + // a = a + b + my; + // d ^= a; + // d >>>= 8 + a = a + b + my; + d ^= a; + d = rotr32(d, 8); + + // c = c + d; + // b ^= c; + // b >>>= 7 + c = c + d; + b ^= c; + b = rotr32(b, 7); + + // role choice: + switch (role) { + case 0: return a; + case 1: return b; + case 2: return c; + default: return d; + } +} + + +// =============== Leaf 16-lane compressing =============== +// notice that, this function will proceed 2 chunks, each time. +// - chunk_words_row: current chunk +// - out_cv: written by lane 0, or lane 16 +__device__ __noinline__ +void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row, // 256×u32 -> 1024 Bytes, from shared memory + // so the chunks_row += 2 as gap + int chunk_len_bytes, + uint64_t chunk_counter, + uint32_t out_cv[8], + unsigned mask16) { + // computing index + int lane = threadIdx.x & 31; // lane_id: 0-31 + int sub = lane >> 4; // 0/1 + int li = lane & 15; // 0..15, abstract lane id. for example, lane 16 will be li=0 + int role = li & 3; // a/b/c/d role + int base = (sub << 4); // 0 or 16 the absolute base + + const int nblocks = (chunk_len_bytes + 63) >> 6; // ceil(chunk_len/64) + + int warp_id = threadIdx.x / WARP_SIZE; + + // initialize + uint32_t cv_word = 0; + if (li < 8) cv_word = BLAKE3_IV_DEV[li]; + + // process all blocks + // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks + // each block has 64B -> 16 x u32 + for (int b = 0; b < nblocks; ++b) { + // each lane holds one u32, + // 16 lane will hold 16 x 4 = 64 B -> it's block + // the another 16 lane will hold opposite 64 B + const uint32_t m_lane = chunk_words_row[b * 16 + li]; + + // 初始化 v:v[0..7]=cv, v[8..11]=IV,v[12..15]^=t/len/flags + // 先把“自己的那个索引”的初值准备好: + + // 计数器/长度/标志(按 BLAKE3 规范) + const uint32_t t0 = (uint32_t)chunk_counter; + const uint32_t t1 = (uint32_t)(chunk_counter >> 32); + const int remain = chunk_len_bytes - (b << 6); + const uint32_t block_len = (remain >= 64) ? 64u : (uint32_t)remain; + + const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false); + uint32_t v = + (li < 8) ? cv_word : + (li < 12) ? BLAKE3_IV_DEV[(li - 8) & 3] : + (li == 12) ? t0 : + (li == 13) ? t1 : + (li == 14) ? block_len : flags; + + // 把索引写成 li = q + 4*rq;对角步先做置换 li' = (rq<<2) | ((q+rq)&3) + int q = (li & 3); + int rq = (li >> 2); + int li_diag = (rq << 2) | ((q + rq) & 3); + int li_undo = (rq << 2) | ((q - rq) & 3); + int gi_col = q; // 0..3 + int gi_diag = (li_diag & 3); // 0..3 + + // ===== 7 rounds ===== + #pragma unroll 4 + for (int r = 0; r < 7; ++r) { + // inside this loop, each lane will do one job + // 16 lane will execute 16 x 2 operations + // in sequential-programming, will do 8 operation + + // ---- 列步(quartet: {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15})---- + { + // 取同 quartet 的 b/c/d(基于当前 v) + uint32_t vb = __shfl_xor_sync(mask16, v, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16); + + // 本 quartet 的 i ∈ {0,1,2,3},列步用 msg 索引 0..7(两两为一对) + + uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16); + + v = G_update_role(v, vb, vc, vd, mx, my, role); + } + + // ---- 对角步 ---- + { + // 在“对角置换域”取到当前 v 值 + uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16); + + // 在该域内做“列步”同样的四邻取值 + uint32_t vb = __shfl_xor_sync(mask16, v_diag, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v_diag, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16); + + // 对角步的 4 组 G 使用本轮消息对的后半(索引 8..15) + + uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16); + + uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role); + + // 反置换回原位:li_undo = (rq<<2) | ((q - rq) & 3) + + // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16); + v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16); + } + } // 7 rounds end + + // 派生新的 CV:cv[i] = v[i] ^ v[i+8](仅 li=0..7 生效) + uint32_t vip8_all = __shfl_sync(mask16, v, li ^ 8, 16); + if (li < 8) { + cv_word = v ^ vip8_all; + } + + // 下一块继续(本函数内 16 个 block 串行) + } + + // 由 lane0 / lane16 收集 8×u32 输出 + #pragma unroll + for (int j = 0; j < 8; ++j) { + uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用 + if (li == 0 && out_cv) out_cv[j] = wj; // 仅 lane0 落盘 + } +} + +// =============== Parent 16-lane compressing =============== +__device__ __forceinline__ +void blake3_parent_cv_simd16(const uint32_t* __restrict__ L, // 8×u32 + const uint32_t* __restrict__ R, // 8×u32 + uint32_t* __restrict__ out_cv, // 8×u32 + unsigned mask16) // half-warp masks for 16 lanes +{ + const int lane = threadIdx.x & 31; + const int li = lane & 15; // 0..15 half the warp + const int role = li & 3; + + // messages: the front 8 from L, and the latter 8 from R + const uint32_t m_lane = (li < 8) ? L[li] : R[li - 8]; + + // v initialize + + const uint32_t t0 = 0u; + const uint32_t t1 = 0u; + const uint32_t block_len = 64u; + const uint32_t flags = FLAG_PARENT; + + uint32_t iv_val = BLAKE3_IV_DEV[li & 7]; + + uint32_t v = + (li < 12) ? iv_val : + (li == 12) ? t0 : + (li == 13) ? t1 : + (li == 14) ? block_len : flags; + + // 与 leaf 相同的“列/对角”两步、共 7 轮 + int q = (li & 3); + int rq = (li >> 2); + int li_diag = (rq << 2) | ((q + rq) & 3); + int li_undo = (rq << 2) | ((q - rq) & 3); + int gi_col = q; + int gi_diag = (li_diag & 3); + + #pragma unroll 4 + for (int r = 0; r < 7; ++r) { + // 列步 + { + uint32_t vb = __shfl_xor_sync(mask16, v, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16); + + uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16); + + v = G_update_role(v, vb, vc, vd, mx, my, role); + } + // 对角步 + { + uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16); + uint32_t vb = __shfl_xor_sync(mask16, v_diag, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v_diag, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16); + + uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16); + + uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role); + v = __shfl_sync(mask16, v_diag_new, li_undo, 16); + } + } + + // state -> CV:cv[i] = v[i] ^ v[i+8] + uint32_t vip8 = __shfl_sync(mask16, v, li ^ 8, 16); + uint32_t cv_word = (li < 8) ? (v ^ vip8) : 0; + + // 半 warp 汇聚到 out_cv[0..7](仅 li==0 的 4×收集也可以) + #pragma unroll + for (int j = 0; j < 8; ++j) { + uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); + if (li == 0) out_cv[j] = wj; + } +} + +// =============== Parent 1-lane compressing =============== +// the actually right compress 7r in a single lane function +__host__ __device__ void blake3_compress_words_7r( + const uint32_t block_words[16], // 64B -> shared memory + const uint32_t cv[8], // 8×u32 -> shared memory + uint64_t chunk_counter, // 64-bit + uint32_t block_len, // [0..64] + uint32_t flags, // CHUNK_START/END/PARENT/ROOT… + uint32_t out_state[16]) // output +{ + // 1) initialize v + uint32_t v0=cv[0], v1=cv[1], v2=cv[2], v3=cv[3]; + uint32_t v4=cv[4], v5=cv[5], v6=cv[6], v7=cv[7]; + +#if defined(__CUDA_ARCH__) + // device-side call + const uint32_t* IV = BLAKE3_IV_DEV; +#else + // host-side call + const uint32_t* IV = BLAKE3_IV_HOST; +#endif + + uint32_t v8 =IV[0], v9 =IV[1], v10=IV[2], v11=IV[3]; + // injection + uint32_t v12=(uint32_t)chunk_counter, v13=(uint32_t)(chunk_counter >> 32), v14=block_len, v15=flags; + + // 2) 7 轮 + int perm[16]; // 每轮的消息索引 + #pragma unroll + for (int i = 0; i < 16; ++i) + perm[i] = i; + + #pragma unroll + for (int r=0; r < 7; ++r) { + // col-step + G(v0, v4, v8, v12, block_words[perm[0]], block_words[perm[1]]); + G(v1, v5, v9, v13, block_words[perm[2]], block_words[perm[3]]); + G(v2, v6, v10,v14, block_words[perm[4]], block_words[perm[5]]); + G(v3, v7, v11,v15, block_words[perm[6]], block_words[perm[7]]); + + // diag-step + G(v0, v5, v10,v15, block_words[perm[8]], block_words[perm[9]]); + G(v1, v6, v11,v12, block_words[perm[10]], block_words[perm[11]]); + G(v2, v7, v8, v13, block_words[perm[12]], block_words[perm[13]]); + G(v3, v4, v9, v14, block_words[perm[14]], block_words[perm[15]]); + + // perm = perm ∘ PERMUTE + int np[16]; +#if defined (__CUDA_ARCH__) + const int* PERM_T = B3_PERMUTE_DEV; +#else + const int* PERM_T = B3_PERMUTE_HOST; +#endif + #pragma unroll + for (int i = 0; i < 16; ++i) + np[i] = perm[PERM_T[i]]; + #pragma unroll + for (int i = 0; i < 16; ++i) + perm[i] = np[i]; + } + + // 3) write to out state + out_state[ 0]=v0; out_state[ 1]=v1; out_state[ 2]=v2; out_state[ 3]=v3; + out_state[ 4]=v4; out_state[ 5]=v5; out_state[ 6]=v6; out_state[ 7]=v7; + out_state[ 8]=v8; out_state[ 9]=v9; out_state[10]=v10; out_state[11]=v11; + out_state[12]=v12; out_state[13]=v13; out_state[14]=v14; out_state[15]=v15; +} + +// from out_state yields CV +__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){ +#pragma unroll + for (int i = 0; i < 8; ++i) + out_cv[i] = st[i] ^ st[8+i]; +} + +__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){ + uint32_t msg[16]; +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[i] = L[i]; + } +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[8+i] = R[i]; + } + uint32_t st[16]; + blake3_compress_words_7r(msg, BLAKE3_IV_DEV, 0ull, 64u, FLAG_PARENT, st); + blake3_state_to_cv(st, out_cv); +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b62068c..68e8d6d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,5 @@ pandas==2.3.3 pip-chill==1.0.3 pybind11==3.0.1 tomli==2.0.1 +torch +blake3 \ No newline at end of file diff --git a/setup.py b/setup.py index eb3a2e4..a45ea14 100644 --- a/setup.py +++ b/setup.py @@ -1,39 +1,158 @@ +# setup.py from setuptools import setup -from pybind11.setup_helpers import Pybind11Extension, build_ext -import sys -import sysconfig - -import numpy as np - -cxx_std = 17 -extra_compile_args = [] -extra_link_args = [] - -# linux environment, default -extra_compile_args += ["-O2", "-ffast-math", "-march=native", "-fopenmp", "-Wall", "-Wextra", "-Wpedantic", "-mavx2", "-mfma"] -extra_link_args += ["-fopenmp"] - -ext_modules = [ - Pybind11Extension( - "flashashing", - sources=[ - "csrc/sha256_base.cpp", - "csrc/sha256_simd.cpp", - "csrc/blake3_base.cpp", - "csrc/binding.cpp" - ], - include_dirs=[np.get_include()], - cxx_std=cxx_std, - extra_compile_args=extra_compile_args, - extra_link_args=extra_link_args, +from setuptools.command.build_ext import build_ext +import pybind11, numpy as np +import sys, os, shutil +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +def find_in_path(name, path): + for d in path.split(os.pathsep): + p = os.path.join(d, name) + if os.path.exists(p): + return os.path.abspath(p) + return None + +def locate_cuda(): + cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") + nvcc = None + if cuda_home: + nvcc = os.path.join(cuda_home, "bin", "nvcc") + else: + nvcc = find_in_path("nvcc", os.environ.get("PATH", "")) + if nvcc: + cuda_home = os.path.dirname(os.path.dirname(nvcc)) + if not nvcc or not os.path.exists(nvcc): + raise RuntimeError("nvcc not found. Set CUDA_HOME/CUDA_PATH or add nvcc to PATH.") + cuda_include = os.path.join(cuda_home, "include") + if sys.platform.startswith("win"): + cuda_libdir = os.path.join(cuda_home, "lib", "x64") + else: + cuda_libdir = os.path.join(cuda_home, "lib64") + return {"home": cuda_home, "nvcc": nvcc, "include": cuda_include, "libdir": cuda_libdir} + +def locate_cutlass_cute(): + """ + 返回 CUTLASS/CuTe 的 include 根目录(要求该目录下有 cutlass/ 与 cute/)。 + 优先使用环境变量 CUTLASS_HOME 或 CUTE_HOME。 + """ + cand = [] + if "CUTE_HOME" in os.environ: + cand.append(os.path.join(os.environ["CUTE_HOME"], "include")) + cand.append(os.environ["CUTE_HOME"]) + if "CUTLASS_HOME" in os.environ: + cand.append(os.path.join(os.environ["CUTLASS_HOME"], "include")) + cand.append(os.environ["CUTLASS_HOME"]) + + cand += [ + "/usr/local/include", # linux 常见 + os.path.expanduser("~/cutlass/include"), # clone 到家目录 + os.path.expanduser("~/CUTLASS/include"), + os.path.expanduser("~/third_party/cutlass/include"), + os.path.abspath("third_party/cutlass/include"), + ] + + def ok(p): + return p and os.path.isdir(p) and \ + os.path.isdir(os.path.join(p, "cute")) # CuTe 在 cutlass/include/cute + + for p in cand: + if ok(p): + return os.path.abspath(p) + + raise RuntimeError( + "Cannot find CUTLASS/CuTe include path.\n" + "Set CUTLASS_HOME or CUTE_HOME to the repository root (the path that contains 'include/cute')." ) + +CUDA = locate_cuda() +CUTLASS_INCLUDE = locate_cutlass_cute() + +print(f"Found cutlass include: {CUTLASS_INCLUDE}") + +CXX_STD = 17 + +arch_list = os.environ.get("FLASHASHING_CUDA_ARCH_LIST", "80;89").split(";") +NVCC_ARCH_FLAGS = [] +for a in arch_list: + a = a.strip() + if a: + NVCC_ARCH_FLAGS += ["-gencode", f"arch=compute_{a},code=sm_{a}"] + +COMMON_DEFINES = [] +COMMON_INCLUDES = [ + np.get_include(), + pybind11.get_include(), + pybind11.get_include(user=True), + CUDA["include"], + CUTLASS_INCLUDE, +] +COMMON_LIB_DIRS = [CUDA["libdir"]] +COMMON_LIBS = ["cudart"] +RPATH = [CUDA["libdir"]] if not sys.platform.startswith("win") else [] + +debug = os.environ.get("DBG_KERNEL", "0") == "1" + +CXX_FLAGS = [ + f"-std=c++{CXX_STD}", "-O3", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math", "-march=native", "-mavx2", "-mfma" +] if not debug else [ + f"-std=c++{CXX_STD}", "-g", "-O0", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math", "-march=native", "-mavx2", "-mfma", "-DDBG_KERNEL=1" +] +LINK_FLAGS = [] + +# OpenMP +if sys.platform.startswith("win"): + # MSVC + CXX_FLAGS += ["/openmp"] +else: + CXX_FLAGS += ["-fopenmp"] + LINK_FLAGS += ["-fopenmp"] + +NVCC_FLAGS = [ + f"-std=c++{CXX_STD}", + "-O3", "-Xcompiler", "-fPIC", + "--expt-relaxed-constexpr", + "--use_fast_math", + "-lineinfo", +] + NVCC_ARCH_FLAGS if not debug else [ + f"-std=c++{CXX_STD}", + "-g", "-O0", "-Xcompiler", "-fPIC", + "--expt-relaxed-constexpr", + "--use_fast_math", + # "-G", + "-lineinfo", + "-DDBG_KERNEL=1", +] + NVCC_ARCH_FLAGS + +if not sys.platform.startswith("win"): + NVCC_FLAGS += ["-Xcompiler", "-fopenmp"] +else: + # MSVC 的 NVCC 透传 + NVCC_FLAGS += ["-Xcompiler", "/openmp", "-Xcompiler", "/MD", "-Xcompiler", "/O2"] + +# ---------- 扩展模块 ---------- +sources = [ + "csrc/sha256_base.cpp", + "csrc/sha256_simd.cpp", + "csrc/blake3_base.cpp", + "csrc/blake3_sm70_sm80.cu", + "csrc/binding.cpp", ] setup( name="flashashing", - version="0.1.0", - description="High performance hashing (SHA-256, BLAKE3) implementation with pybind11", - ext_modules=ext_modules, - cmdclass={"build_ext": build_ext}, - zip_safe=False, + ext_modules=[ + CUDAExtension( + "flashashing", + sources=sources, + extra_compile_args={ + "cxx": CXX_FLAGS, + "nvcc": NVCC_FLAGS, + }, + include_dirs=COMMON_INCLUDES, + library_dirs=COMMON_LIB_DIRS, + libraries=COMMON_LIBS, + extra_link_args=LINK_FLAGS + (["-Wl,-rpath," + RPATH[0]] if RPATH else []), + ) + ], + cmdclass={"build_ext": BuildExtension}, )