From 6765dc54bd4a0afd95c02d9559fe2aebcd4f39a1 Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Sun, 5 Oct 2025 01:20:38 +0800 Subject: [PATCH 01/20] gpu --- .gitignore | 4 +- benchmark/test_gpu.py | 25 +++ csrc/binding.cpp | 119 +++++++++++- csrc/blake3.h | 9 +- csrc/blake3_sm80.cu | 410 ++++++++++++++++++++++++++++++++++++++++++ setup.py | 189 +++++++++++++++---- 6 files changed, 722 insertions(+), 34 deletions(-) create mode 100644 benchmark/test_gpu.py create mode 100644 csrc/blake3_sm80.cu diff --git a/.gitignore b/.gitignore index aebc9d7..dfa07fd 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,6 @@ run.sh build/ dist/ -*.egg-info/ \ No newline at end of file +*.egg-info/ + +*.ncu-rep \ No newline at end of file diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py new file mode 100644 index 0000000..c867025 --- /dev/null +++ b/benchmark/test_gpu.py @@ -0,0 +1,25 @@ +import flashashing as fh +import hashlib +import torch +import time + +GiB = 1024**3 + +s = "A" * GiB # 1 GiB +data = s.encode("utf-8") +print(len(data)) # 1073741824 + +t0 = time.perf_counter() +repeat = 20 +for i in range(repeat): + cv_hex = fh.blake3_gpu_sm80_hex(data) +torch.cuda.synchronize() + +t1 = time.perf_counter() +elapsed = t1 - t0 +print(f"Elapsed time for f{repeat}x BLAKE3 (GPU SM80): {elapsed:.3f} seconds") +print(f"Throughput: {repeat * len(data) / elapsed / (1024**2):.2f} MiB/s") +print("root CV (hex) =", cv_hex) + +expected_sha256_result = hashlib.sha256(data).hexdigest() +print(f"SHA256 Expected: {expected_sha256_result}") \ No newline at end of file diff --git a/csrc/binding.cpp b/csrc/binding.cpp index 2da163f..54bb3d6 100644 --- a/csrc/binding.cpp +++ b/csrc/binding.cpp @@ -76,6 +76,109 @@ static std::string blake3_hash_naive(py::object obj) { } } + +struct PyBytesView { + const uint8_t* ptr = nullptr; + size_t len = 0; + std::string storage; + py::object keep_alive; +}; + +static PyBytesView get_bytes_view(py::object obj) { + PyBytesView v; + + // bytes -> 复制到 storage + if (py::isinstance(obj)) { + v.storage = static_cast(py::bytes(obj)); + v.ptr = reinterpret_cast(v.storage.data()); + v.len = v.storage.size(); + return v; + } + + // str -> 按 UTF-8 编码复制;如果你想按“原始字节”处理,建议在 Python 侧先 .encode() + if (py::isinstance(obj)) { + v.storage = obj.cast(); + v.ptr = reinterpret_cast(v.storage.data()); + v.len = v.storage.size(); + return v; + } + + // bytearray -> 正确姿势:转成 py::bytes 再拿 std::string(不要用迭代器) + if (py::isinstance(obj)) { + v.storage = static_cast(py::bytes(obj)); + v.ptr = reinterpret_cast(v.storage.data()); + v.len = v.storage.size(); + return v; + } + + // 任意 buffer(memoryview / numpy 等) + if (PyObject_CheckBuffer(obj.ptr())) { + py::buffer buf = py::reinterpret_borrow(obj); + py::buffer_info info = buf.request(/*writable=*/false); + + // 一维 C 连续:零拷贝,注意保活 + if (info.ndim == 1 && + info.strides.size() == 1 && + info.strides[0] == static_cast(info.itemsize)) { + v.ptr = reinterpret_cast(info.ptr); + v.len = static_cast(info.size) * static_cast(info.itemsize); + v.keep_alive = obj; // 保活底层内存 + return v; + } + + // 非连续:用 tobytes() 拷贝成线性字节 + py::object tobytes = obj.attr("tobytes")(); + v.storage = static_cast(py::bytes(tobytes)); + v.ptr = reinterpret_cast(v.storage.data()); + v.len = v.storage.size(); + return v; + } + + throw std::invalid_argument( + "blake3_gpu_root_* expects bytes/str/bytearray/memoryview/numpy buffer."); +} + +static std::string cv_words_to_bytes_le(const std::array& cv) { + std::string out; + out.resize(32); + uint8_t* p = reinterpret_cast(&out[0]); + for (int i = 0; i < 8; ++i) { + uint32_t w = cv[i]; + p[4*i + 0] = static_cast( w & 0xFF); + p[4*i + 1] = static_cast((w >> 8) & 0xFF); + p[4*i + 2] = static_cast((w >> 16) & 0xFF); + p[4*i + 3] = static_cast((w >> 24) & 0xFF); + } + return out; +} + +struct GilRelease { + py::gil_scoped_release rel; +}; + +static py::bytes blake3_gpu_root_cv_bytes(py::object obj) { + auto v = get_bytes_view(obj); + std::array root{}; + { + GilRelease _g; + blake3_block_reduce_sm80(v.ptr, static_cast(v.len), &root, /*stream=*/0); + } + std::string b = cv_words_to_bytes_le(root); + return py::bytes(b); +} + +static std::string blake3_gpu_root_hex(py::object obj) { + auto v = get_bytes_view(obj); + std::array root{}; + { + GilRelease _g; + blake3_block_reduce_sm80(v.ptr, static_cast(v.len), &root, /*stream=*/0); + } + std::string b = cv_words_to_bytes_le(root); + return bytes_to_hex(reinterpret_cast(b.data()), b.size()); +} + + PYBIND11_MODULE(flashashing, m) { m.doc() = "SHA-256 and Blake3 bindings (pybind11)"; @@ -84,9 +187,23 @@ PYBIND11_MODULE(flashashing, m) { "Compute SHA-256 of a str/bytes and return hex string."); m.def("hash_simd", &hash_hex_simd4, py::arg("data"), - "SIMD enhanced sha256."), + "SIMD enhanced sha256."); m.def("blake3_hash_naive", &blake3_hash_naive, py::arg("data"), "Compute BLAKE3 hash (single-threaded)."); + + m.def("blake3_gpu_sm80", + &blake3_gpu_root_cv_bytes, + py::arg("data"), + R"pbdoc( +Return the 32-byte *root chaining value* (CV) computed on GPU for the given data. +NOTE: This is not the standard BLAKE3 digest/XOF output. It's the root CV. +)pbdoc"); + m.def("blake3_gpu_sm80_hex", + &blake3_gpu_root_hex, + py::arg("data"), + R"pbdoc( +Return the hex string of the *root chaining value* (CV) computed on GPU. +)pbdoc"); } \ No newline at end of file diff --git a/csrc/blake3.h b/csrc/blake3.h index 55aaa43..c1e27de 100644 --- a/csrc/blake3.h +++ b/csrc/blake3.h @@ -2,6 +2,9 @@ #include #include +#include +#include +#include namespace flashashing { @@ -23,4 +26,8 @@ class Blake3 { std::string bytes_to_hex(const uint8_t *data, size_t len); -} // namespace flashashing \ No newline at end of file +} // namespace flashashing + +void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, + std::array* root_out = nullptr, + cudaStream_t stream = 0); \ No newline at end of file diff --git a/csrc/blake3_sm80.cu b/csrc/blake3_sm80.cu new file mode 100644 index 0000000..53c3575 --- /dev/null +++ b/csrc/blake3_sm80.cu @@ -0,0 +1,410 @@ + +#include +#include +#include +#include + +#define WARP_SIZE 32 + +#define CUDA_CHECK(expr) do { \ + cudaError_t _e = (expr); \ + if (_e != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s at %s:%d: %s\n", \ + #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\ + std::abort(); \ + } \ + } while(0) + +__constant__ uint32_t BLAKE3_IV[8] = { + 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, + 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u +}; +// 轮常量/消息置换表…(略) +// TODO: 按 BLAKE3/BLAKE2s 规范补齐 + +// ---- 小工具 ---- +__device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { + // 使用 funnel shift 更快(算力>=Volta都有) + return __funnelshift_r(x, x, n); +} + +__device__ void blake3_compress_words_7r( + const uint32_t block_words[16], // 64B + const uint32_t cv[8], // 8×u32 + uint64_t chunk_counter, // 64-bit + uint32_t block_len, // [0..64] + uint32_t flags, // CHUNK_START/END/PARENT/ROOT… + uint32_t out_state[16]) // 返回 16×u32 状态向量(按规范) +{ + // TODO: 根据 BLAKE3(基于 BLAKE2s)的 G/round 实现7轮 + // 这里先给占位:将 IV+cv 混合到 out_state,真实实现请替换 +#pragma unroll + for (int i = 0; i < 8; ++i) + out_state[i] = cv[i]; +#pragma unroll + for (int i = 0; i < 8; ++i) + out_state[8+i] = BLAKE3_IV[i]; + + // 混入计数、block_len、flags,和 block_words(简化占位) + out_state[12] ^= (uint32_t)chunk_counter; + out_state[13] ^= (uint32_t)(chunk_counter >> 32); + out_state[14] ^= block_len; + out_state[15] ^= flags; + +#pragma unroll + for (int i = 0; i < 16; ++i) { + out_state[i] ^= block_words[i]; + // 做一点点搅动(占位) + out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31); + } +} + +// 从 out_state 派生新的 CV(按规范应取 state[0..7] ^ state[8..15]) +__device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){ +#pragma unroll + for (int i = 0; i < 8; ++i) + out_cv[i] = st[i] ^ st[8+i]; +} + +// 叶:处理 1KiB chunk(16×64B blocks)→ 1 个 CV +// 假定输入为小端 u32 流,chunk 不足 1KiB 最后一块 block_len<64 并置 END 标志 +__device__ void blake3_leaf_cv(const uint32_t* chunk_words, + int chunk_len_bytes, + uint64_t chunk_counter, + uint32_t out_cv[8]) +{ + uint32_t cv[8]; + // 初始 cv = IV +#pragma unroll + for (int i = 0; i < 8; ++i) + cv[i] = BLAKE3_IV[i]; + + const int nblocks = (chunk_len_bytes + 63) / 64; // ceil + for (int b = 0; b < nblocks; ++b) { + uint32_t st[16]; + const uint32_t* block = chunk_words + b*16; + const int remain = chunk_len_bytes - b*64; + const uint32_t blk_len = remain >= 64 ? 64u : (uint32_t)remain; + + const uint32_t flags = + ((b == 0) ? (1u<<0) : 0u) | // CHUNK_START(示意:bit0) + ((b == nblocks-1) ? (1u<<1) : 0u); // CHUNK_END (示意:bit1) + + blake3_compress_words_7r(block, cv, chunk_counter, blk_len, flags, st); + blake3_state_to_cv(st, cv); + } + +#pragma unroll + for (int i = 0; i < 8; ++i) + out_cv[i] = cv[i]; +} + +__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){ + uint32_t msg[16]; +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[i] = L[i]; + } +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[8+i] = R[i]; + } + uint32_t st[16]; + blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st); + blake3_state_to_cv(st, out_cv); +} + +// ============ Big kernel: 1 warp -> 32 chunks, 1 thread = 1 chunk, 16 WARPS in total ============ +// Each block has 512 threads +// 1 warp process 32 chunk -> 32 KiB +// NUM_WARPS = 512 / 32 = 16 +// Each block processes 16 x 32 chunks = 16 x 32 KiB = 512 KiB +template // pad shared memory +__global__ void blake3_block_reduce_kernel(uint32_t* d_input, + uint32_t* block_cvs, + int chunk_len_bytes, + uint64_t base_chunk_counter, + int total_chunks) { + // NUM_WARPS also stands for NUM_CHUKNS per block + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // consider it as 16 + + // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total + // 8 x 4 x 16 = 512 B shared memory in sum + __shared__ uint32_t cv_smem[NUM_WARPS][8 + PADSIZE]; // avoid bank conflict + + // reduce pipeline: 16 -> 8 -> 4 -> 2 -> 1 + const int tid = threadIdx.x; + const int warp_id = tid / WARP_SIZE; + const int lane_id = tid % WARP_SIZE; + + const uint64_t global_warp_id = blockIdx.x * NUM_WARPS + warp_id; + const uint64_t chunk_counter = base_chunk_counter + global_warp_id; + + // index + const uint64_t warp_chunk_base = global_warp_id * WARP_SIZE; // the start of this warp + // each thread process one chunk + const uint64_t chunk_idx = warp_chunk_base + lane_id; + + // edge processing + int valid = total_chunks - warp_chunk_base; + if (valid <= 0) return; // TODO: will this affect warp shfl? + if (valid > WARP_SIZE) valid = WARP_SIZE; + + // compute idx for this thread + const int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + const uint32_t* chunk_words_ptr = d_input + (size_t)chunk_idx * WORDS_PER_CHUNK; + + uint32_t cv[8] = {0}; // 8 x u32 + bool active = lane_id < valid; + if (active) { + const uint64_t chunk_counter = base_chunk_counter + chunk_idx; + blake3_leaf_cv(chunk_words_ptr, chunk_len_bytes, chunk_counter, cv); + } + + // take care: we cannot use general reduce + // 0-1-2-3-4-...-31, keep this sequential + unsigned mask = __ballot_sync(0xFFFFFFFFu, active); + // step = 1,2,4,8,16 + for (int step = 1; step < WARP_SIZE; step <<= 1) { + int partner_lane = lane_id + step; + + // neighbor cv + uint32_t neighbor_cv[8]; +#pragma unroll + for (int j = 0; j < 8; ++j) { + neighbor_cv[j] = __shfl_down_sync(mask, cv[j], step); + } + + // the left be parent, and make sure `the right` is valid + if (active && ((lane_id & ((step << 1) - 1)) == 0) && (partner_lane < valid)) { + blake3_parent_cv(cv, neighbor_cv, cv); + } + __syncwarp(mask); + + // in the next level, reduce half of active threads + if (lane_id >= (valid & ~(step))) + active = false; + } + + // now, lane 0 holds the root + if (lane_id == 0) { +#pragma unroll + for (int j = 0 ; j < 8; ++j) + cv_smem[warp_id][j] = cv[j]; + } + __syncthreads(); + + // after all these things, we have finished + // 32 -> 16 -> 8 -> 4 -> 2 -> 1 merge + // and now, we are going to implement higher-level merge + // we have 16 warps, each warp has a root cv + // so we are going to execute another logN steps + + // 16 -> 8 -> 4 -> 2 -> 1 + for (int stride = NUM_WARPS >> 1; stride >= 1; stride >>= 1) { + if (warp_id < stride && lane_id == 0) { + uint32_t p[8]; + blake3_parent_cv(&cv_smem[2*warp_id][0], &cv_smem[2*warp_id + 1][0], p); +#pragma unroll + for (int j=0;j<8;++j) + cv_smem[warp_id][j] = p[j]; // write back to shared memory + } + __syncthreads(); + } + + // write this root cv to global memory, not done yet! we need another tiny kernel to sweep + if (tid == 0) { + uint32_t* out = block_cvs + (size_t)blockIdx.x * 8; +#pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = cv_smem[0][j]; + } + +} // blake3_block_reduce_kernel + + +// ============ Tiny kernel ============ +// In big kernel, it will consume 512 KiB each block +// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 512 root = 2048 root +// And this tiny kernel is designed to process these 2048 root +template +__global__ void blake3_pair_reduce_kernel(const uint32_t* __restrict__ in_cv32, + uint32_t* __restrict__ out_cv32, + int n) { + extern __shared__ uint32_t smem[]; // dynamic shared memory + uint32_t* tile = smem; // -> [tile_n][8] + + const int tid = threadIdx.x; + const int b = blockIdx.x; + const int B = gridDim.x; + + const int start = (int)((1ll * n * b) / B); + const int end = (int)((1ll * n * (b+1)) / B); + int tile_n = end - start; + + if (tile_n <= 0) return; // border + + const int words = tile_n * 8; + for (int w = tid; w < words; w += NUM_THREADS) { + tile[w] = in_cv32[start * 8 + w]; + } + __syncthreads(); + + int cur = tile_n; + while (cur > 1) { + const int pairs = cur >> 1; // floor(cur/2) + // process pairs + for (int i = tid; i < pairs; i += NUM_THREADS) { + const uint32_t* L = &tile[(2*i) * 8]; + const uint32_t* R = &tile[(2*i+1) * 8]; + uint32_t p[8]; + blake3_parent_cv(L, R, p); +#pragma unroll + for (int j=0;j<8;++j) tile[i*8 + j] = p[j]; + } + __syncthreads(); + + // even situation: + if ((cur & 1) && tid == 0) { + uint32_t* dst = &tile[pairs * 8]; + uint32_t* src = &tile[(cur - 1) * 8]; +#pragma unroll + for (int j=0;j<8;++j) + dst[j] = src[j]; + } + __syncthreads(); + + cur = pairs + (cur & 1); + } + + // write output + if (tid == 0) { + uint32_t* out = &out_cv32[b * 8]; +#pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = tile[j]; + } +} + +void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, + std::array* root_out = nullptr, + cudaStream_t stream = 0) { + if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) { + fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n", + (unsigned long long)bytes_len); + std::abort(); + } + + constexpr int CHUNK_SIZE = 1024; // 1 KiB + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int NUM_THREADS = 512; // for big kernel + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // 16 + constexpr int CHUNKS_PER_BLOCK= NUM_WARPS * WARP_SIZE; // 16 * 32 = 512 + const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk + const uint64_t total_chunks = bytes_len / CHUNK_SIZE; + const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); + + uint8_t* d_bytes = nullptr; + uint32_t* d_words = nullptr; // alias + uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32 + + // TODO: use thrust + cudaMalloc(&d_bytes, bytes_len); + cudaMemcpyAsync(d_bytes, data, bytes_len, cudaMemcpyHostToDevice, stream); + d_words = reinterpret_cast(d_bytes); + + cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t)); + + // launch big kernel + dim3 grid_big(num_blocks); + dim3 block_big(NUM_THREADS); + uint64_t base_chunk_counter = 0ull; + + blake3_block_reduce_kernel + <<>>( + d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks); + + CUDA_CHECK(cudaGetLastError()); + + if (num_blocks == 1) { + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + if (root_out) *root_out = host_root; + else { + // 简单打印 + printf("root CV:"); + for (int i=0;i<8;++i) + printf(" %08x", host_root[i]); + printf("\n"); + } + + CUDA_CHECK(cudaFree(d_blockCV)); + CUDA_CHECK(cudaFree(d_bytes)); + return; + } + + // the first round of tiny kernel + const int B = (num_blocks >= 8) ? 8 : num_blocks; + uint32_t* d_midCV = nullptr; + cudaMalloc(&d_midCV, (size_t)B * 8u * sizeof(uint32_t)); + + { + dim3 grid(B); + dim3 block(512); // 你指定 “每个 block 512 线程” + // 每个 block 负责 ceil(num_blocks / B) 个 CV;SMEM 大小按此计算 + const int tile = (num_blocks + B - 1) / B; + const size_t smem_bytes = (size_t)tile * 8u * sizeof(uint32_t); + + blake3_pair_reduce_kernel<512> + <<>>(d_blockCV, d_midCV, num_blocks); + CUDA_CHECK(cudaGetLastError()); + } + + // second round + uint32_t* d_root = nullptr; + cudaMalloc(&d_root, 8u * sizeof(uint32_t)); + + { + dim3 grid(1); + dim3 block(B); + const size_t smem_bytes = (size_t)B * 8u * sizeof(uint32_t); + + // generate kernel during compile time + switch (B) { + case 1: blake3_pair_reduce_kernel<1 ><<>>(d_midCV, d_root, B); break; + case 2: blake3_pair_reduce_kernel<2 ><<>>(d_midCV, d_root, B); break; + case 4: blake3_pair_reduce_kernel<4 ><<>>(d_midCV, d_root, B); break; + case 8: blake3_pair_reduce_kernel<8 ><<>>(d_midCV, d_root, B); break; + case 16: blake3_pair_reduce_kernel<16><<>>(d_midCV, d_root, B); break; + case 32: blake3_pair_reduce_kernel<32><<>>(d_midCV, d_root, B); break; + case 64: blake3_pair_reduce_kernel<64><<>>(d_midCV, d_root, B); break; + default: { + blake3_pair_reduce_kernel<256><<>>(d_midCV, d_root, B); + } break; + } + CUDA_CHECK(cudaGetLastError()); + } + + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + if (root_out) { + *root_out = host_root; + } else { + printf("root CV:"); + for (int i=0;i<8;++i) printf(" %08x", host_root[i]); + printf("\n"); + } + + // clear + CUDA_CHECK(cudaFree(d_root)); + CUDA_CHECK(cudaFree(d_midCV)); + CUDA_CHECK(cudaFree(d_blockCV)); + CUDA_CHECK(cudaFree(d_bytes)); +} \ No newline at end of file diff --git a/setup.py b/setup.py index eb3a2e4..749c743 100644 --- a/setup.py +++ b/setup.py @@ -1,39 +1,166 @@ +# setup.py from setuptools import setup -from pybind11.setup_helpers import Pybind11Extension, build_ext -import sys -import sysconfig - -import numpy as np - -cxx_std = 17 -extra_compile_args = [] -extra_link_args = [] - -# linux environment, default -extra_compile_args += ["-O2", "-ffast-math", "-march=native", "-fopenmp", "-Wall", "-Wextra", "-Wpedantic", "-mavx2", "-mfma"] -extra_link_args += ["-fopenmp"] - -ext_modules = [ - Pybind11Extension( - "flashashing", - sources=[ - "csrc/sha256_base.cpp", - "csrc/sha256_simd.cpp", - "csrc/blake3_base.cpp", - "csrc/binding.cpp" - ], - include_dirs=[np.get_include()], - cxx_std=cxx_std, - extra_compile_args=extra_compile_args, - extra_link_args=extra_link_args, - ) +from setuptools.command.build_ext import build_ext +from pybind11.setup_helpers import Pybind11Extension +import pybind11, numpy as np +import sys, os, shutil + +def find_in_path(name, path): + for d in path.split(os.pathsep): + p = os.path.join(d, name) + if os.path.exists(p): + return os.path.abspath(p) + return None + +def locate_cuda(): + cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") + nvcc = None + if cuda_home: + nvcc = os.path.join(cuda_home, "bin", "nvcc") + else: + nvcc = find_in_path("nvcc", os.environ.get("PATH", "")) + if nvcc: + cuda_home = os.path.dirname(os.path.dirname(nvcc)) + if not nvcc or not os.path.exists(nvcc): + raise RuntimeError("nvcc not found. Set CUDA_HOME/CUDA_PATH or add nvcc to PATH.") + cuda_include = os.path.join(cuda_home, "include") + if sys.platform.startswith("win"): + cuda_libdir = os.path.join(cuda_home, "lib", "x64") + else: + cuda_libdir = os.path.join(cuda_home, "lib64") + return {"home": cuda_home, "nvcc": nvcc, "include": cuda_include, "libdir": cuda_libdir} + +CUDA = locate_cuda() + +CXX_STD = 17 + +arch_list = os.environ.get("FLASHASHING_CUDA_ARCH_LIST", "80;86;89").split(";") +NVCC_ARCH_FLAGS = [] +for a in arch_list: + a = a.strip() + if a: + NVCC_ARCH_FLAGS += ["-gencode", f"arch=compute_{a},code=sm_{a}"] + +COMMON_DEFINES = [] +COMMON_INCLUDES = [np.get_include(), pybind11.get_include(), pybind11.get_include(user=True), CUDA["include"]] +COMMON_LIB_DIRS = [CUDA["libdir"]] +COMMON_LIBS = ["cudart"] +RPATH = [CUDA["libdir"]] if not sys.platform.startswith("win") else [] + +CXX_FLAGS = [ + f"-std=c++{CXX_STD}", "-O3", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math" ] +LINK_FLAGS = [] + +# OpenMP +if sys.platform.startswith("win"): + # MSVC + CXX_FLAGS += ["/openmp"] +else: + CXX_FLAGS += ["-fopenmp"] + LINK_FLAGS += ["-fopenmp"] + +NVCC_FLAGS = [ + f"-std=c++{CXX_STD}", + "-O3", "-Xcompiler", "-fPIC", + "--expt-relaxed-constexpr", + "--use_fast_math", + "-lineinfo", +] + NVCC_ARCH_FLAGS + +if not sys.platform.startswith("win"): + NVCC_FLAGS += ["-Xcompiler", "-fopenmp"] +else: + # MSVC 的 NVCC 透传 + NVCC_FLAGS += ["-Xcompiler", "/openmp", "-Xcompiler", "/MD", "-Xcompiler", "/O2"] + +# give .cu to nvcc +from distutils.unixccompiler import UnixCCompiler + +class BuildExtNVCC(build_ext): + def build_extensions(self): + + self.compiler.src_extensions.append(".cu") + + original_compile = self.compiler.compile + + def nvcc_compile(sources, output_dir=None, macros=None, include_dirs=None, + debug=0, extra_preargs=None, extra_postargs=None, depends=None): + cxx_sources, cu_sources = [], [] + for s in sources: + (cu_sources if os.path.splitext(s)[1] == ".cu" else cxx_sources).append(s) + + objects = [] + if cxx_sources: + + postargs = extra_postargs.get("cxx", []) if isinstance(extra_postargs, dict) else extra_postargs + objects += original_compile( + cxx_sources, output_dir, macros, include_dirs, debug, extra_preargs, postargs, depends + ) + + if cu_sources: + + for src in cu_sources: + obj = self.compiler.object_filenames([src], output_dir=output_dir)[0] + cmd = [CUDA["nvcc"], "-c", src, "-o", obj] + NVCC_FLAGS + + incs = include_dirs or [] + for inc in incs: + cmd += ["-I", inc] + + if macros: + for m in macros: + if isinstance(m, tuple): + name, val = m + cmd += ["-D%s=%s" % (name, val)] + else: + cmd += ["-D%s" % m] + + if isinstance(extra_postargs, dict): + cmd += extra_postargs.get("nvcc", []) + elif extra_postargs: + cmd += extra_postargs + + os.makedirs(os.path.dirname(obj), exist_ok=True) + self.spawn(cmd) + objects.append(obj) + return objects + + self.compiler.compile = nvcc_compile + + for ext in self.extensions: + if not sys.platform.startswith("win"): + ext.runtime_library_dirs = list(set((ext.runtime_library_dirs or []) + RPATH)) + build_ext.build_extensions(self) + +# ---------- 扩展模块 ---------- +sources = [ + "csrc/sha256_base.cpp", + "csrc/sha256_simd.cpp", + "csrc/blake3_base.cpp", + "csrc/blake3_sm80.cu", + "csrc/binding.cpp", +] + +ext = Pybind11Extension( + "flashashing", + sources=sources, + include_dirs=COMMON_INCLUDES, + library_dirs=COMMON_LIB_DIRS, + libraries=COMMON_LIBS, + extra_compile_args={ + "cxx": CXX_FLAGS, + "nvcc": [] + }, + extra_link_args=LINK_FLAGS, + define_macros=[("PYBIND11_DETAILED_ERROR_MESSAGES", "1")] + [(d, None) for d in COMMON_DEFINES], +) setup( name="flashashing", version="0.1.0", - description="High performance hashing (SHA-256, BLAKE3) implementation with pybind11", - ext_modules=ext_modules, - cmdclass={"build_ext": build_ext}, + description="High performance hashing (SHA-256, BLAKE3) with CUDA + pybind11", + ext_modules=[ext], + cmdclass={"build_ext": BuildExtNVCC}, zip_safe=False, ) From 4f31a2c55551965a2e5f048565f021c4551554ae Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Sun, 5 Oct 2025 14:03:27 +0800 Subject: [PATCH 02/20] finish v1 building --- .gitignore | 1 + benchmark/test_gpu.py | 8 +++++--- csrc/blake3_sm80.cu | 41 ++++++++++++++++++++++++++++++++++------- 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index dfa07fd..5f07045 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ run.sh +profile.sh **.__pycache__ diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py index c867025..75fcef9 100644 --- a/benchmark/test_gpu.py +++ b/benchmark/test_gpu.py @@ -2,13 +2,16 @@ import hashlib import torch import time +import blake3 -GiB = 1024**3 +GiB = 1024*1024*1024 # bytes -> 1 GiB s = "A" * GiB # 1 GiB data = s.encode("utf-8") print(len(data)) # 1073741824 +std_hex = blake3.blake3(data).hexdigest() + t0 = time.perf_counter() repeat = 20 for i in range(repeat): @@ -21,5 +24,4 @@ print(f"Throughput: {repeat * len(data) / elapsed / (1024**2):.2f} MiB/s") print("root CV (hex) =", cv_hex) -expected_sha256_result = hashlib.sha256(data).hexdigest() -print(f"SHA256 Expected: {expected_sha256_result}") \ No newline at end of file +print(f"std BLAKE3 Expected: {std_hex}") \ No newline at end of file diff --git a/csrc/blake3_sm80.cu b/csrc/blake3_sm80.cu index 53c3575..a58ad5f 100644 --- a/csrc/blake3_sm80.cu +++ b/csrc/blake3_sm80.cu @@ -15,20 +15,21 @@ } \ } while(0) -__constant__ uint32_t BLAKE3_IV[8] = { +__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = { 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u }; -// 轮常量/消息置换表…(略) -// TODO: 按 BLAKE3/BLAKE2s 规范补齐 // ---- 小工具 ---- -__device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { - // 使用 funnel shift 更快(算力>=Volta都有) +__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { +#if defined(__CUDA_ARCH__) return __funnelshift_r(x, x, n); +#else + return (x >> n) | (x << (32 - n)); // host 路径 +#endif } -__device__ void blake3_compress_words_7r( +__host__ __device__ void blake3_compress_words_7r( const uint32_t block_words[16], // 64B const uint32_t cv[8], // 8×u32 uint64_t chunk_counter, // 64-bit @@ -60,7 +61,7 @@ __device__ void blake3_compress_words_7r( } // 从 out_state 派生新的 CV(按规范应取 state[0..7] ^ state[8..15]) -__device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){ +__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){ #pragma unroll for (int i = 0; i < 8; ++i) out_cv[i] = st[i] ^ st[8+i]; @@ -288,6 +289,23 @@ __global__ void blake3_pair_reduce_kernel(const uint32_t* __restrict__ in_cv32, } } +constexpr uint32_t FLAG_ROOT = 8; + +inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) { + uint32_t zero_block[16] = {0}; + uint32_t st[16]; + blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st); + // 写出前 32 字节(state[0..7],小端) + for (int i = 0; i < 8; ++i) { + uint32_t w = st[i]; + out32[4*i+0] = (uint8_t)( w & 0xFF); + out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF); + out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF); + out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF); + } +} + +// wrapper function void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, std::array* root_out = nullptr, cudaStream_t stream = 0) { @@ -333,6 +351,11 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + if (root_out) *root_out = host_root; else { // 简单打印 @@ -394,6 +417,10 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + if (root_out) { *root_out = host_root; } else { From 2e71051367a9533af36f6c54a46876e20bc0bab5 Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Sun, 5 Oct 2025 21:17:46 +0800 Subject: [PATCH 03/20] coalsced gmem access --- README.md | 7 +- csrc/blake3_sm80.cu | 636 ++++++++++++++++++++++++++++++----------- csrc/blake3_sm80_v1.cu | 454 +++++++++++++++++++++++++++++ setup.py | 12 +- 4 files changed, 942 insertions(+), 167 deletions(-) create mode 100644 csrc/blake3_sm80_v1.cu diff --git a/README.md b/README.md index ef873a4..854f865 100644 --- a/README.md +++ b/README.md @@ -19,4 +19,9 @@ python benchmark/test_script.py + 10.3: Implement `SHA256-SIMD` version, still some flaws: only allow short string processing (`len < 55`) + 10.2: Consultation with doctor, invite to repo, add `BLAKE3` basic implementations -+ 10.1: Provide basic template for project, implementing basic sha256 in C++. \ No newline at end of file ++ 10.1: Provide basic template for project, implementing basic sha256 in C++. + +# GPU kernel performance on File Compress hashing + ++ 10.5 - v1 - [commit:`4f31a2c55551965a2e5f048565f021c4551554ae`]: 1709.34 MiB/s ++ 10.5 - v2 - []: 1961.36 MiB/s \ No newline at end of file diff --git a/csrc/blake3_sm80.cu b/csrc/blake3_sm80.cu index a58ad5f..bbc2ae3 100644 --- a/csrc/blake3_sm80.cu +++ b/csrc/blake3_sm80.cu @@ -3,8 +3,10 @@ #include #include #include +#include #define WARP_SIZE 32 +#define LDST128BITS(value) (reinterpret_cast(&(value))[0]) #define CUDA_CHECK(expr) do { \ cudaError_t _e = (expr); \ @@ -29,9 +31,18 @@ __host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { #endif } +__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) { +#if defined(__CUDA_ARCH__) + const uint4 v = *reinterpret_cast(src); + dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w; +#else + std::memcpy(dst, src, 16); +#endif +} + __host__ __device__ void blake3_compress_words_7r( - const uint32_t block_words[16], // 64B - const uint32_t cv[8], // 8×u32 + const uint32_t block_words[16], // 64B -> shared memory + const uint32_t cv[8], // 8×u32 -> shared memory uint64_t chunk_counter, // 64-bit uint32_t block_len, // [0..64] uint32_t flags, // CHUNK_START/END/PARENT/ROOT… @@ -46,14 +57,18 @@ __host__ __device__ void blake3_compress_words_7r( for (int i = 0; i < 8; ++i) out_state[8+i] = BLAKE3_IV[i]; - // 混入计数、block_len、flags,和 block_words(简化占位) out_state[12] ^= (uint32_t)chunk_counter; out_state[13] ^= (uint32_t)(chunk_counter >> 32); out_state[14] ^= block_len; out_state[15] ^= flags; + // so far, the block_words are still pointers. + // now we load it into kernel, as pointed out by ncu profile + uint32_t block_reg_1[4]; + #pragma unroll - for (int i = 0; i < 16; ++i) { + for (int i = 0; i < 16; i += 4) { // the gap is 4 + // load_u128_u32x4(block_words + i, block_reg_1); out_state[i] ^= block_words[i]; // 做一点点搅动(占位) out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31); @@ -115,14 +130,13 @@ __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint3 blake3_state_to_cv(st, out_cv); } -// ============ Big kernel: 1 warp -> 32 chunks, 1 thread = 1 chunk, 16 WARPS in total ============ -// Each block has 512 threads -// 1 warp process 32 chunk -> 32 KiB -// NUM_WARPS = 512 / 32 = 16 -// Each block processes 16 x 32 chunks = 16 x 32 KiB = 512 KiB +// ============ Big kernel: 16 WARPS in total ============ +// grid: (chunks / 64), thread: (512,) template // pad shared memory + const int CHUNK_SIZE=1024, + const int CHUNKS_PER_BLOCK=64, + const int PAD_CHUNK=4, + const int PAD_CV=4> // pad shared memory __global__ void blake3_block_reduce_kernel(uint32_t* d_input, uint32_t* block_cvs, int chunk_len_bytes, @@ -130,169 +144,451 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, int total_chunks) { // NUM_WARPS also stands for NUM_CHUKNS per block constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // consider it as 16 + constexpr int CHUNKS_PROCEED = 64; + + static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned"); + static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4"); // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total - // 8 x 4 x 16 = 512 B shared memory in sum - __shared__ uint32_t cv_smem[NUM_WARPS][8 + PADSIZE]; // avoid bank conflict + // 8 x 4 x 64 = 2 KiB shared memory in sum + __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV]; // avoid bank conflict + + // 4 bytes x 256 x 64 = 64 KiB shared memory. + __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][(CHUNK_SIZE / 4) + PAD_CHUNK]; // [64][256] - // reduce pipeline: 16 -> 8 -> 4 -> 2 -> 1 const int tid = threadIdx.x; + const int bx = blockIdx.x; const int warp_id = tid / WARP_SIZE; const int lane_id = tid % WARP_SIZE; - const uint64_t global_warp_id = blockIdx.x * NUM_WARPS + warp_id; - const uint64_t chunk_counter = base_chunk_counter + global_warp_id; + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int VEC_ELEMS = 4; // uint4 + constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB) + constexpr int WARPS_PER_CTA = NUM_WARPS; // 16 - // index - const uint64_t warp_chunk_base = global_warp_id * WARP_SIZE; // the start of this warp - // each thread process one chunk - const uint64_t chunk_idx = warp_chunk_base + lane_id; + // ============== STAGE 1: Coalsced Global Memory Loading ============== + const int tile_id = blockIdx.x; + const int tile_base = tile_id * CHUNKS_PER_BLOCK; // which chunk do this block start loading + + int valid_chunks = total_chunks - tile_base; + if (valid_chunks <= 0) { + return; // overflow + } + if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK; + + for (int ldt = 0; ldt < 4; ldt++) { + // each warp load 4 chunks + int chunk_local = ldt * WARPS_PER_CTA + warp_id; // ldt*16 + warp -> start chunk + int chunk_global = tile_base + chunk_local; // global chunk idx + + // the pointer for shared memory + uint32_t* s_u32 = &chunk_smem[chunk_local][0]; + + // only read from global, when it's valid + // or, we fill it with 0 + if (chunk_local < valid_chunks) { + const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK; + + // move 16 bytes -> 128 bits each time + // each thread will load 2 x 16 bytes + // so 32 threads will load 32 x 2 x 16 = 1024 B + const uint4* __restrict__ g4 = reinterpret_cast(g_u32); + uint4* __restrict__ s4 = reinterpret_cast(s_u32); + + // idx = lane_id (0..31) 与 lane_id+32 (32..63) + int idx0 = lane_id; // 0..31 + int idx1 = lane_id + WARP_SIZE; // 32..63 + + // thread 0 -> 0, 32 + // thread 1 -> 1, 33 + // ... + // thread 31 -> 31, 63 + // so the global memory access is coalsced + + // notice, we load 16 bytes a time. the index is compressed + // tid 0 -> 0, tid 1 -> 16 + // tid 0 -> 32 x 16, tid 2 -> 32 x 16 + 16 + + uint4 v0 = g4[idx0]; // still, this step we load from gmem, in 4 elements aligned. + uint4 v1 = g4[idx1]; + + s_u32[4*idx0 + 0] = v0.x; s_u32[4*idx0 + 1] = v0.y; // when load into shared mem, do manually + s_u32[4*idx0 + 2] = v0.z; s_u32[4*idx0 + 3] = v0.w; + s_u32[4*idx1 + 0] = v1.x; s_u32[4*idx1 + 1] = v1.y; + s_u32[4*idx1 + 2] = v1.z; s_u32[4*idx1 + 3] = v1.w; + } else { + uint4* s4 = reinterpret_cast(s_u32); + int idx0 = lane_id; + int idx1 = lane_id + WARP_SIZE; + s4[idx0] = make_uint4(0u,0u,0u,0u); + s4[idx1] = make_uint4(0u,0u,0u,0u); + } + } - // edge processing - int valid = total_chunks - warp_chunk_base; - if (valid <= 0) return; // TODO: will this affect warp shfl? - if (valid > WARP_SIZE) valid = WARP_SIZE; + __syncthreads(); // sync all warps - // compute idx for this thread - const int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 - const uint32_t* chunk_words_ptr = d_input + (size_t)chunk_idx * WORDS_PER_CHUNK; + // ============== STAGE 2: Compress Leaf to 64 chain value ============== + const int pass0_valid = min(32, valid_chunks); // pass0 cover [0, 31] chunks + const int pass1_valid = max(0, valid_chunks - 32); // pass1 cover [32, 63] chunks - uint32_t cv[8] = {0}; // 8 x u32 - bool active = lane_id < valid; - if (active) { - const uint64_t chunk_counter = base_chunk_counter + chunk_idx; - blake3_leaf_cv(chunk_words_ptr, chunk_len_bytes, chunk_counter, cv); + __shared__ int parents_count; + if (threadIdx.x == 0) { + const int parents0 = (pass0_valid + 1) >> 1; + const int parents1 = (pass1_valid + 1) >> 1; + parents_count = parents0 + parents1; // ≤ 32 } + // __syncthreads(); - // take care: we cannot use general reduce - // 0-1-2-3-4-...-31, keep this sequential - unsigned mask = __ballot_sync(0xFFFFFFFFu, active); - // step = 1,2,4,8,16 - for (int step = 1; step < WARP_SIZE; step <<= 1) { - int partner_lane = lane_id + step; + auto compute_leaf_cv_from_row = [&](int chunk_local, uint32_t out_cv[8]) { + const uint32_t* mline = &chunk_smem[chunk_local][0]; // 1 KiB = 256 u32 + const uint64_t cc = base_chunk_counter + (uint64_t)(tile_base + chunk_local); + blake3_leaf_cv(mline, chunk_len_bytes, cc, out_cv); + }; - // neighbor cv - uint32_t neighbor_cv[8]; -#pragma unroll - for (int j = 0; j < 8; ++j) { - neighbor_cv[j] = __shfl_down_sync(mask, cv[j], step); + uint32_t warp_cv_pass0[8], warp_cv_pass1[8]; + bool have_pass0 = false, have_pass1 = false; + + // ------- pass = 0 ------- + { + const int left = (warp_id << 1); // 2*warp_id : 0,2,4,...,30 + const int right = left + 1; // neighbor: 1,3,5,...,31 + const int pair_idx = left >> 1; // 0..15 + + uint32_t left_cv[8], right_cv[8]; + + bool have_left = false, have_right = false; + + if (lane_id == 0 && left < pass0_valid) { + compute_leaf_cv_from_row(left, left_cv); + have_left = true; + } + if (lane_id == 1 && right < pass0_valid) { + compute_leaf_cv_from_row(right, right_cv); + have_right = true; } - // the left be parent, and make sure `the right` is valid - if (active && ((lane_id & ((step << 1) - 1)) == 0) && (partner_lane < valid)) { - blake3_parent_cv(cv, neighbor_cv, cv); + // merge two neighbor + unsigned mask = __ballot_sync(0xFFFFFFFFu, (lane_id==0 && have_left) || (lane_id==1 && have_right)); + if (lane_id == 0 && left < pass0_valid) { + uint32_t parent[8]; + if (have_right) { + uint32_t rcv[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) + rcv[j] = __shfl_sync(mask, right_cv[j], 1); + blake3_parent_cv(left_cv, rcv, parent); + } else { + #pragma unroll + for (int j = 0; j < 8; ++j) + parent[j] = left_cv[j]; // 奇数晋级 + } + // 写入 cv_smem 的正确位置:pair_idx = left/2 → 0..15 + #pragma unroll + for (int j = 0; j < 8; ++j) + cv_smem[pair_idx][j] = parent[j]; } - __syncwarp(mask); + __syncwarp(); + } + + // ---- pass 1 ---- + { + const int left = 32 + (warp_id << 1); // 32,34,...,62 + const int right = left + 1; // 33,35,...,63 + const int pair_idx = left >> 1; // 16..31 + + uint32_t left_cv[8], right_cv[8]; + + bool have_left = false, have_right = false; - // in the next level, reduce half of active threads - if (lane_id >= (valid & ~(step))) - active = false; + if (lane_id == 0 && (left - 32) < pass1_valid) { + compute_leaf_cv_from_row(left, left_cv); + have_left = true; + } + if (lane_id == 1 && (right - 32) < pass1_valid) { + compute_leaf_cv_from_row(right, right_cv); + have_right = true; + } + + // TODO: here we may have some issue: overflow and border situation + unsigned mask = __ballot_sync(0xFFFFFFFFu, (lane_id==0 && have_left) || (lane_id==1 && have_right)); + if (lane_id == 0 && (left - 32) < pass1_valid) { + uint32_t parent[8]; + if (have_right) { + uint32_t rcv[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) + rcv[j] = __shfl_sync(mask, right_cv[j], 1); + blake3_parent_cv(left_cv, rcv, parent); + } else { + #pragma unroll + for (int j = 0; j < 8; ++j) + parent[j] = left_cv[j]; // 奇数晋级 + } + // write to the right position + #pragma unroll + for (int j = 0; j < 8; ++j) + cv_smem[pair_idx][j] = parent[j]; + } + __syncwarp(); } - // now, lane 0 holds the root - if (lane_id == 0) { -#pragma unroll - for (int j = 0 ; j < 8; ++j) - cv_smem[warp_id][j] = cv[j]; + __syncthreads(); + + // ============== STAGE 3: Block-Reduce ============== + // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv + // we will only use warp 0 to handle this thing + if (warp_id == 0) { + uint32_t cv[8] = {0,0,0,0,0,0,0,0}; + + const bool active_lane = (lane_id < parents_count); + if (active_lane) { + #pragma unroll + for (int j = 0; j < 8; ++j) cv[j] = cv_smem[lane_id][j]; + } + + // 2) warp reduce 32 -> 16 -> 8 -> 4 -> 2 -> 1 + unsigned mask = __ballot_sync(0xFFFFFFFFu, active_lane); + int cur_n = parents_count; // 当前层的有效节点数(逐层更新) + + for (int step = 1; step < WARP_SIZE; step <<= 1) { + // right-neighbor + uint32_t nbr[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) { + nbr[j] = __shfl_down_sync(mask, cv[j], step); + } + + // safety checking + const bool do_pair = + (lane_id % (step << 1) == 0) && // 左侧 + (lane_id + step < cur_n) && // 右侧在当前层有效范围内 + (lane_id < cur_n); // 左侧也必须有效 + + if (do_pair) { + blake3_parent_cv(cv, nbr, cv); // parent(left, right) -> cv + } + + cur_n = (cur_n + 1) >> 1; + __syncwarp(mask); + } + + // 3) write back to global memory + if (lane_id == 0 && parents_count > 0) { + const int tile_id = blockIdx.x; + uint32_t* out = block_cvs + (size_t)tile_id * 8; // 8 x 4 = 32 B + + // two different write ways + #if 0 + #pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = cv[j]; + #else + // block_cvs should be cudaMalloc ed + reinterpret_cast(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]); + reinterpret_cast(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]); + #endif + } + } +} // blake3_block_reduce_kernel + +__device__ __forceinline__ +void load_cv_g2r(const uint32_t* g, uint32_t r[8]) { + const uint4* g4 = reinterpret_cast(g); + uint4 a = g4[0], b = g4[1]; + r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w; + r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w; +} + +__device__ __forceinline__ +void store_cv_r2g(const uint32_t r[8], uint32_t* g) { + uint4* g4 = reinterpret_cast(g); + g4[0] = make_uint4(r[0],r[1],r[2],r[3]); + g4[1] = make_uint4(r[4],r[5],r[6],r[7]); +} + +// ============ Tiny kernel ============ +// In big kernel, it will consume 64 KiB each block +// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 64 root = 16384 root +// And this tiny kernel is designed to process these 16384 root +template +__global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32, + uint32_t* __restrict__ out_cv32, + int N) +{ + extern __shared__ __align__(16) uint32_t smem[]; // 动态 SMEM;需要 >= TILE_CVS*8*4 字节 + // 视作 2D:[TILE_CVS][8+PAD] + uint32_t* cv_tile = smem; + + const int tid = threadIdx.x; + const int warp_id = tid / WARP_SIZE; // 0..15 + const int lane_id = tid % WARP_SIZE; // 0..31 + + // 本 block 负责的分片起点 + const int tile_start = blockIdx.x * TILE_CVS; + if (tile_start >= N) return; + + // N等于8的时候,这里就是8 + const int tile_n = min(TILE_CVS, N - tile_start); // 该分片的实际 CV 数(<=2048) + + // ---------------- Stage 1: 合并访存 loading 到 SMEM ---------------- + // 每线程搬多个 CV:i = tid, tid+blockDim, ... + for (int i = tid; i < tile_n; i += NUM_THREADS) { // 注意:i = tid, 不是等于0 + const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8; + uint32_t* s = cv_tile + (size_t)i * (8 + PAD); + // 两次 16B + const uint4* g4 = reinterpret_cast(g); + uint4* s4 = reinterpret_cast(s); + // s4[0] = g4[0]; + // s4[1] = g4[1]; + + // in case that the address is not aligned + uint4 v0 = g4[0]; + uint4 v1 = g4[1]; + + s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w; + s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w; } + // 对于 tile_n < TILE_CVS 的尾部,无需清零;后续按有效范围处理 __syncthreads(); - // after all these things, we have finished - // 32 -> 16 -> 8 -> 4 -> 2 -> 1 merge - // and now, we are going to implement higher-level merge - // we have 16 warps, each warp has a root cv - // so we are going to execute another logN steps + // ---------------- Stage 2: 线程内 4→1(保持相邻配对) ---------------- + // 共有 reduced_n0 = ceil(tile_n / 4) 个 lane-root + const int reduced_n0 = (tile_n + 3) >> 1 >> 1; // 等价于 (tile_n+3)/4 + uint32_t lane_cv[8]; // 本线程输出的 lane-root + bool lane_valid = false; + + // 每线程的 4 个输入的起始索引 + int base4 = tid << 2; // tid*4 + if (base4 < tile_n) { + // 读取最多 4 个相邻 CV:idx = base4 + 0,1,2,3 + uint32_t a[8], b[8], c[8], d[8]; + const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD); + load_cv_g2r(s0, a); + + int remain = tile_n - base4; + + if (remain >= 2) { + const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD); + load_cv_g2r(s1, b); + } + if (remain >= 3) { + const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD); + load_cv_g2r(s2, c); + } + if (remain >= 4) { + const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD); + load_cv_g2r(s3, d); + } - // 16 -> 8 -> 4 -> 2 -> 1 - for (int stride = NUM_WARPS >> 1; stride >= 1; stride >>= 1) { - if (warp_id < stride && lane_id == 0) { - uint32_t p[8]; - blake3_parent_cv(&cv_smem[2*warp_id][0], &cv_smem[2*warp_id + 1][0], p); -#pragma unroll - for (int j=0;j<8;++j) - cv_smem[warp_id][j] = p[j]; // write back to shared memory + // 两层相邻配对(奇数晋级) + if (remain == 1) { + #pragma unroll + for (int j = 0; j < 8; ++j) + lane_cv[j] = a[j]; + } else if (remain == 2) { + blake3_parent_cv(a, b, lane_cv); + } else if (remain == 3) { + uint32_t p01[8]; + blake3_parent_cv(a, b, p01); + blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01,(p01,c)->lane_cv + } else { // remain >= 4 + uint32_t p01[8], p23[8]; + blake3_parent_cv(a, b, p01); + blake3_parent_cv(c, d, p23); + blake3_parent_cv(p01, p23, lane_cv); } - __syncthreads(); + lane_valid = true; } - // write this root cv to global memory, not done yet! we need another tiny kernel to sweep - if (tid == 0) { - uint32_t* out = block_cvs + (size_t)blockIdx.x * 8; -#pragma unroll + // ---------------- Stage 3: Warp 内 32→1 相邻配对 ---------------- + // 每个 warp 负责一个连续段:warp_base = warp_id*32 + const int warp_base = warp_id * WARP_SIZE; + const int cur_n_w = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // 本 warp 段内的有效数量 + + // 把 lane_cv 保留在寄存器里做归约;无效 lane 用 mask 剔除 + unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // 仅做存在检测 + int cur_n = cur_n_w; + + // 把“段外的线程”标成无效(避免读越界) + bool active_lane = (lane_id < cur_n_w); + + // 对无效 lane 把值清成 0(不会被使用) + if (!active_lane) { + #pragma unroll for (int j = 0; j < 8; ++j) - out[j] = cv_smem[0][j]; + lane_cv[j] = 0u; } -} // blake3_block_reduce_kernel + // 逐层配对:1,2,4,8,16 - warp-reduce + for (int step = 1; step < WARP_SIZE; step <<= 1) { + // 取右邻 + uint32_t nbr[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) + nbr[j] = __shfl_down_sync(0xFFFFFFFFu, lane_cv[j], step); + + const bool do_pair = + active_lane && + ((lane_id % (step<<1)) == 0) && + (lane_id + step < cur_n); + + if (do_pair) { + blake3_parent_cv(lane_cv, nbr, lane_cv); + } + cur_n = (cur_n + 1) >> 1; + // __syncwarp(); + } -// ============ Tiny kernel ============ -// In big kernel, it will consume 512 KiB each block -// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 512 root = 2048 root -// And this tiny kernel is designed to process these 2048 root -template -__global__ void blake3_pair_reduce_kernel(const uint32_t* __restrict__ in_cv32, - uint32_t* __restrict__ out_cv32, - int n) { - extern __shared__ uint32_t smem[]; // dynamic shared memory - uint32_t* tile = smem; // -> [tile_n][8] - - const int tid = threadIdx.x; - const int b = blockIdx.x; - const int B = gridDim.x; - - const int start = (int)((1ll * n * b) / B); - const int end = (int)((1ll * n * (b+1)) / B); - int tile_n = end - start; - - if (tile_n <= 0) return; // border - - const int words = tile_n * 8; - for (int w = tid; w < words; w += NUM_THREADS) { - tile[w] = in_cv32[start * 8 + w]; + // 这一段的结果在 lane0;把 16 个 warp-root 写入 SMEM 的前 16 行 + __shared__ uint32_t warp_roots[WARP_SIZE/2][8]; // 16×8 + if (lane_id == 0 && cur_n_w > 0) { + #pragma unroll + for (int j=0;j<8;++j) + warp_roots[warp_id][j] = lane_cv[j]; } __syncthreads(); - int cur = tile_n; - while (cur > 1) { - const int pairs = cur >> 1; // floor(cur/2) - // process pairs - for (int i = tid; i < pairs; i += NUM_THREADS) { - const uint32_t* L = &tile[(2*i) * 8]; - const uint32_t* R = &tile[(2*i+1) * 8]; + // ---------------- Stage 4: CTA 内 16→1 相邻配对 ---------------- + // 有效 warp 数:ceil(reduced_n0 / 32) + int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..16 + if (valid_warps == 0) return; + + // 每一个warp的lane 0来做计算 + // 用 lane0 做计算,其它 lane 空转 + for (int stride = (valid_warps >> 1); stride >= 1; stride >>= 1) { + if (warp_id < stride && lane_id == 0) { uint32_t p[8]; - blake3_parent_cv(L, R, p); -#pragma unroll - for (int j=0;j<8;++j) tile[i*8 + j] = p[j]; + blake3_parent_cv(&warp_roots[2*warp_id][0], + &warp_roots[2*warp_id + 1][0], p); + #pragma unroll + for (int j = 0; j < 8; ++j) + warp_roots[warp_id][j] = p[j]; } __syncthreads(); - - // even situation: - if ((cur & 1) && tid == 0) { - uint32_t* dst = &tile[pairs * 8]; - uint32_t* src = &tile[(cur - 1) * 8]; -#pragma unroll - for (int j=0;j<8;++j) - dst[j] = src[j]; + // 奇数晋级 + if ((valid_warps & 1) && warp_id==0 && lane_id==0) { + #pragma unroll + for (int j=0;j<8;++j) + warp_roots[stride][j] = warp_roots[valid_warps-1][j]; } __syncthreads(); - - cur = pairs + (cur & 1); + valid_warps = (valid_warps + 1) >> 1; } - // write output - if (tid == 0) { - uint32_t* out = &out_cv32[b * 8]; -#pragma unroll + // 写回本 block 的根 + if (threadIdx.x == 0) { + uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8; + #pragma unroll for (int j = 0; j < 8; ++j) - out[j] = tile[j]; + out[j] = warp_roots[0][j]; } } constexpr uint32_t FLAG_ROOT = 8; inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) { - uint32_t zero_block[16] = {0}; + const uint32_t zero_block[16] = {0}; uint32_t st[16]; blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st); // 写出前 32 字节(state[0..7],小端) @@ -315,14 +611,28 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, std::abort(); } + int optin = 0, deflt = 0; + cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0); + cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0); + + const int dyn_smem = 64 * 1024; // 64KiB + + // 编译器在编译期决定分配多少动态shmem给kernel + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel<512, 2048, 0>, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel<32, 2048, 0>, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + constexpr int CHUNK_SIZE = 1024; // 1 KiB constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 constexpr int NUM_THREADS = 512; // for big kernel constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // 16 - constexpr int CHUNKS_PER_BLOCK= NUM_WARPS * WARP_SIZE; // 16 * 32 = 512 + constexpr int CHUNKS_PER_BLOCK = 64; // 16 * 32 = 512 const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk const uint64_t total_chunks = bytes_len / CHUNK_SIZE; - const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); + const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); // 16384 blocks uint8_t* d_bytes = nullptr; uint32_t* d_words = nullptr; // alias @@ -333,18 +643,19 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, cudaMemcpyAsync(d_bytes, data, bytes_len, cudaMemcpyHostToDevice, stream); d_words = reinterpret_cast(d_bytes); - cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t)); + cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t)); // 512 KiB - // launch big kernel + // ============= launch big kernel ============= dim3 grid_big(num_blocks); dim3 block_big(NUM_THREADS); uint64_t base_chunk_counter = 0ull; - blake3_block_reduce_kernel + blake3_block_reduce_kernel <<>>( d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); if (num_blocks == 1) { std::array host_root{}; @@ -371,49 +682,44 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, } // the first round of tiny kernel - const int B = (num_blocks >= 8) ? 8 : num_blocks; - uint32_t* d_midCV = nullptr; - cudaMalloc(&d_midCV, (size_t)B * 8u * sizeof(uint32_t)); - + // 1) 16384 output reduce -> 8 + uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32 { - dim3 grid(B); - dim3 block(512); // 你指定 “每个 block 512 线程” - // 每个 block 负责 ceil(num_blocks / B) 个 CV;SMEM 大小按此计算 - const int tile = (num_blocks + B - 1) / B; - const size_t smem_bytes = (size_t)tile * 8u * sizeof(uint32_t); - - blake3_pair_reduce_kernel<512> - <<>>(d_blockCV, d_midCV, num_blocks); + const int N = 16384; // total number + const int TILE = 2048; + const int grid = (N + TILE - 1) / TILE; // = 8 + const int block = 512; + const size_t smem_bytes = (size_t)(TILE) * 8u * sizeof(uint32_t); // 2048×8×4 = 64 KiB + + cudaMalloc(&d_mid_out, (size_t)8 * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel<512, 2048, 0> + <<>>(d_blockCV /*in: 16384×8 x 4*/, + d_mid_out /*out: 8×8*/, N); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } // second round - uint32_t* d_root = nullptr; - cudaMalloc(&d_root, 8u * sizeof(uint32_t)); - + uint32_t* d_root_cv = nullptr; { - dim3 grid(1); - dim3 block(B); - const size_t smem_bytes = (size_t)B * 8u * sizeof(uint32_t); - - // generate kernel during compile time - switch (B) { - case 1: blake3_pair_reduce_kernel<1 ><<>>(d_midCV, d_root, B); break; - case 2: blake3_pair_reduce_kernel<2 ><<>>(d_midCV, d_root, B); break; - case 4: blake3_pair_reduce_kernel<4 ><<>>(d_midCV, d_root, B); break; - case 8: blake3_pair_reduce_kernel<8 ><<>>(d_midCV, d_root, B); break; - case 16: blake3_pair_reduce_kernel<16><<>>(d_midCV, d_root, B); break; - case 32: blake3_pair_reduce_kernel<32><<>>(d_midCV, d_root, B); break; - case 64: blake3_pair_reduce_kernel<64><<>>(d_midCV, d_root, B); break; - default: { - blake3_pair_reduce_kernel<256><<>>(d_midCV, d_root, B); - } break; - } + const int N = 8; + const int TILE = 2048; // 任意 >=N 即可 + const int grid = 1; + const int block = 32; // 32 线程够用 + const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t); // 8 x 8 x 4 = 8 x 32 B + + cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel<32, 2048, 0> + <<>>(d_mid_out /*in: 8×8*/, + d_root_cv /*out: 1×8*/, N); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } std::array host_root{}; - CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root, 8*sizeof(uint32_t), + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root_cv, 8*sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -430,8 +736,8 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, } // clear - CUDA_CHECK(cudaFree(d_root)); - CUDA_CHECK(cudaFree(d_midCV)); + CUDA_CHECK(cudaFree(d_mid_out)); + CUDA_CHECK(cudaFree(d_root_cv)); CUDA_CHECK(cudaFree(d_blockCV)); CUDA_CHECK(cudaFree(d_bytes)); } \ No newline at end of file diff --git a/csrc/blake3_sm80_v1.cu b/csrc/blake3_sm80_v1.cu new file mode 100644 index 0000000..5f08a97 --- /dev/null +++ b/csrc/blake3_sm80_v1.cu @@ -0,0 +1,454 @@ + +#include +#include +#include +#include +#include + +#define WARP_SIZE 32 +#define LDST128BITS(value) (reinterpret_cast(&(value))[0]) + +#define CUDA_CHECK(expr) do { \ + cudaError_t _e = (expr); \ + if (_e != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s at %s:%d: %s\n", \ + #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\ + std::abort(); \ + } \ + } while(0) + +__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = { + 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, + 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u +}; + +// ---- 小工具 ---- +__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { +#if defined(__CUDA_ARCH__) + return __funnelshift_r(x, x, n); +#else + return (x >> n) | (x << (32 - n)); // host 路径 +#endif +} + +__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) { +#if defined(__CUDA_ARCH__) + const uint4 v = *reinterpret_cast(src); + dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w; +#else + std::memcpy(dst, src, 16); +#endif +} + +__host__ __device__ void blake3_compress_words_7r( + const uint32_t block_words[16], // 64B -> 16 x 4 bytes = 64 B = 4 x 16 bytes = 4 x 128 bits + const uint32_t cv[8], // 8×u32 + uint64_t chunk_counter, // 64-bit + uint32_t block_len, // [0..64] + uint32_t flags, // CHUNK_START/END/PARENT/ROOT… + uint32_t out_state[16]) // 返回 16×u32 状态向量(按规范) +{ + // TODO: 根据 BLAKE3(基于 BLAKE2s)的 G/round 实现7轮 + // 这里先给占位:将 IV+cv 混合到 out_state,真实实现请替换 +#pragma unroll + for (int i = 0; i < 8; ++i) + out_state[i] = cv[i]; +#pragma unroll + for (int i = 0; i < 8; ++i) + out_state[8+i] = BLAKE3_IV[i]; + + out_state[12] ^= (uint32_t)chunk_counter; + out_state[13] ^= (uint32_t)(chunk_counter >> 32); + out_state[14] ^= block_len; + out_state[15] ^= flags; + + // so far, the block_words are still pointers. + // now we load it into kernel, as pointed out by ncu profile + uint32_t block_reg_1[4]; + +#pragma unroll + for (int i = 0; i < 16; i += 4) { // the gap is 4 + load_u128_u32x4(block_words + i, block_reg_1); + out_state[i] ^= block_words[i]; + // 做一点点搅动(占位) + out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31); + } +} + +// 从 out_state 派生新的 CV(按规范应取 state[0..7] ^ state[8..15]) +__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){ +#pragma unroll + for (int i = 0; i < 8; ++i) + out_cv[i] = st[i] ^ st[8+i]; +} + +// 叶:处理 1KiB chunk(16×64B blocks)→ 1 个 CV +// 假定输入为小端 u32 流,chunk 不足 1KiB 最后一块 block_len<64 并置 END 标志 +__device__ void blake3_leaf_cv(const uint32_t* chunk_words, + int chunk_len_bytes, + uint64_t chunk_counter, + uint32_t out_cv[8]) +{ + uint32_t cv[8]; + // 初始 cv = IV +#pragma unroll + for (int i = 0; i < 8; ++i) + cv[i] = BLAKE3_IV[i]; + + const int nblocks = (chunk_len_bytes + 63) / 64; // ceil + for (int b = 0; b < nblocks; ++b) { + uint32_t st[16]; + const uint32_t* block = chunk_words + b*16; + const int remain = chunk_len_bytes - b*64; + const uint32_t blk_len = remain >= 64 ? 64u : (uint32_t)remain; + + const uint32_t flags = + ((b == 0) ? (1u<<0) : 0u) | // CHUNK_START(示意:bit0) + ((b == nblocks-1) ? (1u<<1) : 0u); // CHUNK_END (示意:bit1) + + blake3_compress_words_7r(block, cv, chunk_counter, blk_len, flags, st); + blake3_state_to_cv(st, cv); + } + +#pragma unroll + for (int i = 0; i < 8; ++i) + out_cv[i] = cv[i]; +} + +__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){ + uint32_t msg[16]; +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[i] = L[i]; + } +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[8+i] = R[i]; + } + uint32_t st[16]; + blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st); + blake3_state_to_cv(st, out_cv); +} + +// ============ Big kernel: 1 warp -> 32 chunks, 1 thread = 1 chunk, 16 WARPS in total ============ +// Each block has 512 threads +// 1 warp process 32 chunk -> 32 KiB +// NUM_WARPS = 512 / 32 = 16 +// Each block processes 16 x 32 chunks = 16 x 32 KiB = 512 KiB +template // pad shared memory +__global__ void blake3_block_reduce_kernel(uint32_t* d_input, + uint32_t* block_cvs, + int chunk_len_bytes, + uint64_t base_chunk_counter, + int total_chunks) { + // NUM_WARPS also stands for NUM_CHUKNS per block + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // consider it as 16 + + // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total + // 8 x 4 x 16 = 512 B shared memory in sum + __shared__ uint32_t cv_smem[NUM_WARPS][8 + PADSIZE]; // avoid bank conflict + + // reduce pipeline: 16 -> 8 -> 4 -> 2 -> 1 + const int tid = threadIdx.x; + const int warp_id = tid / WARP_SIZE; + const int lane_id = tid % WARP_SIZE; + + const uint64_t global_warp_id = blockIdx.x * NUM_WARPS + warp_id; + const uint64_t chunk_counter = base_chunk_counter + global_warp_id; + + // index + const uint64_t warp_chunk_base = global_warp_id * WARP_SIZE; // the start of this warp + // each thread process one chunk + const uint64_t chunk_idx = warp_chunk_base + lane_id; + + // edge processing + int valid = total_chunks - warp_chunk_base; + if (valid <= 0) return; // TODO: will this affect warp shfl? + if (valid > WARP_SIZE) valid = WARP_SIZE; + + // compute idx for this thread + const int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + const uint32_t* chunk_words_ptr = d_input + (size_t)chunk_idx * WORDS_PER_CHUNK; + + uint32_t cv[8] = {0}; // 8 x u32 + bool active = lane_id < valid; + if (active) { + const uint64_t chunk_counter = base_chunk_counter + chunk_idx; + blake3_leaf_cv(chunk_words_ptr, chunk_len_bytes, chunk_counter, cv); + } + + // Stage 2: merging to parent, in O(logN) depth + + // take care: we cannot use general reduce + // 0-1-2-3-4-...-31, keep this sequential + unsigned mask = __ballot_sync(0xFFFFFFFFu, active); + // step = 1,2,4,8,16 + for (int step = 1; step < WARP_SIZE; step <<= 1) { + int partner_lane = lane_id + step; + + // neighbor cv + uint32_t neighbor_cv[8]; +#pragma unroll + for (int j = 0; j < 8; ++j) { + neighbor_cv[j] = __shfl_down_sync(mask, cv[j], step); + } + + // the left be parent, and make sure `the right` is valid + if (active && ((lane_id & ((step << 1) - 1)) == 0) && (partner_lane < valid)) { + blake3_parent_cv(cv, neighbor_cv, cv); + } + __syncwarp(mask); + + // in the next level, reduce half of active threads + if (lane_id >= (valid & ~(step))) + active = false; + } + + // now, lane 0 holds the root + if (lane_id == 0) { +#pragma unroll + for (int j = 0 ; j < 8; ++j) + cv_smem[warp_id][j] = cv[j]; + } + __syncthreads(); + + // after all these things, we have finished + // 32 -> 16 -> 8 -> 4 -> 2 -> 1 merge + // and now, we are going to implement higher-level merge + // we have 16 warps, each warp has a root cv + // so we are going to execute another logN steps + + // 16 -> 8 -> 4 -> 2 -> 1 + for (int stride = NUM_WARPS >> 1; stride >= 1; stride >>= 1) { + if (warp_id < stride && lane_id == 0) { + uint32_t p[8]; + blake3_parent_cv(&cv_smem[2*warp_id][0], &cv_smem[2*warp_id + 1][0], p); +#pragma unroll + for (int j=0;j<8;++j) + cv_smem[warp_id][j] = p[j]; // write back to shared memory + } + __syncthreads(); + } + + // write this root cv to global memory, not done yet! we need another tiny kernel to sweep + if (tid == 0) { + uint32_t* out = block_cvs + (size_t)blockIdx.x * 8; +#pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = cv_smem[0][j]; + } + +} // blake3_block_reduce_kernel + + +// ============ Tiny kernel ============ +// In big kernel, it will consume 512 KiB each block +// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 512 root = 2048 root +// And this tiny kernel is designed to process these 2048 root +template +__global__ void blake3_pair_reduce_kernel(const uint32_t* __restrict__ in_cv32, + uint32_t* __restrict__ out_cv32, + int n) { + extern __shared__ uint32_t smem[]; // dynamic shared memory + uint32_t* tile = smem; // -> [tile_n][8] + + const int tid = threadIdx.x; + const int b = blockIdx.x; + const int B = gridDim.x; + + const int start = (int)((1ll * n * b) / B); + const int end = (int)((1ll * n * (b+1)) / B); + int tile_n = end - start; + + if (tile_n <= 0) return; // border + + const int words = tile_n * 8; + for (int w = tid; w < words; w += NUM_THREADS) { + tile[w] = in_cv32[start * 8 + w]; + } + __syncthreads(); + + int cur = tile_n; + while (cur > 1) { + const int pairs = cur >> 1; // floor(cur/2) + // process pairs + for (int i = tid; i < pairs; i += NUM_THREADS) { + const uint32_t* L = &tile[(2*i) * 8]; + const uint32_t* R = &tile[(2*i+1) * 8]; + uint32_t p[8]; + blake3_parent_cv(L, R, p); +#pragma unroll + for (int j=0;j<8;++j) tile[i*8 + j] = p[j]; + } + __syncthreads(); + + // even situation: + if ((cur & 1) && tid == 0) { + uint32_t* dst = &tile[pairs * 8]; + uint32_t* src = &tile[(cur - 1) * 8]; +#pragma unroll + for (int j=0;j<8;++j) + dst[j] = src[j]; + } + __syncthreads(); + + cur = pairs + (cur & 1); + } + + // write output + if (tid == 0) { + uint32_t* out = &out_cv32[b * 8]; +#pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = tile[j]; + } +} + +constexpr uint32_t FLAG_ROOT = 8; + +inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) { + uint32_t zero_block[16] = {0}; + uint32_t st[16]; + blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st); + // 写出前 32 字节(state[0..7],小端) + for (int i = 0; i < 8; ++i) { + uint32_t w = st[i]; + out32[4*i+0] = (uint8_t)( w & 0xFF); + out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF); + out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF); + out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF); + } +} + +// wrapper function +void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, + std::array* root_out = nullptr, + cudaStream_t stream = 0) { + if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) { + fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n", + (unsigned long long)bytes_len); + std::abort(); + } + + constexpr int CHUNK_SIZE = 1024; // 1 KiB + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int NUM_THREADS = 512; // for big kernel + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // 16 + constexpr int CHUNKS_PER_BLOCK= NUM_WARPS * WARP_SIZE; // 16 * 32 = 512 + const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk + const uint64_t total_chunks = bytes_len / CHUNK_SIZE; + const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); + + uint8_t* d_bytes = nullptr; + uint32_t* d_words = nullptr; // alias + uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32 + + // TODO: use thrust + cudaMalloc(&d_bytes, bytes_len); + cudaMemcpyAsync(d_bytes, data, bytes_len, cudaMemcpyHostToDevice, stream); + d_words = reinterpret_cast(d_bytes); + + cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t)); + + // launch big kernel + dim3 grid_big(num_blocks); + dim3 block_big(NUM_THREADS); + uint64_t base_chunk_counter = 0ull; + + blake3_block_reduce_kernel + <<>>( + d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks); + + CUDA_CHECK(cudaGetLastError()); + + if (num_blocks == 1) { + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) *root_out = host_root; + else { + // 简单打印 + printf("root CV:"); + for (int i=0;i<8;++i) + printf(" %08x", host_root[i]); + printf("\n"); + } + + CUDA_CHECK(cudaFree(d_blockCV)); + CUDA_CHECK(cudaFree(d_bytes)); + return; + } + + // the first round of tiny kernel + const int B = (num_blocks >= 8) ? 8 : num_blocks; + uint32_t* d_midCV = nullptr; + cudaMalloc(&d_midCV, (size_t)B * 8u * sizeof(uint32_t)); + + { + dim3 grid(B); + dim3 block(512); // 你指定 “每个 block 512 线程” + // 每个 block 负责 ceil(num_blocks / B) 个 CV;SMEM 大小按此计算 + const int tile = (num_blocks + B - 1) / B; + const size_t smem_bytes = (size_t)tile * 8u * sizeof(uint32_t); + + blake3_pair_reduce_kernel<512> + <<>>(d_blockCV, d_midCV, num_blocks); + CUDA_CHECK(cudaGetLastError()); + } + + // second round + uint32_t* d_root = nullptr; + cudaMalloc(&d_root, 8u * sizeof(uint32_t)); + + { + dim3 grid(1); + dim3 block(B); + const size_t smem_bytes = (size_t)B * 8u * sizeof(uint32_t); + + // generate kernel during compile time + switch (B) { + case 1: blake3_pair_reduce_kernel<1 ><<>>(d_midCV, d_root, B); break; + case 2: blake3_pair_reduce_kernel<2 ><<>>(d_midCV, d_root, B); break; + case 4: blake3_pair_reduce_kernel<4 ><<>>(d_midCV, d_root, B); break; + case 8: blake3_pair_reduce_kernel<8 ><<>>(d_midCV, d_root, B); break; + case 16: blake3_pair_reduce_kernel<16><<>>(d_midCV, d_root, B); break; + case 32: blake3_pair_reduce_kernel<32><<>>(d_midCV, d_root, B); break; + case 64: blake3_pair_reduce_kernel<64><<>>(d_midCV, d_root, B); break; + default: { + blake3_pair_reduce_kernel<256><<>>(d_midCV, d_root, B); + } break; + } + CUDA_CHECK(cudaGetLastError()); + } + + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) { + *root_out = host_root; + } else { + printf("root CV:"); + for (int i=0;i<8;++i) printf(" %08x", host_root[i]); + printf("\n"); + } + + // clear + CUDA_CHECK(cudaFree(d_root)); + CUDA_CHECK(cudaFree(d_midCV)); + CUDA_CHECK(cudaFree(d_blockCV)); + CUDA_CHECK(cudaFree(d_bytes)); +} \ No newline at end of file diff --git a/setup.py b/setup.py index 749c743..b2fee0d 100644 --- a/setup.py +++ b/setup.py @@ -47,8 +47,12 @@ def locate_cuda(): COMMON_LIBS = ["cudart"] RPATH = [CUDA["libdir"]] if not sys.platform.startswith("win") else [] +debug = False + CXX_FLAGS = [ - f"-std=c++{CXX_STD}", "-O3", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math" + f"-std=c++{CXX_STD}", "-O3", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math", "-march=native", "-mavx2", "-mfma" +] if not debug else [ + f"-std=c++{CXX_STD}", "-g", "-O0", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math", "-march=native", "-mavx2", "-mfma" ] LINK_FLAGS = [] @@ -66,6 +70,12 @@ def locate_cuda(): "--expt-relaxed-constexpr", "--use_fast_math", "-lineinfo", +] + NVCC_ARCH_FLAGS if not debug else [ + f"-std=c++{CXX_STD}", + "-g", "-O0", "-Xcompiler", "-fPIC", + "--expt-relaxed-constexpr", + "--use_fast_math", + "-G", "-lineinfo", ] + NVCC_ARCH_FLAGS if not sys.platform.startswith("win"): From fd142cc71197483ecb0bf74c10782931a2b7c259 Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Sun, 5 Oct 2025 21:30:54 +0800 Subject: [PATCH 04/20] update scores --- README.md | 4 ++-- benchmark/test_gpu.py | 11 +++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 854f865..a99d68b 100644 --- a/README.md +++ b/README.md @@ -23,5 +23,5 @@ python benchmark/test_script.py # GPU kernel performance on File Compress hashing -+ 10.5 - v1 - [commit:`4f31a2c55551965a2e5f048565f021c4551554ae`]: 1709.34 MiB/s -+ 10.5 - v2 - []: 1961.36 MiB/s \ No newline at end of file ++ 10.5 - v1 - [commit:4f31a2c55551965a2e5f048565f021c4551554ae]: 1709.34 MiB/s ++ 10.5 - v2 - [commit:2e71051367a9533af36f6c54a46876e20bc0bab5]: 2044.49 MiB/s \ No newline at end of file diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py index 75fcef9..b009526 100644 --- a/benchmark/test_gpu.py +++ b/benchmark/test_gpu.py @@ -12,11 +12,18 @@ std_hex = blake3.blake3(data).hexdigest() +# 2) 预热,触发 JIT/驱动初始化,避免首轮偏慢 +for _ in range(2): + fh.blake3_gpu_sm80_hex(data) +torch.cuda.synchronize() + +# 3) 正式计时:这测的是“端到端吞吐”(包含 H2D) +repeat = 5 # 1GiB × 5 已经很重,按机器调整 t0 = time.perf_counter() -repeat = 20 -for i in range(repeat): +for _ in range(repeat): cv_hex = fh.blake3_gpu_sm80_hex(data) torch.cuda.synchronize() +t1 = time.perf_counter() t1 = time.perf_counter() elapsed = t1 - t0 From 329d5425de9558e9e43216c2b89bed43674b7d83 Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Mon, 6 Oct 2025 12:19:27 +0800 Subject: [PATCH 05/20] update kernel to SOTA --- .gitignore | 3 +- benchmark/test_gpu.py | 26 +++--- csrc/binding.cpp | 50 ++++++----- csrc/blake3.h | 5 +- csrc/{blake3_sm80.cu => blake3_sm70.cu} | 37 +++++++-- setup.py | 105 ++++++------------------ 6 files changed, 104 insertions(+), 122 deletions(-) rename csrc/{blake3_sm80.cu => blake3_sm70.cu} (96%) diff --git a/.gitignore b/.gitignore index 5f07045..ee53a7a 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,5 @@ build/ dist/ *.egg-info/ -*.ncu-rep \ No newline at end of file +*.ncu-rep +*.nsys-rep \ No newline at end of file diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py index b009526..4e49107 100644 --- a/benchmark/test_gpu.py +++ b/benchmark/test_gpu.py @@ -1,34 +1,40 @@ +import torch import flashashing as fh import hashlib -import torch import time import blake3 GiB = 1024*1024*1024 # bytes -> 1 GiB -s = "A" * GiB # 1 GiB -data = s.encode("utf-8") -print(len(data)) # 1073741824 +cpu = torch.empty(GiB * 1, dtype=torch.uint8, pin_memory=True) +cpu[:] = ord('A') + +# 一次性 H2D(可重用) +d = torch.empty_like(cpu, device='cuda') +d.copy_(cpu, non_blocking=True) +torch.cuda.synchronize() + +stream = torch.cuda.current_stream().cuda_stream -std_hex = blake3.blake3(data).hexdigest() +# std_hex = blake3.blake3(data).hexdigest() # 2) 预热,触发 JIT/驱动初始化,避免首轮偏慢 for _ in range(2): - fh.blake3_gpu_sm80_hex(data) + fh.blake3_gpu_sm70_hex(d.data_ptr(), d.numel(), stream) torch.cuda.synchronize() # 3) 正式计时:这测的是“端到端吞吐”(包含 H2D) repeat = 5 # 1GiB × 5 已经很重,按机器调整 t0 = time.perf_counter() for _ in range(repeat): - cv_hex = fh.blake3_gpu_sm80_hex(data) + cv_hex = fh.blake3_gpu_sm70_hex(d.data_ptr(), d.numel(), stream) torch.cuda.synchronize() t1 = time.perf_counter() t1 = time.perf_counter() elapsed = t1 - t0 -print(f"Elapsed time for f{repeat}x BLAKE3 (GPU SM80): {elapsed:.3f} seconds") -print(f"Throughput: {repeat * len(data) / elapsed / (1024**2):.2f} MiB/s") +print(f"Elapsed time for {repeat}x BLAKE3 (GPU SM70): {elapsed:.3f} seconds") +print(f"Throughput: {repeat * d.numel() / elapsed / (1024**2):.2f} MiB/s") print("root CV (hex) =", cv_hex) -print(f"std BLAKE3 Expected: {std_hex}") \ No newline at end of file +# print(f"std BLAKE3 Expected: {std_hex}") \ No newline at end of file diff --git a/csrc/binding.cpp b/csrc/binding.cpp index 54bb3d6..d32b172 100644 --- a/csrc/binding.cpp +++ b/csrc/binding.cpp @@ -156,23 +156,29 @@ struct GilRelease { py::gil_scoped_release rel; }; -static py::bytes blake3_gpu_root_cv_bytes(py::object obj) { - auto v = get_bytes_view(obj); - std::array root{}; - { - GilRelease _g; - blake3_block_reduce_sm80(v.ptr, static_cast(v.len), &root, /*stream=*/0); - } - std::string b = cv_words_to_bytes_le(root); - return py::bytes(b); -} +// static py::bytes blake3_gpu_root_cv_bytes(py::object obj) { +// auto v = get_bytes_view(obj); +// std::array root{}; +// { +// GilRelease _g; +// blake3_block_reduce_sm70(v.ptr, static_cast(v.len), &root, /*stream=*/0); +// } +// std::string b = cv_words_to_bytes_le(root); +// return py::bytes(b); +// } + +static std::string blake3_gpu_root_hex(uint64_t device_ptr, + uint64_t nbytes, + uint64_t stream_int = 0) { + auto d_data = reinterpret_cast(device_ptr); + auto stream = reinterpret_cast(stream_int); + cudaSetDevice(0); + cudaFree(0); // attach -static std::string blake3_gpu_root_hex(py::object obj) { - auto v = get_bytes_view(obj); std::array root{}; { GilRelease _g; - blake3_block_reduce_sm80(v.ptr, static_cast(v.len), &root, /*stream=*/0); + blake3_block_reduce_sm70(d_data, nbytes, &root, stream); } std::string b = cv_words_to_bytes_le(root); return bytes_to_hex(reinterpret_cast(b.data()), b.size()); @@ -193,16 +199,16 @@ PYBIND11_MODULE(flashashing, m) { py::arg("data"), "Compute BLAKE3 hash (single-threaded)."); - m.def("blake3_gpu_sm80", - &blake3_gpu_root_cv_bytes, - py::arg("data"), - R"pbdoc( -Return the 32-byte *root chaining value* (CV) computed on GPU for the given data. -NOTE: This is not the standard BLAKE3 digest/XOF output. It's the root CV. -)pbdoc"); - m.def("blake3_gpu_sm80_hex", +// m.def("blake3_gpu_sm70", +// &blake3_gpu_root_cv_bytes, +// py::arg("data"), +// R"pbdoc( +// Return the 32-byte *root chaining value* (CV) computed on GPU for the given data. +// NOTE: This is not the standard BLAKE3 digest/XOF output. It's the root CV. +// )pbdoc"); + m.def("blake3_gpu_sm70_hex", &blake3_gpu_root_hex, - py::arg("data"), + py::arg("d_data"), py::arg("nbytes"), py::arg("stream")=0, R"pbdoc( Return the hex string of the *root chaining value* (CV) computed on GPU. )pbdoc"); diff --git a/csrc/blake3.h b/csrc/blake3.h index c1e27de..52e762a 100644 --- a/csrc/blake3.h +++ b/csrc/blake3.h @@ -28,6 +28,9 @@ std::string bytes_to_hex(const uint8_t *data, size_t len); } // namespace flashashing -void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, + +// ============== GPU implementations ================ +void blake3_block_reduce_sm70(const uint8_t* d_data, + uint64_t bytes_len, std::array* root_out = nullptr, cudaStream_t stream = 0); \ No newline at end of file diff --git a/csrc/blake3_sm80.cu b/csrc/blake3_sm70.cu similarity index 96% rename from csrc/blake3_sm80.cu rename to csrc/blake3_sm70.cu index bbc2ae3..03a3969 100644 --- a/csrc/blake3_sm80.cu +++ b/csrc/blake3_sm70.cu @@ -602,7 +602,8 @@ inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out3 } // wrapper function -void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, +void blake3_block_reduce_sm70(const uint8_t* d_data, + uint64_t bytes_len, std::array* root_out = nullptr, cudaStream_t stream = 0) { if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) { @@ -611,6 +612,22 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, std::abort(); } + // int dev = -1; + // cudaGetDevice(&dev); + // printf("[dbg] my runtime current device = %d\n", dev); + + // cudaPointerAttributes attr{}; + // auto st = cudaPointerGetAttributes(&attr, d_data); + // printf("[dbg] getAttr=%d, type=%d (0=host,1=device,2=managed), device=%d\n", + // (int)st, (int)attr.type, attr.device); + + // cudaPointerAttributes attr{}; + // CUDA_CHECK(cudaPointerGetAttributes(&attr, d_data)); + // if (attr.type != cudaMemoryTypeDevice) { + // fprintf(stderr, "d_data is not device memory!\n"); + // std::abort(); + // } + int optin = 0, deflt = 0; cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0); cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0); @@ -625,6 +642,7 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, blake3_cv_block_reduce_kernel<32, 2048, 0>, cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + constexpr int CHUNK_SIZE = 1024; // 1 KiB constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 constexpr int NUM_THREADS = 512; // for big kernel @@ -633,16 +651,17 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk const uint64_t total_chunks = bytes_len / CHUNK_SIZE; const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); // 16384 blocks - - uint8_t* d_bytes = nullptr; - uint32_t* d_words = nullptr; // alias + CUDA_CHECK(cudaFuncSetAttribute( + blake3_block_reduce_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100)); + + uint8_t* d_bytes = const_cast(d_data); + uint32_t* d_words = reinterpret_cast(d_bytes);; // alias uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32 - // TODO: use thrust - cudaMalloc(&d_bytes, bytes_len); - cudaMemcpyAsync(d_bytes, data, bytes_len, cudaMemcpyHostToDevice, stream); - d_words = reinterpret_cast(d_bytes); + // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch. + // TODO: use thrust cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t)); // 512 KiB // ============= launch big kernel ============= @@ -739,5 +758,5 @@ void blake3_block_reduce_sm80(const uint8_t* data, uint64_t bytes_len, CUDA_CHECK(cudaFree(d_mid_out)); CUDA_CHECK(cudaFree(d_root_cv)); CUDA_CHECK(cudaFree(d_blockCV)); - CUDA_CHECK(cudaFree(d_bytes)); + // CUDA_CHECK(cudaFree(d_bytes)); } \ No newline at end of file diff --git a/setup.py b/setup.py index b2fee0d..e047cc4 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,9 @@ # setup.py from setuptools import setup from setuptools.command.build_ext import build_ext -from pybind11.setup_helpers import Pybind11Extension import pybind11, numpy as np import sys, os, shutil +from torch.utils.cpp_extension import BuildExtension, CUDAExtension def find_in_path(name, path): for d in path.split(os.pathsep): @@ -84,93 +84,40 @@ def locate_cuda(): # MSVC 的 NVCC 透传 NVCC_FLAGS += ["-Xcompiler", "/openmp", "-Xcompiler", "/MD", "-Xcompiler", "/O2"] -# give .cu to nvcc -from distutils.unixccompiler import UnixCCompiler - -class BuildExtNVCC(build_ext): - def build_extensions(self): - - self.compiler.src_extensions.append(".cu") - - original_compile = self.compiler.compile - - def nvcc_compile(sources, output_dir=None, macros=None, include_dirs=None, - debug=0, extra_preargs=None, extra_postargs=None, depends=None): - cxx_sources, cu_sources = [], [] - for s in sources: - (cu_sources if os.path.splitext(s)[1] == ".cu" else cxx_sources).append(s) - - objects = [] - if cxx_sources: - - postargs = extra_postargs.get("cxx", []) if isinstance(extra_postargs, dict) else extra_postargs - objects += original_compile( - cxx_sources, output_dir, macros, include_dirs, debug, extra_preargs, postargs, depends - ) - - if cu_sources: - - for src in cu_sources: - obj = self.compiler.object_filenames([src], output_dir=output_dir)[0] - cmd = [CUDA["nvcc"], "-c", src, "-o", obj] + NVCC_FLAGS - - incs = include_dirs or [] - for inc in incs: - cmd += ["-I", inc] - - if macros: - for m in macros: - if isinstance(m, tuple): - name, val = m - cmd += ["-D%s=%s" % (name, val)] - else: - cmd += ["-D%s" % m] - - if isinstance(extra_postargs, dict): - cmd += extra_postargs.get("nvcc", []) - elif extra_postargs: - cmd += extra_postargs - - os.makedirs(os.path.dirname(obj), exist_ok=True) - self.spawn(cmd) - objects.append(obj) - return objects - - self.compiler.compile = nvcc_compile - - for ext in self.extensions: - if not sys.platform.startswith("win"): - ext.runtime_library_dirs = list(set((ext.runtime_library_dirs or []) + RPATH)) - build_ext.build_extensions(self) - # ---------- 扩展模块 ---------- sources = [ "csrc/sha256_base.cpp", "csrc/sha256_simd.cpp", "csrc/blake3_base.cpp", - "csrc/blake3_sm80.cu", + "csrc/blake3_sm70.cu", "csrc/binding.cpp", ] -ext = Pybind11Extension( - "flashashing", - sources=sources, - include_dirs=COMMON_INCLUDES, - library_dirs=COMMON_LIB_DIRS, - libraries=COMMON_LIBS, - extra_compile_args={ - "cxx": CXX_FLAGS, - "nvcc": [] - }, - extra_link_args=LINK_FLAGS, - define_macros=[("PYBIND11_DETAILED_ERROR_MESSAGES", "1")] + [(d, None) for d in COMMON_DEFINES], -) +# ext = Pybind11Extension( +# "flashashing", +# sources=sources, +# include_dirs=COMMON_INCLUDES, +# library_dirs=COMMON_LIB_DIRS, +# libraries=COMMON_LIBS, +# extra_compile_args={ +# "cxx": CXX_FLAGS, +# "nvcc": [] +# }, +# extra_link_args=LINK_FLAGS, +# define_macros=[("PYBIND11_DETAILED_ERROR_MESSAGES", "1")] + [(d, None) for d in COMMON_DEFINES], +# ) setup( name="flashashing", - version="0.1.0", - description="High performance hashing (SHA-256, BLAKE3) with CUDA + pybind11", - ext_modules=[ext], - cmdclass={"build_ext": BuildExtNVCC}, - zip_safe=False, + ext_modules=[ + CUDAExtension( + "flashashing", + sources=sources, + extra_compile_args={ + "cxx": CXX_FLAGS, + "nvcc": NVCC_FLAGS, + }, + ) + ], + cmdclass={"build_ext": BuildExtension}, ) From 9781ad9759d13e0e482162d57c68b65c946f4ff9 Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Mon, 6 Oct 2025 17:13:41 +0800 Subject: [PATCH 06/20] v4 kernel --- .gitignore | 6 +- README.md | 4 +- backup_deprecated/blake3_sm70_v1.cu | 765 ++++++++++++++++++ {csrc => backup_deprecated}/blake3_sm80_v1.cu | 0 csrc/blake3_sm70.cu | 419 +++++++--- 5 files changed, 1074 insertions(+), 120 deletions(-) create mode 100644 backup_deprecated/blake3_sm70_v1.cu rename {csrc => backup_deprecated}/blake3_sm80_v1.cu (100%) diff --git a/.gitignore b/.gitignore index ee53a7a..47c25b7 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,8 @@ dist/ *.egg-info/ *.ncu-rep -*.nsys-rep \ No newline at end of file +*.nsys-rep + +*.ptx + +*.o \ No newline at end of file diff --git a/README.md b/README.md index a99d68b..c22203f 100644 --- a/README.md +++ b/README.md @@ -24,4 +24,6 @@ python benchmark/test_script.py # GPU kernel performance on File Compress hashing + 10.5 - v1 - [commit:4f31a2c55551965a2e5f048565f021c4551554ae]: 1709.34 MiB/s -+ 10.5 - v2 - [commit:2e71051367a9533af36f6c54a46876e20bc0bab5]: 2044.49 MiB/s \ No newline at end of file ++ 10.5 - v2 - [commit:2e71051367a9533af36f6c54a46876e20bc0bab5]: 2044.49 MiB/s ++ 10.6 - v3 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s ++ 10.6 - v4 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s \ No newline at end of file diff --git a/backup_deprecated/blake3_sm70_v1.cu b/backup_deprecated/blake3_sm70_v1.cu new file mode 100644 index 0000000..b170812 --- /dev/null +++ b/backup_deprecated/blake3_sm70_v1.cu @@ -0,0 +1,765 @@ + +#include +#include +#include +#include +#include + +#define WARP_SIZE 32 +#define LDST128BITS(value) (reinterpret_cast(&(value))[0]) + +#define CUDA_CHECK(expr) do { \ + cudaError_t _e = (expr); \ + if (_e != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s at %s:%d: %s\n", \ + #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\ + std::abort(); \ + } \ + } while(0) + +__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = { + 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, + 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u +}; + +// ---- 小工具 ---- +__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { +#if defined(__CUDA_ARCH__) + return __funnelshift_r(x, x, n); +#else + return (x >> n) | (x << (32 - n)); // host 路径 +#endif +} + +__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) { +#if defined(__CUDA_ARCH__) + const uint4 v = *reinterpret_cast(src); + dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w; +#else + std::memcpy(dst, src, 16); +#endif +} + +__host__ __device__ void blake3_compress_words_7r( + const uint32_t block_words[16], // 64B -> shared memory + const uint32_t cv[8], // 8×u32 -> shared memory + uint64_t chunk_counter, // 64-bit + uint32_t block_len, // [0..64] + uint32_t flags, // CHUNK_START/END/PARENT/ROOT… + uint32_t out_state[16]) // 返回 16×u32 状态向量(按规范) +{ + // TODO: 根据 BLAKE3(基于 BLAKE2s)的 G/round 实现7轮 + // 这里先给占位:将 IV+cv 混合到 out_state,真实实现请替换 +#pragma unroll + for (int i = 0; i < 8; ++i) + out_state[i] = cv[i]; +#pragma unroll + for (int i = 0; i < 8; ++i) + out_state[8+i] = BLAKE3_IV[i]; + + out_state[12] ^= (uint32_t)chunk_counter; + out_state[13] ^= (uint32_t)(chunk_counter >> 32); + out_state[14] ^= block_len; + out_state[15] ^= flags; + + // so far, the block_words are still pointers. + // now we load it into kernel, as pointed out by ncu profile + uint32_t block_reg_1[4]; + +#pragma unroll + for (int i = 0; i < 16; i += 4) { // the gap is 4 + // load_u128_u32x4(block_words + i, block_reg_1); + out_state[i] ^= block_words[i]; + // 做一点点搅动(占位) + out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31); + } +} + +// 从 out_state 派生新的 CV(按规范应取 state[0..7] ^ state[8..15]) +__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){ +#pragma unroll + for (int i = 0; i < 8; ++i) + out_cv[i] = st[i] ^ st[8+i]; +} + +// 叶:处理 1KiB chunk(16×64B blocks)→ 1 个 CV +// 假定输入为小端 u32 流,chunk 不足 1KiB 最后一块 block_len<64 并置 END 标志 +__device__ void blake3_leaf_cv(const uint32_t* chunk_words, + int chunk_len_bytes, + uint64_t chunk_counter, + uint32_t out_cv[8]) +{ + uint32_t cv[8]; + // 初始 cv = IV +#pragma unroll + for (int i = 0; i < 8; ++i) + cv[i] = BLAKE3_IV[i]; + + const int nblocks = (chunk_len_bytes + 63) / 64; // ceil + for (int b = 0; b < nblocks; ++b) { + uint32_t st[16]; + const uint32_t* block = chunk_words + b*16; + const int remain = chunk_len_bytes - b*64; + const uint32_t blk_len = remain >= 64 ? 64u : (uint32_t)remain; + + const uint32_t flags = + ((b == 0) ? (1u<<0) : 0u) | // CHUNK_START(示意:bit0) + ((b == nblocks-1) ? (1u<<1) : 0u); // CHUNK_END (示意:bit1) + + blake3_compress_words_7r(block, cv, chunk_counter, blk_len, flags, st); + blake3_state_to_cv(st, cv); + } + +#pragma unroll + for (int i = 0; i < 8; ++i) + out_cv[i] = cv[i]; +} + +__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){ + uint32_t msg[16]; +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[i] = L[i]; + } +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[8+i] = R[i]; + } + uint32_t st[16]; + blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st); + blake3_state_to_cv(st, out_cv); +} + +// ============ Big kernel: 16 WARPS in total ============ +// grid: (chunks / 64), thread: (512,) +template // pad shared memory +__global__ void blake3_block_reduce_kernel(uint32_t* d_input, + uint32_t* block_cvs, + int chunk_len_bytes, + uint64_t base_chunk_counter, + int total_chunks) { + // NUM_WARPS also stands for NUM_CHUKNS per block + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // consider it as 16 + constexpr int CHUNKS_PROCEED = 64; + + static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned"); + static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4"); + + // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total + // 8 x 4 x 64 = 2 KiB shared memory in sum + __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV]; // avoid bank conflict + + // 4 bytes x 256 x 64 = 64 KiB shared memory. + __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][(CHUNK_SIZE / 4) + PAD_CHUNK]; // [64][256] + + const int tid = threadIdx.x; + const int bx = blockIdx.x; + const int warp_id = tid / WARP_SIZE; + const int lane_id = tid % WARP_SIZE; + + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int VEC_ELEMS = 4; // uint4 + constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB) + constexpr int WARPS_PER_CTA = NUM_WARPS; // 16 + + // ============== STAGE 1: Coalsced Global Memory Loading ============== + const int tile_id = blockIdx.x; + const int tile_base = tile_id * CHUNKS_PER_BLOCK; // which chunk do this block start loading + + int valid_chunks = total_chunks - tile_base; + if (valid_chunks <= 0) { + return; // overflow + } + if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK; + + for (int ldt = 0; ldt < 4; ldt++) { + // each warp load 4 chunks + int chunk_local = ldt * WARPS_PER_CTA + warp_id; // ldt*16 + warp -> start chunk + int chunk_global = tile_base + chunk_local; // global chunk idx + + // the pointer for shared memory + uint32_t* s_u32 = &chunk_smem[chunk_local][0]; + + // only read from global, when it's valid + // or, we fill it with 0 + if (chunk_local < valid_chunks) { + const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK; + + // move 16 bytes -> 128 bits each time + // each thread will load 2 x 16 bytes + // so 32 threads will load 32 x 2 x 16 = 1024 B + const uint4* __restrict__ g4 = reinterpret_cast(g_u32); + uint4* __restrict__ s4 = reinterpret_cast(s_u32); + + // idx = lane_id (0..31) 与 lane_id+32 (32..63) + int idx0 = lane_id; // 0..31 + int idx1 = lane_id + WARP_SIZE; // 32..63 + + // thread 0 -> 0, 32 + // thread 1 -> 1, 33 + // ... + // thread 31 -> 31, 63 + // so the global memory access is coalsced + + // notice, we load 16 bytes a time. the index is compressed + // tid 0 -> 0, tid 1 -> 16 + // tid 0 -> 32 x 16, tid 2 -> 32 x 16 + 16 + + uint4 v0 = g4[idx0]; // still, this step we load from gmem, in 4 elements aligned. + uint4 v1 = g4[idx1]; + + s_u32[4*idx0 + 0] = v0.x; // when load into shared mem, do manually + s_u32[4*idx0 + 1] = v0.y; + s_u32[4*idx0 + 2] = v0.z; + s_u32[4*idx0 + 3] = v0.w; + + s_u32[4*idx1 + 0] = v1.x; + s_u32[4*idx1 + 1] = v1.y; + s_u32[4*idx1 + 2] = v1.z; + s_u32[4*idx1 + 3] = v1.w; + } else { + uint4* s4 = reinterpret_cast(s_u32); + int idx0 = lane_id; + int idx1 = lane_id + WARP_SIZE; + s4[idx0] = make_uint4(0u,0u,0u,0u); + s4[idx1] = make_uint4(0u,0u,0u,0u); + } + } + + __syncthreads(); // sync all warps + + // ============== STAGE 2: Compress Leaf to 64 chain value ============== + const int pass0_valid = min(32, valid_chunks); // pass0 cover [0, 31] chunks + const int pass1_valid = max(0, valid_chunks - 32); // pass1 cover [32, 63] chunks + + __shared__ int parents_count; + if (threadIdx.x == 0) { + const int parents0 = (pass0_valid + 1) >> 1; + const int parents1 = (pass1_valid + 1) >> 1; + parents_count = parents0 + parents1; // ≤ 32 + } + __syncthreads(); + + auto compute_leaf_cv_from_row = [&](int chunk_local, uint32_t out_cv[8]) { + const uint32_t* mline = &chunk_smem[chunk_local][0]; // 1 KiB = 256 u32 + const uint64_t cc = base_chunk_counter + (uint64_t)(tile_base + chunk_local); + blake3_leaf_cv(mline, chunk_len_bytes, cc, out_cv); + }; + + uint32_t warp_cv_pass0[8], warp_cv_pass1[8]; + bool have_pass0 = false, have_pass1 = false; + + // ------- STAGE 2 pass = 0 ------- + { + const int left = (warp_id << 1); // 2*warp_id : 0,2,4,...,30 + const int right = left + 1; // neighbor: 1,3,5,...,31 + const int pair_idx = left >> 1; // 0..15 + + uint32_t left_cv[8], right_cv[8]; + + bool have_left = false, have_right = false; + + if (lane_id == 0 && left < pass0_valid) { + compute_leaf_cv_from_row(left, left_cv); + have_left = true; + } + if (lane_id == 1 && right < pass0_valid) { + compute_leaf_cv_from_row(right, right_cv); + have_right = true; + } + + // merge two neighbor + unsigned mask = __ballot_sync(0xFFFFFFFFu, (lane_id==0 && have_left) || (lane_id==1 && have_right)); + if (lane_id == 0 && left < pass0_valid) { + uint32_t parent[8]; + if (have_right) { + uint32_t rcv[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) + rcv[j] = __shfl_sync(mask, right_cv[j], 1); + blake3_parent_cv(left_cv, rcv, parent); + } else { + #pragma unroll + for (int j = 0; j < 8; ++j) + parent[j] = left_cv[j]; // 奇数晋级 + } + // 写入 cv_smem 的正确位置:pair_idx = left/2 → 0..15 + #pragma unroll + for (int j = 0; j < 8; ++j) + cv_smem[pair_idx][j] = parent[j]; + } + __syncwarp(); + } + + // ---- STAGE 2 pass 1 ---- + { + const int left = 32 + (warp_id << 1); // 32,34,...,62 + const int right = left + 1; // 33,35,...,63 + const int pair_idx = left >> 1; // 16..31 + + uint32_t left_cv[8], right_cv[8]; + + bool have_left = false, have_right = false; + + if (lane_id == 0 && (left - 32) < pass1_valid) { + compute_leaf_cv_from_row(left, left_cv); + have_left = true; + } + if (lane_id == 1 && (right - 32) < pass1_valid) { + compute_leaf_cv_from_row(right, right_cv); + have_right = true; + } + + // TODO: here we may have some issue: overflow and border situation + unsigned mask = __ballot_sync(0xFFFFFFFFu, (lane_id==0 && have_left) || (lane_id==1 && have_right)); + if (lane_id == 0 && (left - 32) < pass1_valid) { + uint32_t parent[8]; + if (have_right) { + uint32_t rcv[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) + rcv[j] = __shfl_sync(mask, right_cv[j], 1); + blake3_parent_cv(left_cv, rcv, parent); + } else { + #pragma unroll + for (int j = 0; j < 8; ++j) + parent[j] = left_cv[j]; // 奇数晋级 + } + // write to the right position + #pragma unroll + for (int j = 0; j < 8; ++j) + cv_smem[pair_idx][j] = parent[j]; + } + __syncwarp(); + } + + __syncthreads(); + + // ============== STAGE 3: Block-Reduce ============== + // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv + // we will only use warp 0 to handle this thing + if (warp_id == 0) { + uint32_t cv[8] = {0,0,0,0,0,0,0,0}; + + const bool active_lane = (lane_id < parents_count); + if (active_lane) { + #pragma unroll + for (int j = 0; j < 8; ++j) cv[j] = cv_smem[lane_id][j]; + } + + // 2) warp reduce 32 -> 16 -> 8 -> 4 -> 2 -> 1 + unsigned mask = __ballot_sync(0xFFFFFFFFu, active_lane); + int cur_n = parents_count; // 当前层的有效节点数(逐层更新) + + for (int step = 1; step < WARP_SIZE; step <<= 1) { + // right-neighbor + uint32_t nbr[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) { + nbr[j] = __shfl_down_sync(mask, cv[j], step); + } + + // safety checking + const bool do_pair = + (lane_id % (step << 1) == 0) && // 左侧 + (lane_id + step < cur_n) && // 右侧在当前层有效范围内 + (lane_id < cur_n); // 左侧也必须有效 + + if (do_pair) { + blake3_parent_cv(cv, nbr, cv); // parent(left, right) -> cv + } + + cur_n = (cur_n + 1) >> 1; + __syncwarp(mask); + } + + // 3) write back to global memory + if (lane_id == 0 && parents_count > 0) { + const int tile_id = blockIdx.x; + uint32_t* out = block_cvs + (size_t)tile_id * 8; // 8 x 4 = 32 B + + // two different write ways + #if 0 + #pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = cv[j]; + #else + // block_cvs should be cudaMalloc ed + reinterpret_cast(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]); + reinterpret_cast(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]); + #endif + } + } +} // blake3_block_reduce_kernel + +__device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) { + const uint4* g4 = reinterpret_cast(g); + uint4 a = g4[0], b = g4[1]; + r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w; + r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w; +} + +__device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) { + uint4* g4 = reinterpret_cast(g); + g4[0] = make_uint4(r[0],r[1],r[2],r[3]); + g4[1] = make_uint4(r[4],r[5],r[6],r[7]); +} + +// ============ Tiny kernel ============ +// In big kernel, it will consume 64 KiB each block +// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 64 root = 16384 root +// And this tiny kernel is designed to process these 16384 root +template +__global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32, + uint32_t* __restrict__ out_cv32, + int N) +{ + extern __shared__ __align__(16) uint32_t smem[]; // 动态 SMEM;需要 >= TILE_CVS*8*4 字节 + // 视作 2D:[TILE_CVS][8+PAD] + uint32_t* cv_tile = smem; + + const int tid = threadIdx.x; + const int warp_id = tid / WARP_SIZE; // 0..15 + const int lane_id = tid % WARP_SIZE; // 0..31 + + // 本 block 负责的分片起点 + const int tile_start = blockIdx.x * TILE_CVS; + if (tile_start >= N) return; + + // N等于8的时候,这里就是8 + const int tile_n = min(TILE_CVS, N - tile_start); // 该分片的实际 CV 数(<=2048) + + // ---------------- Stage 1: 合并访存 loading 到 SMEM ---------------- + // 每线程搬多个 CV:i = tid, tid+blockDim, ... + for (int i = tid; i < tile_n; i += NUM_THREADS) { // 注意:i = tid, 不是等于0 + const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8; + uint32_t* s = cv_tile + (size_t)i * (8 + PAD); + // 两次 16B + const uint4* g4 = reinterpret_cast(g); + uint4* s4 = reinterpret_cast(s); + // s4[0] = g4[0]; + // s4[1] = g4[1]; + + // in case that the address is not aligned + uint4 v0 = g4[0]; + uint4 v1 = g4[1]; + + s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w; + s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w; + } + // 对于 tile_n < TILE_CVS 的尾部,无需清零;后续按有效范围处理 + __syncthreads(); + + // ---------------- Stage 2: 线程内 4→1(保持相邻配对) ---------------- + // 共有 reduced_n0 = ceil(tile_n / 4) 个 lane-root + const int reduced_n0 = (tile_n + 3) >> 1 >> 1; // 等价于 (tile_n+3)/4 + uint32_t lane_cv[8]; // 本线程输出的 lane-root + bool lane_valid = false; + + // 每线程的 4 个输入的起始索引 + int base4 = tid << 2; // tid*4 + if (base4 < tile_n) { + // 读取最多 4 个相邻 CV:idx = base4 + 0,1,2,3 + uint32_t a[8], b[8], c[8], d[8]; + const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD); + load_cv_g2r(s0, a); + + int remain = tile_n - base4; + + if (remain >= 2) { + const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD); + load_cv_g2r(s1, b); + } + if (remain >= 3) { + const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD); + load_cv_g2r(s2, c); + } + if (remain >= 4) { + const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD); + load_cv_g2r(s3, d); + } + + // 两层相邻配对(奇数晋级) + if (remain == 1) { + #pragma unroll + for (int j = 0; j < 8; ++j) + lane_cv[j] = a[j]; + } else if (remain == 2) { + blake3_parent_cv(a, b, lane_cv); + } else if (remain == 3) { + uint32_t p01[8]; + blake3_parent_cv(a, b, p01); + blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01,(p01,c)->lane_cv + } else { // remain >= 4 + uint32_t p01[8], p23[8]; + blake3_parent_cv(a, b, p01); + blake3_parent_cv(c, d, p23); + blake3_parent_cv(p01, p23, lane_cv); + } + lane_valid = true; + } + + // ---------------- Stage 3: Warp 内 32→1 相邻配对 ---------------- + // 每个 warp 负责一个连续段:warp_base = warp_id*32 + const int warp_base = warp_id * WARP_SIZE; + const int cur_n_w = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // 本 warp 段内的有效数量 + + // 把 lane_cv 保留在寄存器里做归约;无效 lane 用 mask 剔除 + unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // 仅做存在检测 + int cur_n = cur_n_w; + + // 把“段外的线程”标成无效(避免读越界) + bool active_lane = (lane_id < cur_n_w); + + // 对无效 lane 把值清成 0(不会被使用) + if (!active_lane) { + #pragma unroll + for (int j = 0; j < 8; ++j) + lane_cv[j] = 0u; + } + + // 逐层配对:1,2,4,8,16 - warp-reduce + for (int step = 1; step < WARP_SIZE; step <<= 1) { + // 取右邻 + uint32_t nbr[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) + nbr[j] = __shfl_down_sync(0xFFFFFFFFu, lane_cv[j], step); + + const bool do_pair = + active_lane && + ((lane_id % (step<<1)) == 0) && + (lane_id + step < cur_n); + + if (do_pair) { + blake3_parent_cv(lane_cv, nbr, lane_cv); + } + + cur_n = (cur_n + 1) >> 1; + // __syncwarp(); + } + + // 这一段的结果在 lane0;把 16 个 warp-root 写入 SMEM 的前 16 行 + __shared__ uint32_t warp_roots[WARP_SIZE/2][8]; // 16×8 + if (lane_id == 0 && cur_n_w > 0) { + #pragma unroll + for (int j=0;j<8;++j) + warp_roots[warp_id][j] = lane_cv[j]; + } + __syncthreads(); + + // ---------------- Stage 4: CTA 内 16→1 相邻配对 ---------------- + // 有效 warp 数:ceil(reduced_n0 / 32) + int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..16 + if (valid_warps == 0) return; + + // 每一个warp的lane 0来做计算 + // 用 lane0 做计算,其它 lane 空转 + for (int stride = (valid_warps >> 1); stride >= 1; stride >>= 1) { + if (warp_id < stride && lane_id == 0) { + uint32_t p[8]; + blake3_parent_cv(&warp_roots[2*warp_id][0], + &warp_roots[2*warp_id + 1][0], p); + #pragma unroll + for (int j = 0; j < 8; ++j) + warp_roots[warp_id][j] = p[j]; + } + __syncthreads(); + // 奇数晋级 + if ((valid_warps & 1) && warp_id==0 && lane_id==0) { + #pragma unroll + for (int j=0;j<8;++j) + warp_roots[stride][j] = warp_roots[valid_warps-1][j]; + } + __syncthreads(); + valid_warps = (valid_warps + 1) >> 1; + } + + // 写回本 block 的根 + if (threadIdx.x == 0) { + uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8; + #pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = warp_roots[0][j]; + } +} + +constexpr uint32_t FLAG_ROOT = 8; + +inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) { + const uint32_t zero_block[16] = {0}; + uint32_t st[16]; + blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st); + // 写出前 32 字节(state[0..7],小端) + for (int i = 0; i < 8; ++i) { + uint32_t w = st[i]; + out32[4*i+0] = (uint8_t)( w & 0xFF); + out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF); + out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF); + out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF); + } +} + +// wrapper function +void blake3_block_reduce_sm70(const uint8_t* d_data, + uint64_t bytes_len, + std::array* root_out = nullptr, + cudaStream_t stream = 0) { + if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) { + fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n", + (unsigned long long)bytes_len); + std::abort(); + } + + // int dev = -1; + // cudaGetDevice(&dev); + // printf("[dbg] my runtime current device = %d\n", dev); + + // cudaPointerAttributes attr{}; + // auto st = cudaPointerGetAttributes(&attr, d_data); + // printf("[dbg] getAttr=%d, type=%d (0=host,1=device,2=managed), device=%d\n", + // (int)st, (int)attr.type, attr.device); + + // cudaPointerAttributes attr{}; + // CUDA_CHECK(cudaPointerGetAttributes(&attr, d_data)); + // if (attr.type != cudaMemoryTypeDevice) { + // fprintf(stderr, "d_data is not device memory!\n"); + // std::abort(); + // } + + int optin = 0, deflt = 0; + cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0); + cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0); + + const int dyn_smem = 64 * 1024; // 64KiB + + // 编译器在编译期决定分配多少动态shmem给kernel + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel<512, 2048, 0>, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel<32, 2048, 0>, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + + + constexpr int CHUNK_SIZE = 1024; // 1 KiB + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int NUM_THREADS = 512; // for big kernel + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // 16 + constexpr int CHUNKS_PER_BLOCK = 64; // 16 * 32 = 512 + const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk + const uint64_t total_chunks = bytes_len / CHUNK_SIZE; + const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); // 16384 blocks + CUDA_CHECK(cudaFuncSetAttribute( + blake3_block_reduce_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100)); + + uint8_t* d_bytes = const_cast(d_data); + uint32_t* d_words = reinterpret_cast(d_bytes);; // alias + uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32 + + // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch. + + // TODO: use thrust + cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t)); // 512 KiB + + // ============= launch big kernel ============= + dim3 grid_big(num_blocks); + dim3 block_big(NUM_THREADS); + uint64_t base_chunk_counter = 0ull; + + blake3_block_reduce_kernel + <<>>( + d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks); + + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + + if (num_blocks == 1) { + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) *root_out = host_root; + else { + // 简单打印 + printf("root CV:"); + for (int i=0;i<8;++i) + printf(" %08x", host_root[i]); + printf("\n"); + } + + CUDA_CHECK(cudaFree(d_blockCV)); + CUDA_CHECK(cudaFree(d_bytes)); + return; + } + + // the first round of tiny kernel + // 1) 16384 output reduce -> 8 + uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32 + { + const int N = 16384; // total number + const int TILE = 2048; + const int grid = (N + TILE - 1) / TILE; // = 8 + const int block = 512; + const size_t smem_bytes = (size_t)(TILE) * 8u * sizeof(uint32_t); // 2048×8×4 = 64 KiB + + cudaMalloc(&d_mid_out, (size_t)8 * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel<512, 2048, 0> + <<>>(d_blockCV /*in: 16384×8 x 4*/, + d_mid_out /*out: 8×8*/, N); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + } + + // second round + uint32_t* d_root_cv = nullptr; + { + const int N = 8; + const int TILE = 2048; // 任意 >=N 即可 + const int grid = 1; + const int block = 32; // 32 线程够用 + const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t); // 8 x 8 x 4 = 8 x 32 B + + cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel<32, 2048, 0> + <<>>(d_mid_out /*in: 8×8*/, + d_root_cv /*out: 1×8*/, N); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + } + + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root_cv, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) { + *root_out = host_root; + } else { + printf("root CV:"); + for (int i=0;i<8;++i) printf(" %08x", host_root[i]); + printf("\n"); + } + + // clear + CUDA_CHECK(cudaFree(d_mid_out)); + CUDA_CHECK(cudaFree(d_root_cv)); + CUDA_CHECK(cudaFree(d_blockCV)); + // CUDA_CHECK(cudaFree(d_bytes)); +} \ No newline at end of file diff --git a/csrc/blake3_sm80_v1.cu b/backup_deprecated/blake3_sm80_v1.cu similarity index 100% rename from csrc/blake3_sm80_v1.cu rename to backup_deprecated/blake3_sm80_v1.cu diff --git a/csrc/blake3_sm70.cu b/csrc/blake3_sm70.cu index 03a3969..4a98b49 100644 --- a/csrc/blake3_sm70.cu +++ b/csrc/blake3_sm70.cu @@ -22,6 +22,30 @@ __host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = { 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u }; +enum : uint32_t { + FLAG_CHUNK_START = 1u << 0, + FLAG_CHUNK_END = 1u << 1, + FLAG_PARENT = 1u << 2, + FLAG_ROOT = 1u << 3, + FLAG_KEYED_HASH = 1u << 4, + FLAG_DERIVE_KEY_CONTEXT = 1u << 5, + FLAG_DERIVE_KEY_MATERIAL= 1u << 6, +}; + +__device__ __noinline__ +uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) { + uint32_t f = 0; + if (block_idx_in_chunk == 0) f |= FLAG_CHUNK_START; + if (block_idx_in_chunk == nblocks_in_chunk - 1) f |= FLAG_CHUNK_END; + if (is_root_chunk) f |= FLAG_ROOT; // only this block in msg, or this is root + return f; +} + +__device__ __forceinline__ +uint32_t blake3_parent_flags(bool is_root_parent) { + return FLAG_PARENT | (is_root_parent ? FLAG_ROOT : 0); +} + // ---- 小工具 ---- __host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { #if defined(__CUDA_ARCH__) @@ -82,37 +106,206 @@ __host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16 out_cv[i] = st[i] ^ st[8+i]; } -// 叶:处理 1KiB chunk(16×64B blocks)→ 1 个 CV -// 假定输入为小端 u32 流,chunk 不足 1KiB 最后一块 block_len<64 并置 END 标志 -__device__ void blake3_leaf_cv(const uint32_t* chunk_words, - int chunk_len_bytes, - uint64_t chunk_counter, - uint32_t out_cv[8]) +// swap-table +// BLAKE3 message schedule: rows are P^r, r=0..6. +// Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) +// https://www.cse.unsw.edu.au/~cs4601/refs/papers/blake3.pdf +__device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = { + // r = 0: identity + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + // r = 1: P + { 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8 }, + // r = 2: P∘P + { 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1 }, + // r = 3 + { 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6 }, + // r = 4 + { 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4 }, + // r = 5 + { 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7 }, + // r = 6 + { 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13 }, +}; + +// get the "r" round, "k" message, it is broadcasted from m[li] lane. li = k +__device__ __forceinline__ uint32_t msg_rk(uint32_t m_lane, int round, int k, unsigned mask16) { + int src = B3_MSG_SCHEDULE[round][k]; + return __shfl_sync(mask16, m_lane, src, 16); +} + +__device__ __noinline__ +uint32_t G_update_role(uint32_t v_self, uint32_t v_b, uint32_t v_c, uint32_t v_d, + uint32_t mx, uint32_t my, int role) { - uint32_t cv[8]; - // 初始 cv = IV -#pragma unroll - for (int i = 0; i < 8; ++i) - cv[i] = BLAKE3_IV[i]; + // 按 BLAKE2s 32-bit 的 G 序列算出 a',b',c',d',最后返回“当前 role”的那个值 + uint32_t a = v_self, b = v_b, c = v_c, d = v_d; + + // a = a + b + mx; + // d ^= a; + // d >>>= 16 + a = a + b + mx; + d ^= a; + d = rotr32(d, 16); + + // c = c + d; + // b ^= c; + // b >>>= 12 + c = c + d; + b ^= c; + b = rotr32(b, 12); + + // a = a + b + my; + // d ^= a; + // d >>>= 8 + a = a + b + my; + d ^= a; + d = rotr32(d, 8); + + // c = c + d; + // b ^= c; + // b >>>= 7 + c = c + d; + b ^= c; + b = rotr32(b, 7); + + // role choice: + switch (role) { + case 0: return a; + case 1: return b; + case 2: return c; + default: return d; + } +} - const int nblocks = (chunk_len_bytes + 63) / 64; // ceil +// notice that, this function will proceed 2 chunks, each time. +// - chunk_words_row: current chunk +// - out_cv: written by lane 0, or lane 16 +__device__ __noinline__ +void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row, // 256×u32 -> 1024 Bytes, from shared memory + // so the chunks_row += 2 as gap + int chunk_len_bytes, + uint64_t chunk_counter, + uint32_t out_cv[8], + unsigned mask16) { + // computing index + int lane = threadIdx.x & 31; // lane_id: 0-31 + int sub = lane >> 4; // 0/1 + int li = lane & 15; // 0..15, abstract lane id. for example, lane 16 will be li=0 + int role = li & 3; // a/b/c/d role + int base = (sub << 4); // 0 or 16 the absolute base + + const int nblocks = (chunk_len_bytes + 63) >> 6; // ceil(chunk_len/64) + + int warp_id = threadIdx.x / WARP_SIZE; + + // initialize + uint32_t cv_word = 0; + if (li < 8) cv_word = BLAKE3_IV[li]; + + // process all blocks + // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks + // each block has 64B -> 16 x u32 for (int b = 0; b < nblocks; ++b) { - uint32_t st[16]; - const uint32_t* block = chunk_words + b*16; - const int remain = chunk_len_bytes - b*64; - const uint32_t blk_len = remain >= 64 ? 64u : (uint32_t)remain; + // each lane holds one u32, + // 16 lane will hold 16 x 4 = 64 B -> it's block + // the another 16 lane will hold opposite 64 B + const uint32_t m_lane = chunk_words_row[b * 16 + li]; + + // 初始化 v:v[0..7]=cv, v[8..11]=IV,v[12..15]^=t/len/flags + // 先把“自己的那个索引”的初值准备好: + uint32_t v = (li < 8) + ? cv_word // v[i](i<8) + : BLAKE3_IV[li - 8]; // v[8..15] ← IV + + // 计数器/长度/标志(按 BLAKE3 规范) + const uint32_t t0 = (uint32_t)chunk_counter; + const uint32_t t1 = (uint32_t)(chunk_counter >> 32); + const int remain = chunk_len_bytes - (b << 6); + const uint32_t block_len = (remain >= 64) ? 64u : (uint32_t)remain; + + const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false); + + // 只在 12..15 四个索引上异或相应域(不分支,用谓词掩码) + v ^= (li == 12) ? t0 : 0u; + v ^= (li == 13) ? t1 : 0u; + v ^= (li == 14) ? block_len: 0u; + v ^= (li == 15) ? flags : 0u; + + // ===== 7 rounds ===== + #pragma unroll 1 // 不要unroll + for (int r = 0; r < 7; ++r) { + // inside this loop, each lane will do one job + // 16 lane will execute 16 x 2 operations + // in sequential-programming, will do 8 operation + + // ---- 列步(quartet: {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15})---- + { + // 取同 quartet 的 b/c/d(基于当前 v) + uint32_t vb = __shfl_xor_sync(mask16, v, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16); + + // 本 quartet 的 i ∈ {0,1,2,3},列步用 msg 索引 0..7(两两为一对) + int gi = (li & 3); // 0..3 + uint32_t mx = msg_rk(m_lane, r, 2*gi + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 2*gi + 1, mask16); + + v = G_update_role(v, vb, vc, vd, mx, my, role); + } - const uint32_t flags = - ((b == 0) ? (1u<<0) : 0u) | // CHUNK_START(示意:bit0) - ((b == nblocks-1) ? (1u<<1) : 0u); // CHUNK_END (示意:bit1) + // ---- 对角步 ---- + { + // 把索引写成 li = q + 4*rq;对角步先做置换 li' = (rq<<2) | ((q+rq)&3) + int q = (li & 3); + int rq = (li >> 2); + int li_diag = (rq << 2) | ((q + rq) & 3); - blake3_compress_words_7r(block, cv, chunk_counter, blk_len, flags, st); - blake3_state_to_cv(st, cv); + // 在“对角置换域”取到当前 v 值 + uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16); + + // 在该域内做“列步”同样的四邻取值 + uint32_t vb = __shfl_xor_sync(mask16, v_diag, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v_diag, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16); + + // 对角步的 4 组 G 使用本轮消息对的后半(索引 8..15) + int gi = (li_diag & 3); // 0..3 + uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 8 + 2*gi + 1, mask16); + + uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role); + + // 反置换回原位:li_undo = (rq<<2) | ((q - rq) & 3) + int li_undo = (rq << 2) | ((q - rq) & 3); + // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16); + v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16); + } + } // 7 rounds end + + // 派生新的 CV:cv[i] = v[i] ^ v[i+8](仅 li=0..7 生效) + uint32_t vip8_all = __shfl_sync(mask16, v, li ^ 8, 16); + if (li < 8) { + cv_word = v ^ vip8_all; + } + + // 下一块继续(本函数内 16 个 block 串行) } -#pragma unroll - for (int i = 0; i < 8; ++i) - out_cv[i] = cv[i]; + // 由 lane0 / lane16 收集 8×u32 输出 + + // This will trigger problem! + // if (li == 0) { // only thread 0 and thread 16 will do this. + // #pragma unroll + // for (int j = 0; j < 8; ++j) { + // out_cv[j] = __shfl_sync(mask16, cv_word, j, 16); + // } + // } + + #pragma unroll + for (int j = 0; j < 8; ++j) { + uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用 + if (li == 0) out_cv[j] = wj; // 仅 lane0 落盘 + } } __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){ @@ -135,8 +328,8 @@ __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint3 template // pad shared memory + const int PAD_CHUNK=0, + const int PAD_CV=0> // pad shared memory __global__ void blake3_block_reduce_kernel(uint32_t* d_input, uint32_t* block_cvs, int chunk_len_bytes, @@ -212,10 +405,15 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, uint4 v0 = g4[idx0]; // still, this step we load from gmem, in 4 elements aligned. uint4 v1 = g4[idx1]; - s_u32[4*idx0 + 0] = v0.x; s_u32[4*idx0 + 1] = v0.y; // when load into shared mem, do manually - s_u32[4*idx0 + 2] = v0.z; s_u32[4*idx0 + 3] = v0.w; - s_u32[4*idx1 + 0] = v1.x; s_u32[4*idx1 + 1] = v1.y; - s_u32[4*idx1 + 2] = v1.z; s_u32[4*idx1 + 3] = v1.w; + s_u32[4*idx0 + 0] = v0.x; // when load into shared mem, do manually + s_u32[4*idx0 + 1] = v0.y; + s_u32[4*idx0 + 2] = v0.z; + s_u32[4*idx0 + 3] = v0.w; + + s_u32[4*idx1 + 0] = v1.x; + s_u32[4*idx1 + 1] = v1.y; + s_u32[4*idx1 + 2] = v1.z; + s_u32[4*idx1 + 3] = v1.w; } else { uint4* s4 = reinterpret_cast(s_u32); int idx0 = lane_id; @@ -237,103 +435,92 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, const int parents1 = (pass1_valid + 1) >> 1; parents_count = parents0 + parents1; // ≤ 32 } - // __syncthreads(); - - auto compute_leaf_cv_from_row = [&](int chunk_local, uint32_t out_cv[8]) { - const uint32_t* mline = &chunk_smem[chunk_local][0]; // 1 KiB = 256 u32 - const uint64_t cc = base_chunk_counter + (uint64_t)(tile_base + chunk_local); - blake3_leaf_cv(mline, chunk_len_bytes, cc, out_cv); - }; - - uint32_t warp_cv_pass0[8], warp_cv_pass1[8]; - bool have_pass0 = false, have_pass1 = false; - - // ------- pass = 0 ------- - { - const int left = (warp_id << 1); // 2*warp_id : 0,2,4,...,30 - const int right = left + 1; // neighbor: 1,3,5,...,31 - const int pair_idx = left >> 1; // 0..15 - - uint32_t left_cv[8], right_cv[8]; - - bool have_left = false, have_right = false; + __syncthreads(); - if (lane_id == 0 && left < pass0_valid) { - compute_leaf_cv_from_row(left, left_cv); - have_left = true; - } - if (lane_id == 1 && right < pass0_valid) { - compute_leaf_cv_from_row(right, right_cv); - have_right = true; + // if (blockIdx.x==0 && threadIdx.x==0) printf("after stage1\n"); + + // this is for each warp's lane0 and lane16 written + // to decrease the register usage. + __shared__ __align__(16) uint32_t tmp_cv[CHUNKS_PER_BLOCK][8 + PAD_CV]; // 2 KiB + + // lambda function: compress this thing + auto do_big_pass = [&](int base /*0 或 32*/, int pass_valid) { + // left=base+2*warp_id, right=left+1 + const int left = base + (warp_id << 1); // base + 0,2,4,6,... + const int right = left + 1; + const int left_rel = left - base; // 0..31 + const int right_rel = right - base; // 1..32 + const bool has_left = (left_rel < pass_valid); + const bool has_right = (right_rel < pass_valid); + + // const int lane_id = threadIdx.x & 31; + const int sub = lane_id >> 4; // sub-warp: 0 or 1, lane0-15: sub-warp0; lane16-lane31: sub-warp1 + const int li = lane_id & 15; // 0..15 + const unsigned mask16= (sub == 0 ? 0x0000FFFFu : 0xFFFF0000u); + + const int chunk_local = left + sub; // sub=0→left, sub=1→right + const bool active = (sub==0 ? (left - base) < pass_valid + : (right - base) < pass_valid); + + // uint32_t my_cv[8]; + + // the left-sub-warp and right-sub-warp will execute the same code + // distinguish the index by computing, + // to avoid warp-divergence + if (active) { + // the chunk local identifies the left or right chunk, so do not worry. + const uint32_t* row = &chunk_smem[chunk_local][0]; + const uint64_t cc = base_chunk_counter + (uint64_t)(tile_base + chunk_local); + blake3_leaf_cv_simd16_onechunk(row, + chunk_len_bytes, + cc, + &tmp_cv[chunk_local][0], + mask16); } - // merge two neighbor - unsigned mask = __ballot_sync(0xFFFFFFFFu, (lane_id==0 && have_left) || (lane_id==1 && have_right)); - if (lane_id == 0 && left < pass0_valid) { + __syncwarp(); // make sure two warps written into `tmp_cv` + + // now we have compute 2 chunks' cv + // merge it to a parent cv + if (lane_id == 0 && has_left) { + const uint32_t* lcv = &tmp_cv[left][0]; uint32_t parent[8]; - if (have_right) { - uint32_t rcv[8]; + if ((right - base) < pass_valid) { + const uint32_t* rcv = &tmp_cv[right][0]; + blake3_parent_cv(lcv, rcv, parent); + } else { // odd: up-flow directly #pragma unroll - for (int j = 0; j < 8; ++j) - rcv[j] = __shfl_sync(mask, right_cv[j], 1); - blake3_parent_cv(left_cv, rcv, parent); - } else { - #pragma unroll - for (int j = 0; j < 8; ++j) - parent[j] = left_cv[j]; // 奇数晋级 + for (int j = 0 ; j < 8; ++j) + parent[j] = lcv[j]; } - // 写入 cv_smem 的正确位置:pair_idx = left/2 → 0..15 + + // now, one warp computes 2 chunks, yield one parent-cv value + const int pair_idx = (base >> 1) + warp_id; // 0, 16 + warp_id #pragma unroll - for (int j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) cv_smem[pair_idx][j] = parent[j]; } - __syncwarp(); - } - // ---- pass 1 ---- - { - const int left = 32 + (warp_id << 1); // 32,34,...,62 - const int right = left + 1; // 33,35,...,63 - const int pair_idx = left >> 1; // 16..31 + __syncwarp(); // NOTICE: this is necessary! + }; // do_big_pass - uint32_t left_cv[8], right_cv[8]; + // big-pass 1: computing 0-31 chunks + do_big_pass(/*base=*/0, pass0_valid); - bool have_left = false, have_right = false; + // if (bx == 0) printf("Finish 1 big pass\n"); - if (lane_id == 0 && (left - 32) < pass1_valid) { - compute_leaf_cv_from_row(left, left_cv); - have_left = true; - } - if (lane_id == 1 && (right - 32) < pass1_valid) { - compute_leaf_cv_from_row(right, right_cv); - have_right = true; - } + // big-pass 2: computing 32-63 chunks + do_big_pass(/*base=*/32, pass1_valid); - // TODO: here we may have some issue: overflow and border situation - unsigned mask = __ballot_sync(0xFFFFFFFFu, (lane_id==0 && have_left) || (lane_id==1 && have_right)); - if (lane_id == 0 && (left - 32) < pass1_valid) { - uint32_t parent[8]; - if (have_right) { - uint32_t rcv[8]; - #pragma unroll - for (int j = 0; j < 8; ++j) - rcv[j] = __shfl_sync(mask, right_cv[j], 1); - blake3_parent_cv(left_cv, rcv, parent); - } else { - #pragma unroll - for (int j = 0; j < 8; ++j) - parent[j] = left_cv[j]; // 奇数晋级 - } - // write to the right position - #pragma unroll - for (int j = 0; j < 8; ++j) - cv_smem[pair_idx][j] = parent[j]; - } - __syncwarp(); - } + // if (bx == 0) printf("Finish 2 big pass\n"); __syncthreads(); + // printf("Stage 2 done!!!\n"); + + // right now, we have got 32 chain values + // a warp-reduce to merge. + // ============== STAGE 3: Block-Reduce ============== // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv // we will only use warp 0 to handle this thing @@ -391,16 +578,14 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, } } // blake3_block_reduce_kernel -__device__ __forceinline__ -void load_cv_g2r(const uint32_t* g, uint32_t r[8]) { +__device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) { const uint4* g4 = reinterpret_cast(g); uint4 a = g4[0], b = g4[1]; r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w; r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w; } -__device__ __forceinline__ -void store_cv_r2g(const uint32_t r[8], uint32_t* g) { +__device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) { uint4* g4 = reinterpret_cast(g); g4[0] = make_uint4(r[0],r[1],r[2],r[3]); g4[1] = make_uint4(r[4],r[5],r[6],r[7]); @@ -585,8 +770,6 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv } } -constexpr uint32_t FLAG_ROOT = 8; - inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) { const uint32_t zero_block[16] = {0}; uint32_t st[16]; @@ -652,7 +835,7 @@ void blake3_block_reduce_sm70(const uint8_t* d_data, const uint64_t total_chunks = bytes_len / CHUNK_SIZE; const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); // 16384 blocks CUDA_CHECK(cudaFuncSetAttribute( - blake3_block_reduce_kernel, + blake3_block_reduce_kernel, cudaFuncAttributePreferredSharedMemoryCarveout, 100)); uint8_t* d_bytes = const_cast(d_data); @@ -669,7 +852,7 @@ void blake3_block_reduce_sm70(const uint8_t* d_data, dim3 block_big(NUM_THREADS); uint64_t base_chunk_counter = 0ull; - blake3_block_reduce_kernel + blake3_block_reduce_kernel <<>>( d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks); From cdae83c6be9184fd6d9949f29578f88b12f76d27 Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Mon, 6 Oct 2025 18:48:58 +0800 Subject: [PATCH 07/20] resolve bank conflict --- README.md | 6 +- backup_deprecated/blake3_sm70_v1.cu | 119 ++-- backup_deprecated/blake3_sm70_v2.cu | 939 ++++++++++++++++++++++++++++ benchmark/test_gpu.py | 2 +- csrc/blake3_sm70.cu | 52 +- 5 files changed, 1046 insertions(+), 72 deletions(-) create mode 100644 backup_deprecated/blake3_sm70_v2.cu diff --git a/README.md b/README.md index c22203f..0945c67 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,9 @@ python benchmark/test_script.py # GPU kernel performance on File Compress hashing +> This logs were tested on RTX 4090 Laptop GPU, computation / memory is limited, soon in the latter section we will see the perf on different machine. + + 10.5 - v1 - [commit:4f31a2c55551965a2e5f048565f021c4551554ae]: 1709.34 MiB/s + 10.5 - v2 - [commit:2e71051367a9533af36f6c54a46876e20bc0bab5]: 2044.49 MiB/s -+ 10.6 - v3 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s -+ 10.6 - v4 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s \ No newline at end of file ++ 10.6 - v3 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s (False Result) ++ 10.6 - v4 - [commit:9781ad9759d13e0e482162d57c68b65c946f4ff9]: 17069.23 MiB/s \ No newline at end of file diff --git a/backup_deprecated/blake3_sm70_v1.cu b/backup_deprecated/blake3_sm70_v1.cu index b170812..867b6af 100644 --- a/backup_deprecated/blake3_sm70_v1.cu +++ b/backup_deprecated/blake3_sm70_v1.cu @@ -31,6 +31,18 @@ __host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { #endif } +#define B3_G(a,b,c,d, mx, my) \ + do { \ + a = a + b + (mx); \ + d = rotr32(d ^ a, 16); \ + c = c + d; \ + b = rotr32(b ^ c, 12); \ + a = a + b + (my); \ + d = rotr32(d ^ a, 8); \ + c = c + d; \ + b = rotr32(b ^ c, 7); \ + } while (0) + __host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) { #if defined(__CUDA_ARCH__) const uint4 v = *reinterpret_cast(src); @@ -40,6 +52,23 @@ __host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t ds #endif } +__host__ __device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = { + // r = 0: identity + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + // r = 1: P + { 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8 }, + // r = 2: P∘P + { 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1 }, + // r = 3 + { 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6 }, + // r = 4 + { 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4 }, + // r = 5 + { 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7 }, + // r = 6 + { 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13 }, +}; + __host__ __device__ void blake3_compress_words_7r( const uint32_t block_words[16], // 64B -> shared memory const uint32_t cv[8], // 8×u32 -> shared memory @@ -48,31 +77,49 @@ __host__ __device__ void blake3_compress_words_7r( uint32_t flags, // CHUNK_START/END/PARENT/ROOT… uint32_t out_state[16]) // 返回 16×u32 状态向量(按规范) { - // TODO: 根据 BLAKE3(基于 BLAKE2s)的 G/round 实现7轮 - // 这里先给占位:将 IV+cv 混合到 out_state,真实实现请替换 -#pragma unroll - for (int i = 0; i < 8; ++i) - out_state[i] = cv[i]; -#pragma unroll - for (int i = 0; i < 8; ++i) - out_state[8+i] = BLAKE3_IV[i]; - - out_state[12] ^= (uint32_t)chunk_counter; - out_state[13] ^= (uint32_t)(chunk_counter >> 32); - out_state[14] ^= block_len; - out_state[15] ^= flags; - - // so far, the block_words are still pointers. - // now we load it into kernel, as pointed out by ncu profile - uint32_t block_reg_1[4]; - -#pragma unroll - for (int i = 0; i < 16; i += 4) { // the gap is 4 - // load_u128_u32x4(block_words + i, block_reg_1); - out_state[i] ^= block_words[i]; - // 做一点点搅动(占位) - out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31); + // v[0..7]=cv;v[8..11]=IV;v[12..15]=IV^t0^t1^b^flags + uint32_t v0=cv[0], v1=cv[1], v2=cv[2], v3=cv[3]; + uint32_t v4=cv[4], v5=cv[5], v6=cv[6], v7=cv[7]; + + uint32_t v8 = BLAKE3_IV[0], v9 = BLAKE3_IV[1]; + uint32_t v10= BLAKE3_IV[2], v11 = BLAKE3_IV[3]; + uint32_t v12= BLAKE3_IV[4] ^ (uint32_t)chunk_counter; + uint32_t v13= BLAKE3_IV[5] ^ (uint32_t)(chunk_counter >> 32); + uint32_t v14= BLAKE3_IV[6] ^ block_len; + uint32_t v15= BLAKE3_IV[7] ^ flags; + + // 7 轮 +#pragma unroll 1 + for (int r = 0; r < 7; ++r) { + // 取本轮 16 个消息字(按 BLAKE3 调度表) + const uint8_t* SR = B3_MSG_SCHEDULE[r]; // 0..15 + const uint32_t m0 = block_words[SR[0]], m1 = block_words[SR[1]]; + const uint32_t m2 = block_words[SR[2]], m3 = block_words[SR[3]]; + const uint32_t m4 = block_words[SR[4]], m5 = block_words[SR[5]]; + const uint32_t m6 = block_words[SR[6]], m7 = block_words[SR[7]]; + const uint32_t m8 = block_words[SR[8]], m9 = block_words[SR[9]]; + const uint32_t m10 = block_words[SR[10]], m11 = block_words[SR[11]]; + const uint32_t m12 = block_words[SR[12]], m13 = block_words[SR[13]]; + const uint32_t m14 = block_words[SR[14]], m15 = block_words[SR[15]]; + + // 列步:四个 G + B3_G(v0, v4, v8, v12, m0, m1); + B3_G(v1, v5, v9, v13, m2, m3); + B3_G(v2, v6, v10, v14, m4, m5); + B3_G(v3, v7, v11, v15, m6, m7); + + // 对角步:四个 G + B3_G(v0, v5, v10, v15, m8, m9); + B3_G(v1, v6, v11, v12, m10, m11); + B3_G(v2, v7, v8, v13, m12, m13); + B3_G(v3, v4, v9, v14, m14, m15); } + + // 输出 16×u32 状态(后续由调用者做 state_to_cv) + out_state[0]=v0; out_state[1]=v1; out_state[2]=v2; out_state[3]=v3; + out_state[4]=v4; out_state[5]=v5; out_state[6]=v6; out_state[7]=v7; + out_state[8]=v8; out_state[9]=v9; out_state[10]=v10; out_state[11]=v11; + out_state[12]=v12;out_state[13]=v13;out_state[14]=v14; out_state[15]=v15; } // 从 out_state 派生新的 CV(按规范应取 state[0..7] ^ state[8..15]) @@ -615,22 +662,6 @@ void blake3_block_reduce_sm70(const uint8_t* d_data, std::abort(); } - // int dev = -1; - // cudaGetDevice(&dev); - // printf("[dbg] my runtime current device = %d\n", dev); - - // cudaPointerAttributes attr{}; - // auto st = cudaPointerGetAttributes(&attr, d_data); - // printf("[dbg] getAttr=%d, type=%d (0=host,1=device,2=managed), device=%d\n", - // (int)st, (int)attr.type, attr.device); - - // cudaPointerAttributes attr{}; - // CUDA_CHECK(cudaPointerGetAttributes(&attr, d_data)); - // if (attr.type != cudaMemoryTypeDevice) { - // fprintf(stderr, "d_data is not device memory!\n"); - // std::abort(); - // } - int optin = 0, deflt = 0; cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0); cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0); @@ -654,8 +685,12 @@ void blake3_block_reduce_sm70(const uint8_t* d_data, const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk const uint64_t total_chunks = bytes_len / CHUNK_SIZE; const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); // 16384 blocks + + constexpr int pad_chunk = 0; + constexpr int pad_cv = 0; + CUDA_CHECK(cudaFuncSetAttribute( - blake3_block_reduce_kernel, + blake3_block_reduce_kernel, cudaFuncAttributePreferredSharedMemoryCarveout, 100)); uint8_t* d_bytes = const_cast(d_data); @@ -672,7 +707,7 @@ void blake3_block_reduce_sm70(const uint8_t* d_data, dim3 block_big(NUM_THREADS); uint64_t base_chunk_counter = 0ull; - blake3_block_reduce_kernel + blake3_block_reduce_kernel <<>>( d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks); diff --git a/backup_deprecated/blake3_sm70_v2.cu b/backup_deprecated/blake3_sm70_v2.cu new file mode 100644 index 0000000..fc418c2 --- /dev/null +++ b/backup_deprecated/blake3_sm70_v2.cu @@ -0,0 +1,939 @@ + +#include +#include +#include +#include +#include + +#define WARP_SIZE 32 +#define LDST128BITS(value) (reinterpret_cast(&(value))[0]) + +#define CUDA_CHECK(expr) do { \ + cudaError_t _e = (expr); \ + if (_e != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s at %s:%d: %s\n", \ + #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\ + std::abort(); \ + } \ + } while(0) + +__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = { + 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, + 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u +}; + +enum : uint32_t { + FLAG_CHUNK_START = 1u << 0, + FLAG_CHUNK_END = 1u << 1, + FLAG_PARENT = 1u << 2, + FLAG_ROOT = 1u << 3, + FLAG_KEYED_HASH = 1u << 4, + FLAG_DERIVE_KEY_CONTEXT = 1u << 5, + FLAG_DERIVE_KEY_MATERIAL= 1u << 6, +}; + +__device__ __noinline__ +uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) { + uint32_t f = 0; + f |= (uint32_t)-(block_idx_in_chunk==0) & FLAG_CHUNK_START; + f |= (uint32_t)-(block_idx_in_chunk==nblocks_in_chunk-1) & FLAG_CHUNK_END; + if (is_root_chunk) f |= FLAG_ROOT; // only this block in msg, or this is root + return f; +} + +__device__ __forceinline__ +uint32_t blake3_parent_flags(bool is_root_parent) { + return FLAG_PARENT | (is_root_parent ? FLAG_ROOT : 0); +} + +// ---- 小工具 ---- +__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { +#if defined(__CUDA_ARCH__) + return __funnelshift_r(x, x, n); +#else + return (x >> n) | (x << (32 - n)); // host 路径 +#endif +} + +__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) { +#if defined(__CUDA_ARCH__) + const uint4 v = *reinterpret_cast(src); + dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w; +#else + std::memcpy(dst, src, 16); +#endif +} + +__host__ __device__ void blake3_compress_words_7r( + const uint32_t block_words[16], // 64B -> shared memory + const uint32_t cv[8], // 8×u32 -> shared memory + uint64_t chunk_counter, // 64-bit + uint32_t block_len, // [0..64] + uint32_t flags, // CHUNK_START/END/PARENT/ROOT… + uint32_t out_state[16]) // 返回 16×u32 状态向量(按规范) +{ + // TODO: 根据 BLAKE3(基于 BLAKE2s)的 G/round 实现7轮 + // 这里先给占位:将 IV+cv 混合到 out_state,真实实现请替换 +#pragma unroll + for (int i = 0; i < 8; ++i) + out_state[i] = cv[i]; +#pragma unroll + for (int i = 0; i < 8; ++i) + out_state[8+i] = BLAKE3_IV[i]; + + out_state[12] ^= (uint32_t)chunk_counter; + out_state[13] ^= (uint32_t)(chunk_counter >> 32); + out_state[14] ^= block_len; + out_state[15] ^= flags; + + // so far, the block_words are still pointers. + // now we load it into kernel, as pointed out by ncu profile + uint32_t block_reg_1[4]; + +#pragma unroll + for (int i = 0; i < 16; i += 4) { // the gap is 4 + // load_u128_u32x4(block_words + i, block_reg_1); + out_state[i] ^= block_words[i]; + // 做一点点搅动(占位) + out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31); + } +} + +// 从 out_state 派生新的 CV(按规范应取 state[0..7] ^ state[8..15]) +__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){ +#pragma unroll + for (int i = 0; i < 8; ++i) + out_cv[i] = st[i] ^ st[8+i]; +} + +// swap-table +// BLAKE3 message schedule: rows are P^r, r=0..6. +// Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) +// https://www.cse.unsw.edu.au/~cs4601/refs/papers/blake3.pdf +__device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = { + // r = 0: identity + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + // r = 1: P + { 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8 }, + // r = 2: P∘P + { 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1 }, + // r = 3 + { 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6 }, + // r = 4 + { 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4 }, + // r = 5 + { 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7 }, + // r = 6 + { 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13 }, +}; + +// get the "r" round, "k" message, it is broadcasted from m[li] lane. li = k +__device__ __forceinline__ uint32_t msg_rk(uint32_t m_lane, int round, int k, unsigned mask16) { + int src = B3_MSG_SCHEDULE[round][k]; + return __shfl_sync(mask16, m_lane, src, 16); +} + +__device__ __noinline__ +uint32_t G_update_role(uint32_t v_self, uint32_t v_b, uint32_t v_c, uint32_t v_d, + uint32_t mx, uint32_t my, int role) +{ + // 按 BLAKE2s 32-bit 的 G 序列算出 a',b',c',d',最后返回“当前 role”的那个值 + uint32_t a = v_self, b = v_b, c = v_c, d = v_d; + + // a = a + b + mx; + // d ^= a; + // d >>>= 16 + a = a + b + mx; + d ^= a; + d = rotr32(d, 16); + + // c = c + d; + // b ^= c; + // b >>>= 12 + c = c + d; + b ^= c; + b = rotr32(b, 12); + + // a = a + b + my; + // d ^= a; + // d >>>= 8 + a = a + b + my; + d ^= a; + d = rotr32(d, 8); + + // c = c + d; + // b ^= c; + // b >>>= 7 + c = c + d; + b ^= c; + b = rotr32(b, 7); + + // role choice: + switch (role) { + case 0: return a; + case 1: return b; + case 2: return c; + default: return d; + } +} + +// notice that, this function will proceed 2 chunks, each time. +// - chunk_words_row: current chunk +// - out_cv: written by lane 0, or lane 16 +__device__ __noinline__ +void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row, // 256×u32 -> 1024 Bytes, from shared memory + // so the chunks_row += 2 as gap + int chunk_len_bytes, + uint64_t chunk_counter, + uint32_t out_cv[8], + unsigned mask16) { + // computing index + int lane = threadIdx.x & 31; // lane_id: 0-31 + int sub = lane >> 4; // 0/1 + int li = lane & 15; // 0..15, abstract lane id. for example, lane 16 will be li=0 + int role = li & 3; // a/b/c/d role + int base = (sub << 4); // 0 or 16 the absolute base + + const int nblocks = (chunk_len_bytes + 63) >> 6; // ceil(chunk_len/64) + + int warp_id = threadIdx.x / WARP_SIZE; + + // initialize + uint32_t cv_word = 0; + if (li < 8) cv_word = BLAKE3_IV[li]; + + // process all blocks + // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks + // each block has 64B -> 16 x u32 + for (int b = 0; b < nblocks; ++b) { + // each lane holds one u32, + // 16 lane will hold 16 x 4 = 64 B -> it's block + // the another 16 lane will hold opposite 64 B + const uint32_t m_lane = chunk_words_row[b * 16 + li]; + + // 初始化 v:v[0..7]=cv, v[8..11]=IV,v[12..15]^=t/len/flags + // 先把“自己的那个索引”的初值准备好: + uint32_t v = (li < 8) + ? cv_word // v[i](i<8) + : BLAKE3_IV[li - 8]; // v[8..15] ← IV + + // 计数器/长度/标志(按 BLAKE3 规范) + const uint32_t t0 = (uint32_t)chunk_counter; + const uint32_t t1 = (uint32_t)(chunk_counter >> 32); + const int remain = chunk_len_bytes - (b << 6); + const uint32_t block_len = (remain >= 64) ? 64u : (uint32_t)remain; + + const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false); + + // 只在 12..15 四个索引上异或相应域(不分支,用谓词掩码) + v ^= (li == 12) ? t0 : 0u; + v ^= (li == 13) ? t1 : 0u; + v ^= (li == 14) ? block_len: 0u; + v ^= (li == 15) ? flags : 0u; + + // 把索引写成 li = q + 4*rq;对角步先做置换 li' = (rq<<2) | ((q+rq)&3) + int q = (li & 3); + int rq = (li >> 2); + int li_diag = (rq << 2) | ((q + rq) & 3); + int li_undo = (rq << 2) | ((q - rq) & 3); + int gi_col = q; // 0..3 + int gi_diag = (li_diag & 3); // 0..3 + + // ===== 7 rounds ===== + #pragma unroll 4 + for (int r = 0; r < 7; ++r) { + // inside this loop, each lane will do one job + // 16 lane will execute 16 x 2 operations + // in sequential-programming, will do 8 operation + + // ---- 列步(quartet: {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15})---- + { + // 取同 quartet 的 b/c/d(基于当前 v) + uint32_t vb = __shfl_xor_sync(mask16, v, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16); + + // 本 quartet 的 i ∈ {0,1,2,3},列步用 msg 索引 0..7(两两为一对) + + uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16); + + v = G_update_role(v, vb, vc, vd, mx, my, role); + } + + // ---- 对角步 ---- + { + // 在“对角置换域”取到当前 v 值 + uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16); + + // 在该域内做“列步”同样的四邻取值 + uint32_t vb = __shfl_xor_sync(mask16, v_diag, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v_diag, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16); + + // 对角步的 4 组 G 使用本轮消息对的后半(索引 8..15) + + uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16); + + uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role); + + // 反置换回原位:li_undo = (rq<<2) | ((q - rq) & 3) + + // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16); + v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16); + } + } // 7 rounds end + + // 派生新的 CV:cv[i] = v[i] ^ v[i+8](仅 li=0..7 生效) + uint32_t vip8_all = __shfl_sync(mask16, v, li ^ 8, 16); + if (li < 8) { + cv_word = v ^ vip8_all; + } + + // 下一块继续(本函数内 16 个 block 串行) + } + + // 由 lane0 / lane16 收集 8×u32 输出 + #pragma unroll + for (int j = 0; j < 8; ++j) { + uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用 + if (li == 0) out_cv[j] = wj; // 仅 lane0 落盘 + } +} + +__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){ + uint32_t msg[16]; +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[i] = L[i]; + } +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[8+i] = R[i]; + } + uint32_t st[16]; + blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st); + blake3_state_to_cv(st, out_cv); +} + +// ============ Big kernel: 16 WARPS in total ============ +// grid: (chunks / 64), thread: (512,) +template // pad shared memory +__global__ void blake3_block_reduce_kernel(uint32_t* d_input, + uint32_t* block_cvs, + int chunk_len_bytes, + uint64_t base_chunk_counter, + int total_chunks) { + // NUM_WARPS also stands for NUM_CHUKNS per block + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // consider it as 16 + constexpr int CHUNKS_PROCEED = 64; + + static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned"); + static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4"); + + // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total + // 8 x 4 x 64 = 2 KiB shared memory in sum + __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV]; // avoid bank conflict + + // 4 bytes x 256 x 64 = 64 KiB shared memory. + __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][(CHUNK_SIZE / 4) + PAD_CHUNK]; // [64][256] + + const int tid = threadIdx.x; + const int bx = blockIdx.x; + const int warp_id = tid / WARP_SIZE; + const int lane_id = tid % WARP_SIZE; + + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int VEC_ELEMS = 4; // uint4 + constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB) + constexpr int WARPS_PER_CTA = NUM_WARPS; // 16 + + // ============== STAGE 1: Coalsced Global Memory Loading ============== + const int tile_id = blockIdx.x; + const int tile_base = tile_id * CHUNKS_PER_BLOCK; // which chunk do this block start loading + + int valid_chunks = total_chunks - tile_base; + if (valid_chunks <= 0) { + return; // overflow + } + if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK; + + for (int ldt = 0; ldt < 4; ldt++) { + // each warp load 4 chunks + int chunk_local = ldt * WARPS_PER_CTA + warp_id; // ldt*16 + warp -> start chunk + int chunk_global = tile_base + chunk_local; // global chunk idx + + // the pointer for shared memory + uint32_t* s_u32 = &chunk_smem[chunk_local][0]; + + // only read from global, when it's valid + // or, we fill it with 0 + if (chunk_local < valid_chunks) { + const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK; + + // move 16 bytes -> 128 bits each time + // each thread will load 2 x 16 bytes + // so 32 threads will load 32 x 2 x 16 = 1024 B + const uint4* __restrict__ g4 = reinterpret_cast(g_u32); + uint4* __restrict__ s4 = reinterpret_cast(s_u32); + + // idx = lane_id (0..31) 与 lane_id+32 (32..63) + int idx0 = lane_id; // 0..31 + int idx1 = lane_id + WARP_SIZE; // 32..63 + + // thread 0 -> 0, 32 + // thread 1 -> 1, 33 + // ... + // thread 31 -> 31, 63 + // so the global memory access is coalsced + + // notice, we load 16 bytes a time. the index is compressed + // tid 0 -> 0, tid 1 -> 16 + // tid 0 -> 32 x 16, tid 2 -> 32 x 16 + 16 + + uint4 v0 = g4[idx0]; // still, this step we load from gmem, in 4 elements aligned. + uint4 v1 = g4[idx1]; + + s_u32[4*idx0 + 0] = v0.x; // when load into shared mem, do manually + s_u32[4*idx0 + 1] = v0.y; + s_u32[4*idx0 + 2] = v0.z; + s_u32[4*idx0 + 3] = v0.w; + + s_u32[4*idx1 + 0] = v1.x; + s_u32[4*idx1 + 1] = v1.y; + s_u32[4*idx1 + 2] = v1.z; + s_u32[4*idx1 + 3] = v1.w; + } else { + uint4* s4 = reinterpret_cast(s_u32); + int idx0 = lane_id; + int idx1 = lane_id + WARP_SIZE; + s4[idx0] = make_uint4(0u,0u,0u,0u); + s4[idx1] = make_uint4(0u,0u,0u,0u); + } + } + + __syncthreads(); // sync all warps + + // ============== STAGE 2: Compress Leaf to 64 chain value ============== + const int pass0_valid = min(32, valid_chunks); // pass0 cover [0, 31] chunks + const int pass1_valid = max(0, valid_chunks - 32); // pass1 cover [32, 63] chunks + + __shared__ int parents_count; + if (threadIdx.x == 0) { + const int parents0 = (pass0_valid + 1) >> 1; + const int parents1 = (pass1_valid + 1) >> 1; + parents_count = parents0 + parents1; // ≤ 32 + } + __syncthreads(); + + // if (blockIdx.x==0 && threadIdx.x==0) printf("after stage1\n"); + + // this is for each warp's lane0 and lane16 written + // to decrease the register usage. + __shared__ __align__(16) uint32_t tmp_cv[CHUNKS_PER_BLOCK][8 + PAD_CV]; // 2 KiB + + // lambda function: compress this thing + auto do_big_pass = [&](int base /*0 或 32*/, int pass_valid) { + // left=base+2*warp_id, right=left+1 + const int left = base + (warp_id << 1); // base + 0,2,4,6,... + const int right = left + 1; + const int left_rel = left - base; // 0..31 + const int right_rel = right - base; // 1..32 + const bool has_left = (left_rel < pass_valid); + const bool has_right = (right_rel < pass_valid); + + // const int lane_id = threadIdx.x & 31; + const int sub = lane_id >> 4; // sub-warp: 0 or 1, lane0-15: sub-warp0; lane16-lane31: sub-warp1 + const int li = lane_id & 15; // 0..15 + const unsigned mask16= (sub == 0 ? 0x0000FFFFu : 0xFFFF0000u); + + const int chunk_local = left + sub; // sub=0→left, sub=1→right + const bool active = (sub==0 ? (left - base) < pass_valid + : (right - base) < pass_valid); + + // uint32_t my_cv[8]; + + // the left-sub-warp and right-sub-warp will execute the same code + // distinguish the index by computing, + // to avoid warp-divergence + if (active) { + // the chunk local identifies the left or right chunk, so do not worry. + const uint32_t* row = &chunk_smem[chunk_local][0]; + const uint64_t cc = base_chunk_counter + (uint64_t)(tile_base + chunk_local); + blake3_leaf_cv_simd16_onechunk(row, + chunk_len_bytes, + cc, + &tmp_cv[chunk_local][0], + mask16); + } + + __syncwarp(); // make sure two warps written into `tmp_cv` + + // now we have compute 2 chunks' cv + // merge it to a parent cv + if (lane_id == 0 && has_left) { + const uint32_t* lcv = &tmp_cv[left][0]; + uint32_t parent[8]; + if ((right - base) < pass_valid) { + const uint32_t* rcv = &tmp_cv[right][0]; + blake3_parent_cv(lcv, rcv, parent); + } else { // odd: up-flow directly + #pragma unroll + for (int j = 0 ; j < 8; ++j) + parent[j] = lcv[j]; + } + + // now, one warp computes 2 chunks, yield one parent-cv value + const int pair_idx = (base >> 1) + warp_id; // 0, 16 + warp_id + #pragma unroll + for (int j = 0; j < 8; ++j) + cv_smem[pair_idx][j] = parent[j]; + } + + __syncwarp(); // NOTICE: this is necessary! + }; // do_big_pass + + // big-pass 1: computing 0-31 chunks + do_big_pass(/*base=*/0, pass0_valid); + + // if (bx == 0) printf("Finish 1 big pass\n"); + + // big-pass 2: computing 32-63 chunks + do_big_pass(/*base=*/32, pass1_valid); + + // if (bx == 0) printf("Finish 2 big pass\n"); + + __syncthreads(); + + // printf("Stage 2 done!!!\n"); + + // right now, we have got 32 chain values + // a warp-reduce to merge. + + // ============== STAGE 3: Block-Reduce ============== + // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv + // we will only use warp 0 to handle this thing + if (warp_id == 0) { + uint32_t cv[8] = {0,0,0,0,0,0,0,0}; + + const bool active_lane = (lane_id < parents_count); + if (active_lane) { + #pragma unroll + for (int j = 0; j < 8; ++j) cv[j] = cv_smem[lane_id][j]; + } + + // 2) warp reduce 32 -> 16 -> 8 -> 4 -> 2 -> 1 + unsigned mask = __ballot_sync(0xFFFFFFFFu, active_lane); + int cur_n = parents_count; // 当前层的有效节点数(逐层更新) + + for (int step = 1; step < WARP_SIZE; step <<= 1) { + // right-neighbor + uint32_t nbr[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) { + nbr[j] = __shfl_down_sync(mask, cv[j], step); + } + + // safety checking + const bool do_pair = + (lane_id % (step << 1) == 0) && // 左侧 + (lane_id + step < cur_n) && // 右侧在当前层有效范围内 + (lane_id < cur_n); // 左侧也必须有效 + + if (do_pair) { + blake3_parent_cv(cv, nbr, cv); // parent(left, right) -> cv + } + + cur_n = (cur_n + 1) >> 1; + __syncwarp(mask); + } + + // 3) write back to global memory + if (lane_id == 0 && parents_count > 0) { + const int tile_id = blockIdx.x; + uint32_t* out = block_cvs + (size_t)tile_id * 8; // 8 x 4 = 32 B + + // two different write ways + #if 0 + #pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = cv[j]; + #else + // block_cvs should be cudaMalloc ed + reinterpret_cast(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]); + reinterpret_cast(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]); + #endif + } + } +} // blake3_block_reduce_kernel + +__device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) { + const uint4* g4 = reinterpret_cast(g); + uint4 a = g4[0], b = g4[1]; + r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w; + r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w; +} + +__device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) { + uint4* g4 = reinterpret_cast(g); + g4[0] = make_uint4(r[0],r[1],r[2],r[3]); + g4[1] = make_uint4(r[4],r[5],r[6],r[7]); +} + +// ============ Tiny kernel ============ +// In big kernel, it will consume 64 KiB each block +// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 64 root = 16384 root +// And this tiny kernel is designed to process these 16384 root +template +__global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32, + uint32_t* __restrict__ out_cv32, + int N) +{ + extern __shared__ __align__(16) uint32_t smem[]; // 动态 SMEM;需要 >= TILE_CVS*8*4 字节 + // 视作 2D:[TILE_CVS][8+PAD] + uint32_t* cv_tile = smem; + + const int tid = threadIdx.x; + const int warp_id = tid / WARP_SIZE; // 0..15 + const int lane_id = tid % WARP_SIZE; // 0..31 + + // 本 block 负责的分片起点 + const int tile_start = blockIdx.x * TILE_CVS; + if (tile_start >= N) return; + + // N等于8的时候,这里就是8 + const int tile_n = min(TILE_CVS, N - tile_start); // 该分片的实际 CV 数(<=2048) + + // ---------------- Stage 1: 合并访存 loading 到 SMEM ---------------- + // 每线程搬多个 CV:i = tid, tid+blockDim, ... + for (int i = tid; i < tile_n; i += NUM_THREADS) { // 注意:i = tid, 不是等于0 + const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8; + uint32_t* s = cv_tile + (size_t)i * (8 + PAD); + // 两次 16B + const uint4* g4 = reinterpret_cast(g); + uint4* s4 = reinterpret_cast(s); + // s4[0] = g4[0]; + // s4[1] = g4[1]; + + // in case that the address is not aligned + uint4 v0 = g4[0]; + uint4 v1 = g4[1]; + + s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w; + s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w; + } + // 对于 tile_n < TILE_CVS 的尾部,无需清零;后续按有效范围处理 + __syncthreads(); + + // ---------------- Stage 2: 线程内 4→1(保持相邻配对) ---------------- + // 共有 reduced_n0 = ceil(tile_n / 4) 个 lane-root + const int reduced_n0 = (tile_n + 3) >> 1 >> 1; // 等价于 (tile_n+3)/4 + uint32_t lane_cv[8]; // 本线程输出的 lane-root + bool lane_valid = false; + + // 每线程的 4 个输入的起始索引 + int base4 = tid << 2; // tid*4 + if (base4 < tile_n) { + // 读取最多 4 个相邻 CV:idx = base4 + 0,1,2,3 + uint32_t a[8], b[8], c[8], d[8]; + const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD); + load_cv_g2r(s0, a); + + int remain = tile_n - base4; + + if (remain >= 2) { + const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD); + load_cv_g2r(s1, b); + } + if (remain >= 3) { + const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD); + load_cv_g2r(s2, c); + } + if (remain >= 4) { + const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD); + load_cv_g2r(s3, d); + } + + // 两层相邻配对(奇数晋级) + if (remain == 1) { + #pragma unroll + for (int j = 0; j < 8; ++j) + lane_cv[j] = a[j]; + } else if (remain == 2) { + blake3_parent_cv(a, b, lane_cv); + } else if (remain == 3) { + uint32_t p01[8]; + blake3_parent_cv(a, b, p01); + blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01,(p01,c)->lane_cv + } else { // remain >= 4 + uint32_t p01[8], p23[8]; + blake3_parent_cv(a, b, p01); + blake3_parent_cv(c, d, p23); + blake3_parent_cv(p01, p23, lane_cv); + } + lane_valid = true; + } + + // ---------------- Stage 3: Warp 内 32→1 相邻配对 ---------------- + // 每个 warp 负责一个连续段:warp_base = warp_id*32 + const int warp_base = warp_id * WARP_SIZE; + const int cur_n_w = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // 本 warp 段内的有效数量 + + // 把 lane_cv 保留在寄存器里做归约;无效 lane 用 mask 剔除 + unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // 仅做存在检测 + int cur_n = cur_n_w; + + // 把“段外的线程”标成无效(避免读越界) + bool active_lane = (lane_id < cur_n_w); + + // 对无效 lane 把值清成 0(不会被使用) + if (!active_lane) { + #pragma unroll + for (int j = 0; j < 8; ++j) + lane_cv[j] = 0u; + } + + // 逐层配对:1,2,4,8,16 - warp-reduce + for (int step = 1; step < WARP_SIZE; step <<= 1) { + // 取右邻 + uint32_t nbr[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) + nbr[j] = __shfl_down_sync(0xFFFFFFFFu, lane_cv[j], step); + + const bool do_pair = + active_lane && + ((lane_id % (step<<1)) == 0) && + (lane_id + step < cur_n); + + if (do_pair) { + blake3_parent_cv(lane_cv, nbr, lane_cv); + } + + cur_n = (cur_n + 1) >> 1; + // __syncwarp(); + } + + // 这一段的结果在 lane0;把 16 个 warp-root 写入 SMEM 的前 16 行 + __shared__ uint32_t warp_roots[WARP_SIZE/2][8]; // 16×8 + if (lane_id == 0 && cur_n_w > 0) { + #pragma unroll + for (int j=0;j<8;++j) + warp_roots[warp_id][j] = lane_cv[j]; + } + __syncthreads(); + + // ---------------- Stage 4: CTA 内 16→1 相邻配对 ---------------- + // 有效 warp 数:ceil(reduced_n0 / 32) + int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..16 + if (valid_warps == 0) return; + + // 每一个warp的lane 0来做计算 + // 用 lane0 做计算,其它 lane 空转 + for (int stride = (valid_warps >> 1); stride >= 1; stride >>= 1) { + if (warp_id < stride && lane_id == 0) { + uint32_t p[8]; + blake3_parent_cv(&warp_roots[2*warp_id][0], + &warp_roots[2*warp_id + 1][0], p); + #pragma unroll + for (int j = 0; j < 8; ++j) + warp_roots[warp_id][j] = p[j]; + } + __syncthreads(); + // 奇数晋级 + if ((valid_warps & 1) && warp_id==0 && lane_id==0) { + #pragma unroll + for (int j=0;j<8;++j) + warp_roots[stride][j] = warp_roots[valid_warps-1][j]; + } + __syncthreads(); + valid_warps = (valid_warps + 1) >> 1; + } + + // 写回本 block 的根 + if (threadIdx.x == 0) { + uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8; + #pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = warp_roots[0][j]; + } +} + +inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) { + const uint32_t zero_block[16] = {0}; + uint32_t st[16]; + blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st); + // 写出前 32 字节(state[0..7],小端) + for (int i = 0; i < 8; ++i) { + uint32_t w = st[i]; + out32[4*i+0] = (uint8_t)( w & 0xFF); + out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF); + out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF); + out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF); + } +} + +// wrapper function +void blake3_block_reduce_sm70(const uint8_t* d_data, + uint64_t bytes_len, + std::array* root_out = nullptr, + cudaStream_t stream = 0) { + if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) { + fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n", + (unsigned long long)bytes_len); + std::abort(); + } + + // int dev = -1; + // cudaGetDevice(&dev); + // printf("[dbg] my runtime current device = %d\n", dev); + + // cudaPointerAttributes attr{}; + // auto st = cudaPointerGetAttributes(&attr, d_data); + // printf("[dbg] getAttr=%d, type=%d (0=host,1=device,2=managed), device=%d\n", + // (int)st, (int)attr.type, attr.device); + + // cudaPointerAttributes attr{}; + // CUDA_CHECK(cudaPointerGetAttributes(&attr, d_data)); + // if (attr.type != cudaMemoryTypeDevice) { + // fprintf(stderr, "d_data is not device memory!\n"); + // std::abort(); + // } + + int optin = 0, deflt = 0; + cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0); + cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0); + + const int dyn_smem = 64 * 1024; // 64KiB + + // 编译器在编译期决定分配多少动态shmem给kernel + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel<512, 2048, 0>, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel<32, 2048, 0>, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + + + constexpr int CHUNK_SIZE = 1024; // 1 KiB + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int NUM_THREADS = 512; // for big kernel + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // 16 + constexpr int CHUNKS_PER_BLOCK = 64; // 16 * 32 = 512 + const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk + const uint64_t total_chunks = bytes_len / CHUNK_SIZE; + const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); // 16384 blocks + CUDA_CHECK(cudaFuncSetAttribute( + blake3_block_reduce_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100)); + + uint8_t* d_bytes = const_cast(d_data); + uint32_t* d_words = reinterpret_cast(d_bytes);; // alias + uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32 + + // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch. + + // TODO: use thrust + cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t)); // 512 KiB + + // ============= launch big kernel ============= + dim3 grid_big(num_blocks); + dim3 block_big(NUM_THREADS); + uint64_t base_chunk_counter = 0ull; + + blake3_block_reduce_kernel + <<>>( + d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks); + + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + + if (num_blocks == 1) { + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) *root_out = host_root; + else { + // 简单打印 + printf("root CV:"); + for (int i=0;i<8;++i) + printf(" %08x", host_root[i]); + printf("\n"); + } + + CUDA_CHECK(cudaFree(d_blockCV)); + CUDA_CHECK(cudaFree(d_bytes)); + return; + } + + // the first round of tiny kernel + // 1) 16384 output reduce -> 8 + uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32 + { + const int N = 16384; // total number + const int TILE = 2048; + const int grid = (N + TILE - 1) / TILE; // = 8 + const int block = 512; + const size_t smem_bytes = (size_t)(TILE) * 8u * sizeof(uint32_t); // 2048×8×4 = 64 KiB + + cudaMalloc(&d_mid_out, (size_t)8 * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel<512, 2048, 0> + <<>>(d_blockCV /*in: 16384×8 x 4*/, + d_mid_out /*out: 8×8*/, N); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + } + + // second round + uint32_t* d_root_cv = nullptr; + { + const int N = 8; + const int TILE = 2048; // 任意 >=N 即可 + const int grid = 1; + const int block = 32; // 32 线程够用 + const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t); // 8 x 8 x 4 = 8 x 32 B + + cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel<32, 2048, 0> + <<>>(d_mid_out /*in: 8×8*/, + d_root_cv /*out: 1×8*/, N); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + } + + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root_cv, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) { + *root_out = host_root; + } else { + printf("root CV:"); + for (int i=0;i<8;++i) printf(" %08x", host_root[i]); + printf("\n"); + } + + // clear + CUDA_CHECK(cudaFree(d_mid_out)); + CUDA_CHECK(cudaFree(d_root_cv)); + CUDA_CHECK(cudaFree(d_blockCV)); + // CUDA_CHECK(cudaFree(d_bytes)); +} \ No newline at end of file diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py index 4e49107..a8ecbd1 100644 --- a/benchmark/test_gpu.py +++ b/benchmark/test_gpu.py @@ -24,7 +24,7 @@ torch.cuda.synchronize() # 3) 正式计时:这测的是“端到端吞吐”(包含 H2D) -repeat = 5 # 1GiB × 5 已经很重,按机器调整 +repeat = 3 # 1GiB × 5 已经很重,按机器调整 t0 = time.perf_counter() for _ in range(repeat): cv_hex = fh.blake3_gpu_sm70_hex(d.data_ptr(), d.numel(), stream) diff --git a/csrc/blake3_sm70.cu b/csrc/blake3_sm70.cu index 4a98b49..bb70888 100644 --- a/csrc/blake3_sm70.cu +++ b/csrc/blake3_sm70.cu @@ -35,8 +35,8 @@ enum : uint32_t { __device__ __noinline__ uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) { uint32_t f = 0; - if (block_idx_in_chunk == 0) f |= FLAG_CHUNK_START; - if (block_idx_in_chunk == nblocks_in_chunk - 1) f |= FLAG_CHUNK_END; + f |= (uint32_t)-(block_idx_in_chunk==0) & FLAG_CHUNK_START; + f |= (uint32_t)-(block_idx_in_chunk==nblocks_in_chunk-1) & FLAG_CHUNK_END; if (is_root_chunk) f |= FLAG_ROOT; // only this block in msg, or this is root return f; } @@ -231,8 +231,16 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row v ^= (li == 14) ? block_len: 0u; v ^= (li == 15) ? flags : 0u; + // 把索引写成 li = q + 4*rq;对角步先做置换 li' = (rq<<2) | ((q+rq)&3) + int q = (li & 3); + int rq = (li >> 2); + int li_diag = (rq << 2) | ((q + rq) & 3); + int li_undo = (rq << 2) | ((q - rq) & 3); + int gi_col = q; // 0..3 + int gi_diag = (li_diag & 3); // 0..3 + // ===== 7 rounds ===== - #pragma unroll 1 // 不要unroll + #pragma unroll 4 for (int r = 0; r < 7; ++r) { // inside this loop, each lane will do one job // 16 lane will execute 16 x 2 operations @@ -246,20 +254,15 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16); // 本 quartet 的 i ∈ {0,1,2,3},列步用 msg 索引 0..7(两两为一对) - int gi = (li & 3); // 0..3 - uint32_t mx = msg_rk(m_lane, r, 2*gi + 0, mask16); - uint32_t my = msg_rk(m_lane, r, 2*gi + 1, mask16); + + uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16); v = G_update_role(v, vb, vc, vd, mx, my, role); } // ---- 对角步 ---- { - // 把索引写成 li = q + 4*rq;对角步先做置换 li' = (rq<<2) | ((q+rq)&3) - int q = (li & 3); - int rq = (li >> 2); - int li_diag = (rq << 2) | ((q + rq) & 3); - // 在“对角置换域”取到当前 v 值 uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16); @@ -269,14 +272,14 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16); // 对角步的 4 组 G 使用本轮消息对的后半(索引 8..15) - int gi = (li_diag & 3); // 0..3 - uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi + 0, mask16); - uint32_t my = msg_rk(m_lane, r, 8 + 2*gi + 1, mask16); + + uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16); uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role); // 反置换回原位:li_undo = (rq<<2) | ((q - rq) & 3) - int li_undo = (rq << 2) | ((q - rq) & 3); + // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16); v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16); } @@ -292,15 +295,6 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row } // 由 lane0 / lane16 收集 8×u32 输出 - - // This will trigger problem! - // if (li == 0) { // only thread 0 and thread 16 will do this. - // #pragma unroll - // for (int j = 0; j < 8; ++j) { - // out_cv[j] = __shfl_sync(mask16, cv_word, j, 16); - // } - // } - #pragma unroll for (int j = 0; j < 8; ++j) { uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用 @@ -328,7 +322,7 @@ __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint3 template // pad shared memory __global__ void blake3_block_reduce_kernel(uint32_t* d_input, uint32_t* block_cvs, @@ -834,8 +828,12 @@ void blake3_block_reduce_sm70(const uint8_t* d_data, const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk const uint64_t total_chunks = bytes_len / CHUNK_SIZE; const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); // 16384 blocks + + constexpr int pad_chunk = 16; + constexpr int pad_cv = 0; + CUDA_CHECK(cudaFuncSetAttribute( - blake3_block_reduce_kernel, + blake3_block_reduce_kernel, cudaFuncAttributePreferredSharedMemoryCarveout, 100)); uint8_t* d_bytes = const_cast(d_data); @@ -852,7 +850,7 @@ void blake3_block_reduce_sm70(const uint8_t* d_data, dim3 block_big(NUM_THREADS); uint64_t base_chunk_counter = 0ull; - blake3_block_reduce_kernel + blake3_block_reduce_kernel <<>>( d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks); From 4be8258f5e82aab4e57e8b70a604ebb9361d8aa0 Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Mon, 6 Oct 2025 19:46:55 +0800 Subject: [PATCH 08/20] tune size --- README.md | 3 ++- csrc/blake3_sm70.cu | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 0945c67..576c3e7 100644 --- a/README.md +++ b/README.md @@ -28,4 +28,5 @@ python benchmark/test_script.py + 10.5 - v1 - [commit:4f31a2c55551965a2e5f048565f021c4551554ae]: 1709.34 MiB/s + 10.5 - v2 - [commit:2e71051367a9533af36f6c54a46876e20bc0bab5]: 2044.49 MiB/s + 10.6 - v3 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s (False Result) -+ 10.6 - v4 - [commit:9781ad9759d13e0e482162d57c68b65c946f4ff9]: 17069.23 MiB/s \ No newline at end of file ++ 10.6 - v4 - [commit:9781ad9759d13e0e482162d57c68b65c946f4ff9]: 17069.23 MiB/s ++ 10.6 - v5 - [commit:]: 34261.37 MiB/s \ No newline at end of file diff --git a/csrc/blake3_sm70.cu b/csrc/blake3_sm70.cu index bb70888..e61b85b 100644 --- a/csrc/blake3_sm70.cu +++ b/csrc/blake3_sm70.cu @@ -331,7 +331,7 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, int total_chunks) { // NUM_WARPS also stands for NUM_CHUKNS per block constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // consider it as 16 - constexpr int CHUNKS_PROCEED = 64; + constexpr int CHUNKS_PROCEED = CHUNKS_PER_BLOCK; static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned"); static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4"); @@ -820,11 +820,11 @@ void blake3_block_reduce_sm70(const uint8_t* d_data, cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + constexpr int CHUNKS_PER_BLOCK = 32; // 16 * 32 = 512 constexpr int CHUNK_SIZE = 1024; // 1 KiB constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 - constexpr int NUM_THREADS = 512; // for big kernel - constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // 16 - constexpr int CHUNKS_PER_BLOCK = 64; // 16 * 32 = 512 + constexpr int NUM_THREADS = CHUNKS_PER_BLOCK * 512 / 64; // for big kernel, 512 or 256 + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // 16 or 8 const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk const uint64_t total_chunks = bytes_len / CHUNK_SIZE; const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); // 16384 blocks From 4c388a3a80fdb4c59f9c96da9dac613971b27cd4 Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Tue, 7 Oct 2025 13:34:52 +0800 Subject: [PATCH 09/20] finish basic sm70_sm80 dev --- README.md | 5 +- backup_deprecated/blake3_sm70_v2.cu | 18 +- .../blake3_sm80_v3.cu | 151 ++- benchmark/perf.txt | 1 + benchmark/test_gpu.py | 8 +- csrc/binding.cpp | 4 +- csrc/blake3.h | 2 +- csrc/blake3_sm70_sm80.cu | 993 ++++++++++++++++++ requirements.txt | 2 + setup.py | 65 +- 10 files changed, 1168 insertions(+), 81 deletions(-) rename csrc/blake3_sm70.cu => backup_deprecated/blake3_sm80_v3.cu (90%) create mode 100644 benchmark/perf.txt create mode 100644 csrc/blake3_sm70_sm80.cu diff --git a/README.md b/README.md index 576c3e7..c435acf 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,9 @@ Proposal (from Google Drive): https://docs.google.com/document/d/10B3_nT8xF49vLg conda create -n ceg5206 python=3.12 pip install -r requirements.txt +# clone the cutlass repository +git clone https://github.com/NVIDIA/cutlass.git ~/cutlass --depth 1 + # install the cpp source file python setup.py install @@ -29,4 +32,4 @@ python benchmark/test_script.py + 10.5 - v2 - [commit:2e71051367a9533af36f6c54a46876e20bc0bab5]: 2044.49 MiB/s + 10.6 - v3 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s (False Result) + 10.6 - v4 - [commit:9781ad9759d13e0e482162d57c68b65c946f4ff9]: 17069.23 MiB/s -+ 10.6 - v5 - [commit:]: 34261.37 MiB/s \ No newline at end of file ++ 10.6 - v5 - [commit:4be8258f5e82aab4e57e8b70a604ebb9361d8aa0]: 34261.37 MiB/s \ No newline at end of file diff --git a/backup_deprecated/blake3_sm70_v2.cu b/backup_deprecated/blake3_sm70_v2.cu index fc418c2..e61b85b 100644 --- a/backup_deprecated/blake3_sm70_v2.cu +++ b/backup_deprecated/blake3_sm70_v2.cu @@ -322,7 +322,7 @@ __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint3 template // pad shared memory __global__ void blake3_block_reduce_kernel(uint32_t* d_input, uint32_t* block_cvs, @@ -331,7 +331,7 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, int total_chunks) { // NUM_WARPS also stands for NUM_CHUKNS per block constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // consider it as 16 - constexpr int CHUNKS_PROCEED = 64; + constexpr int CHUNKS_PROCEED = CHUNKS_PER_BLOCK; static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned"); static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4"); @@ -820,16 +820,20 @@ void blake3_block_reduce_sm70(const uint8_t* d_data, cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + constexpr int CHUNKS_PER_BLOCK = 32; // 16 * 32 = 512 constexpr int CHUNK_SIZE = 1024; // 1 KiB constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 - constexpr int NUM_THREADS = 512; // for big kernel - constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // 16 - constexpr int CHUNKS_PER_BLOCK = 64; // 16 * 32 = 512 + constexpr int NUM_THREADS = CHUNKS_PER_BLOCK * 512 / 64; // for big kernel, 512 or 256 + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // 16 or 8 const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk const uint64_t total_chunks = bytes_len / CHUNK_SIZE; const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); // 16384 blocks + + constexpr int pad_chunk = 16; + constexpr int pad_cv = 0; + CUDA_CHECK(cudaFuncSetAttribute( - blake3_block_reduce_kernel, + blake3_block_reduce_kernel, cudaFuncAttributePreferredSharedMemoryCarveout, 100)); uint8_t* d_bytes = const_cast(d_data); @@ -846,7 +850,7 @@ void blake3_block_reduce_sm70(const uint8_t* d_data, dim3 block_big(NUM_THREADS); uint64_t base_chunk_counter = 0ull; - blake3_block_reduce_kernel + blake3_block_reduce_kernel <<>>( d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks); diff --git a/csrc/blake3_sm70.cu b/backup_deprecated/blake3_sm80_v3.cu similarity index 90% rename from csrc/blake3_sm70.cu rename to backup_deprecated/blake3_sm80_v3.cu index e61b85b..fdf1716 100644 --- a/csrc/blake3_sm70.cu +++ b/backup_deprecated/blake3_sm80_v3.cu @@ -1,10 +1,22 @@ +#include "cute/numeric/int.hpp" #include #include -#include #include #include +#include +#include +#include + +#if __CUDA_ARCH__ >= 800 + #include +#endif + +#include +#include +#include + #define WARP_SIZE 32 #define LDST128BITS(value) (reinterpret_cast(&(value))[0]) @@ -17,6 +29,20 @@ } \ } while(0) + +using namespace cute; + +using vec_t = cute::uint128_t; // one time loading 16 B + +#if __CUDA_ARCH__ >= 800 + // SM80 branch + using Atom = cute::Copy_Atom, vec_t>; +#else + // Volta/Turing(SM70/75)general copy + using Atom = cute::Copy_Atom, vec_t>; + // 也可用更保守的:using Atom = cute::Copy_Atom, vec_t>; +#endif + __host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = { 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u @@ -317,11 +343,13 @@ __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint3 blake3_state_to_cv(st, out_cv); } + + // ============ Big kernel: 16 WARPS in total ============ // grid: (chunks / 64), thread: (512,) -template 32 KB const int PAD_CHUNK=16, const int PAD_CV=0> // pad shared memory __global__ void blake3_block_reduce_kernel(uint32_t* d_input, @@ -340,19 +368,19 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, // 8 x 4 x 64 = 2 KiB shared memory in sum __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV]; // avoid bank conflict + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int VEC_ELEMS = 4; // uint4, 16B + constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB) + constexpr int WARPS_PER_CTA = NUM_WARPS; // 16 + // 4 bytes x 256 x 64 = 64 KiB shared memory. - __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][(CHUNK_SIZE / 4) + PAD_CHUNK]; // [64][256] + __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][WORDS_PER_CHUNK + PAD_CHUNK]; // [64][256+PAD] const int tid = threadIdx.x; const int bx = blockIdx.x; const int warp_id = tid / WARP_SIZE; const int lane_id = tid % WARP_SIZE; - constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 - constexpr int VEC_ELEMS = 4; // uint4 - constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB) - constexpr int WARPS_PER_CTA = NUM_WARPS; // 16 - // ============== STAGE 1: Coalsced Global Memory Loading ============== const int tile_id = blockIdx.x; const int tile_base = tile_id * CHUNKS_PER_BLOCK; // which chunk do this block start loading @@ -363,59 +391,82 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, } if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK; + // use CuTe for loading + + auto make_smem_tensor = [&](int chunk_local /* The chunk id on shared memory */) { + // smem index:chunk_smem[chunk_local][0] + // length: (WORDS_PER_CHUNK + PAD_CHUNK) + uint32_t* s_u32 = &chunk_smem[chunk_local][0]; + vec_t* s_vec = reinterpret_cast(s_u32); // each element is 16B + return make_tensor(make_smem_ptr(s_vec), + make_shape(Int{}), // 64 x 4 = 256, one row of shared memory + make_stride(Int<1>{})); // continuous + }; + + // shared memory 256 items = 64 x vec_t(4) + // thread layout: 32 threads + // thread 0: 0, 32 + // thread 1: 1, 33 + // thread 32: 31,63 + // so we load data in coalsced mode + using Atom = cute::Copy_Atom, vec_t>; + + // thread layout. (lane,0)->idx=2*lane,(lane,1)->idx=2*lane+1 + auto thr_layout = make_layout( + make_shape(Int<2>{}), + make_stride(Int<1>{}) + ); + + auto tile_layout = make_layout( + make_shape(Int{}, Int<2>{}), // (32,2) + make_stride(Int<1>{}, Int{}) // idx = lane + k2*32 + ); + + // build the real copy + TiledCopy tcopy = make_tiled_copy(Atom{}, thr_layout, tile_layout); + + // this sentence will load + // auto thr_copy = local_partition(tcopy, lane_id); + + for (int ldt = 0; ldt < 4; ldt++) { // each warp load 4 chunks int chunk_local = ldt * WARPS_PER_CTA + warp_id; // ldt*16 + warp -> start chunk int chunk_global = tile_base + chunk_local; // global chunk idx // the pointer for shared memory - uint32_t* s_u32 = &chunk_smem[chunk_local][0]; + auto smem_vec1d = make_smem_tensor(chunk_local); // 64 x uint128_t // only read from global, when it's valid // or, we fill it with 0 - if (chunk_local < valid_chunks) { - const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK; - - // move 16 bytes -> 128 bits each time - // each thread will load 2 x 16 bytes - // so 32 threads will load 32 x 2 x 16 = 1024 B - const uint4* __restrict__ g4 = reinterpret_cast(g_u32); - uint4* __restrict__ s4 = reinterpret_cast(s_u32); - - // idx = lane_id (0..31) 与 lane_id+32 (32..63) - int idx0 = lane_id; // 0..31 - int idx1 = lane_id + WARP_SIZE; // 32..63 - - // thread 0 -> 0, 32 - // thread 1 -> 1, 33 - // ... - // thread 31 -> 31, 63 - // so the global memory access is coalsced - - // notice, we load 16 bytes a time. the index is compressed - // tid 0 -> 0, tid 1 -> 16 - // tid 0 -> 32 x 16, tid 2 -> 32 x 16 + 16 - - uint4 v0 = g4[idx0]; // still, this step we load from gmem, in 4 elements aligned. - uint4 v1 = g4[idx1]; - - s_u32[4*idx0 + 0] = v0.x; // when load into shared mem, do manually - s_u32[4*idx0 + 1] = v0.y; - s_u32[4*idx0 + 2] = v0.z; - s_u32[4*idx0 + 3] = v0.w; - - s_u32[4*idx1 + 0] = v1.x; - s_u32[4*idx1 + 1] = v1.y; - s_u32[4*idx1 + 2] = v1.z; - s_u32[4*idx1 + 3] = v1.w; + const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK; + const vec_t* g_vec_base = reinterpret_cast(g_u32); // 16 B each time + + // the src is 64 x 4 = 256, so is dest + auto gmem_vec1d = make_tensor(make_gmem_ptr(g_vec_base), + make_shape(Int{}), // 64 x uint128_t + make_stride(Int<1>{})); + bool g_valid = (chunk_local < valid_chunks); + + auto smem_vec2d = make_tensor(smem_vec1d.data(), tile_layout); + auto gmem_vec2d = make_tensor(gmem_vec1d.data(), tile_layout); + + // now we will load this + auto tCg = local_partition(gmem_vec2d, tile_layout, lane_id); + auto tCs = local_partition(smem_vec2d, tile_layout, lane_id); + + // launch the copy inst. + if (g_valid) { + // gmem load → smem store:twice 16B + copy(tcopy, tCg, tCs); } else { - uint4* s4 = reinterpret_cast(s_u32); - int idx0 = lane_id; - int idx1 = lane_id + WARP_SIZE; - s4[idx0] = make_uint4(0u,0u,0u,0u); - s4[idx1] = make_uint4(0u,0u,0u,0u); + for (int i = 0; i < size(tCs); ++i) { + *reinterpret_cast(raw_pointer_cast(&tCs(i))) = uint128_t(0, 0); + } } } + cp_async_fence(); + cp_async_wait<0>(); __syncthreads(); // sync all warps diff --git a/benchmark/perf.txt b/benchmark/perf.txt new file mode 100644 index 0000000..8656b7e --- /dev/null +++ b/benchmark/perf.txt @@ -0,0 +1 @@ +10.6 V100: 53927.17 MiB/s \ No newline at end of file diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py index a8ecbd1..52b0977 100644 --- a/benchmark/test_gpu.py +++ b/benchmark/test_gpu.py @@ -6,7 +6,7 @@ GiB = 1024*1024*1024 # bytes -> 1 GiB -cpu = torch.empty(GiB * 1, dtype=torch.uint8, pin_memory=True) +cpu = torch.empty(GiB * 1, dtype=torch.uint8) cpu[:] = ord('A') # 一次性 H2D(可重用) @@ -20,14 +20,14 @@ # 2) 预热,触发 JIT/驱动初始化,避免首轮偏慢 for _ in range(2): - fh.blake3_gpu_sm70_hex(d.data_ptr(), d.numel(), stream) + fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) torch.cuda.synchronize() # 3) 正式计时:这测的是“端到端吞吐”(包含 H2D) -repeat = 3 # 1GiB × 5 已经很重,按机器调整 +repeat = 5 # 1GiB × 5 已经很重,按机器调整 t0 = time.perf_counter() for _ in range(repeat): - cv_hex = fh.blake3_gpu_sm70_hex(d.data_ptr(), d.numel(), stream) + cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) torch.cuda.synchronize() t1 = time.perf_counter() diff --git a/csrc/binding.cpp b/csrc/binding.cpp index d32b172..43290c4 100644 --- a/csrc/binding.cpp +++ b/csrc/binding.cpp @@ -178,7 +178,7 @@ static std::string blake3_gpu_root_hex(uint64_t device_ptr, std::array root{}; { GilRelease _g; - blake3_block_reduce_sm70(d_data, nbytes, &root, stream); + blake3_block_reduce_sm70_sm80(d_data, nbytes, &root, stream); } std::string b = cv_words_to_bytes_le(root); return bytes_to_hex(reinterpret_cast(b.data()), b.size()); @@ -206,7 +206,7 @@ PYBIND11_MODULE(flashashing, m) { // Return the 32-byte *root chaining value* (CV) computed on GPU for the given data. // NOTE: This is not the standard BLAKE3 digest/XOF output. It's the root CV. // )pbdoc"); - m.def("blake3_gpu_sm70_hex", + m.def("blake3_gpu_sm70_sm80_hex", &blake3_gpu_root_hex, py::arg("d_data"), py::arg("nbytes"), py::arg("stream")=0, R"pbdoc( diff --git a/csrc/blake3.h b/csrc/blake3.h index 52e762a..27509c9 100644 --- a/csrc/blake3.h +++ b/csrc/blake3.h @@ -30,7 +30,7 @@ std::string bytes_to_hex(const uint8_t *data, size_t len); // ============== GPU implementations ================ -void blake3_block_reduce_sm70(const uint8_t* d_data, +void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, uint64_t bytes_len, std::array* root_out = nullptr, cudaStream_t stream = 0); \ No newline at end of file diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu new file mode 100644 index 0000000..95b282f --- /dev/null +++ b/csrc/blake3_sm70_sm80.cu @@ -0,0 +1,993 @@ + +#include "cute/numeric/int.hpp" +#include +#include +#include +#include + +#include +#include +#include + +#if __CUDA_ARCH__ >= 800 + #include +#endif + +#include +#include +#include + +#define WARP_SIZE 32 +#define LDST128BITS(value) (reinterpret_cast(&(value))[0]) + +#define CUDA_CHECK(expr) do { \ + cudaError_t _e = (expr); \ + if (_e != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s at %s:%d: %s\n", \ + #expr, __FILE__, __LINE__, cudaGetErrorString(_e));\ + std::abort(); \ + } \ + } while(0) + + +using namespace cute; + +using vec_t = cute::uint128_t; // one time loading 16 B + +#if __CUDA_ARCH__ >= 800 + // SM80 branch + using Atom = cute::Copy_Atom, vec_t>; +#else + // Volta/Turing(SM70/75)general copy + using Atom = cute::Copy_Atom, vec_t>; +#endif + +__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = { + 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, + 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u +}; + +enum : uint32_t { + FLAG_CHUNK_START = 1u << 0, + FLAG_CHUNK_END = 1u << 1, + FLAG_PARENT = 1u << 2, + FLAG_ROOT = 1u << 3, + FLAG_KEYED_HASH = 1u << 4, + FLAG_DERIVE_KEY_CONTEXT = 1u << 5, + FLAG_DERIVE_KEY_MATERIAL= 1u << 6, +}; + +__device__ __noinline__ +uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) { + uint32_t f = 0; + f |= (uint32_t)-(block_idx_in_chunk==0) & FLAG_CHUNK_START; + f |= (uint32_t)-(block_idx_in_chunk==nblocks_in_chunk-1) & FLAG_CHUNK_END; + if (is_root_chunk) f |= FLAG_ROOT; // only this block in msg, or this is root + return f; +} + +__device__ __forceinline__ +uint32_t blake3_parent_flags(bool is_root_parent) { + return FLAG_PARENT | (is_root_parent ? FLAG_ROOT : 0); +} + +// ---- 小工具 ---- +__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { +#if defined(__CUDA_ARCH__) + return __funnelshift_r(x, x, n); +#else + return (x >> n) | (x << (32 - n)); // host 路径 +#endif +} + +__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) { +#if defined(__CUDA_ARCH__) + const uint4 v = *reinterpret_cast(src); + dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w; +#else + std::memcpy(dst, src, 16); +#endif +} + +__host__ __device__ void blake3_compress_words_7r( + const uint32_t block_words[16], // 64B -> shared memory + const uint32_t cv[8], // 8×u32 -> shared memory + uint64_t chunk_counter, // 64-bit + uint32_t block_len, // [0..64] + uint32_t flags, // CHUNK_START/END/PARENT/ROOT… + uint32_t out_state[16]) // 返回 16×u32 状态向量(按规范) +{ + // TODO: 根据 BLAKE3(基于 BLAKE2s)的 G/round 实现7轮 + // 这里先给占位:将 IV+cv 混合到 out_state,真实实现请替换 +#pragma unroll + for (int i = 0; i < 8; ++i) + out_state[i] = cv[i]; +#pragma unroll + for (int i = 0; i < 8; ++i) + out_state[8+i] = BLAKE3_IV[i]; + + out_state[12] ^= (uint32_t)chunk_counter; + out_state[13] ^= (uint32_t)(chunk_counter >> 32); + out_state[14] ^= block_len; + out_state[15] ^= flags; + + // so far, the block_words are still pointers. + // now we load it into kernel, as pointed out by ncu profile + uint32_t block_reg_1[4]; + +#pragma unroll + for (int i = 0; i < 16; i += 4) { // the gap is 4 + // load_u128_u32x4(block_words + i, block_reg_1); + out_state[i] ^= block_words[i]; + // 做一点点搅动(占位) + out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31); + } +} + +// 从 out_state 派生新的 CV(按规范应取 state[0..7] ^ state[8..15]) +__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){ +#pragma unroll + for (int i = 0; i < 8; ++i) + out_cv[i] = st[i] ^ st[8+i]; +} + +// swap-table +// BLAKE3 message schedule: rows are P^r, r=0..6. +// Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) +// https://www.cse.unsw.edu.au/~cs4601/refs/papers/blake3.pdf +__device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = { + // r = 0: identity + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + // r = 1: P + { 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8 }, + // r = 2: P∘P + { 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1 }, + // r = 3 + { 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6 }, + // r = 4 + { 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4 }, + // r = 5 + { 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7 }, + // r = 6 + { 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13 }, +}; + +// get the "r" round, "k" message, it is broadcasted from m[li] lane. li = k +__device__ __forceinline__ uint32_t msg_rk(uint32_t m_lane, int round, int k, unsigned mask16) { + int src = B3_MSG_SCHEDULE[round][k]; + return __shfl_sync(mask16, m_lane, src, 16); +} + +__device__ __noinline__ +uint32_t G_update_role(uint32_t v_self, uint32_t v_b, uint32_t v_c, uint32_t v_d, + uint32_t mx, uint32_t my, int role) +{ + // 按 BLAKE2s 32-bit 的 G 序列算出 a',b',c',d',最后返回“当前 role”的那个值 + uint32_t a = v_self, b = v_b, c = v_c, d = v_d; + + // a = a + b + mx; + // d ^= a; + // d >>>= 16 + a = a + b + mx; + d ^= a; + d = rotr32(d, 16); + + // c = c + d; + // b ^= c; + // b >>>= 12 + c = c + d; + b ^= c; + b = rotr32(b, 12); + + // a = a + b + my; + // d ^= a; + // d >>>= 8 + a = a + b + my; + d ^= a; + d = rotr32(d, 8); + + // c = c + d; + // b ^= c; + // b >>>= 7 + c = c + d; + b ^= c; + b = rotr32(b, 7); + + // role choice: + switch (role) { + case 0: return a; + case 1: return b; + case 2: return c; + default: return d; + } +} + +// notice that, this function will proceed 2 chunks, each time. +// - chunk_words_row: current chunk +// - out_cv: written by lane 0, or lane 16 +__device__ __noinline__ +void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row, // 256×u32 -> 1024 Bytes, from shared memory + // so the chunks_row += 2 as gap + int chunk_len_bytes, + uint64_t chunk_counter, + uint32_t out_cv[8], + unsigned mask16) { + // computing index + int lane = threadIdx.x & 31; // lane_id: 0-31 + int sub = lane >> 4; // 0/1 + int li = lane & 15; // 0..15, abstract lane id. for example, lane 16 will be li=0 + int role = li & 3; // a/b/c/d role + int base = (sub << 4); // 0 or 16 the absolute base + + const int nblocks = (chunk_len_bytes + 63) >> 6; // ceil(chunk_len/64) + + int warp_id = threadIdx.x / WARP_SIZE; + + // initialize + uint32_t cv_word = 0; + if (li < 8) cv_word = BLAKE3_IV[li]; + + // process all blocks + // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks + // each block has 64B -> 16 x u32 + for (int b = 0; b < nblocks; ++b) { + // each lane holds one u32, + // 16 lane will hold 16 x 4 = 64 B -> it's block + // the another 16 lane will hold opposite 64 B + const uint32_t m_lane = chunk_words_row[b * 16 + li]; + + // 初始化 v:v[0..7]=cv, v[8..11]=IV,v[12..15]^=t/len/flags + // 先把“自己的那个索引”的初值准备好: + uint32_t v = (li < 8) + ? cv_word // v[i](i<8) + : BLAKE3_IV[li - 8]; // v[8..15] ← IV + + // 计数器/长度/标志(按 BLAKE3 规范) + const uint32_t t0 = (uint32_t)chunk_counter; + const uint32_t t1 = (uint32_t)(chunk_counter >> 32); + const int remain = chunk_len_bytes - (b << 6); + const uint32_t block_len = (remain >= 64) ? 64u : (uint32_t)remain; + + const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false); + + // 只在 12..15 四个索引上异或相应域(不分支,用谓词掩码) + v ^= (li == 12) ? t0 : 0u; + v ^= (li == 13) ? t1 : 0u; + v ^= (li == 14) ? block_len: 0u; + v ^= (li == 15) ? flags : 0u; + + // 把索引写成 li = q + 4*rq;对角步先做置换 li' = (rq<<2) | ((q+rq)&3) + int q = (li & 3); + int rq = (li >> 2); + int li_diag = (rq << 2) | ((q + rq) & 3); + int li_undo = (rq << 2) | ((q - rq) & 3); + int gi_col = q; // 0..3 + int gi_diag = (li_diag & 3); // 0..3 + + // ===== 7 rounds ===== + #pragma unroll 4 + for (int r = 0; r < 7; ++r) { + // inside this loop, each lane will do one job + // 16 lane will execute 16 x 2 operations + // in sequential-programming, will do 8 operation + + // ---- 列步(quartet: {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15})---- + { + // 取同 quartet 的 b/c/d(基于当前 v) + uint32_t vb = __shfl_xor_sync(mask16, v, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16); + + // 本 quartet 的 i ∈ {0,1,2,3},列步用 msg 索引 0..7(两两为一对) + + uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16); + + v = G_update_role(v, vb, vc, vd, mx, my, role); + } + + // ---- 对角步 ---- + { + // 在“对角置换域”取到当前 v 值 + uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16); + + // 在该域内做“列步”同样的四邻取值 + uint32_t vb = __shfl_xor_sync(mask16, v_diag, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v_diag, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16); + + // 对角步的 4 组 G 使用本轮消息对的后半(索引 8..15) + + uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16); + + uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role); + + // 反置换回原位:li_undo = (rq<<2) | ((q - rq) & 3) + + // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16); + v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16); + } + } // 7 rounds end + + // 派生新的 CV:cv[i] = v[i] ^ v[i+8](仅 li=0..7 生效) + uint32_t vip8_all = __shfl_sync(mask16, v, li ^ 8, 16); + if (li < 8) { + cv_word = v ^ vip8_all; + } + + // 下一块继续(本函数内 16 个 block 串行) + } + + // 由 lane0 / lane16 收集 8×u32 输出 + #pragma unroll + for (int j = 0; j < 8; ++j) { + uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用 + if (li == 0) out_cv[j] = wj; // 仅 lane0 落盘 + } +} + +__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){ + uint32_t msg[16]; +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[i] = L[i]; + } +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[8+i] = R[i]; + } + uint32_t st[16]; + blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st); + blake3_state_to_cv(st, out_cv); +} + + + +// ============ Big kernel: 16 WARPS in total ============ +// grid: (chunks / 64), thread: (512,) +template 32 KB + const int PAD_CHUNK=16, + const int PAD_CV=0> // pad shared memory +__global__ void blake3_block_reduce_kernel(uint32_t* d_input, + uint32_t* block_cvs, + int chunk_len_bytes, + uint64_t base_chunk_counter, + int total_chunks) { + // NUM_WARPS also stands for NUM_CHUKNS per block + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // consider it as 16 + constexpr int CHUNKS_PROCEED = CHUNKS_PER_BLOCK; + + static_assert(((CHUNK_SIZE/4)+PAD_CHUNK) % 4 == 0, "chunk_smem row must be 16B aligned"); + static_assert(((8+PAD_CV)) % 4 == 0, "cv_smem row should be 16B aligned if using uint4"); + + // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total + // 8 x 4 x 64 = 2 KiB shared memory in sum + __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV]; // avoid bank conflict + + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int VEC_ELEMS = 4; // uint4, 16B + constexpr int VEC_PER_CHUNK = (CHUNK_SIZE) / (sizeof(uint32_t) * VEC_ELEMS); // 64 (per 1KiB) + constexpr int WARPS_PER_CTA = NUM_WARPS; // 16 + + // 4 bytes x 256 x 64 = 64 KiB shared memory. + __shared__ __align__(16) uint32_t chunk_smem[CHUNKS_PROCEED][WORDS_PER_CHUNK + PAD_CHUNK]; // [64][256+PAD] + + const int tid = threadIdx.x; + const int bx = blockIdx.x; + const int warp_id = tid / WARP_SIZE; + const int lane_id = tid % WARP_SIZE; + + // ============== STAGE 1: Coalsced Global Memory Loading ============== + const int tile_id = blockIdx.x; + const int tile_base = tile_id * CHUNKS_PER_BLOCK; // which chunk do this block start loading + + int valid_chunks = total_chunks - tile_base; + if (valid_chunks <= 0) { + return; // overflow + } + if (valid_chunks > CHUNKS_PER_BLOCK) valid_chunks = CHUNKS_PER_BLOCK; + + // use CuTe for loading + + auto make_smem_tensor = [&](int chunk_local /* The chunk id on shared memory */) { + // smem index:chunk_smem[chunk_local][0] + // length: (WORDS_PER_CHUNK + PAD_CHUNK) + uint32_t* s_u32 = &chunk_smem[chunk_local][0]; + vec_t* s_vec = reinterpret_cast(s_u32); // each element is 16B + return make_tensor(make_smem_ptr(s_vec), + make_shape(Int{}), // 64 x 4 = 256, one row of shared memory + make_stride(Int<1>{})); // continuous + }; + + // shared memory 256 items = 64 x vec_t(4) + // thread layout: 32 threads + // thread 0: 0, 32 + // thread 1: 1, 33 + // thread 32: 31,63 + // so we load data in coalsced mode + using Atom = cute::Copy_Atom, vec_t>; + + // thread layout. (lane,0)->idx=2*lane,(lane,1)->idx=2*lane+1 + auto thr_layout = make_layout( + make_shape(Int<2>{}), + make_stride(Int<1>{}) + ); + + auto tile_layout = make_layout( + make_shape(Int{}, Int<2>{}), // (32,2) + make_stride(Int<1>{}, Int{}) // idx = lane + k2*32 + ); + + // build the real copy + TiledCopy tcopy = make_tiled_copy(Atom{}, thr_layout, tile_layout); + + // this sentence will load + // auto thr_copy = local_partition(tcopy, lane_id); + + + for (int ldt = 0; ldt < 4; ldt++) { + // each warp load 4 chunks + int chunk_local = ldt * WARPS_PER_CTA + warp_id; // ldt*16 + warp -> start chunk + int chunk_global = tile_base + chunk_local; // global chunk idx + + // the pointer for shared memory + auto smem_vec1d = make_smem_tensor(chunk_local); // 64 x uint128_t + + // only read from global, when it's valid + // or, we fill it with 0 + const uint32_t* g_u32 = d_input + (size_t)chunk_global * WORDS_PER_CHUNK; + const vec_t* g_vec_base = reinterpret_cast(g_u32); // 16 B each time + + // the src is 64 x 4 = 256, so is dest + auto gmem_vec1d = make_tensor(make_gmem_ptr(g_vec_base), + make_shape(Int{}), // 64 x uint128_t + make_stride(Int<1>{})); + bool g_valid = (chunk_local < valid_chunks); + + auto smem_vec2d = make_tensor(smem_vec1d.data(), tile_layout); + auto gmem_vec2d = make_tensor(gmem_vec1d.data(), tile_layout); + + // now we will load this + auto tCg = local_partition(gmem_vec2d, tile_layout, lane_id); + auto tCs = local_partition(smem_vec2d, tile_layout, lane_id); + + // launch the copy inst. + if (g_valid) { + // gmem load → smem store:twice 16B + copy(tcopy, tCg, tCs); + } else { + for (int i = 0; i < size(tCs); ++i) { + *reinterpret_cast(raw_pointer_cast(&tCs(i))) = uint128_t(0, 0); + } + } + } + cp_async_fence(); + cp_async_wait<0>(); + + __syncthreads(); // sync all warps + + // ============== STAGE 2: Compress Leaf to 64 chain value ============== + const int pass0_valid = min(32, valid_chunks); // pass0 cover [0, 31] chunks + const int pass1_valid = max(0, valid_chunks - 32); // pass1 cover [32, 63] chunks + + __shared__ int parents_count; + if (threadIdx.x == 0) { + const int parents0 = (pass0_valid + 1) >> 1; + const int parents1 = (pass1_valid + 1) >> 1; + parents_count = parents0 + parents1; // ≤ 32 + } + __syncthreads(); + + // if (blockIdx.x==0 && threadIdx.x==0) printf("after stage1\n"); + + // this is for each warp's lane0 and lane16 written + // to decrease the register usage. + __shared__ __align__(16) uint32_t tmp_cv[CHUNKS_PER_BLOCK][8 + PAD_CV]; // 2 KiB + + // lambda function: compress this thing + auto do_big_pass = [&](int base /*0 或 32*/, int pass_valid) { + // left=base+2*warp_id, right=left+1 + const int left = base + (warp_id << 1); // base + 0,2,4,6,... + const int right = left + 1; + const int left_rel = left - base; // 0..31 + const int right_rel = right - base; // 1..32 + const bool has_left = (left_rel < pass_valid); + const bool has_right = (right_rel < pass_valid); + + // const int lane_id = threadIdx.x & 31; + const int sub = lane_id >> 4; // sub-warp: 0 or 1, lane0-15: sub-warp0; lane16-lane31: sub-warp1 + const int li = lane_id & 15; // 0..15 + const unsigned mask16= (sub == 0 ? 0x0000FFFFu : 0xFFFF0000u); + + const int chunk_local = left + sub; // sub=0→left, sub=1→right + const bool active = (sub==0 ? (left - base) < pass_valid + : (right - base) < pass_valid); + + // uint32_t my_cv[8]; + + // the left-sub-warp and right-sub-warp will execute the same code + // distinguish the index by computing, + // to avoid warp-divergence + if (active) { + // the chunk local identifies the left or right chunk, so do not worry. + const uint32_t* row = &chunk_smem[chunk_local][0]; + const uint64_t cc = base_chunk_counter + (uint64_t)(tile_base + chunk_local); + blake3_leaf_cv_simd16_onechunk(row, + chunk_len_bytes, + cc, + &tmp_cv[chunk_local][0], + mask16); + } + + __syncwarp(); // make sure two warps written into `tmp_cv` + + // now we have compute 2 chunks' cv + // merge it to a parent cv + if (lane_id == 0 && has_left) { + const uint32_t* lcv = &tmp_cv[left][0]; + uint32_t parent[8]; + if ((right - base) < pass_valid) { + const uint32_t* rcv = &tmp_cv[right][0]; + blake3_parent_cv(lcv, rcv, parent); + } else { // odd: up-flow directly + #pragma unroll + for (int j = 0 ; j < 8; ++j) + parent[j] = lcv[j]; + } + + // now, one warp computes 2 chunks, yield one parent-cv value + const int pair_idx = (base >> 1) + warp_id; // 0, 16 + warp_id + #pragma unroll + for (int j = 0; j < 8; ++j) + cv_smem[pair_idx][j] = parent[j]; + } + + __syncwarp(); // NOTICE: this is necessary! + }; // do_big_pass + + // big-pass 1: computing 0-31 chunks + do_big_pass(/*base=*/0, pass0_valid); + + // if (bx == 0) printf("Finish 1 big pass\n"); + + // big-pass 2: computing 32-63 chunks + do_big_pass(/*base=*/32, pass1_valid); + + // if (bx == 0) printf("Finish 2 big pass\n"); + + __syncthreads(); + + // printf("Stage 2 done!!!\n"); + + // right now, we have got 32 chain values + // a warp-reduce to merge. + + // ============== STAGE 3: Block-Reduce ============== + // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv + // we will only use warp 0 to handle this thing + if (warp_id == 0) { + uint32_t cv[8] = {0,0,0,0,0,0,0,0}; + + const bool active_lane = (lane_id < parents_count); + if (active_lane) { + #pragma unroll + for (int j = 0; j < 8; ++j) cv[j] = cv_smem[lane_id][j]; + } + + // 2) warp reduce 32 -> 16 -> 8 -> 4 -> 2 -> 1 + unsigned mask = __ballot_sync(0xFFFFFFFFu, active_lane); + int cur_n = parents_count; // 当前层的有效节点数(逐层更新) + + for (int step = 1; step < WARP_SIZE; step <<= 1) { + // right-neighbor + uint32_t nbr[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) { + nbr[j] = __shfl_down_sync(mask, cv[j], step); + } + + // safety checking + const bool do_pair = + (lane_id % (step << 1) == 0) && // 左侧 + (lane_id + step < cur_n) && // 右侧在当前层有效范围内 + (lane_id < cur_n); // 左侧也必须有效 + + if (do_pair) { + blake3_parent_cv(cv, nbr, cv); // parent(left, right) -> cv + } + + cur_n = (cur_n + 1) >> 1; + __syncwarp(mask); + } + + // 3) write back to global memory + if (lane_id == 0 && parents_count > 0) { + const int tile_id = blockIdx.x; + uint32_t* out = block_cvs + (size_t)tile_id * 8; // 8 x 4 = 32 B + + // two different write ways + #if 0 + #pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = cv[j]; + #else + // block_cvs should be cudaMalloc ed + reinterpret_cast(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]); + reinterpret_cast(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]); + #endif + } + } +} // blake3_block_reduce_kernel + +__device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) { + const uint4* g4 = reinterpret_cast(g); + uint4 a = g4[0], b = g4[1]; + r[0]=a.x; r[1]=a.y; r[2]=a.z; r[3]=a.w; + r[4]=b.x; r[5]=b.y; r[6]=b.z; r[7]=b.w; +} + +__device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) { + uint4* g4 = reinterpret_cast(g); + g4[0] = make_uint4(r[0],r[1],r[2],r[3]); + g4[1] = make_uint4(r[4],r[5],r[6],r[7]); +} + +// ============ Tiny kernel ============ +// In big kernel, it will consume 64 KiB each block +// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 64 root = 16384 root +// And this tiny kernel is designed to process these 16384 root +template +__global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32, + uint32_t* __restrict__ out_cv32, + int N) +{ + extern __shared__ __align__(16) uint32_t smem[]; // 动态 SMEM;需要 >= TILE_CVS*8*4 字节 + // 视作 2D:[TILE_CVS][8+PAD] + uint32_t* cv_tile = smem; + + const int tid = threadIdx.x; + const int warp_id = tid / WARP_SIZE; // 0..15 + const int lane_id = tid % WARP_SIZE; // 0..31 + + // 本 block 负责的分片起点 + const int tile_start = blockIdx.x * TILE_CVS; + if (tile_start >= N) return; + + // N等于8的时候,这里就是8 + const int tile_n = min(TILE_CVS, N - tile_start); // 该分片的实际 CV 数(<=2048) + + // ---------------- Stage 1: 合并访存 loading 到 SMEM ---------------- + // 每线程搬多个 CV:i = tid, tid+blockDim, ... + for (int i = tid; i < tile_n; i += NUM_THREADS) { // 注意:i = tid, 不是等于0 + const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8; + uint32_t* s = cv_tile + (size_t)i * (8 + PAD); + // 两次 16B + const uint4* g4 = reinterpret_cast(g); + uint4* s4 = reinterpret_cast(s); + // s4[0] = g4[0]; + // s4[1] = g4[1]; + + // in case that the address is not aligned + uint4 v0 = g4[0]; + uint4 v1 = g4[1]; + + s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w; + s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w; + } + // 对于 tile_n < TILE_CVS 的尾部,无需清零;后续按有效范围处理 + __syncthreads(); + + // ---------------- Stage 2: 线程内 4→1(保持相邻配对) ---------------- + // 共有 reduced_n0 = ceil(tile_n / 4) 个 lane-root + const int reduced_n0 = (tile_n + 3) >> 1 >> 1; // 等价于 (tile_n+3)/4 + uint32_t lane_cv[8]; // 本线程输出的 lane-root + bool lane_valid = false; + + // 每线程的 4 个输入的起始索引 + int base4 = tid << 2; // tid*4 + if (base4 < tile_n) { + // 读取最多 4 个相邻 CV:idx = base4 + 0,1,2,3 + uint32_t a[8], b[8], c[8], d[8]; + const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD); + load_cv_g2r(s0, a); + + int remain = tile_n - base4; + + if (remain >= 2) { + const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD); + load_cv_g2r(s1, b); + } + if (remain >= 3) { + const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD); + load_cv_g2r(s2, c); + } + if (remain >= 4) { + const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD); + load_cv_g2r(s3, d); + } + + // 两层相邻配对(奇数晋级) + if (remain == 1) { + #pragma unroll + for (int j = 0; j < 8; ++j) + lane_cv[j] = a[j]; + } else if (remain == 2) { + blake3_parent_cv(a, b, lane_cv); + } else if (remain == 3) { + uint32_t p01[8]; + blake3_parent_cv(a, b, p01); + blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01,(p01,c)->lane_cv + } else { // remain >= 4 + uint32_t p01[8], p23[8]; + blake3_parent_cv(a, b, p01); + blake3_parent_cv(c, d, p23); + blake3_parent_cv(p01, p23, lane_cv); + } + lane_valid = true; + } + + // ---------------- Stage 3: Warp 内 32→1 相邻配对 ---------------- + // 每个 warp 负责一个连续段:warp_base = warp_id*32 + const int warp_base = warp_id * WARP_SIZE; + const int cur_n_w = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // 本 warp 段内的有效数量 + + // 把 lane_cv 保留在寄存器里做归约;无效 lane 用 mask 剔除 + unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // 仅做存在检测 + int cur_n = cur_n_w; + + // 把“段外的线程”标成无效(避免读越界) + bool active_lane = (lane_id < cur_n_w); + + // 对无效 lane 把值清成 0(不会被使用) + if (!active_lane) { + #pragma unroll + for (int j = 0; j < 8; ++j) + lane_cv[j] = 0u; + } + + // 逐层配对:1,2,4,8,16 - warp-reduce + for (int step = 1; step < WARP_SIZE; step <<= 1) { + // 取右邻 + uint32_t nbr[8]; + #pragma unroll + for (int j = 0; j < 8; ++j) + nbr[j] = __shfl_down_sync(0xFFFFFFFFu, lane_cv[j], step); + + const bool do_pair = + active_lane && + ((lane_id % (step<<1)) == 0) && + (lane_id + step < cur_n); + + if (do_pair) { + blake3_parent_cv(lane_cv, nbr, lane_cv); + } + + cur_n = (cur_n + 1) >> 1; + // __syncwarp(); + } + + // 这一段的结果在 lane0;把 16 个 warp-root 写入 SMEM 的前 16 行 + __shared__ uint32_t warp_roots[WARP_SIZE/2][8]; // 16×8 + if (lane_id == 0 && cur_n_w > 0) { + #pragma unroll + for (int j=0;j<8;++j) + warp_roots[warp_id][j] = lane_cv[j]; + } + __syncthreads(); + + // ---------------- Stage 4: CTA 内 16→1 相邻配对 ---------------- + // 有效 warp 数:ceil(reduced_n0 / 32) + int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..16 + if (valid_warps == 0) return; + + // 每一个warp的lane 0来做计算 + // 用 lane0 做计算,其它 lane 空转 + for (int stride = (valid_warps >> 1); stride >= 1; stride >>= 1) { + if (warp_id < stride && lane_id == 0) { + uint32_t p[8]; + blake3_parent_cv(&warp_roots[2*warp_id][0], + &warp_roots[2*warp_id + 1][0], p); + #pragma unroll + for (int j = 0; j < 8; ++j) + warp_roots[warp_id][j] = p[j]; + } + __syncthreads(); + // 奇数晋级 + if ((valid_warps & 1) && warp_id==0 && lane_id==0) { + #pragma unroll + for (int j=0;j<8;++j) + warp_roots[stride][j] = warp_roots[valid_warps-1][j]; + } + __syncthreads(); + valid_warps = (valid_warps + 1) >> 1; + } + + // 写回本 block 的根 + if (threadIdx.x == 0) { + uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8; + #pragma unroll + for (int j = 0; j < 8; ++j) + out[j] = warp_roots[0][j]; + } +} + +inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) { + const uint32_t zero_block[16] = {0}; + uint32_t st[16]; + blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st); + // 写出前 32 字节(state[0..7],小端) + for (int i = 0; i < 8; ++i) { + uint32_t w = st[i]; + out32[4*i+0] = (uint8_t)( w & 0xFF); + out32[4*i+1] = (uint8_t)((w >> 8 ) & 0xFF); + out32[4*i+2] = (uint8_t)((w >> 16) & 0xFF); + out32[4*i+3] = (uint8_t)((w >> 24) & 0xFF); + } +} + +// wrapper function +void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, + uint64_t bytes_len, + std::array* root_out = nullptr, + cudaStream_t stream = 0) { + if ((bytes_len % 1024ull) != 0ull || (bytes_len % 4ull) != 0ull) { + fprintf(stderr, "[blake3] bytes_len must be a multiple of 1024 and 4 (got %llu)\n", + (unsigned long long)bytes_len); + std::abort(); + } + + // int dev = -1; + // cudaGetDevice(&dev); + // printf("[dbg] my runtime current device = %d\n", dev); + + // cudaPointerAttributes attr{}; + // auto st = cudaPointerGetAttributes(&attr, d_data); + // printf("[dbg] getAttr=%d, type=%d (0=host,1=device,2=managed), device=%d\n", + // (int)st, (int)attr.type, attr.device); + + // cudaPointerAttributes attr{}; + // CUDA_CHECK(cudaPointerGetAttributes(&attr, d_data)); + // if (attr.type != cudaMemoryTypeDevice) { + // fprintf(stderr, "d_data is not device memory!\n"); + // std::abort(); + // } + + int optin = 0, deflt = 0; + cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0); + cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0); + + const int dyn_smem = 64 * 1024; // 64KiB + + // 编译器在编译期决定分配多少动态shmem给kernel + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel<512, 2048, 0>, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel<32, 2048, 0>, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + + + constexpr int CHUNKS_PER_BLOCK = 32; // 16 * 32 = 512 + constexpr int CHUNK_SIZE = 1024; // 1 KiB + constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 + constexpr int NUM_THREADS = CHUNKS_PER_BLOCK * 512 / 64; // for big kernel, 512 or 256 + constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // 16 or 8 + const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk + const uint64_t total_chunks = bytes_len / CHUNK_SIZE; + const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); // 16384 blocks + + constexpr int pad_chunk = 16; + constexpr int pad_cv = 0; + + CUDA_CHECK(cudaFuncSetAttribute( + blake3_block_reduce_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100)); + + uint8_t* d_bytes = const_cast(d_data); + uint32_t* d_words = reinterpret_cast(d_bytes);; // alias + uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32 + + // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch. + + // TODO: use thrust + cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t)); // 512 KiB + + // ============= launch big kernel ============= + dim3 grid_big(num_blocks); + dim3 block_big(NUM_THREADS); + uint64_t base_chunk_counter = 0ull; + + blake3_block_reduce_kernel + <<>>( + d_words, d_blockCV, chunk_len_bytes, base_chunk_counter, (int)total_chunks); + + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + + if (num_blocks == 1) { + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_blockCV, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) *root_out = host_root; + else { + // 简单打印 + printf("root CV:"); + for (int i=0;i<8;++i) + printf(" %08x", host_root[i]); + printf("\n"); + } + + CUDA_CHECK(cudaFree(d_blockCV)); + CUDA_CHECK(cudaFree(d_bytes)); + return; + } + + // the first round of tiny kernel + // 1) 16384 output reduce -> 8 + uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32 + { + const int N = 16384; // total number + const int TILE = 2048; + const int grid = (N + TILE - 1) / TILE; // = 8 + const int block = 512; + const size_t smem_bytes = (size_t)(TILE) * 8u * sizeof(uint32_t); // 2048×8×4 = 64 KiB + + cudaMalloc(&d_mid_out, (size_t)8 * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel<512, 2048, 0> + <<>>(d_blockCV /*in: 16384×8 x 4*/, + d_mid_out /*out: 8×8*/, N); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + } + + // second round + uint32_t* d_root_cv = nullptr; + { + const int N = 8; + const int TILE = 2048; // 任意 >=N 即可 + const int grid = 1; + const int block = 32; // 32 线程够用 + const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t); // 8 x 8 x 4 = 8 x 32 B + + cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel<32, 2048, 0> + <<>>(d_mid_out /*in: 8×8*/, + d_root_cv /*out: 1×8*/, N); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + } + + std::array host_root{}; + CUDA_CHECK(cudaMemcpyAsync(host_root.data(), d_root_cv, 8*sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // last final process + uint8_t digest32[32]; + blake3_digest32_from_root_cv(host_root.data(), digest32); + + if (root_out) { + *root_out = host_root; + } else { + printf("root CV:"); + for (int i=0;i<8;++i) printf(" %08x", host_root[i]); + printf("\n"); + } + + // clear + CUDA_CHECK(cudaFree(d_mid_out)); + CUDA_CHECK(cudaFree(d_root_cv)); + CUDA_CHECK(cudaFree(d_blockCV)); + // CUDA_CHECK(cudaFree(d_bytes)); // this memory is managed by torch, we do not free it. +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b62068c..68e8d6d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,5 @@ pandas==2.3.3 pip-chill==1.0.3 pybind11==3.0.1 tomli==2.0.1 +torch +blake3 \ No newline at end of file diff --git a/setup.py b/setup.py index e047cc4..599e2f0 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,44 @@ def locate_cuda(): cuda_libdir = os.path.join(cuda_home, "lib64") return {"home": cuda_home, "nvcc": nvcc, "include": cuda_include, "libdir": cuda_libdir} +def locate_cutlass_cute(): + """ + 返回 CUTLASS/CuTe 的 include 根目录(要求该目录下有 cutlass/ 与 cute/)。 + 优先使用环境变量 CUTLASS_HOME 或 CUTE_HOME。 + """ + cand = [] + if "CUTE_HOME" in os.environ: + cand.append(os.path.join(os.environ["CUTE_HOME"], "include")) + cand.append(os.environ["CUTE_HOME"]) + if "CUTLASS_HOME" in os.environ: + cand.append(os.path.join(os.environ["CUTLASS_HOME"], "include")) + cand.append(os.environ["CUTLASS_HOME"]) + + cand += [ + "/usr/local/include", # linux 常见 + os.path.expanduser("~/cutlass/include"), # clone 到家目录 + os.path.expanduser("~/CUTLASS/include"), + os.path.expanduser("~/third_party/cutlass/include"), + os.path.abspath("third_party/cutlass/include"), + ] + + def ok(p): + return p and os.path.isdir(p) and \ + os.path.isdir(os.path.join(p, "cute")) # CuTe 在 cutlass/include/cute + + for p in cand: + if ok(p): + return os.path.abspath(p) + + raise RuntimeError( + "Cannot find CUTLASS/CuTe include path.\n" + "Set CUTLASS_HOME or CUTE_HOME to the repository root (the path that contains 'include/cute')." + ) + CUDA = locate_cuda() +CUTLASS_INCLUDE = locate_cutlass_cute() + +print(f"Found cutlass include: {CUTLASS_INCLUDE}") CXX_STD = 17 @@ -42,7 +79,13 @@ def locate_cuda(): NVCC_ARCH_FLAGS += ["-gencode", f"arch=compute_{a},code=sm_{a}"] COMMON_DEFINES = [] -COMMON_INCLUDES = [np.get_include(), pybind11.get_include(), pybind11.get_include(user=True), CUDA["include"]] +COMMON_INCLUDES = [ + np.get_include(), + pybind11.get_include(), + pybind11.get_include(user=True), + CUDA["include"], + CUTLASS_INCLUDE, +] COMMON_LIB_DIRS = [CUDA["libdir"]] COMMON_LIBS = ["cudart"] RPATH = [CUDA["libdir"]] if not sys.platform.startswith("win") else [] @@ -89,24 +132,10 @@ def locate_cuda(): "csrc/sha256_base.cpp", "csrc/sha256_simd.cpp", "csrc/blake3_base.cpp", - "csrc/blake3_sm70.cu", + "csrc/blake3_sm70_sm80.cu", "csrc/binding.cpp", ] -# ext = Pybind11Extension( -# "flashashing", -# sources=sources, -# include_dirs=COMMON_INCLUDES, -# library_dirs=COMMON_LIB_DIRS, -# libraries=COMMON_LIBS, -# extra_compile_args={ -# "cxx": CXX_FLAGS, -# "nvcc": [] -# }, -# extra_link_args=LINK_FLAGS, -# define_macros=[("PYBIND11_DETAILED_ERROR_MESSAGES", "1")] + [(d, None) for d in COMMON_DEFINES], -# ) - setup( name="flashashing", ext_modules=[ @@ -117,6 +146,10 @@ def locate_cuda(): "cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS, }, + include_dirs=COMMON_INCLUDES, + library_dirs=COMMON_LIB_DIRS, + libraries=COMMON_LIBS, + extra_link_args=LINK_FLAGS + (["-Wl,-rpath," + RPATH[0]] if RPATH else []), ) ], cmdclass={"build_ext": BuildExtension}, From 263756c78e23f2e5e255e9b4088f8bba427590be Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Tue, 7 Oct 2025 14:09:26 +0800 Subject: [PATCH 10/20] bug fix for SM70 --- README.md | 2 ++ csrc/blake3_sm70_sm80.cu | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c435acf..0a6cf25 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,8 @@ pip install -r requirements.txt # clone the cutlass repository git clone https://github.com/NVIDIA/cutlass.git ~/cutlass --depth 1 +export FLASHASHING_CUDA_ARCH_LIST= # your arch here + # install the cpp source file python setup.py install diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu index 95b282f..989eafa 100644 --- a/csrc/blake3_sm70_sm80.cu +++ b/csrc/blake3_sm70_sm80.cu @@ -9,7 +9,7 @@ #include #include -#if __CUDA_ARCH__ >= 800 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #include #endif @@ -34,7 +34,7 @@ using namespace cute; using vec_t = cute::uint128_t; // one time loading 16 B -#if __CUDA_ARCH__ >= 800 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 // SM80 branch using Atom = cute::Copy_Atom, vec_t>; #else @@ -408,7 +408,7 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, // thread 1: 1, 33 // thread 32: 31,63 // so we load data in coalsced mode - using Atom = cute::Copy_Atom, vec_t>; + // using Atom = cute::Copy_Atom, vec_t>; // thread layout. (lane,0)->idx=2*lane,(lane,1)->idx=2*lane+1 auto thr_layout = make_layout( @@ -464,8 +464,11 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, } } } + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 cp_async_fence(); cp_async_wait<0>(); +#endif __syncthreads(); // sync all warps From 7f8e35dbe17c4e26eec478d699884f96a31c4e13 Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Tue, 7 Oct 2025 18:33:36 +0800 Subject: [PATCH 11/20] debug --- benchmark/perf.txt | 3 +- benchmark/test_gpu.py | 51 ++++++++----- csrc/blake3_sm70_sm80.cu | 150 ++++++++++++++++++++++++--------------- setup.py | 9 +-- 4 files changed, 134 insertions(+), 79 deletions(-) diff --git a/benchmark/perf.txt b/benchmark/perf.txt index 8656b7e..2920e89 100644 --- a/benchmark/perf.txt +++ b/benchmark/perf.txt @@ -1 +1,2 @@ -10.6 V100: 53927.17 MiB/s \ No newline at end of file +10.7 V100: 54819.25 MiB/s + RTX 4090: 150585.83 MiB/s diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py index 52b0977..33e14a0 100644 --- a/benchmark/test_gpu.py +++ b/benchmark/test_gpu.py @@ -4,6 +4,9 @@ import time import blake3 +check_accuracy = True +check_perf = False + GiB = 1024*1024*1024 # bytes -> 1 GiB cpu = torch.empty(GiB * 1, dtype=torch.uint8) @@ -18,23 +21,37 @@ # std_hex = blake3.blake3(data).hexdigest() -# 2) 预热,触发 JIT/驱动初始化,避免首轮偏慢 -for _ in range(2): - fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) -torch.cuda.synchronize() - -# 3) 正式计时:这测的是“端到端吞吐”(包含 H2D) -repeat = 5 # 1GiB × 5 已经很重,按机器调整 -t0 = time.perf_counter() -for _ in range(repeat): +if check_accuracy: + # 2) GPU 版本 cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) -torch.cuda.synchronize() -t1 = time.perf_counter() - -t1 = time.perf_counter() -elapsed = t1 - t0 -print(f"Elapsed time for {repeat}x BLAKE3 (GPU SM70): {elapsed:.3f} seconds") -print(f"Throughput: {repeat * d.numel() / elapsed / (1024**2):.2f} MiB/s") -print("root CV (hex) =", cv_hex) + torch.cuda.synchronize() + print("GPU BLAKE3 Result: ", cv_hex) + + # 1) CPU 版本 + std_hex = blake3.blake3(cpu.numpy()).hexdigest() + print("std BLAKE3 Expected: ", std_hex) + + assert cv_hex == std_hex, "GPU BLAKE3 result does not match CPU result!" + print("GPU BLAKE3 result matches CPU result!") + +if check_perf: + # 2) 预热,触发 JIT/驱动初始化,避免首轮偏慢 + for _ in range(2): + fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) + torch.cuda.synchronize() + + # 3) 正式计时:这测的是“端到端吞吐”(包含 H2D) + repeat = 5 # 1GiB × 5 已经很重,按机器调整 + t0 = time.perf_counter() + for _ in range(repeat): + cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) + torch.cuda.synchronize() + t1 = time.perf_counter() + + t1 = time.perf_counter() + elapsed = t1 - t0 + print(f"Elapsed time for {repeat}x BLAKE3 (GPU SM70): {elapsed:.3f} seconds") + print(f"Throughput: {repeat * d.numel() / elapsed / (1024**2):.2f} MiB/s") + print("root CV (hex) =", cv_hex) # print(f"std BLAKE3 Expected: {std_hex}") \ No newline at end of file diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu index 989eafa..a9f3ec9 100644 --- a/csrc/blake3_sm70_sm80.cu +++ b/csrc/blake3_sm70_sm80.cu @@ -10,7 +10,7 @@ #include #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - #include +#include #endif #include @@ -365,7 +365,7 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, // 8 x u32 -> one chain value, we have `NUM_WARPS` chain value in total // 8 x 4 x 64 = 2 KiB shared memory in sum - __shared__ __align__(16) uint32_t cv_smem[32][8 + PAD_CV]; // avoid bank conflict + __shared__ __align__(16) uint32_t cv_smem[CHUNKS_PER_BLOCK / 2][8 + PAD_CV]; // avoid bank conflict constexpr int WORDS_PER_CHUNK = CHUNK_SIZE / 4; // 256 constexpr int VEC_ELEMS = 4; // uint4, 16B @@ -427,7 +427,6 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, // this sentence will load // auto thr_copy = local_partition(tcopy, lane_id); - for (int ldt = 0; ldt < 4; ldt++) { // each warp load 4 chunks int chunk_local = ldt * WARPS_PER_CTA + warp_id; // ldt*16 + warp -> start chunk @@ -456,8 +455,12 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, // launch the copy inst. if (g_valid) { - // gmem load → smem store:twice 16B + // gmem load → smem store: 16B x 2 copy(tcopy, tCg, tCs); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + cp_async_fence(); + cp_async_wait<0>(); +#endif } else { for (int i = 0; i < size(tCs); ++i) { *reinterpret_cast(raw_pointer_cast(&tCs(i))) = uint128_t(0, 0); @@ -465,33 +468,34 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, } } -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - cp_async_fence(); - cp_async_wait<0>(); -#endif - __syncthreads(); // sync all warps - // ============== STAGE 2: Compress Leaf to 64 chain value ============== - const int pass0_valid = min(32, valid_chunks); // pass0 cover [0, 31] chunks - const int pass1_valid = max(0, valid_chunks - 32); // pass1 cover [32, 63] chunks +#if defined(DBG_KERNEL) && DBG_KERNEL + if (threadIdx.x == 0 && blockIdx.x == 0) { + printf("Stage 1 finish processing\n"); + } +#endif + + // ============== STAGE 2: Compress Leaf to `CHUNKS_PER_BLOCK` chain value ============== + const int pass0_valid = min(CHUNKS_PER_BLOCK / 2, valid_chunks); // pass0 cover [0, CHUNKS_PER_BLOCK / 2] chunks + const int pass1_valid = max(0, valid_chunks - (CHUNKS_PER_BLOCK / 2)); // pass1 cover [CHUNKS_PER_BLOCK / 2, CHUNKS_PER_BLOCK] chunks __shared__ int parents_count; if (threadIdx.x == 0) { - const int parents0 = (pass0_valid + 1) >> 1; - const int parents1 = (pass1_valid + 1) >> 1; + const int parents0 = (pass0_valid + 1) >> 1; // 8 or 16 + const int parents1 = (pass1_valid + 1) >> 1; // 8 or 16 parents_count = parents0 + parents1; // ≤ 32 } __syncthreads(); - // if (blockIdx.x==0 && threadIdx.x==0) printf("after stage1\n"); - // this is for each warp's lane0 and lane16 written // to decrease the register usage. __shared__ __align__(16) uint32_t tmp_cv[CHUNKS_PER_BLOCK][8 + PAD_CV]; // 2 KiB // lambda function: compress this thing - auto do_big_pass = [&](int base /*0 或 32*/, int pass_valid) { + // 64 chunks - 16 warps, 1 warp -> 4 chunks -> 2pass + // 32 chunks - 8 warps, 1 warp -> 4 chunks -> 2pass + auto do_big_pass = [&](int base /*0 or chunks/2 */, int pass_valid) { // left=base+2*warp_id, right=left+1 const int left = base + (warp_id << 1); // base + 0,2,4,6,... const int right = left + 1; @@ -499,33 +503,51 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, const int right_rel = right - base; // 1..32 const bool has_left = (left_rel < pass_valid); const bool has_right = (right_rel < pass_valid); - +#if defined(DBG_KERNEL) && DBG_KERNEL + if (base == 16 && threadIdx.x % 32 == 31 && blockIdx.x == 0) { + printf("left %d right %d left_rel %d, right_rel %d, pass_valid %d\n", left, right, left_rel, right_rel, pass_valid); + } +#endif + // const int lane_id = threadIdx.x & 31; const int sub = lane_id >> 4; // sub-warp: 0 or 1, lane0-15: sub-warp0; lane16-lane31: sub-warp1 const int li = lane_id & 15; // 0..15 - const unsigned mask16= (sub == 0 ? 0x0000FFFFu : 0xFFFF0000u); + + const unsigned full = __activemask(); + const unsigned submask = (sub==0 ? 0x0000FFFFu : 0xFFFF0000u) & full; + const unsigned mask16 = (sub == 0 ? 0x0000FFFFu : 0xFFFF0000u); const int chunk_local = left + sub; // sub=0→left, sub=1→right - const bool active = (sub==0 ? (left - base) < pass_valid - : (right - base) < pass_valid); + const bool valid_sub = (sub==0 ? (left_rel < pass_valid) : (right_rel < pass_valid)); // uint32_t my_cv[8]; // the left-sub-warp and right-sub-warp will execute the same code // distinguish the index by computing, // to avoid warp-divergence - if (active) { - // the chunk local identifies the left or right chunk, so do not worry. - const uint32_t* row = &chunk_smem[chunk_local][0]; - const uint64_t cc = base_chunk_counter + (uint64_t)(tile_base + chunk_local); - blake3_leaf_cv_simd16_onechunk(row, - chunk_len_bytes, - cc, - &tmp_cv[chunk_local][0], - mask16); + + // the chunk local identifies the left or right chunk, so do not worry. + const uint32_t* row = valid_sub ? &chunk_smem[chunk_local][0] : &chunk_smem[0][0]; + uint32_t* out = valid_sub ? &tmp_cv[chunk_local][0] : nullptr; + +#if defined(DBG_KERNEL) && DBG_KERNEL + if (!valid_sub) { + printf("tile %d warp %d not valid in base %d, pass_valid %d\n", blockIdx.x, threadIdx.x/32, base, pass_valid); } +#endif + const uint64_t cc = base_chunk_counter + (uint64_t)(tile_base + chunk_local); + blake3_leaf_cv_simd16_onechunk(row, + chunk_len_bytes, + cc, + out, + submask); +#if defined(DBG_KERNEL) && DBG_KERNEL + if (threadIdx.x == 0 && blockIdx.x == 0) { + printf("tile %d warp %d ready\n", blockIdx.x, threadIdx.x/32); + } +#endif - __syncwarp(); // make sure two warps written into `tmp_cv` + __syncwarp(full); // make sure two warps written into `tmp_cv` // now we have compute 2 chunks' cv // merge it to a parent cv @@ -548,46 +570,59 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, cv_smem[pair_idx][j] = parent[j]; } - __syncwarp(); // NOTICE: this is necessary! + __syncwarp(full); // NOTICE: this is necessary! }; // do_big_pass - // big-pass 1: computing 0-31 chunks - do_big_pass(/*base=*/0, pass0_valid); + // big-pass 1: computing [CHUNK_PER_BLOCK / 2] chunks + do_big_pass(0, pass0_valid); - // if (bx == 0) printf("Finish 1 big pass\n"); +#if defined(DBG_KERNEL) && DBG_KERNEL + if (threadIdx.x == 0 && blockIdx.x == 0 && pass0_valid != 0) { + printf("Stage 2 - pass 1 finish processing\n"); + } +#endif - // big-pass 2: computing 32-63 chunks - do_big_pass(/*base=*/32, pass1_valid); + // big-pass 2: computing [CHUNK_PER_BLOCK / 2] chunks + do_big_pass(CHUNKS_PER_BLOCK / 2, pass1_valid); - // if (bx == 0) printf("Finish 2 big pass\n"); +#if defined(DBG_KERNEL) && DBG_KERNEL + if (threadIdx.x == 0 && blockIdx.x == 0 && pass1_valid != 0) { + printf("Stage 2 - pass 2 finish processing, pass 1: %d\n", pass1_valid); + } +#endif __syncthreads(); - // printf("Stage 2 done!!!\n"); - // right now, we have got 32 chain values // a warp-reduce to merge. // ============== STAGE 3: Block-Reduce ============== - // 32 - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv + // [32] - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv // we will only use warp 0 to handle this thing if (warp_id == 0) { uint32_t cv[8] = {0,0,0,0,0,0,0,0}; - const bool active_lane = (lane_id < parents_count); +#if defined(DBG_KERNEL) && DBG_KERNEL + if (threadIdx.x == 0 && blockIdx.x == 0) { + printf("parents count: %d\n", parents_count); + } +#endif + + const bool active_lane = (lane_id < parents_count); // parents_count = CHUNKS_PER_BLOCK / 2 if (active_lane) { - #pragma unroll - for (int j = 0; j < 8; ++j) cv[j] = cv_smem[lane_id][j]; + #pragma unroll + for (int j = 0; j < 8; ++j) + cv[j] = cv_smem[lane_id][j]; } - // 2) warp reduce 32 -> 16 -> 8 -> 4 -> 2 -> 1 - unsigned mask = __ballot_sync(0xFFFFFFFFu, active_lane); + // 2) warp reduce [32] -> 16 -> 8 -> 4 -> 2 -> 1 int cur_n = parents_count; // 当前层的有效节点数(逐层更新) - - for (int step = 1; step < WARP_SIZE; step <<= 1) { + + for (int step = 1; cur_n > 1; step <<= 1) { + unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_id < cur_n); // right-neighbor uint32_t nbr[8]; - #pragma unroll + #pragma unroll for (int j = 0; j < 8; ++j) { nbr[j] = __shfl_down_sync(mask, cv[j], step); } @@ -607,22 +642,23 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, } // 3) write back to global memory + // now, warp_id == 0 and lane_id == 0 will write to output ptr + // one block compute 32 chunks if (lane_id == 0 && parents_count > 0) { const int tile_id = blockIdx.x; uint32_t* out = block_cvs + (size_t)tile_id * 8; // 8 x 4 = 32 B - // two different write ways - #if 0 - #pragma unroll - for (int j = 0; j < 8; ++j) - out[j] = cv[j]; - #else - // block_cvs should be cudaMalloc ed + // block_cvs should be cudaMalloced reinterpret_cast(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]); reinterpret_cast(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]); - #endif } } + +#if defined(DBG_KERNEL) && DBG_KERNEL + if (threadIdx.x == 0 && blockIdx.x == 0) { + printf("Stage 3 - finish processing\n"); + } +#endif } // blake3_block_reduce_kernel __device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) { diff --git a/setup.py b/setup.py index 599e2f0..630e58b 100644 --- a/setup.py +++ b/setup.py @@ -71,7 +71,7 @@ def ok(p): CXX_STD = 17 -arch_list = os.environ.get("FLASHASHING_CUDA_ARCH_LIST", "80;86;89").split(";") +arch_list = os.environ.get("FLASHASHING_CUDA_ARCH_LIST", "80;89").split(";") NVCC_ARCH_FLAGS = [] for a in arch_list: a = a.strip() @@ -84,18 +84,18 @@ def ok(p): pybind11.get_include(), pybind11.get_include(user=True), CUDA["include"], - CUTLASS_INCLUDE, + CUTLASS_INCLUDE, ] COMMON_LIB_DIRS = [CUDA["libdir"]] COMMON_LIBS = ["cudart"] RPATH = [CUDA["libdir"]] if not sys.platform.startswith("win") else [] -debug = False +debug = os.environ.get("DBG_KERNEL", "0") == "1" CXX_FLAGS = [ f"-std=c++{CXX_STD}", "-O3", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math", "-march=native", "-mavx2", "-mfma" ] if not debug else [ - f"-std=c++{CXX_STD}", "-g", "-O0", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math", "-march=native", "-mavx2", "-mfma" + f"-std=c++{CXX_STD}", "-g", "-O0", "-fPIC", "-Wall", "-Wextra", "-Wpedantic", "-ffast-math", "-march=native", "-mavx2", "-mfma", "-DDBG_KERNEL=1" ] LINK_FLAGS = [] @@ -119,6 +119,7 @@ def ok(p): "--expt-relaxed-constexpr", "--use_fast_math", "-G", "-lineinfo", + "-DDBG_KERNEL=1", ] + NVCC_ARCH_FLAGS if not sys.platform.startswith("win"): From 1853bafae8b19dfa8a78594a227d5913e3dc88fe Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Tue, 7 Oct 2025 22:03:03 +0800 Subject: [PATCH 12/20] finish big kernel debug --- benchmark/test_gpu.py | 3 + csrc/blake3_sm70_sm80.cu | 257 +++++++++++++++++++++++++++------------ setup.py | 3 +- 3 files changed, 185 insertions(+), 78 deletions(-) diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py index 33e14a0..40cbb23 100644 --- a/benchmark/test_gpu.py +++ b/benchmark/test_gpu.py @@ -31,6 +31,9 @@ std_hex = blake3.blake3(cpu.numpy()).hexdigest() print("std BLAKE3 Expected: ", std_hex) + std_hex_1KB = blake3.blake3(cpu[:1024].numpy()).hexdigest() + print("std BLAKE3 1KB: ", std_hex_1KB) + assert cv_hex == std_hex, "GPU BLAKE3 result does not match CPU result!" print("GPU BLAKE3 result matches CPU result!") diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu index a9f3ec9..ee5dd16 100644 --- a/csrc/blake3_sm70_sm80.cu +++ b/csrc/blake3_sm70_sm80.cu @@ -89,6 +89,23 @@ __host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t ds #endif } +__device__ void print_cv(uint32_t cv[8]) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + auto get_byte = [&](int i) { + int w = i >> 2; // 第 i 个字节来自第 w 个 u32 + int off = (i & 3) * 8; // 在该 u32 中的偏移 + return (unsigned)((cv[w] >> off) & 0xFFu); + }; + + printf("block %d root CV (u32, little-endian words):", blockIdx.x); + for (int i = 0; i < 32; ++i) { + printf("%02x", get_byte(i)); + if ((i & 3) == 3) printf(" "); // 每 4 字节空格 + } + printf("\n"); + } +} + __host__ __device__ void blake3_compress_words_7r( const uint32_t block_words[16], // 64B -> shared memory const uint32_t cv[8], // 8×u32 -> shared memory @@ -327,6 +344,80 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row } } +__device__ __noinline__ +void blake3_parent_cv_simd16(const uint32_t* __restrict__ L, // 8×u32 + const uint32_t* __restrict__ R, // 8×u32 + uint32_t* __restrict__ out_cv, // 8×u32 + unsigned mask16) // half-warp masks for 16 lanes +{ + const int lane = threadIdx.x & 31; + const int li = lane & 15; // 0..15 half the warp + const int role = li & 3; + + // messages: the front 8 from L, and the latter 8 from R + const uint32_t m_lane = (li < 8) ? L[li] : R[li - 8]; + + // v initialize + uint32_t v = (li < 8) ? BLAKE3_IV[li] : BLAKE3_IV[li - 8]; + + const uint32_t t0 = 0u; + const uint32_t t1 = 0u; + const uint32_t block_len = 64u; + const uint32_t flags = FLAG_PARENT; + + v ^= (li == 12) ? t0 : 0u; + v ^= (li == 13) ? t1 : 0u; + v ^= (li == 14) ? block_len : 0u; + v ^= (li == 15) ? flags : 0u; + + // 与 leaf 相同的“列/对角”两步、共 7 轮 + int q = (li & 3); + int rq = (li >> 2); + int li_diag = (rq << 2) | ((q + rq) & 3); + int li_undo = (rq << 2) | ((q - rq) & 3); + int gi_col = q; + int gi_diag = (li_diag & 3); + + #pragma unroll 4 + for (int r = 0; r < 7; ++r) { + // 列步 + { + uint32_t vb = __shfl_xor_sync(mask16, v, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16); + + uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16); + + v = G_update_role(v, vb, vc, vd, mx, my, role); + } + // 对角步 + { + uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16); + uint32_t vb = __shfl_xor_sync(mask16, v_diag, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v_diag, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16); + + uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16); + + uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role); + v = __shfl_sync(mask16, v_diag_new, li_undo, 16); + } + } + + // state -> CV:cv[i] = v[i] ^ v[i+8] + uint32_t vip8 = __shfl_sync(mask16, v, li ^ 8, 16); + uint32_t cv_word = (li < 8) ? (v ^ vip8) : 0; + + // 半 warp 汇聚到 out_cv[0..7](仅 li==0 的 4×收集也可以) + #pragma unroll + for (int j = 0; j < 8; ++j) { + uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); + if (li == 0) out_cv[j] = wj; + } +} + __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){ uint32_t msg[16]; #pragma unroll @@ -503,11 +594,11 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, const int right_rel = right - base; // 1..32 const bool has_left = (left_rel < pass_valid); const bool has_right = (right_rel < pass_valid); -#if defined(DBG_KERNEL) && DBG_KERNEL - if (base == 16 && threadIdx.x % 32 == 31 && blockIdx.x == 0) { - printf("left %d right %d left_rel %d, right_rel %d, pass_valid %d\n", left, right, left_rel, right_rel, pass_valid); - } -#endif +// #if defined(DBG_KERNEL) && DBG_KERNEL +// if (base == 16 && threadIdx.x % 32 == 31 && blockIdx.x == 0) { +// printf("left %d right %d left_rel %d, right_rel %d, pass_valid %d\n", left, right, left_rel, right_rel, pass_valid); +// } +// #endif // const int lane_id = threadIdx.x & 31; const int sub = lane_id >> 4; // sub-warp: 0 or 1, lane0-15: sub-warp0; lane16-lane31: sub-warp1 @@ -541,35 +632,43 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, cc, out, submask); -#if defined(DBG_KERNEL) && DBG_KERNEL - if (threadIdx.x == 0 && blockIdx.x == 0) { - printf("tile %d warp %d ready\n", blockIdx.x, threadIdx.x/32); - } -#endif __syncwarp(full); // make sure two warps written into `tmp_cv` - // now we have compute 2 chunks' cv - // merge it to a parent cv - if (lane_id == 0 && has_left) { +// #if defined(DBG_KERNEL) && DBG_KERNEL +// if (blockIdx.x == 0 && threadIdx.x == 0) printf("The simd16-lane res: \n"); +// print_cv(out); +// #endif + + // now, one warp computes 2 chunks, yield one parent-cv value + const int pair_idx = (base >> 1) + warp_id; // 0, 16 + warp_id + + if (has_left) { const uint32_t* lcv = &tmp_cv[left][0]; - uint32_t parent[8]; - if ((right - base) < pass_valid) { + + if (has_right) { const uint32_t* rcv = &tmp_cv[right][0]; - blake3_parent_cv(lcv, rcv, parent); - } else { // odd: up-flow directly - #pragma unroll - for (int j = 0 ; j < 8; ++j) - parent[j] = lcv[j]; - } - // now, one warp computes 2 chunks, yield one parent-cv value - const int pair_idx = (base >> 1) + warp_id; // 0, 16 + warp_id - #pragma unroll - for (int j = 0; j < 8; ++j) - cv_smem[pair_idx][j] = parent[j]; + // sub==0 half warp participate + const unsigned sub0_mask = 0x0000FFFFu & full; + if ((lane_id >> 4) == 0) { + blake3_parent_cv_simd16(lcv, rcv, &cv_smem[pair_idx][0], sub0_mask); + } + // sub==1 do not involve + } else { + // odd: left -> pair_idx + if ((lane_id >> 4) == 0) { + int li = lane_id & 15; + if (li < 8) cv_smem[pair_idx][li] = lcv[li]; + } + } } +#if defined(DBG_KERNEL) && DBG_KERNEL + if (blockIdx.x == 0 && threadIdx.x == 0) printf("The 1st chunk merged res: \n"); + print_cv(cv_smem[pair_idx]); +#endif + __syncwarp(full); // NOTICE: this is necessary! }; // do_big_pass @@ -593,72 +692,76 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, __syncthreads(); - // right now, we have got 32 chain values + // right now, we have got CHUNKS_PER_BLOCK / 2 chain values // a warp-reduce to merge. // ============== STAGE 3: Block-Reduce ============== // [32] - 16 - 8 - 4 - 2 - 1, to get a Block-root-cv // we will only use warp 0 to handle this thing - if (warp_id == 0) { - uint32_t cv[8] = {0,0,0,0,0,0,0,0}; - -#if defined(DBG_KERNEL) && DBG_KERNEL - if (threadIdx.x == 0 && blockIdx.x == 0) { - printf("parents count: %d\n", parents_count); - } -#endif - - const bool active_lane = (lane_id < parents_count); // parents_count = CHUNKS_PER_BLOCK / 2 - if (active_lane) { - #pragma unroll - for (int j = 0; j < 8; ++j) - cv[j] = cv_smem[lane_id][j]; - } - - // 2) warp reduce [32] -> 16 -> 8 -> 4 -> 2 -> 1 - int cur_n = parents_count; // 当前层的有效节点数(逐层更新) - - for (int step = 1; cur_n > 1; step <<= 1) { - unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_id < cur_n); - // right-neighbor - uint32_t nbr[8]; - #pragma unroll - for (int j = 0; j < 8; ++j) { - nbr[j] = __shfl_down_sync(mask, cv[j], step); + // the `parent_count` is the indicator + // for 256-8warps-32chunks, the parent count is 16 + if (parents_count > 0) { + // half-warp info + const int sub = lane_id >> 4; // 0/1 -> which sub warp it belong to + const int li = lane_id & 15; // 0..15 + const unsigned full = __activemask(); + const unsigned submask = ((sub==0) ? 0x0000FFFFu : 0xFFFF0000u) & full; + + int cur_n = parents_count; + + while (cur_n > 1) { + int pairs = cur_n >> 1; // the merge num this layer will do, for 16 parents, this start at 8 + // warp 0, 0-15: 0 + // warp 0, 16-31: 1 + // warp 1, 0-15: 2 + // warp 1, 16-31: 3 + // .. + // warp 7, 0-15: 14 + // warp 7, 16-31: 15 + int half_id = warp_id * 2 + sub; // half-warp id + + // each half-warp handle one pair: (left=2*half_id, right=left+1) + // only half warps will participate in + if (half_id < pairs) { + int left_idx = (half_id << 1); + int right_idx = left_idx + 1; + const uint32_t* L = &cv_smem[left_idx][0]; + const uint32_t* R = &cv_smem[right_idx][0]; + + // 半 warp SIMD16 合并:结果就地写回 left_idx + blake3_parent_cv_simd16(L, R, &cv_smem[left_idx][0], submask); } - // safety checking - const bool do_pair = - (lane_id % (step << 1) == 0) && // 左侧 - (lane_id + step < cur_n) && // 右侧在当前层有效范围内 - (lane_id < cur_n); // 左侧也必须有效 + __syncthreads(); // make sure all warps could see. - if (do_pair) { - blake3_parent_cv(cv, nbr, cv); // parent(left, right) -> cv + // odd: up-flow + if (cur_n & 1) { + // 让 warp0 的 sub0 来搬运(li < 8) + if (warp_id == 0 && sub == 0 && li < 8) { + cv_smem[pairs][li] = cv_smem[cur_n - 1][li]; + } } - cur_n = (cur_n + 1) >> 1; - __syncwarp(mask); - } + __syncthreads(); // next - level - // 3) write back to global memory - // now, warp_id == 0 and lane_id == 0 will write to output ptr - // one block compute 32 chunks - if (lane_id == 0 && parents_count > 0) { - const int tile_id = blockIdx.x; - uint32_t* out = block_cvs + (size_t)tile_id * 8; // 8 x 4 = 32 B + cur_n = pairs + (cur_n & 1); + } - // block_cvs should be cudaMalloced - reinterpret_cast(out)[0] = make_uint4(cv[0],cv[1],cv[2],cv[3]); - reinterpret_cast(out)[1] = make_uint4(cv[4],cv[5],cv[6],cv[7]); + // write out + if (warp_id == 0 && lane_id == 0) { + uint32_t* out = block_cvs + (size_t)blockIdx.x * 8; +#if defined(DBG_KERNEL) && DBG_KERNEL + print_cv(cv_smem[0]); +#endif + reinterpret_cast(out)[0] = make_uint4(cv_smem[0][0], cv_smem[0][1], cv_smem[0][2], cv_smem[0][3]); + reinterpret_cast(out)[1] = make_uint4(cv_smem[0][4], cv_smem[0][5], cv_smem[0][6], cv_smem[0][7]); } } #if defined(DBG_KERNEL) && DBG_KERNEL - if (threadIdx.x == 0 && blockIdx.x == 0) { - printf("Stage 3 - finish processing\n"); - } + printf("================================ Finishing all in big kernel! ================================\n"); #endif + } // blake3_block_reduce_kernel __device__ __forceinline__ void load_cv_g2r(const uint32_t* g, uint32_t r[8]) { @@ -916,7 +1019,7 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, constexpr int NUM_WARPS = (NUM_THREADS + WARP_SIZE - 1) / WARP_SIZE; // 16 or 8 const int chunk_len_bytes = CHUNK_SIZE; // 1 KiB per chunk const uint64_t total_chunks = bytes_len / CHUNK_SIZE; - const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); // 16384 blocks + const int num_blocks = (int)((total_chunks + CHUNKS_PER_BLOCK - 1) / CHUNKS_PER_BLOCK); // 16384 blocks, 32768 for 32-size constexpr int pad_chunk = 16; constexpr int pad_cv = 0; @@ -932,7 +1035,7 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch. // TODO: use thrust - cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t)); // 512 KiB + cudaMalloc(&d_blockCV, (size_t)num_blocks * 8u * sizeof(uint32_t)); // 512 KiB, 1M for 32-size // ============= launch big kernel ============= dim3 grid_big(num_blocks); diff --git a/setup.py b/setup.py index 630e58b..a45ea14 100644 --- a/setup.py +++ b/setup.py @@ -118,7 +118,8 @@ def ok(p): "-g", "-O0", "-Xcompiler", "-fPIC", "--expt-relaxed-constexpr", "--use_fast_math", - "-G", "-lineinfo", + # "-G", + "-lineinfo", "-DDBG_KERNEL=1", ] + NVCC_ARCH_FLAGS From f802823de022d734d437512750a119a3916cec45 Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Wed, 8 Oct 2025 17:22:35 +0800 Subject: [PATCH 13/20] debug tiny kernel --- README.md | 3 +- benchmark/perf.txt | 41 ++++++ benchmark/test_gpu.py | 9 +- csrc/blake3_sm70_sm80.cu | 290 ++++++++++++++++++++++++--------------- 4 files changed, 224 insertions(+), 119 deletions(-) diff --git a/README.md b/README.md index 0a6cf25..88cb978 100644 --- a/README.md +++ b/README.md @@ -34,4 +34,5 @@ python benchmark/test_script.py + 10.5 - v2 - [commit:2e71051367a9533af36f6c54a46876e20bc0bab5]: 2044.49 MiB/s + 10.6 - v3 - [commit:329d5425de9558e9e43216c2b89bed43674b7d83]: 201822.66 MiB/s (False Result) + 10.6 - v4 - [commit:9781ad9759d13e0e482162d57c68b65c946f4ff9]: 17069.23 MiB/s -+ 10.6 - v5 - [commit:4be8258f5e82aab4e57e8b70a604ebb9361d8aa0]: 34261.37 MiB/s \ No newline at end of file ++ 10.6 - v5 - [commit:4be8258f5e82aab4e57e8b70a604ebb9361d8aa0]: 34261.37 MiB/s ++ 10.8 - v6 - [commit:]: 15145.81 MiB/s \ No newline at end of file diff --git a/benchmark/perf.txt b/benchmark/perf.txt index 2920e89..d48f165 100644 --- a/benchmark/perf.txt +++ b/benchmark/perf.txt @@ -1,2 +1,43 @@ 10.7 V100: 54819.25 MiB/s RTX 4090: 150585.83 MiB/s + +(ceg5206) yazhu@DESKTOP-37AIPE6:~/workspace/cpp_code/ceg5206/grp_proj$ python benchmark/test_gpu.py +Stage 1 finish processing +The 1st chunk merged res: +block 0 root CV (u32, little-endian words):68f4f634 6a351c13 83eef7bf 6194f235 c3e40515 5a2bff65 9ff142fa 54c43967 +Stage 2 - pass 1 finish processing +The 1st chunk merged res: +block 0 root CV (u32, little-endian words):930c7381 17018a62 722c6e90 4b5bfaa7 09e9e8f9 e41e7ecf 75386773 7c727b3b +Stage 2 - pass 2 finish processing, pass 1: 16 +block 0 root CV (u32, little-endian words):791e0e1a 05a01828 bc7cfa9f 3274a8ae 50feb0e3 c6113c92 2aaca74f 272f3096 +================================ Finishing all in big kernel! ================================ +Block 0 root CV for tiny kernel:block 0 root CV (u32, little-endian words):369139fa 28a9fc9c ab9149a8 e3f60323 9c7504ac b5088ac8 4c951d3f 3ca90f3a +Block 0 root CV for tiny kernel:block 0 root CV (u32, little-endian words):62a88205 53625a4a b538365b c131e095 935d3504 989b4852 f5ed3f51 6a53b552 +GPU BLAKE3 Result: 62a8820553625a4ab538365bc131e095935d3504989b4852f5ed3f516a53b552 +std BLAKE3 Expected: d490c8f71546ae9909e5c6c1fd23264e268c148ea83a140d517b249e6a6a035b +std BLAKE3 1KB: f7314bcd4f08b945da46890d4abcbe9bd78905369461379ed5ab893eaccff236 +Traceback (most recent call last): + File "/home/yazhu/workspace/cpp_code/ceg5206/grp_proj/benchmark/test_gpu.py", line 37, in + assert cv_hex == std_hex, "GPU BLAKE3 result does not match CPU result!" + ^^^^^^^^^^^^^^^^^ +AssertionError: GPU BLAKE3 result does not match CPU result! +(ceg5206) yazhu@DESKTOP-37AIPE6:~/workspace/cpp_code/ceg5206/grp_proj$ python benchmark/test_gpu.py +Stage 1 finish processing +The 1st chunk merged res: +block 0 root CV (u32, little-endian words):68f4f634 6a351c13 83eef7bf 6194f235 c3e40515 5a2bff65 9ff142fa 54c43967 +Stage 2 - pass 1 finish processing +The 1st chunk merged res: +block 0 root CV (u32, little-endian words):930c7381 17018a62 722c6e90 4b5bfaa7 09e9e8f9 e41e7ecf 75386773 7c727b3b +Stage 2 - pass 2 finish processing, pass 1: 16 +block 0 root CV (u32, little-endian words):791e0e1a 05a01828 bc7cfa9f 3274a8ae 50feb0e3 c6113c92 2aaca74f 272f3096 +================================ Finishing all in big kernel! ================================ +Block 0 root CV for tiny kernel:block 0 root CV (u32, little-endian words):369139fa 28a9fc9c ab9149a8 e3f60323 9c7504ac b5088ac8 4c951d3f 3ca90f3a +Block 0 root CV for tiny kernel:block 0 root CV (u32, little-endian words):62a88205 53625a4a b538365b c131e095 935d3504 989b4852 f5ed3f51 6a53b552 +GPU BLAKE3 Result: 62a8820553625a4ab538365bc131e095935d3504989b4852f5ed3f516a53b552 +std BLAKE3 Expected: d490c8f71546ae9909e5c6c1fd23264e268c148ea83a140d517b249e6a6a035b +std BLAKE3 1KB: f7314bcd4f08b945da46890d4abcbe9bd78905369461379ed5ab893eaccff236 +Traceback (most recent call last): + File "/home/yazhu/workspace/cpp_code/ceg5206/grp_proj/benchmark/test_gpu.py", line 37, in + assert cv_hex == std_hex, "GPU BLAKE3 result does not match CPU result!" + ^^^^^^^^^^^^^^^^^ +AssertionError: GPU BLAKE3 result does not match CPU result! \ No newline at end of file diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py index 40cbb23..f0696b2 100644 --- a/benchmark/test_gpu.py +++ b/benchmark/test_gpu.py @@ -4,8 +4,8 @@ import time import blake3 -check_accuracy = True -check_perf = False +check_accuracy = False +check_perf = True GiB = 1024*1024*1024 # bytes -> 1 GiB @@ -31,8 +31,8 @@ std_hex = blake3.blake3(cpu.numpy()).hexdigest() print("std BLAKE3 Expected: ", std_hex) - std_hex_1KB = blake3.blake3(cpu[:1024].numpy()).hexdigest() - print("std BLAKE3 1KB: ", std_hex_1KB) + # std_hex_1KB = blake3.blake3(cpu[:1024].numpy()).hexdigest() + # print("std BLAKE3 1KB: ", std_hex_1KB) assert cv_hex == std_hex, "GPU BLAKE3 result does not match CPU result!" print("GPU BLAKE3 result matches CPU result!") @@ -51,7 +51,6 @@ torch.cuda.synchronize() t1 = time.perf_counter() - t1 = time.perf_counter() elapsed = t1 - t0 print(f"Elapsed time for {repeat}x BLAKE3 (GPU SM70): {elapsed:.3f} seconds") print(f"Throughput: {repeat * d.numel() / elapsed / (1024**2):.2f} MiB/s") diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu index ee5dd16..2911af3 100644 --- a/csrc/blake3_sm70_sm80.cu +++ b/csrc/blake3_sm70_sm80.cu @@ -1,5 +1,5 @@ -#include "cute/numeric/int.hpp" +#include #include #include #include @@ -29,6 +29,17 @@ } \ } while(0) +#define G(a,b,c,d, x, y) \ + do { \ + (a) = (a) + (b) + (x); \ + (d) = rotr32((d) ^ (a),16); \ + (c) = (c) + (d); \ + (b) = rotr32((b) ^ (c),12); \ + (a) = (a) + (b) + (y); \ + (d) = rotr32((d) ^ (a), 8); \ + (c) = (c) + (d); \ + (b) = rotr32((b) ^ (c), 7); \ + } while (0) using namespace cute; @@ -106,42 +117,70 @@ __device__ void print_cv(uint32_t cv[8]) { } } +__constant__ __device__ int B3_PERMUTE[16] = { + 2, 6, 3,10, 7, 0, 4,13, 1,11,12, 5, 9,14,15, 8 +}; + +// the actually right compress 7r in a single lane function __host__ __device__ void blake3_compress_words_7r( const uint32_t block_words[16], // 64B -> shared memory const uint32_t cv[8], // 8×u32 -> shared memory uint64_t chunk_counter, // 64-bit uint32_t block_len, // [0..64] uint32_t flags, // CHUNK_START/END/PARENT/ROOT… - uint32_t out_state[16]) // 返回 16×u32 状态向量(按规范) + uint32_t out_state[16]) // output { - // TODO: 根据 BLAKE3(基于 BLAKE2s)的 G/round 实现7轮 - // 这里先给占位:将 IV+cv 混合到 out_state,真实实现请替换 -#pragma unroll - for (int i = 0; i < 8; ++i) - out_state[i] = cv[i]; -#pragma unroll - for (int i = 0; i < 8; ++i) - out_state[8+i] = BLAKE3_IV[i]; + // 1) initialize v + uint32_t v0=cv[0], v1=cv[1], v2=cv[2], v3=cv[3]; + uint32_t v4=cv[4], v5=cv[5], v6=cv[6], v7=cv[7]; - out_state[12] ^= (uint32_t)chunk_counter; - out_state[13] ^= (uint32_t)(chunk_counter >> 32); - out_state[14] ^= block_len; - out_state[15] ^= flags; + uint32_t v8 =BLAKE3_IV[0], v9 =BLAKE3_IV[1], v10=BLAKE3_IV[2], v11=BLAKE3_IV[3]; + uint32_t v12=BLAKE3_IV[4], v13=BLAKE3_IV[5], v14=BLAKE3_IV[6], v15=BLAKE3_IV[7]; - // so far, the block_words are still pointers. - // now we load it into kernel, as pointed out by ncu profile - uint32_t block_reg_1[4]; + // injection + v12 ^= (uint32_t)chunk_counter; + v13 ^= (uint32_t)(chunk_counter >> 32); + v14 ^= block_len; + v15 ^= flags; -#pragma unroll - for (int i = 0; i < 16; i += 4) { // the gap is 4 - // load_u128_u32x4(block_words + i, block_reg_1); - out_state[i] ^= block_words[i]; - // 做一点点搅动(占位) - out_state[i] = rotr32(out_state[i] + 0x9E3779B9u, (i*7)&31); + // 2) 7 轮 + int perm[16]; // 每轮的消息索引 + #pragma unroll + for (int i = 0; i < 16; ++i) + perm[i] = i; + + #pragma unroll + for (int r=0; r < 7; ++r) { + // col-step + G(v0, v4, v8, v12, block_words[perm[0]], block_words[perm[1]]); + G(v1, v5, v9, v13, block_words[perm[2]], block_words[perm[3]]); + G(v2, v6, v10,v14, block_words[perm[4]], block_words[perm[5]]); + G(v3, v7, v11,v15, block_words[perm[6]], block_words[perm[7]]); + + // diag-step + G(v0, v5, v10,v15, block_words[perm[8]], block_words[perm[9]]); + G(v1, v6, v11,v12, block_words[perm[10]], block_words[perm[11]]); + G(v2, v7, v8, v13, block_words[perm[12]], block_words[perm[13]]); + G(v3, v4, v9, v14, block_words[perm[14]], block_words[perm[15]]); + + // perm = perm ∘ PERMUTE + int np[16]; + #pragma unroll + for (int i = 0; i < 16; ++i) + np[i] = perm[B3_PERMUTE[i]]; + #pragma unroll + for (int i = 0; i < 16; ++i) + perm[i] = np[i]; } + + // 3) write to out state + out_state[ 0]=v0; out_state[ 1]=v1; out_state[ 2]=v2; out_state[ 3]=v3; + out_state[ 4]=v4; out_state[ 5]=v5; out_state[ 6]=v6; out_state[ 7]=v7; + out_state[ 8]=v8; out_state[ 9]=v9; out_state[10]=v10; out_state[11]=v11; + out_state[12]=v12; out_state[13]=v13; out_state[14]=v14; out_state[15]=v15; } -// 从 out_state 派生新的 CV(按规范应取 state[0..7] ^ state[8..15]) +// from out_state yields CV __host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){ #pragma unroll for (int i = 0; i < 8; ++i) @@ -429,7 +468,7 @@ __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint3 msg[8+i] = R[i]; } uint32_t st[16]; - blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, (1u<<2), st); + blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, FLAG_PARENT, st); blake3_state_to_cv(st, out_cv); } @@ -759,7 +798,8 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, } #if defined(DBG_KERNEL) && DBG_KERNEL - printf("================================ Finishing all in big kernel! ================================\n"); + if (blockIdx.x == 0 && threadIdx.x == 0) + printf("================================ Finishing all in big kernel! ================================\n"); #endif } // blake3_block_reduce_kernel @@ -778,66 +818,72 @@ __device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) { } // ============ Tiny kernel ============ -// In big kernel, it will consume 64 KiB each block -// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / 64 root = 16384 root -// And this tiny kernel is designed to process these 16384 root -template +// In big kernel, it will consume 64 or 32 KiB each block [CHUNKS_PER_BLOCK] +// For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / CHUNKS_PER_BLOCK root = 16384 or 32768 roots +// And this tiny kernel is designed to process these 16384 or 32768 root +// For one chain value, takes u32 x 8 = 32B, +// 32KB shared memory could contain 1K chain value +// 16KB shmem could contain 512 chain value +template __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv32, uint32_t* __restrict__ out_cv32, int N) { - extern __shared__ __align__(16) uint32_t smem[]; // 动态 SMEM;需要 >= TILE_CVS*8*4 字节 - // 视作 2D:[TILE_CVS][8+PAD] + extern __shared__ __align__(16) uint32_t smem[]; // dyn SMEM >= TILE_CVS*32B + // regarad as 2D dyn shmem: [TILE_CVS][8+PAD] uint32_t* cv_tile = smem; const int tid = threadIdx.x; - const int warp_id = tid / WARP_SIZE; // 0..15 + constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + const int warp_id = tid / WARP_SIZE; // 0..NUM_WARPS (8 or 16) const int lane_id = tid % WARP_SIZE; // 0..31 - // 本 block 负责的分片起点 + // the start of this block. + // each block will process TILE_CVS roots. const int tile_start = blockIdx.x * TILE_CVS; if (tile_start >= N) return; - // N等于8的时候,这里就是8 - const int tile_n = min(TILE_CVS, N - tile_start); // 该分片的实际 CV 数(<=2048) + const int tile_n = min(TILE_CVS, N - tile_start); // actual cv count of this block - // ---------------- Stage 1: 合并访存 loading 到 SMEM ---------------- - // 每线程搬多个 CV:i = tid, tid+blockDim, ... - for (int i = tid; i < tile_n; i += NUM_THREADS) { // 注意:i = tid, 不是等于0 + // ---------------- Stage 1: coalsced loading to SMEM ---------------- + // each time load 8 KB, [tile_n/NUM_THREADS] times in total. + for (int i = tid; i < tile_n; i += NUM_THREADS) { // notice: i = tid, not start from 0 const uint32_t* g = in_cv32 + (size_t)(tile_start + i) * 8; uint32_t* s = cv_tile + (size_t)i * (8 + PAD); - // 两次 16B + // 16B x 2 const uint4* g4 = reinterpret_cast(g); uint4* s4 = reinterpret_cast(s); // s4[0] = g4[0]; // s4[1] = g4[1]; // in case that the address is not aligned - uint4 v0 = g4[0]; - uint4 v1 = g4[1]; + uint4 v0 = g4[0]; // each thread load 16B, 256 x 16=4KB + uint4 v1 = g4[1]; // each thread load 16B, 256 x 16=4KB s[0] = v0.x; s[1] = v0.y; s[2] = v0.z; s[3] = v0.w; s[4] = v1.x; s[5] = v1.y; s[6] = v1.z; s[7] = v1.w; } - // 对于 tile_n < TILE_CVS 的尾部,无需清零;后续按有效范围处理 __syncthreads(); - // ---------------- Stage 2: 线程内 4→1(保持相邻配对) ---------------- - // 共有 reduced_n0 = ceil(tile_n / 4) 个 lane-root - const int reduced_n0 = (tile_n + 3) >> 1 >> 1; // 等价于 (tile_n+3)/4 - uint32_t lane_cv[8]; // 本线程输出的 lane-root + // ---------------- Stage 2: each lane merge 4 → 1 (keep the neighbor order) ---------------- + // reduced_n0 = ceil(tile_n / 4) lane-root + const int reduced_n0 = (tile_n + 3) >> 1 >> 1; + uint32_t lane_cv[8]; // output of this lane 4-1 root bool lane_valid = false; - // 每线程的 4 个输入的起始索引 + // start index int base4 = tid << 2; // tid*4 if (base4 < tile_n) { - // 读取最多 4 个相邻 CV:idx = base4 + 0,1,2,3 + // 4-neighbor CV:idx = base4 + 0,1,2,3 uint32_t a[8], b[8], c[8], d[8]; + const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD); load_cv_g2r(s0, a); int remain = tile_n - base4; + // Yazhu: branch-prediction extra-overhead + if (remain >= 2) { const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD); load_cv_g2r(s1, b); @@ -851,16 +897,16 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv load_cv_g2r(s3, d); } - // 两层相邻配对(奇数晋级) + // merge the neighbor if (remain == 1) { #pragma unroll for (int j = 0; j < 8; ++j) lane_cv[j] = a[j]; } else if (remain == 2) { - blake3_parent_cv(a, b, lane_cv); + blake3_parent_cv(a, b, lane_cv); // write to lane_cv directly } else if (remain == 3) { uint32_t p01[8]; - blake3_parent_cv(a, b, p01); + blake3_parent_cv(a, b, p01); // one buffer blake3_parent_cv(p01, c, lane_cv); // (0,1)->p01,(p01,c)->lane_cv } else { // remain >= 4 uint32_t p01[8], p23[8]; @@ -871,28 +917,25 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv lane_valid = true; } - // ---------------- Stage 3: Warp 内 32→1 相邻配对 ---------------- - // 每个 warp 负责一个连续段:warp_base = warp_id*32 + // ---------------- Stage 3: Warp-level 32→1 neighbor-shfl merge ---------------- const int warp_base = warp_id * WARP_SIZE; - const int cur_n_w = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // 本 warp 段内的有效数量 + const int cur_n_w = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // active number - // 把 lane_cv 保留在寄存器里做归约;无效 lane 用 mask 剔除 - unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // 仅做存在检测 + // this will introduce extra branch-prediction overhead + unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); int cur_n = cur_n_w; - // 把“段外的线程”标成无效(避免读越界) bool active_lane = (lane_id < cur_n_w); - // 对无效 lane 把值清成 0(不会被使用) if (!active_lane) { #pragma unroll for (int j = 0; j < 8; ++j) lane_cv[j] = 0u; } - // 逐层配对:1,2,4,8,16 - warp-reduce + // step = 1,2,4,8,16 - warp-reduce for (int step = 1; step < WARP_SIZE; step <<= 1) { - // 取右邻 + // right-neighbor uint32_t nbr[8]; #pragma unroll for (int j = 0; j < 8; ++j) @@ -911,43 +954,62 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv // __syncwarp(); } - // 这一段的结果在 lane0;把 16 个 warp-root 写入 SMEM 的前 16 行 - __shared__ uint32_t warp_roots[WARP_SIZE/2][8]; // 16×8 + // lane0;NUM_WARPS warp-root write to SMEM + // e.g. if this block has 8 warp, there will be 8 warp-root + __shared__ uint32_t warp_roots[NUM_WARPS][8]; // NUM_WARPS × 8 if (lane_id == 0 && cur_n_w > 0) { #pragma unroll - for (int j=0;j<8;++j) + for (int j = 0; j < 8; ++j) warp_roots[warp_id][j] = lane_cv[j]; } __syncthreads(); - // ---------------- Stage 4: CTA 内 16→1 相邻配对 ---------------- - // 有效 warp 数:ceil(reduced_n0 / 32) - int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..16 + // ---------------- Stage 4: CTA's NUM_WARPS → 1 block reduce ---------------- + int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..NUM_WARPS if (valid_warps == 0) return; - // 每一个warp的lane 0来做计算 - // 用 lane0 做计算,其它 lane 空转 - for (int stride = (valid_warps >> 1); stride >= 1; stride >>= 1) { - if (warp_id < stride && lane_id == 0) { - uint32_t p[8]; - blake3_parent_cv(&warp_roots[2*warp_id][0], - &warp_roots[2*warp_id + 1][0], p); - #pragma unroll - for (int j = 0; j < 8; ++j) - warp_roots[warp_id][j] = p[j]; + + // 16 lane compute the merge together + const int sub = lane_id >> 4; // 0/1 + const int li = lane_id & 15; // 0..15 + const unsigned full = __activemask(); + const unsigned submask = ((sub==0) ? 0x0000FFFFu : 0xFFFF0000u) & full; + cur_n = NUM_WARPS; + + while (cur_n > 1) { + const int pairs = cur_n >> 1; // the pair count. + const int half_id = warp_id * 2 + sub; // half-warp index + + if (half_id < pairs) { + const int left_idx = (half_id << 1); + const int right_idx = left_idx + 1; + + const uint32_t* L = &warp_roots[left_idx][0]; + const uint32_t* R = &warp_roots[right_idx][0]; + + blake3_parent_cv_simd16(L, R, &warp_roots[left_idx][0], submask); } + __syncthreads(); - // 奇数晋级 - if ((valid_warps & 1) && warp_id==0 && lane_id==0) { - #pragma unroll - for (int j=0;j<8;++j) - warp_roots[stride][j] = warp_roots[valid_warps-1][j]; + + // if odd + if (cur_n & 1) { + if (warp_id == 0 && sub == 0 && li < 8) { + warp_roots[pairs][li] = warp_roots[cur_n - 1][li]; + } } - __syncthreads(); - valid_warps = (valid_warps + 1) >> 1; + + __syncthreads(); // next-level + cur_n = pairs + (cur_n & 1); // + } +#if defined(DBG_KERNEL) && DBG_KERNEL + if (tid == 0 && blockIdx.x == 0) { + printf("Block %d root CV for tiny kernel:", blockIdx.x); + print_cv(warp_roots[0]); } +#endif - // 写回本 block 的根 + // ---------------- Stage 5: write to output ---------------- if (threadIdx.x == 0) { uint32_t* out = out_cv32 + (size_t)blockIdx.x * 8; #pragma unroll @@ -1000,17 +1062,6 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, int optin = 0, deflt = 0; cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0); cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0); - - const int dyn_smem = 64 * 1024; // 64KiB - - // 编译器在编译期决定分配多少动态shmem给kernel - CUDA_CHECK(cudaFuncSetAttribute( - blake3_cv_block_reduce_kernel<512, 2048, 0>, - cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); - CUDA_CHECK(cudaFuncSetAttribute( - blake3_cv_block_reduce_kernel<32, 2048, 0>, - cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); - constexpr int CHUNKS_PER_BLOCK = 32; // 16 * 32 = 512 constexpr int CHUNK_SIZE = 1024; // 1 KiB @@ -1074,19 +1125,26 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, } // the first round of tiny kernel - // 1) 16384 output reduce -> 8 + // 1) 16384 or 32768 output reduce -> 8 + uint32_t* d_mid_out = nullptr; // num_blocks × 8 u32 { - const int N = 16384; // total number - const int TILE = 2048; - const int grid = (N + TILE - 1) / TILE; // = 8 - const int block = 512; - const size_t smem_bytes = (size_t)(TILE) * 8u * sizeof(uint32_t); // 2048×8×4 = 64 KiB - - cudaMalloc(&d_mid_out, (size_t)8 * 8u * sizeof(uint32_t)); - - blake3_cv_block_reduce_kernel<512, 2048, 0> - <<>>(d_blockCV /*in: 16384×8 x 4*/, + const int dyn_smem = 32 * 1024; // 32KiB + const int N = num_blocks; // total root-cv number + constexpr int TILE = 1024; + const int grid = (N + TILE - 1) / TILE; // 32 KB / 1024 = 32 + constexpr int NUM_THREADS = 256; + const size_t smem_bytes = (size_t)(TILE) * (8u + pad_cv) * sizeof(uint32_t); // 1024 x 32 = 32 KiB + + // decide upon compiling time + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + + cudaMalloc(&d_mid_out, (size_t)grid * 8u * sizeof(uint32_t)); + + blake3_cv_block_reduce_kernel + <<>>(d_blockCV /*in: 16384×8 x 4*/, d_mid_out /*out: 8×8*/, N); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); @@ -1095,16 +1153,22 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, // second round uint32_t* d_root_cv = nullptr; { - const int N = 8; - const int TILE = 2048; // 任意 >=N 即可 - const int grid = 1; - const int block = 32; // 32 线程够用 + const int dyn_smem = 1024; // 1 KiB will enough + constexpr int TILE = 1024; // any >= N + const int N = (num_blocks + TILE - 1) / TILE; // 32 + constexpr int grid = 1; + constexpr int NUM_THREADS = 32; // 32 threads will be enough const size_t smem_bytes = (size_t)(N) * 8u * sizeof(uint32_t); // 8 x 8 x 4 = 8 x 32 B + // decide upon compiling time + CUDA_CHECK(cudaFuncSetAttribute( + blake3_cv_block_reduce_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, dyn_smem)); + cudaMalloc(&d_root_cv, (size_t)1 * 8u * sizeof(uint32_t)); - blake3_cv_block_reduce_kernel<32, 2048, 0> - <<>>(d_mid_out /*in: 8×8*/, + blake3_cv_block_reduce_kernel + <<>>(d_mid_out /*in: 8×8*/, d_root_cv /*out: 1×8*/, N); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); From cd2a2386314d4868b1deb1972912b901a873a118 Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Wed, 8 Oct 2025 17:30:44 +0800 Subject: [PATCH 14/20] debug whole gpu kernel 1 --- csrc/blake3_sm70_sm80.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu index 2911af3..81addad 100644 --- a/csrc/blake3_sm70_sm80.cu +++ b/csrc/blake3_sm70_sm80.cu @@ -481,7 +481,7 @@ template 32 KB const int PAD_CHUNK=16, const int PAD_CV=0> // pad shared memory -__global__ void blake3_block_reduce_kernel(uint32_t* d_input, +__global__ void blake3_block_reduce_kernel(const uint32_t* d_input, uint32_t* block_cvs, int chunk_len_bytes, uint64_t base_chunk_counter, @@ -596,6 +596,7 @@ __global__ void blake3_block_reduce_kernel(uint32_t* d_input, *reinterpret_cast(raw_pointer_cast(&tCs(i))) = uint128_t(0, 0); } } + __syncwarp(); // inner-warp sync } __syncthreads(); // sync all warps @@ -1080,7 +1081,7 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, cudaFuncAttributePreferredSharedMemoryCarveout, 100)); uint8_t* d_bytes = const_cast(d_data); - uint32_t* d_words = reinterpret_cast(d_bytes);; // alias + const uint32_t* d_words = reinterpret_cast(d_bytes);; // alias uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32 // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch. From 5d617e044dbf090e4067ad2daea79663da88a456 Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Wed, 8 Oct 2025 23:07:19 +0800 Subject: [PATCH 15/20] reconstruct proj --- benchmark/test_gpu.py | 5 +- csrc/blake3_sm70_sm80.cu | 435 +-------------------------------------- csrc/utils.cuh | 433 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 440 insertions(+), 433 deletions(-) create mode 100644 csrc/utils.cuh diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py index f0696b2..50a7422 100644 --- a/benchmark/test_gpu.py +++ b/benchmark/test_gpu.py @@ -41,13 +41,16 @@ # 2) 预热,触发 JIT/驱动初始化,避免首轮偏慢 for _ in range(2): fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) + torch.cuda.synchronize() + print("\n") torch.cuda.synchronize() # 3) 正式计时:这测的是“端到端吞吐”(包含 H2D) - repeat = 5 # 1GiB × 5 已经很重,按机器调整 + repeat = 2 # 1GiB × 5 已经很重,按机器调整 t0 = time.perf_counter() for _ in range(repeat): cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) + print("\n") torch.cuda.synchronize() t1 = time.perf_counter() diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu index 81addad..9365f7d 100644 --- a/csrc/blake3_sm70_sm80.cu +++ b/csrc/blake3_sm70_sm80.cu @@ -17,6 +17,8 @@ #include #include +#include "utils.cuh" + #define WARP_SIZE 32 #define LDST128BITS(value) (reinterpret_cast(&(value))[0]) @@ -29,17 +31,7 @@ } \ } while(0) -#define G(a,b,c,d, x, y) \ - do { \ - (a) = (a) + (b) + (x); \ - (d) = rotr32((d) ^ (a),16); \ - (c) = (c) + (d); \ - (b) = rotr32((b) ^ (c),12); \ - (a) = (a) + (b) + (y); \ - (d) = rotr32((d) ^ (a), 8); \ - (c) = (c) + (d); \ - (b) = rotr32((b) ^ (c), 7); \ - } while (0) + using namespace cute; @@ -53,426 +45,6 @@ using vec_t = cute::uint128_t; // one time loading 16 B using Atom = cute::Copy_Atom, vec_t>; #endif -__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = { - 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, - 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u -}; - -enum : uint32_t { - FLAG_CHUNK_START = 1u << 0, - FLAG_CHUNK_END = 1u << 1, - FLAG_PARENT = 1u << 2, - FLAG_ROOT = 1u << 3, - FLAG_KEYED_HASH = 1u << 4, - FLAG_DERIVE_KEY_CONTEXT = 1u << 5, - FLAG_DERIVE_KEY_MATERIAL= 1u << 6, -}; - -__device__ __noinline__ -uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) { - uint32_t f = 0; - f |= (uint32_t)-(block_idx_in_chunk==0) & FLAG_CHUNK_START; - f |= (uint32_t)-(block_idx_in_chunk==nblocks_in_chunk-1) & FLAG_CHUNK_END; - if (is_root_chunk) f |= FLAG_ROOT; // only this block in msg, or this is root - return f; -} - -__device__ __forceinline__ -uint32_t blake3_parent_flags(bool is_root_parent) { - return FLAG_PARENT | (is_root_parent ? FLAG_ROOT : 0); -} - -// ---- 小工具 ---- -__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { -#if defined(__CUDA_ARCH__) - return __funnelshift_r(x, x, n); -#else - return (x >> n) | (x << (32 - n)); // host 路径 -#endif -} - -__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) { -#if defined(__CUDA_ARCH__) - const uint4 v = *reinterpret_cast(src); - dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w; -#else - std::memcpy(dst, src, 16); -#endif -} - -__device__ void print_cv(uint32_t cv[8]) { - if (blockIdx.x == 0 && threadIdx.x == 0) { - auto get_byte = [&](int i) { - int w = i >> 2; // 第 i 个字节来自第 w 个 u32 - int off = (i & 3) * 8; // 在该 u32 中的偏移 - return (unsigned)((cv[w] >> off) & 0xFFu); - }; - - printf("block %d root CV (u32, little-endian words):", blockIdx.x); - for (int i = 0; i < 32; ++i) { - printf("%02x", get_byte(i)); - if ((i & 3) == 3) printf(" "); // 每 4 字节空格 - } - printf("\n"); - } -} - -__constant__ __device__ int B3_PERMUTE[16] = { - 2, 6, 3,10, 7, 0, 4,13, 1,11,12, 5, 9,14,15, 8 -}; - -// the actually right compress 7r in a single lane function -__host__ __device__ void blake3_compress_words_7r( - const uint32_t block_words[16], // 64B -> shared memory - const uint32_t cv[8], // 8×u32 -> shared memory - uint64_t chunk_counter, // 64-bit - uint32_t block_len, // [0..64] - uint32_t flags, // CHUNK_START/END/PARENT/ROOT… - uint32_t out_state[16]) // output -{ - // 1) initialize v - uint32_t v0=cv[0], v1=cv[1], v2=cv[2], v3=cv[3]; - uint32_t v4=cv[4], v5=cv[5], v6=cv[6], v7=cv[7]; - - uint32_t v8 =BLAKE3_IV[0], v9 =BLAKE3_IV[1], v10=BLAKE3_IV[2], v11=BLAKE3_IV[3]; - uint32_t v12=BLAKE3_IV[4], v13=BLAKE3_IV[5], v14=BLAKE3_IV[6], v15=BLAKE3_IV[7]; - - // injection - v12 ^= (uint32_t)chunk_counter; - v13 ^= (uint32_t)(chunk_counter >> 32); - v14 ^= block_len; - v15 ^= flags; - - // 2) 7 轮 - int perm[16]; // 每轮的消息索引 - #pragma unroll - for (int i = 0; i < 16; ++i) - perm[i] = i; - - #pragma unroll - for (int r=0; r < 7; ++r) { - // col-step - G(v0, v4, v8, v12, block_words[perm[0]], block_words[perm[1]]); - G(v1, v5, v9, v13, block_words[perm[2]], block_words[perm[3]]); - G(v2, v6, v10,v14, block_words[perm[4]], block_words[perm[5]]); - G(v3, v7, v11,v15, block_words[perm[6]], block_words[perm[7]]); - - // diag-step - G(v0, v5, v10,v15, block_words[perm[8]], block_words[perm[9]]); - G(v1, v6, v11,v12, block_words[perm[10]], block_words[perm[11]]); - G(v2, v7, v8, v13, block_words[perm[12]], block_words[perm[13]]); - G(v3, v4, v9, v14, block_words[perm[14]], block_words[perm[15]]); - - // perm = perm ∘ PERMUTE - int np[16]; - #pragma unroll - for (int i = 0; i < 16; ++i) - np[i] = perm[B3_PERMUTE[i]]; - #pragma unroll - for (int i = 0; i < 16; ++i) - perm[i] = np[i]; - } - - // 3) write to out state - out_state[ 0]=v0; out_state[ 1]=v1; out_state[ 2]=v2; out_state[ 3]=v3; - out_state[ 4]=v4; out_state[ 5]=v5; out_state[ 6]=v6; out_state[ 7]=v7; - out_state[ 8]=v8; out_state[ 9]=v9; out_state[10]=v10; out_state[11]=v11; - out_state[12]=v12; out_state[13]=v13; out_state[14]=v14; out_state[15]=v15; -} - -// from out_state yields CV -__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){ -#pragma unroll - for (int i = 0; i < 8; ++i) - out_cv[i] = st[i] ^ st[8+i]; -} - -// swap-table -// BLAKE3 message schedule: rows are P^r, r=0..6. -// Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) -// https://www.cse.unsw.edu.au/~cs4601/refs/papers/blake3.pdf -__device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = { - // r = 0: identity - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - // r = 1: P - { 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8 }, - // r = 2: P∘P - { 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1 }, - // r = 3 - { 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6 }, - // r = 4 - { 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4 }, - // r = 5 - { 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7 }, - // r = 6 - { 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13 }, -}; - -// get the "r" round, "k" message, it is broadcasted from m[li] lane. li = k -__device__ __forceinline__ uint32_t msg_rk(uint32_t m_lane, int round, int k, unsigned mask16) { - int src = B3_MSG_SCHEDULE[round][k]; - return __shfl_sync(mask16, m_lane, src, 16); -} - -__device__ __noinline__ -uint32_t G_update_role(uint32_t v_self, uint32_t v_b, uint32_t v_c, uint32_t v_d, - uint32_t mx, uint32_t my, int role) -{ - // 按 BLAKE2s 32-bit 的 G 序列算出 a',b',c',d',最后返回“当前 role”的那个值 - uint32_t a = v_self, b = v_b, c = v_c, d = v_d; - - // a = a + b + mx; - // d ^= a; - // d >>>= 16 - a = a + b + mx; - d ^= a; - d = rotr32(d, 16); - - // c = c + d; - // b ^= c; - // b >>>= 12 - c = c + d; - b ^= c; - b = rotr32(b, 12); - - // a = a + b + my; - // d ^= a; - // d >>>= 8 - a = a + b + my; - d ^= a; - d = rotr32(d, 8); - - // c = c + d; - // b ^= c; - // b >>>= 7 - c = c + d; - b ^= c; - b = rotr32(b, 7); - - // role choice: - switch (role) { - case 0: return a; - case 1: return b; - case 2: return c; - default: return d; - } -} - -// notice that, this function will proceed 2 chunks, each time. -// - chunk_words_row: current chunk -// - out_cv: written by lane 0, or lane 16 -__device__ __noinline__ -void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row, // 256×u32 -> 1024 Bytes, from shared memory - // so the chunks_row += 2 as gap - int chunk_len_bytes, - uint64_t chunk_counter, - uint32_t out_cv[8], - unsigned mask16) { - // computing index - int lane = threadIdx.x & 31; // lane_id: 0-31 - int sub = lane >> 4; // 0/1 - int li = lane & 15; // 0..15, abstract lane id. for example, lane 16 will be li=0 - int role = li & 3; // a/b/c/d role - int base = (sub << 4); // 0 or 16 the absolute base - - const int nblocks = (chunk_len_bytes + 63) >> 6; // ceil(chunk_len/64) - - int warp_id = threadIdx.x / WARP_SIZE; - - // initialize - uint32_t cv_word = 0; - if (li < 8) cv_word = BLAKE3_IV[li]; - - // process all blocks - // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks - // each block has 64B -> 16 x u32 - for (int b = 0; b < nblocks; ++b) { - // each lane holds one u32, - // 16 lane will hold 16 x 4 = 64 B -> it's block - // the another 16 lane will hold opposite 64 B - const uint32_t m_lane = chunk_words_row[b * 16 + li]; - - // 初始化 v:v[0..7]=cv, v[8..11]=IV,v[12..15]^=t/len/flags - // 先把“自己的那个索引”的初值准备好: - uint32_t v = (li < 8) - ? cv_word // v[i](i<8) - : BLAKE3_IV[li - 8]; // v[8..15] ← IV - - // 计数器/长度/标志(按 BLAKE3 规范) - const uint32_t t0 = (uint32_t)chunk_counter; - const uint32_t t1 = (uint32_t)(chunk_counter >> 32); - const int remain = chunk_len_bytes - (b << 6); - const uint32_t block_len = (remain >= 64) ? 64u : (uint32_t)remain; - - const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false); - - // 只在 12..15 四个索引上异或相应域(不分支,用谓词掩码) - v ^= (li == 12) ? t0 : 0u; - v ^= (li == 13) ? t1 : 0u; - v ^= (li == 14) ? block_len: 0u; - v ^= (li == 15) ? flags : 0u; - - // 把索引写成 li = q + 4*rq;对角步先做置换 li' = (rq<<2) | ((q+rq)&3) - int q = (li & 3); - int rq = (li >> 2); - int li_diag = (rq << 2) | ((q + rq) & 3); - int li_undo = (rq << 2) | ((q - rq) & 3); - int gi_col = q; // 0..3 - int gi_diag = (li_diag & 3); // 0..3 - - // ===== 7 rounds ===== - #pragma unroll 4 - for (int r = 0; r < 7; ++r) { - // inside this loop, each lane will do one job - // 16 lane will execute 16 x 2 operations - // in sequential-programming, will do 8 operation - - // ---- 列步(quartet: {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15})---- - { - // 取同 quartet 的 b/c/d(基于当前 v) - uint32_t vb = __shfl_xor_sync(mask16, v, 4, 16); - uint32_t vc = __shfl_xor_sync(mask16, v, 8, 16); - uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16); - - // 本 quartet 的 i ∈ {0,1,2,3},列步用 msg 索引 0..7(两两为一对) - - uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16); - uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16); - - v = G_update_role(v, vb, vc, vd, mx, my, role); - } - - // ---- 对角步 ---- - { - // 在“对角置换域”取到当前 v 值 - uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16); - - // 在该域内做“列步”同样的四邻取值 - uint32_t vb = __shfl_xor_sync(mask16, v_diag, 4, 16); - uint32_t vc = __shfl_xor_sync(mask16, v_diag, 8, 16); - uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16); - - // 对角步的 4 组 G 使用本轮消息对的后半(索引 8..15) - - uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16); - uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16); - - uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role); - - // 反置换回原位:li_undo = (rq<<2) | ((q - rq) & 3) - - // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16); - v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16); - } - } // 7 rounds end - - // 派生新的 CV:cv[i] = v[i] ^ v[i+8](仅 li=0..7 生效) - uint32_t vip8_all = __shfl_sync(mask16, v, li ^ 8, 16); - if (li < 8) { - cv_word = v ^ vip8_all; - } - - // 下一块继续(本函数内 16 个 block 串行) - } - - // 由 lane0 / lane16 收集 8×u32 输出 - #pragma unroll - for (int j = 0; j < 8; ++j) { - uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用 - if (li == 0) out_cv[j] = wj; // 仅 lane0 落盘 - } -} - -__device__ __noinline__ -void blake3_parent_cv_simd16(const uint32_t* __restrict__ L, // 8×u32 - const uint32_t* __restrict__ R, // 8×u32 - uint32_t* __restrict__ out_cv, // 8×u32 - unsigned mask16) // half-warp masks for 16 lanes -{ - const int lane = threadIdx.x & 31; - const int li = lane & 15; // 0..15 half the warp - const int role = li & 3; - - // messages: the front 8 from L, and the latter 8 from R - const uint32_t m_lane = (li < 8) ? L[li] : R[li - 8]; - - // v initialize - uint32_t v = (li < 8) ? BLAKE3_IV[li] : BLAKE3_IV[li - 8]; - - const uint32_t t0 = 0u; - const uint32_t t1 = 0u; - const uint32_t block_len = 64u; - const uint32_t flags = FLAG_PARENT; - - v ^= (li == 12) ? t0 : 0u; - v ^= (li == 13) ? t1 : 0u; - v ^= (li == 14) ? block_len : 0u; - v ^= (li == 15) ? flags : 0u; - - // 与 leaf 相同的“列/对角”两步、共 7 轮 - int q = (li & 3); - int rq = (li >> 2); - int li_diag = (rq << 2) | ((q + rq) & 3); - int li_undo = (rq << 2) | ((q - rq) & 3); - int gi_col = q; - int gi_diag = (li_diag & 3); - - #pragma unroll 4 - for (int r = 0; r < 7; ++r) { - // 列步 - { - uint32_t vb = __shfl_xor_sync(mask16, v, 4, 16); - uint32_t vc = __shfl_xor_sync(mask16, v, 8, 16); - uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16); - - uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16); - uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16); - - v = G_update_role(v, vb, vc, vd, mx, my, role); - } - // 对角步 - { - uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16); - uint32_t vb = __shfl_xor_sync(mask16, v_diag, 4, 16); - uint32_t vc = __shfl_xor_sync(mask16, v_diag, 8, 16); - uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16); - - uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16); - uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16); - - uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role); - v = __shfl_sync(mask16, v_diag_new, li_undo, 16); - } - } - - // state -> CV:cv[i] = v[i] ^ v[i+8] - uint32_t vip8 = __shfl_sync(mask16, v, li ^ 8, 16); - uint32_t cv_word = (li < 8) ? (v ^ vip8) : 0; - - // 半 warp 汇聚到 out_cv[0..7](仅 li==0 的 4×收集也可以) - #pragma unroll - for (int j = 0; j < 8; ++j) { - uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); - if (li == 0) out_cv[j] = wj; - } -} - -__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){ - uint32_t msg[16]; -#pragma unroll - for (int i = 0; i < 8; ++i) { - msg[i] = L[i]; - } -#pragma unroll - for (int i = 0; i < 8; ++i) { - msg[8+i] = R[i]; - } - uint32_t st[16]; - blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, FLAG_PARENT, st); - blake3_state_to_cv(st, out_cv); -} - - // ============ Big kernel: 16 WARPS in total ============ // grid: (chunks / 64), thread: (512,) @@ -1023,7 +595,6 @@ inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out3 const uint32_t zero_block[16] = {0}; uint32_t st[16]; blake3_compress_words_7r(zero_block, root_cv, 0ull, 64u, FLAG_ROOT, st); - // 写出前 32 字节(state[0..7],小端) for (int i = 0; i < 8; ++i) { uint32_t w = st[i]; out32[4*i+0] = (uint8_t)( w & 0xFF); diff --git a/csrc/utils.cuh b/csrc/utils.cuh new file mode 100644 index 0000000..fabfe55 --- /dev/null +++ b/csrc/utils.cuh @@ -0,0 +1,433 @@ +#include + +#define WARP_SIZE 32 +#define G(a,b,c,d, x, y) \ + do { \ + (a) = (a) + (b) + (x); \ + (d) = rotr32((d) ^ (a),16); \ + (c) = (c) + (d); \ + (b) = rotr32((b) ^ (c),12); \ + (a) = (a) + (b) + (y); \ + (d) = rotr32((d) ^ (a), 8); \ + (c) = (c) + (d); \ + (b) = rotr32((b) ^ (c), 7); \ + } while (0) + + +__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = { + 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, + 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u +}; + +enum : uint32_t { + FLAG_CHUNK_START = 1u << 0, + FLAG_CHUNK_END = 1u << 1, + FLAG_PARENT = 1u << 2, + FLAG_ROOT = 1u << 3, + FLAG_KEYED_HASH = 1u << 4, + FLAG_DERIVE_KEY_CONTEXT = 1u << 5, + FLAG_DERIVE_KEY_MATERIAL= 1u << 6, +}; + +__device__ __noinline__ +uint32_t blake3_leaf_flags(int block_idx_in_chunk, int nblocks_in_chunk, bool is_root_chunk = false) { + uint32_t f = 0; + f |= (uint32_t)-(block_idx_in_chunk==0) & FLAG_CHUNK_START; + f |= (uint32_t)-(block_idx_in_chunk==nblocks_in_chunk-1) & FLAG_CHUNK_END; + if (is_root_chunk) f |= FLAG_ROOT; // only this block in msg, or this is root + return f; +} + +__device__ __forceinline__ +uint32_t blake3_parent_flags(bool is_root_parent) { + return FLAG_PARENT | (is_root_parent ? FLAG_ROOT : 0); +} + +// ---- 小工具 ---- +__host__ __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { +#if defined(__CUDA_ARCH__) + return __funnelshift_r(x, x, n); +#else + return (x >> n) | (x << (32 - n)); // host 路径 +#endif +} + +__host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t dst[4]) { +#if defined(__CUDA_ARCH__) + const uint4 v = *reinterpret_cast(src); + dst[0] = v.x; dst[1] = v.y; dst[2] = v.z; dst[3] = v.w; +#else + std::memcpy(dst, src, 16); +#endif +} + +__device__ void print_cv(uint32_t cv[8]) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + auto get_byte = [&](int i) { + int w = i >> 2; // 第 i 个字节来自第 w 个 u32 + int off = (i & 3) * 8; // 在该 u32 中的偏移 + return (unsigned)((cv[w] >> off) & 0xFFu); + }; + + printf("block %d root CV (u32, little-endian words):", blockIdx.x); + for (int i = 0; i < 32; ++i) { + printf("%02x", get_byte(i)); + if ((i & 3) == 3) printf(" "); // 每 4 字节空格 + } + printf("\n"); + } +} + +__constant__ __device__ int B3_PERMUTE[16] = { + 2, 6, 3,10, 7, 0, 4,13, 1,11,12, 5, 9,14,15, 8 +}; + +// swap-table +// BLAKE3 message schedule: rows are P^r, r=0..6. +// Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) +// https://www.cse.unsw.edu.au/~cs4601/refs/papers/blake3.pdf +__device__ __constant__ uint8_t B3_MSG_SCHEDULE[7][16] = { + // r = 0: identity + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + // r = 1: P + { 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8 }, + // r = 2: P∘P + { 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1 }, + // r = 3 + { 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6 }, + // r = 4 + { 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4 }, + // r = 5 + { 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7 }, + // r = 6 + { 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13 }, +}; + +// get the "r" round, "k" message, it is broadcasted from m[li] lane. li = k +__device__ __forceinline__ uint32_t msg_rk(uint32_t m_lane, int round, int k, unsigned mask16) { + int src = B3_MSG_SCHEDULE[round][k]; + return __shfl_sync(mask16, m_lane, src, 16); +} + +__device__ __noinline__ +uint32_t G_update_role(uint32_t v_self, uint32_t v_b, uint32_t v_c, uint32_t v_d, + uint32_t mx, uint32_t my, int role) +{ + // 按 BLAKE2s 32-bit 的 G 序列算出 a',b',c',d',最后返回“当前 role”的那个值 + uint32_t a = v_self, b = v_b, c = v_c, d = v_d; + + // a = a + b + mx; + // d ^= a; + // d >>>= 16 + a = a + b + mx; + d ^= a; + d = rotr32(d, 16); + + // c = c + d; + // b ^= c; + // b >>>= 12 + c = c + d; + b ^= c; + b = rotr32(b, 12); + + // a = a + b + my; + // d ^= a; + // d >>>= 8 + a = a + b + my; + d ^= a; + d = rotr32(d, 8); + + // c = c + d; + // b ^= c; + // b >>>= 7 + c = c + d; + b ^= c; + b = rotr32(b, 7); + + // role choice: + switch (role) { + case 0: return a; + case 1: return b; + case 2: return c; + default: return d; + } +} + + +// =============== Leaf 16-lane compressing =============== +// notice that, this function will proceed 2 chunks, each time. +// - chunk_words_row: current chunk +// - out_cv: written by lane 0, or lane 16 +__device__ __noinline__ +void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row, // 256×u32 -> 1024 Bytes, from shared memory + // so the chunks_row += 2 as gap + int chunk_len_bytes, + uint64_t chunk_counter, + uint32_t out_cv[8], + unsigned mask16) { + // computing index + int lane = threadIdx.x & 31; // lane_id: 0-31 + int sub = lane >> 4; // 0/1 + int li = lane & 15; // 0..15, abstract lane id. for example, lane 16 will be li=0 + int role = li & 3; // a/b/c/d role + int base = (sub << 4); // 0 or 16 the absolute base + + const int nblocks = (chunk_len_bytes + 63) >> 6; // ceil(chunk_len/64) + + int warp_id = threadIdx.x / WARP_SIZE; + + // initialize + uint32_t cv_word = 0; + if (li < 8) cv_word = BLAKE3_IV[li]; + + // process all blocks + // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks + // each block has 64B -> 16 x u32 + for (int b = 0; b < nblocks; ++b) { + // each lane holds one u32, + // 16 lane will hold 16 x 4 = 64 B -> it's block + // the another 16 lane will hold opposite 64 B + const uint32_t m_lane = chunk_words_row[b * 16 + li]; + + // 初始化 v:v[0..7]=cv, v[8..11]=IV,v[12..15]^=t/len/flags + // 先把“自己的那个索引”的初值准备好: + + // 计数器/长度/标志(按 BLAKE3 规范) + const uint32_t t0 = (uint32_t)chunk_counter; + const uint32_t t1 = (uint32_t)(chunk_counter >> 32); + const int remain = chunk_len_bytes - (b << 6); + const uint32_t block_len = (remain >= 64) ? 64u : (uint32_t)remain; + + const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false); + uint32_t v = + (li < 8) ? cv_word : + (li < 12) ? BLAKE3_IV[(li - 8) & 3] : + (li == 12) ? t0 : + (li == 13) ? t1 : + (li == 14) ? block_len : flags; + + // 把索引写成 li = q + 4*rq;对角步先做置换 li' = (rq<<2) | ((q+rq)&3) + int q = (li & 3); + int rq = (li >> 2); + int li_diag = (rq << 2) | ((q + rq) & 3); + int li_undo = (rq << 2) | ((q - rq) & 3); + int gi_col = q; // 0..3 + int gi_diag = (li_diag & 3); // 0..3 + + // ===== 7 rounds ===== + #pragma unroll 4 + for (int r = 0; r < 7; ++r) { + // inside this loop, each lane will do one job + // 16 lane will execute 16 x 2 operations + // in sequential-programming, will do 8 operation + + // ---- 列步(quartet: {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15})---- + { + // 取同 quartet 的 b/c/d(基于当前 v) + uint32_t vb = __shfl_xor_sync(mask16, v, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16); + + // 本 quartet 的 i ∈ {0,1,2,3},列步用 msg 索引 0..7(两两为一对) + + uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16); + + v = G_update_role(v, vb, vc, vd, mx, my, role); + } + + // ---- 对角步 ---- + { + // 在“对角置换域”取到当前 v 值 + uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16); + + // 在该域内做“列步”同样的四邻取值 + uint32_t vb = __shfl_xor_sync(mask16, v_diag, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v_diag, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16); + + // 对角步的 4 组 G 使用本轮消息对的后半(索引 8..15) + + uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16); + + uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role); + + // 反置换回原位:li_undo = (rq<<2) | ((q - rq) & 3) + + // v = __shfl_sync(mask16, v_diag_new, base + li_undo, 16); + v = __shfl_sync(mask16, v_diag_new, /*relative*/ li_undo, 16); + } + } // 7 rounds end + + // 派生新的 CV:cv[i] = v[i] ^ v[i+8](仅 li=0..7 生效) + uint32_t vip8_all = __shfl_sync(mask16, v, li ^ 8, 16); + if (li < 8) { + cv_word = v ^ vip8_all; + } + + // 下一块继续(本函数内 16 个 block 串行) + } + + // 由 lane0 / lane16 收集 8×u32 输出 + #pragma unroll + for (int j = 0; j < 8; ++j) { + uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用 + if (li == 0) out_cv[j] = wj; // 仅 lane0 落盘 + } +} + +// =============== Parent 16-lane compressing =============== +__device__ __noinline__ +void blake3_parent_cv_simd16(const uint32_t* __restrict__ L, // 8×u32 + const uint32_t* __restrict__ R, // 8×u32 + uint32_t* __restrict__ out_cv, // 8×u32 + unsigned mask16) // half-warp masks for 16 lanes +{ + const int lane = threadIdx.x & 31; + const int li = lane & 15; // 0..15 half the warp + const int role = li & 3; + + // messages: the front 8 from L, and the latter 8 from R + const uint32_t m_lane = (li < 8) ? L[li] : R[li - 8]; + + // v initialize + + const uint32_t t0 = 0u; + const uint32_t t1 = 0u; + const uint32_t block_len = 64u; + const uint32_t flags = FLAG_PARENT; + + uint32_t iv_val = BLAKE3_IV[li & 7]; + + uint32_t v = + (li < 12) ? iv_val : + (li == 12) ? t0 : + (li == 13) ? t1 : + (li == 14) ? block_len : flags; + + // 与 leaf 相同的“列/对角”两步、共 7 轮 + int q = (li & 3); + int rq = (li >> 2); + int li_diag = (rq << 2) | ((q + rq) & 3); + int li_undo = (rq << 2) | ((q - rq) & 3); + int gi_col = q; + int gi_diag = (li_diag & 3); + + #pragma unroll 4 + for (int r = 0; r < 7; ++r) { + // 列步 + { + uint32_t vb = __shfl_xor_sync(mask16, v, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v, 12, 16); + + uint32_t mx = msg_rk(m_lane, r, 2*gi_col + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 2*gi_col + 1, mask16); + + v = G_update_role(v, vb, vc, vd, mx, my, role); + } + // 对角步 + { + uint32_t v_diag = __shfl_sync(mask16, v, li_diag, 16); + uint32_t vb = __shfl_xor_sync(mask16, v_diag, 4, 16); + uint32_t vc = __shfl_xor_sync(mask16, v_diag, 8, 16); + uint32_t vd = __shfl_xor_sync(mask16, v_diag, 12, 16); + + uint32_t mx = msg_rk(m_lane, r, 8 + 2*gi_diag + 0, mask16); + uint32_t my = msg_rk(m_lane, r, 8 + 2*gi_diag + 1, mask16); + + uint32_t v_diag_new = G_update_role(v_diag, vb, vc, vd, mx, my, role); + v = __shfl_sync(mask16, v_diag_new, li_undo, 16); + } + } + + // state -> CV:cv[i] = v[i] ^ v[i+8] + uint32_t vip8 = __shfl_sync(mask16, v, li ^ 8, 16); + uint32_t cv_word = (li < 8) ? (v ^ vip8) : 0; + + // 半 warp 汇聚到 out_cv[0..7](仅 li==0 的 4×收集也可以) + #pragma unroll + for (int j = 0; j < 8; ++j) { + uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); + if (li == 0) out_cv[j] = wj; + } +} + +// =============== Parent 1-lane compressing =============== +// the actually right compress 7r in a single lane function +__host__ __device__ void blake3_compress_words_7r( + const uint32_t block_words[16], // 64B -> shared memory + const uint32_t cv[8], // 8×u32 -> shared memory + uint64_t chunk_counter, // 64-bit + uint32_t block_len, // [0..64] + uint32_t flags, // CHUNK_START/END/PARENT/ROOT… + uint32_t out_state[16]) // output +{ + // 1) initialize v + uint32_t v0=cv[0], v1=cv[1], v2=cv[2], v3=cv[3]; + uint32_t v4=cv[4], v5=cv[5], v6=cv[6], v7=cv[7]; + + uint32_t v8 =BLAKE3_IV[0], v9 =BLAKE3_IV[1], v10=BLAKE3_IV[2], v11=BLAKE3_IV[3]; + + // injection + uint32_t v12=(uint32_t)chunk_counter, v13=(uint32_t)(chunk_counter >> 32), v14=block_len, v15=flags; + + // 2) 7 轮 + int perm[16]; // 每轮的消息索引 + #pragma unroll + for (int i = 0; i < 16; ++i) + perm[i] = i; + + #pragma unroll + for (int r=0; r < 7; ++r) { + // col-step + G(v0, v4, v8, v12, block_words[perm[0]], block_words[perm[1]]); + G(v1, v5, v9, v13, block_words[perm[2]], block_words[perm[3]]); + G(v2, v6, v10,v14, block_words[perm[4]], block_words[perm[5]]); + G(v3, v7, v11,v15, block_words[perm[6]], block_words[perm[7]]); + + // diag-step + G(v0, v5, v10,v15, block_words[perm[8]], block_words[perm[9]]); + G(v1, v6, v11,v12, block_words[perm[10]], block_words[perm[11]]); + G(v2, v7, v8, v13, block_words[perm[12]], block_words[perm[13]]); + G(v3, v4, v9, v14, block_words[perm[14]], block_words[perm[15]]); + + // perm = perm ∘ PERMUTE + int np[16]; + #pragma unroll + for (int i = 0; i < 16; ++i) + np[i] = perm[B3_PERMUTE[i]]; + #pragma unroll + for (int i = 0; i < 16; ++i) + perm[i] = np[i]; + } + + // 3) write to out state + out_state[ 0]=v0; out_state[ 1]=v1; out_state[ 2]=v2; out_state[ 3]=v3; + out_state[ 4]=v4; out_state[ 5]=v5; out_state[ 6]=v6; out_state[ 7]=v7; + out_state[ 8]=v8; out_state[ 9]=v9; out_state[10]=v10; out_state[11]=v11; + out_state[12]=v12; out_state[13]=v13; out_state[14]=v14; out_state[15]=v15; +} + +// from out_state yields CV +__host__ __device__ __forceinline__ void blake3_state_to_cv(const uint32_t st[16], uint32_t out_cv[8]){ +#pragma unroll + for (int i = 0; i < 8; ++i) + out_cv[i] = st[i] ^ st[8+i]; +} + +__device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint32_t out_cv[8]){ + uint32_t msg[16]; +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[i] = L[i]; + } +#pragma unroll + for (int i = 0; i < 8; ++i) { + msg[8+i] = R[i]; + } + uint32_t st[16]; + blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, FLAG_PARENT, st); + blake3_state_to_cv(st, out_cv); +} \ No newline at end of file From 8642165c516d5e4d8ab1651294a00d2d7e27ce2e Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Thu, 9 Oct 2025 14:57:07 +0800 Subject: [PATCH 16/20] debug tiny kernel --- benchmark/test_gpu.py | 8 ++-- csrc/blake3_sm70_sm80.cu | 89 +++++++++++++++++++++++++++------------- csrc/utils.cuh | 2 +- 3 files changed, 66 insertions(+), 33 deletions(-) diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py index 50a7422..6e10d6d 100644 --- a/benchmark/test_gpu.py +++ b/benchmark/test_gpu.py @@ -42,15 +42,17 @@ for _ in range(2): fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) torch.cuda.synchronize() - print("\n") + print("1\n") torch.cuda.synchronize() # 3) 正式计时:这测的是“端到端吞吐”(包含 H2D) - repeat = 2 # 1GiB × 5 已经很重,按机器调整 + repeat = 0 # 1GiB × 5 已经很重,按机器调整 + cv_hex = None t0 = time.perf_counter() for _ in range(repeat): cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) - print("\n") + torch.cuda.synchronize() + print("2\n") torch.cuda.synchronize() t1 = time.perf_counter() diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu index 9365f7d..0fc397e 100644 --- a/csrc/blake3_sm70_sm80.cu +++ b/csrc/blake3_sm70_sm80.cu @@ -231,7 +231,7 @@ __global__ void blake3_block_reduce_kernel(const uint32_t* d_input, // the chunk local identifies the left or right chunk, so do not worry. const uint32_t* row = valid_sub ? &chunk_smem[chunk_local][0] : &chunk_smem[0][0]; - uint32_t* out = valid_sub ? &tmp_cv[chunk_local][0] : nullptr; + uint32_t* out = valid_sub ? &tmp_cv[chunk_local][0] : nullptr; // FIXME: here has some problem!!! #if defined(DBG_KERNEL) && DBG_KERNEL if (!valid_sub) { @@ -245,7 +245,7 @@ __global__ void blake3_block_reduce_kernel(const uint32_t* d_input, out, submask); - __syncwarp(full); // make sure two warps written into `tmp_cv` + __syncwarp(); // make sure two warps written into `tmp_cv` // #if defined(DBG_KERNEL) && DBG_KERNEL // if (blockIdx.x == 0 && threadIdx.x == 0) printf("The simd16-lane res: \n"); @@ -281,7 +281,7 @@ __global__ void blake3_block_reduce_kernel(const uint32_t* d_input, print_cv(cv_smem[pair_idx]); #endif - __syncwarp(full); // NOTICE: this is necessary! + __syncwarp(); // NOTICE: this is necessary! }; // do_big_pass // big-pass 1: computing [CHUNK_PER_BLOCK / 2] chunks @@ -390,6 +390,20 @@ __device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) { g4[1] = make_uint4(r[4],r[5],r[6],r[7]); } +__device__ __forceinline__ +const void* to_smem_ptr(const void* p){ + return (const void*)__cvta_generic_to_shared(p); +} + +__device__ __forceinline__ +void load_cv_s2r_vec(const uint32_t* __restrict__ s, uint32_t dst[8]) { + const uint4* s4 = reinterpret_cast(to_smem_ptr(s)); // 编译成 ld.shared.v4.u32 + uint4 v0 = s4[0]; + uint4 v1 = s4[1]; + dst[0]=v0.x; dst[1]=v0.y; dst[2]=v0.z; dst[3]=v0.w; + dst[4]=v1.x; dst[5]=v1.y; dst[6]=v1.z; dst[7]=v1.w; +} + // ============ Tiny kernel ============ // In big kernel, it will consume 64 or 32 KiB each block [CHUNKS_PER_BLOCK] // For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / CHUNKS_PER_BLOCK root = 16384 or 32768 roots @@ -438,6 +452,13 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv } __syncthreads(); +#if defined(DBG_KERNEL) && DBG_KERNEL + if (tid == 0 && blockIdx.x == 0) { + printf("Block %d root CV for tiny kernel entry:", blockIdx.x); + print_cv(smem); + } +#endif + // ---------------- Stage 2: each lane merge 4 → 1 (keep the neighbor order) ---------------- // reduced_n0 = ceil(tile_n / 4) lane-root const int reduced_n0 = (tile_n + 3) >> 1 >> 1; @@ -445,13 +466,15 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv bool lane_valid = false; // start index - int base4 = tid << 2; // tid*4 + int base4 = tid << 2; // tid*4, 0..8 if (base4 < tile_n) { // 4-neighbor CV:idx = base4 + 0,1,2,3 uint32_t a[8], b[8], c[8], d[8]; const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD); - load_cv_g2r(s0, a); + printf("The base 1: %d\n", base4); + // TODO: what's wrong? + load_cv_s2r_vec(s0, a); int remain = tile_n - base4; @@ -459,15 +482,18 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv if (remain >= 2) { const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD); - load_cv_g2r(s1, b); + if (base4 >= 1020) printf("The base 2: %d\n", base4); + load_cv_s2r_vec(s1, b); } if (remain >= 3) { const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD); - load_cv_g2r(s2, c); + if (base4 >= 1020) printf("The base 3: %d\n", base4); + load_cv_s2r_vec(s2, c); } if (remain >= 4) { const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD); - load_cv_g2r(s3, d); + if (base4 >= 1020) printf("The base 4: %d\n", base4); + load_cv_s2r_vec(s3, d); } // merge the neighbor @@ -490,15 +516,22 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv lane_valid = true; } +#if defined(DBG_KERNEL) && DBG_KERNEL + if (blockIdx.x == 0 && threadIdx.x == 0) { + printf("The lane-reduce-4 output:"); + print_cv(lane_cv); + } +#endif + // ---------------- Stage 3: Warp-level 32→1 neighbor-shfl merge ---------------- const int warp_base = warp_id * WARP_SIZE; const int cur_n_w = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // active number // this will introduce extra branch-prediction overhead - unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); + unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // This may not work int cur_n = cur_n_w; - bool active_lane = (lane_id < cur_n_w); + bool active_lane = (lane_id < cur_n); if (!active_lane) { #pragma unroll @@ -507,12 +540,12 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv } // step = 1,2,4,8,16 - warp-reduce - for (int step = 1; step < WARP_SIZE; step <<= 1) { + for (int step = 1; step < cur_n_w; step <<= 1) { // right-neighbor uint32_t nbr[8]; #pragma unroll for (int j = 0; j < 8; ++j) - nbr[j] = __shfl_down_sync(0xFFFFFFFFu, lane_cv[j], step); + nbr[j] = __shfl_down_sync(mask, lane_cv[j], step); const bool do_pair = active_lane && @@ -537,6 +570,13 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv } __syncthreads(); +#if defined(DBG_KERNEL) && DBG_KERNEL + if (blockIdx.x == 0 && threadIdx.x == 0) { + printf("The warp-reduce output:"); + print_cv(warp_roots[0]); + } +#endif + // ---------------- Stage 4: CTA's NUM_WARPS → 1 block reduce ---------------- int valid_warps = (reduced_n0 + WARP_SIZE - 1) / WARP_SIZE; // 0..NUM_WARPS if (valid_warps == 0) return; @@ -547,11 +587,11 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv const int li = lane_id & 15; // 0..15 const unsigned full = __activemask(); const unsigned submask = ((sub==0) ? 0x0000FFFFu : 0xFFFF0000u) & full; + const int half_id = warp_id * 2 + sub; // half-warp index cur_n = NUM_WARPS; while (cur_n > 1) { const int pairs = cur_n >> 1; // the pair count. - const int half_id = warp_id * 2 + sub; // half-warp index if (half_id < pairs) { const int left_idx = (half_id << 1); @@ -589,6 +629,13 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv for (int j = 0; j < 8; ++j) out[j] = warp_roots[0][j]; } + +#if defined(DBG_KERNEL) && DBG_KERNEL + if (blockIdx.x == 0 && threadIdx.x == 0) { + printf("================================ Finishing all in tiny kernel! ================================\n"); + } +#endif + } inline void blake3_digest32_from_root_cv(const uint32_t root_cv[8], uint8_t out32[32]) { @@ -615,22 +662,6 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, std::abort(); } - // int dev = -1; - // cudaGetDevice(&dev); - // printf("[dbg] my runtime current device = %d\n", dev); - - // cudaPointerAttributes attr{}; - // auto st = cudaPointerGetAttributes(&attr, d_data); - // printf("[dbg] getAttr=%d, type=%d (0=host,1=device,2=managed), device=%d\n", - // (int)st, (int)attr.type, attr.device); - - // cudaPointerAttributes attr{}; - // CUDA_CHECK(cudaPointerGetAttributes(&attr, d_data)); - // if (attr.type != cudaMemoryTypeDevice) { - // fprintf(stderr, "d_data is not device memory!\n"); - // std::abort(); - // } - int optin = 0, deflt = 0; cudaDeviceGetAttribute(&optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0); cudaDeviceGetAttribute(&deflt, cudaDevAttrMaxSharedMemoryPerBlock, 0); diff --git a/csrc/utils.cuh b/csrc/utils.cuh index fabfe55..3c1739b 100644 --- a/csrc/utils.cuh +++ b/csrc/utils.cuh @@ -273,7 +273,7 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row #pragma unroll for (int j = 0; j < 8; ++j) { uint32_t wj = __shfl_sync(mask16, cv_word, j, 16); // 16 lane 全调用 - if (li == 0) out_cv[j] = wj; // 仅 lane0 落盘 + if (li == 0 && out_cv) out_cv[j] = wj; // 仅 lane0 落盘 } } From e8afb0664d66f10907f2e2f8c53a533d68493734 Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Thu, 9 Oct 2025 15:06:47 +0800 Subject: [PATCH 17/20] finish part-debugging --- benchmark/test_gpu.py | 6 +----- csrc/blake3_sm70_sm80.cu | 17 ++++------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py index 6e10d6d..3052a80 100644 --- a/benchmark/test_gpu.py +++ b/benchmark/test_gpu.py @@ -41,18 +41,14 @@ # 2) 预热,触发 JIT/驱动初始化,避免首轮偏慢 for _ in range(2): fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) - torch.cuda.synchronize() - print("1\n") torch.cuda.synchronize() # 3) 正式计时:这测的是“端到端吞吐”(包含 H2D) - repeat = 0 # 1GiB × 5 已经很重,按机器调整 + repeat = 5 # 1GiB × 5 已经很重,按机器调整 cv_hex = None t0 = time.perf_counter() for _ in range(repeat): cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) - torch.cuda.synchronize() - print("2\n") torch.cuda.synchronize() t1 = time.perf_counter() diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu index 0fc397e..b782f90 100644 --- a/csrc/blake3_sm70_sm80.cu +++ b/csrc/blake3_sm70_sm80.cu @@ -390,16 +390,12 @@ __device__ __forceinline__ void store_cv_r2g(const uint32_t r[8], uint32_t* g) { g4[1] = make_uint4(r[4],r[5],r[6],r[7]); } -__device__ __forceinline__ -const void* to_smem_ptr(const void* p){ - return (const void*)__cvta_generic_to_shared(p); -} - __device__ __forceinline__ void load_cv_s2r_vec(const uint32_t* __restrict__ s, uint32_t dst[8]) { - const uint4* s4 = reinterpret_cast(to_smem_ptr(s)); // 编译成 ld.shared.v4.u32 - uint4 v0 = s4[0]; - uint4 v1 = s4[1]; + // 确保 16B 对齐:cv_tile 要 __align__(16),stride=(8+PAD)*4 也要是16的倍数 + const uint4* p = reinterpret_cast(s); + uint4 v0 = p[0]; + uint4 v1 = p[1]; dst[0]=v0.x; dst[1]=v0.y; dst[2]=v0.z; dst[3]=v0.w; dst[4]=v1.x; dst[5]=v1.y; dst[6]=v1.z; dst[7]=v1.w; } @@ -472,8 +468,6 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv uint32_t a[8], b[8], c[8], d[8]; const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD); - printf("The base 1: %d\n", base4); - // TODO: what's wrong? load_cv_s2r_vec(s0, a); int remain = tile_n - base4; @@ -482,17 +476,14 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv if (remain >= 2) { const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD); - if (base4 >= 1020) printf("The base 2: %d\n", base4); load_cv_s2r_vec(s1, b); } if (remain >= 3) { const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD); - if (base4 >= 1020) printf("The base 3: %d\n", base4); load_cv_s2r_vec(s2, c); } if (remain >= 4) { const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD); - if (base4 >= 1020) printf("The base 4: %d\n", base4); load_cv_s2r_vec(s3, d); } From 0d1485a5ac3478355998ac845660a55926fa2599 Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Fri, 10 Oct 2025 17:07:21 +0800 Subject: [PATCH 18/20] debug big kernel tail problem --- benchmark/perf.txt | 44 ++-------------- benchmark/test_gpu.py | 20 ++++--- csrc/blake3_sm70_sm80.cu | 110 ++++++++++++++++++++++++++++++--------- csrc/utils.cuh | 52 ++++++++++++------ 4 files changed, 138 insertions(+), 88 deletions(-) diff --git a/benchmark/perf.txt b/benchmark/perf.txt index d48f165..0058b2e 100644 --- a/benchmark/perf.txt +++ b/benchmark/perf.txt @@ -1,43 +1,5 @@ 10.7 V100: 54819.25 MiB/s - RTX 4090: 150585.83 MiB/s + RTX 4090: 150585.83 MiB/s (False) -(ceg5206) yazhu@DESKTOP-37AIPE6:~/workspace/cpp_code/ceg5206/grp_proj$ python benchmark/test_gpu.py -Stage 1 finish processing -The 1st chunk merged res: -block 0 root CV (u32, little-endian words):68f4f634 6a351c13 83eef7bf 6194f235 c3e40515 5a2bff65 9ff142fa 54c43967 -Stage 2 - pass 1 finish processing -The 1st chunk merged res: -block 0 root CV (u32, little-endian words):930c7381 17018a62 722c6e90 4b5bfaa7 09e9e8f9 e41e7ecf 75386773 7c727b3b -Stage 2 - pass 2 finish processing, pass 1: 16 -block 0 root CV (u32, little-endian words):791e0e1a 05a01828 bc7cfa9f 3274a8ae 50feb0e3 c6113c92 2aaca74f 272f3096 -================================ Finishing all in big kernel! ================================ -Block 0 root CV for tiny kernel:block 0 root CV (u32, little-endian words):369139fa 28a9fc9c ab9149a8 e3f60323 9c7504ac b5088ac8 4c951d3f 3ca90f3a -Block 0 root CV for tiny kernel:block 0 root CV (u32, little-endian words):62a88205 53625a4a b538365b c131e095 935d3504 989b4852 f5ed3f51 6a53b552 -GPU BLAKE3 Result: 62a8820553625a4ab538365bc131e095935d3504989b4852f5ed3f516a53b552 -std BLAKE3 Expected: d490c8f71546ae9909e5c6c1fd23264e268c148ea83a140d517b249e6a6a035b -std BLAKE3 1KB: f7314bcd4f08b945da46890d4abcbe9bd78905369461379ed5ab893eaccff236 -Traceback (most recent call last): - File "/home/yazhu/workspace/cpp_code/ceg5206/grp_proj/benchmark/test_gpu.py", line 37, in - assert cv_hex == std_hex, "GPU BLAKE3 result does not match CPU result!" - ^^^^^^^^^^^^^^^^^ -AssertionError: GPU BLAKE3 result does not match CPU result! -(ceg5206) yazhu@DESKTOP-37AIPE6:~/workspace/cpp_code/ceg5206/grp_proj$ python benchmark/test_gpu.py -Stage 1 finish processing -The 1st chunk merged res: -block 0 root CV (u32, little-endian words):68f4f634 6a351c13 83eef7bf 6194f235 c3e40515 5a2bff65 9ff142fa 54c43967 -Stage 2 - pass 1 finish processing -The 1st chunk merged res: -block 0 root CV (u32, little-endian words):930c7381 17018a62 722c6e90 4b5bfaa7 09e9e8f9 e41e7ecf 75386773 7c727b3b -Stage 2 - pass 2 finish processing, pass 1: 16 -block 0 root CV (u32, little-endian words):791e0e1a 05a01828 bc7cfa9f 3274a8ae 50feb0e3 c6113c92 2aaca74f 272f3096 -================================ Finishing all in big kernel! ================================ -Block 0 root CV for tiny kernel:block 0 root CV (u32, little-endian words):369139fa 28a9fc9c ab9149a8 e3f60323 9c7504ac b5088ac8 4c951d3f 3ca90f3a -Block 0 root CV for tiny kernel:block 0 root CV (u32, little-endian words):62a88205 53625a4a b538365b c131e095 935d3504 989b4852 f5ed3f51 6a53b552 -GPU BLAKE3 Result: 62a8820553625a4ab538365bc131e095935d3504989b4852f5ed3f516a53b552 -std BLAKE3 Expected: d490c8f71546ae9909e5c6c1fd23264e268c148ea83a140d517b249e6a6a035b -std BLAKE3 1KB: f7314bcd4f08b945da46890d4abcbe9bd78905369461379ed5ab893eaccff236 -Traceback (most recent call last): - File "/home/yazhu/workspace/cpp_code/ceg5206/grp_proj/benchmark/test_gpu.py", line 37, in - assert cv_hex == std_hex, "GPU BLAKE3 result does not match CPU result!" - ^^^^^^^^^^^^^^^^^ -AssertionError: GPU BLAKE3 result does not match CPU result! \ No newline at end of file +10.9, RTX4090: 67184.92 MiB/s +10.9 A800: 20803.11 MiB/s \ No newline at end of file diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py index 3052a80..3e391e9 100644 --- a/benchmark/test_gpu.py +++ b/benchmark/test_gpu.py @@ -40,21 +40,29 @@ if check_perf: # 2) 预热,触发 JIT/驱动初始化,避免首轮偏慢 for _ in range(2): + d.copy_(cpu, non_blocking=True) + torch.cuda.synchronize() fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) torch.cuda.synchronize() # 3) 正式计时:这测的是“端到端吞吐”(包含 H2D) - repeat = 5 # 1GiB × 5 已经很重,按机器调整 + repeat = 0 # 1GiB × 5 已经很重,按机器调整 cv_hex = None - t0 = time.perf_counter() + total_time = 1e-8 for _ in range(repeat): + d.copy_(cpu, non_blocking=True) + torch.cuda.synchronize() + + t0 = time.perf_counter() cv_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) + t1 = time.perf_counter() + + elapsed = t1 - t0 + total_time += elapsed torch.cuda.synchronize() - t1 = time.perf_counter() - elapsed = t1 - t0 - print(f"Elapsed time for {repeat}x BLAKE3 (GPU SM70): {elapsed:.3f} seconds") - print(f"Throughput: {repeat * d.numel() / elapsed / (1024**2):.2f} MiB/s") + print(f"Elapsed time for {repeat}x BLAKE3 (GPU SM70): {total_time:.3f} seconds") + print(f"Throughput: {repeat * d.numel() / total_time / (1024**2):.2f} MiB/s") print("root CV (hex) =", cv_hex) # print(f"std BLAKE3 Expected: {std_hex}") \ No newline at end of file diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu index b782f90..f17e07f 100644 --- a/csrc/blake3_sm70_sm80.cu +++ b/csrc/blake3_sm70_sm80.cu @@ -1,4 +1,3 @@ - #include #include #include @@ -245,7 +244,7 @@ __global__ void blake3_block_reduce_kernel(const uint32_t* d_input, out, submask); - __syncwarp(); // make sure two warps written into `tmp_cv` + __syncwarp(submask); // make sure two warps written into `tmp_cv` // #if defined(DBG_KERNEL) && DBG_KERNEL // if (blockIdx.x == 0 && threadIdx.x == 0) printf("The simd16-lane res: \n"); @@ -362,9 +361,9 @@ __global__ void blake3_block_reduce_kernel(const uint32_t* d_input, // write out if (warp_id == 0 && lane_id == 0) { uint32_t* out = block_cvs + (size_t)blockIdx.x * 8; -#if defined(DBG_KERNEL) && DBG_KERNEL - print_cv(cv_smem[0]); -#endif +// #if defined(DBG_KERNEL) && DBG_KERNEL +// print_cv(cv_smem[0]); +// #endif reinterpret_cast(out)[0] = make_uint4(cv_smem[0][0], cv_smem[0][1], cv_smem[0][2], cv_smem[0][3]); reinterpret_cast(out)[1] = make_uint4(cv_smem[0][4], cv_smem[0][5], cv_smem[0][6], cv_smem[0][7]); } @@ -400,6 +399,27 @@ void load_cv_s2r_vec(const uint32_t* __restrict__ s, uint32_t dst[8]) { dst[4]=v1.x; dst[5]=v1.y; dst[6]=v1.z; dst[7]=v1.w; } +__device__ __forceinline__ +void load_cv_s2r_vec_shared(const uint32_t* __restrict__ s, uint32_t dst[8]) { +#if __CUDA_ARCH__ >= 700 + // 把泛型指针显式转成 shared 地址,然后 ld.shared.v4.u32 两次 + unsigned smem_addr = static_cast(__cvta_generic_to_shared(s)); + uint32_t x0,x1,x2,x3,x4,x5,x6,x7; + asm volatile( + "ld.shared.v4.u32 {%0,%1,%2,%3}, [%8];\n\t" + "ld.shared.v4.u32 {%4,%5,%6,%7}, [%8+16];\n\t" + : "=r"(x0),"=r"(x1),"=r"(x2),"=r"(x3), + "=r"(x4),"=r"(x5),"=r"(x6),"=r"(x7) + : "r"(smem_addr)); + dst[0]=x0; dst[1]=x1; dst[2]=x2; dst[3]=x3; + dst[4]=x4; dst[5]=x5; dst[6]=x6; dst[7]=x7; +#else + // 旧架构退化到标量 + #pragma unroll + for (int j=0;j<8;++j) dst[j] = s[j]; +#endif +} + // ============ Tiny kernel ============ // In big kernel, it will consume 64 or 32 KiB each block [CHUNKS_PER_BLOCK] // For a 1 GiB corpus, it will produce 1 x 1024 x 1024 / CHUNKS_PER_BLOCK root = 16384 or 32768 roots @@ -468,7 +488,7 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv uint32_t a[8], b[8], c[8], d[8]; const uint32_t* s0 = cv_tile + (size_t)base4 * (8 + PAD); - load_cv_s2r_vec(s0, a); + load_cv_s2r_vec_shared(s0, a); int remain = tile_n - base4; @@ -476,16 +496,31 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv if (remain >= 2) { const uint32_t* s1 = cv_tile + (size_t)(base4+1) * (8 + PAD); - load_cv_s2r_vec(s1, b); + load_cv_s2r_vec_shared(s1, b); } if (remain >= 3) { const uint32_t* s2 = cv_tile + (size_t)(base4+2) * (8 + PAD); - load_cv_s2r_vec(s2, c); + load_cv_s2r_vec_shared(s2, c); } if (remain >= 4) { const uint32_t* s3 = cv_tile + (size_t)(base4+3) * (8 + PAD); - load_cv_s2r_vec(s3, d); + load_cv_s2r_vec_shared(s3, d); + } + +#if defined(DBG_KERNEL) && DBG_KERNEL + // 石锤了:load进来的东西压根不一样 + if (blockIdx.x == 0 && threadIdx.x < 28 && threadIdx.x > 10) { + for (int i = 0 ; i < 32; i++) { + if (i == lane_id) { + printf("The lane-merge input from lane: %d:\n", lane_id); + print_cv(a, lane_id); + print_cv(b, lane_id); + print_cv(c, lane_id); + print_cv(d, lane_id); + } + } } +#endif // merge the neighbor if (remain == 1) { @@ -508,53 +543,76 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv } #if defined(DBG_KERNEL) && DBG_KERNEL - if (blockIdx.x == 0 && threadIdx.x == 0) { + if (blockIdx.x == 0 && threadIdx.x == 4 && lane_valid) { printf("The lane-reduce-4 output:"); - print_cv(lane_cv); + print_cv(lane_cv, 4); } #endif // ---------------- Stage 3: Warp-level 32→1 neighbor-shfl merge ---------------- const int warp_base = warp_id * WARP_SIZE; - const int cur_n_w = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // active number + const int n = max(0, min(WARP_SIZE, reduced_n0 - warp_base)); // active number // this will introduce extra branch-prediction overhead - unsigned mask = __ballot_sync(0xFFFFFFFFu, lane_valid && (tid < reduced_n0*4)); // This may not work - int cur_n = cur_n_w; + const unsigned active_mask = __ballot_sync(0xFFFFFFFFu, lane_id < n); - bool active_lane = (lane_id < cur_n); +// #if defined(DBG_KERNEL) && DBG_KERNEL +// if (gridDim.x == 1 && lane_valid) { +// printf("The tid: %d\n", tid); +// } +// #endif - if (!active_lane) { + if (lane_id >= n) { #pragma unroll for (int j = 0; j < 8; ++j) lane_cv[j] = 0u; } + __syncwarp(active_mask); + // step = 1,2,4,8,16 - warp-reduce - for (int step = 1; step < cur_n_w; step <<= 1) { + for (int step = 1; step < n; step <<= 1) { // right-neighbor uint32_t nbr[8]; #pragma unroll for (int j = 0; j < 8; ++j) - nbr[j] = __shfl_down_sync(mask, lane_cv[j], step); + nbr[j] = __shfl_down_sync(active_mask, lane_cv[j], step); + + __syncwarp(active_mask); // 所有 lane 都抓完再继续 + // only the left-half lane will participant + // 0,2,4,6,8,... step = 1 + // 0,4,8,... step = 2 + // 0,8,.. step = 4 + // 0,16 step = 8 + // 0 step = 16 const bool do_pair = - active_lane && - ((lane_id % (step<<1)) == 0) && - (lane_id + step < cur_n); + ((lane_id & ((step << 1) - 1)) == 0) && // 只允许每 2*step 组的第一个 lane 合并 + ((lane_id ^ step) < n); // 右邻确实在活跃集合内 + +// #if defined(DBG_KERNEL) && DBG_KERNEL +// if (gridDim.x != 1 && blockIdx.x == 0 && threadIdx.x < 32) printf("lane %d do pair? %d\n", lane_id, do_pair); +// #endif + +// #if defined(DBG_KERNEL) && DBG_KERNEL +// if (gridDim.x != 1 && blockIdx.x == 0 && threadIdx.x == 16) { +// printf("left:"); +// print_cv(lane_cv, 16); +// printf("right:"); +// print_cv(nbr, 16); +// } +// #endif if (do_pair) { blake3_parent_cv(lane_cv, nbr, lane_cv); } - - cur_n = (cur_n + 1) >> 1; - // __syncwarp(); + __syncwarp(active_mask); } // lane0;NUM_WARPS warp-root write to SMEM // e.g. if this block has 8 warp, there will be 8 warp-root __shared__ uint32_t warp_roots[NUM_WARPS][8]; // NUM_WARPS × 8 - if (lane_id == 0 && cur_n_w > 0) { + if (lane_id == 0 && n > 0) { #pragma unroll for (int j = 0; j < 8; ++j) warp_roots[warp_id][j] = lane_cv[j]; @@ -579,7 +637,7 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv const unsigned full = __activemask(); const unsigned submask = ((sub==0) ? 0x0000FFFFu : 0xFFFF0000u) & full; const int half_id = warp_id * 2 + sub; // half-warp index - cur_n = NUM_WARPS; + int cur_n = NUM_WARPS; while (cur_n > 1) { const int pairs = cur_n >> 1; // the pair count. diff --git a/csrc/utils.cuh b/csrc/utils.cuh index 3c1739b..8c88995 100644 --- a/csrc/utils.cuh +++ b/csrc/utils.cuh @@ -1,4 +1,5 @@ #include +#pragma once #define WARP_SIZE 32 #define G(a,b,c,d, x, y) \ @@ -13,10 +14,23 @@ (b) = rotr32((b) ^ (c), 7); \ } while (0) +// host - side definition +inline constexpr uint32_t BLAKE3_IV_HOST[8] = { + 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, + 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u +}; + +__device__ __constant__ uint32_t BLAKE3_IV_DEV[8] = { + 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, + 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u +}; -__host__ __device__ __constant__ uint32_t BLAKE3_IV[8] = { - 0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au, - 0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u +inline constexpr int B3_PERMUTE_HOST[16] = { + 2, 6, 3,10, 7, 0, 4,13, 1,11,12, 5, 9, 14, 15, 8 +}; + +__device__ __constant__ int B3_PERMUTE_DEV[16] = { + 2, 6, 3,10, 7, 0, 4,13, 1,11,12, 5, 9, 14, 15, 8 }; enum : uint32_t { @@ -61,8 +75,8 @@ __host__ __device__ inline void load_u128_u32x4(const uint32_t* src, uint32_t ds #endif } -__device__ void print_cv(uint32_t cv[8]) { - if (blockIdx.x == 0 && threadIdx.x == 0) { +__device__ void print_cv(uint32_t cv[8], int tgt_tid = 0) { + if (blockIdx.x == 0 && threadIdx.x == tgt_tid) { auto get_byte = [&](int i) { int w = i >> 2; // 第 i 个字节来自第 w 个 u32 int off = (i & 3) * 8; // 在该 u32 中的偏移 @@ -78,10 +92,6 @@ __device__ void print_cv(uint32_t cv[8]) { } } -__constant__ __device__ int B3_PERMUTE[16] = { - 2, 6, 3,10, 7, 0, 4,13, 1,11,12, 5, 9,14,15, 8 -}; - // swap-table // BLAKE3 message schedule: rows are P^r, r=0..6. // Source: BLAKE3 spec Table 2; rows > 1 are repeated permutations P^r. (see §2.2) @@ -178,7 +188,7 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row // initialize uint32_t cv_word = 0; - if (li < 8) cv_word = BLAKE3_IV[li]; + if (li < 8) cv_word = BLAKE3_IV_DEV[li]; // process all blocks // in this situation, 1024 bytes will have 1024 / 64 = 16 blocks @@ -201,7 +211,7 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row const uint32_t flags = blake3_leaf_flags(b, nblocks, /*is_root_chunk*/ false); uint32_t v = (li < 8) ? cv_word : - (li < 12) ? BLAKE3_IV[(li - 8) & 3] : + (li < 12) ? BLAKE3_IV_DEV[(li - 8) & 3] : (li == 12) ? t0 : (li == 13) ? t1 : (li == 14) ? block_len : flags; @@ -298,7 +308,7 @@ void blake3_parent_cv_simd16(const uint32_t* __restrict__ L, // 8×u32 const uint32_t block_len = 64u; const uint32_t flags = FLAG_PARENT; - uint32_t iv_val = BLAKE3_IV[li & 7]; + uint32_t iv_val = BLAKE3_IV_DEV[li & 7]; uint32_t v = (li < 12) ? iv_val : @@ -368,8 +378,15 @@ __host__ __device__ void blake3_compress_words_7r( uint32_t v0=cv[0], v1=cv[1], v2=cv[2], v3=cv[3]; uint32_t v4=cv[4], v5=cv[5], v6=cv[6], v7=cv[7]; - uint32_t v8 =BLAKE3_IV[0], v9 =BLAKE3_IV[1], v10=BLAKE3_IV[2], v11=BLAKE3_IV[3]; +#if defined(__CUDA_ARCH__) + // device-side call + const uint32_t* IV = BLAKE3_IV_DEV; +#else + // host-side call + const uint32_t* IV = BLAKE3_IV_HOST; +#endif + uint32_t v8 =IV[0], v9 =IV[1], v10=IV[2], v11=IV[3]; // injection uint32_t v12=(uint32_t)chunk_counter, v13=(uint32_t)(chunk_counter >> 32), v14=block_len, v15=flags; @@ -395,9 +412,14 @@ __host__ __device__ void blake3_compress_words_7r( // perm = perm ∘ PERMUTE int np[16]; +#if defined (__CUDA_ARCH__) + const int* PERM_T = B3_PERMUTE_DEV; +#else + const int* PERM_T = B3_PERMUTE_HOST; +#endif #pragma unroll for (int i = 0; i < 16; ++i) - np[i] = perm[B3_PERMUTE[i]]; + np[i] = perm[PERM_T[i]]; #pragma unroll for (int i = 0; i < 16; ++i) perm[i] = np[i]; @@ -428,6 +450,6 @@ __device__ void blake3_parent_cv(const uint32_t L[8], const uint32_t R[8], uint3 msg[8+i] = R[i]; } uint32_t st[16]; - blake3_compress_words_7r(msg, BLAKE3_IV, 0ull, 64u, FLAG_PARENT, st); + blake3_compress_words_7r(msg, BLAKE3_IV_DEV, 0ull, 64u, FLAG_PARENT, st); blake3_state_to_cv(st, out_cv); } \ No newline at end of file From 520c60ce195df802cf13293ceb653f3137de44f2 Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Sat, 11 Oct 2025 10:34:21 +0800 Subject: [PATCH 19/20] finish debug --- benchmark/test_gpu.py | 5 ++-- csrc/blake3_sm70_sm80.cu | 49 ++++++++++++++++++++-------------------- csrc/utils.cuh | 2 +- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py index 3e391e9..902386c 100644 --- a/benchmark/test_gpu.py +++ b/benchmark/test_gpu.py @@ -40,9 +40,10 @@ if check_perf: # 2) 预热,触发 JIT/驱动初始化,避免首轮偏慢 for _ in range(2): - d.copy_(cpu, non_blocking=True) + # d.copy_(cpu, non_blocking=True) torch.cuda.synchronize() - fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) + output = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), d.numel(), stream) + print("warmup CV (hex) =", output) torch.cuda.synchronize() # 3) 正式计时:这测的是“端到端吞吐”(包含 H2D) diff --git a/csrc/blake3_sm70_sm80.cu b/csrc/blake3_sm70_sm80.cu index f17e07f..b835c15 100644 --- a/csrc/blake3_sm70_sm80.cu +++ b/csrc/blake3_sm70_sm80.cu @@ -507,20 +507,20 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv load_cv_s2r_vec_shared(s3, d); } -#if defined(DBG_KERNEL) && DBG_KERNEL - // 石锤了:load进来的东西压根不一样 - if (blockIdx.x == 0 && threadIdx.x < 28 && threadIdx.x > 10) { - for (int i = 0 ; i < 32; i++) { - if (i == lane_id) { - printf("The lane-merge input from lane: %d:\n", lane_id); - print_cv(a, lane_id); - print_cv(b, lane_id); - print_cv(c, lane_id); - print_cv(d, lane_id); - } - } - } -#endif +// #if defined(DBG_KERNEL) && DBG_KERNEL +// // 石锤了:load进来的东西压根不一样 +// if (blockIdx.x == 0 && threadIdx.x < 28 && threadIdx.x > 10) { +// for (int i = 0 ; i < 32; i++) { +// if (i == lane_id) { +// printf("The lane-merge input from lane: %d:\n", lane_id); +// print_cv(a, lane_id); +// print_cv(b, lane_id); +// print_cv(c, lane_id); +// print_cv(d, lane_id); +// } +// } +// } +// #endif // merge the neighbor if (remain == 1) { @@ -595,16 +595,16 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv // #endif // #if defined(DBG_KERNEL) && DBG_KERNEL -// if (gridDim.x != 1 && blockIdx.x == 0 && threadIdx.x == 16) { -// printf("left:"); -// print_cv(lane_cv, 16); -// printf("right:"); -// print_cv(nbr, 16); -// } +// if (gridDim.x != 1 && blockIdx.x == 0 && threadIdx.x == 16) { +// printf("left on step %d:", step); +// print_cv(lane_cv, 16); +// printf("right on step %d:", step); +// print_cv(nbr, 16); +// } // #endif if (do_pair) { - blake3_parent_cv(lane_cv, nbr, lane_cv); + blake3_parent_cv(lane_cv, nbr, lane_cv); // does not require shared memory } __syncwarp(active_mask); } @@ -637,7 +637,7 @@ __global__ void blake3_cv_block_reduce_kernel(const uint32_t* __restrict__ in_cv const unsigned full = __activemask(); const unsigned submask = ((sub==0) ? 0x0000FFFFu : 0xFFFF0000u) & full; const int half_id = warp_id * 2 + sub; // half-warp index - int cur_n = NUM_WARPS; + int cur_n = valid_warps; while (cur_n > 1) { const int pairs = cur_n >> 1; // the pair count. @@ -731,8 +731,7 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, blake3_block_reduce_kernel, cudaFuncAttributePreferredSharedMemoryCarveout, 100)); - uint8_t* d_bytes = const_cast(d_data); - const uint32_t* d_words = reinterpret_cast(d_bytes);; // alias + const auto* d_words = reinterpret_cast(d_data); // alias uint32_t* d_blockCV = nullptr; // num_blocks × 8 u32 // here we cut the largest bottleneck, do not allocate gpu memory here, do it in pytorch. @@ -772,7 +771,7 @@ void blake3_block_reduce_sm70_sm80(const uint8_t* d_data, } CUDA_CHECK(cudaFree(d_blockCV)); - CUDA_CHECK(cudaFree(d_bytes)); + // CUDA_CHECK(cudaFree(d_bytes)); return; } diff --git a/csrc/utils.cuh b/csrc/utils.cuh index 8c88995..c0311b8 100644 --- a/csrc/utils.cuh +++ b/csrc/utils.cuh @@ -288,7 +288,7 @@ void blake3_leaf_cv_simd16_onechunk(const uint32_t* __restrict__ chunk_words_row } // =============== Parent 16-lane compressing =============== -__device__ __noinline__ +__device__ __forceinline__ void blake3_parent_cv_simd16(const uint32_t* __restrict__ L, // 8×u32 const uint32_t* __restrict__ R, // 8×u32 uint32_t* __restrict__ out_cv, // 8×u32 From 2cc5b6504c393a7777b13ed4ab9ed9dd529d3b1f Mon Sep 17 00:00:00 2001 From: l1cacheDell Date: Tue, 11 Nov 2025 22:20:21 +0800 Subject: [PATCH 20/20] benchmark --- benchmark/bench.py | 116 ++++++++++++++++++++++++++++++++++++++++++ benchmark/test_gpu.py | 2 +- 2 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 benchmark/bench.py diff --git a/benchmark/bench.py b/benchmark/bench.py new file mode 100644 index 0000000..307e2fa --- /dev/null +++ b/benchmark/bench.py @@ -0,0 +1,116 @@ +# bench_blake3_sizes.py +import time +import math +import torch +import blake3 +import flashashing as fh + +# ===== 可调参数 ===== +CHECK_ACCURACY = False # 是否校验与 CPU blake3 一致(小规模可开,大规模会慢一点) +USE_PINNED_HOST = True # 是否使用 pinned host memory(建议 True,用于端到端吞吐) +WARMUP_PER_SIZE = 2 # 每个尺寸的预热次数(不计入统计) + +# 每个尺寸的重复次数,尽量让统计时间不至于太短 +def pick_repeats(n_bytes: int) -> int: + if n_bytes <= 1*1024: # 1 KB + return 2000 + if n_bytes <= 16*1024: + return 1000 + if n_bytes <= 64*1024: + return 500 + if n_bytes <= 256*1024: + return 200 + if n_bytes <= 1*1024*1024: + return 100 + if n_bytes <= 4*1024*1024: + return 50 + return 20 # 10.3MB + +# ===== 测试尺寸(字节)===== +sizes = [ + ("1KB (1 chunk)", 1 * 1024), + ("8KB", 8 * 1024), + ("16KB", 16 * 1024), + ("64KB", 64 * 1024), + ("256KB", 256 * 1024), + ("1MB", 1 * 1024 * 1024), + ("4MB", 4 * 1024 * 1024), + ("10.3MB", int(11 * 1024 * 1024)), # 以 MiB 解释 10.3MB +] + +device = torch.device("cuda") +stream = torch.cuda.current_stream().cuda_stream + +# 预分配最大的 host/device buffer,子切片复用,保证公平且减少反复分配 +max_n = max(n for _, n in sizes) +cpu = torch.empty(max_n, dtype=torch.uint8, pin_memory=USE_PINNED_HOST) +cpu[:] = ord('A') + +d = torch.empty_like(cpu, device=device) # device 大小与 max_n 一致 +torch.cuda.synchronize() + +# 标题 +print("BLAKE3 GPU benchmark across message sizes") +print(f"Host pinned memory: {USE_PINNED_HOST}") +print("-" * 96) +print("{:<14} {:>10} {:>8} {:>12} {:>12} {:>12} {:>12}".format( + "size", "bytes", "repeat", "kern_ms", "kern_MiB/s", "e2e_ms", "e2e_MiB/s" +)) +print("-" * 96) + +for label, n in sizes: + # 预热 + for _ in range(WARMUP_PER_SIZE): + # 给 device 子切片复制 n 字节 + d[:n].copy_(cpu[:n], non_blocking=True) + torch.cuda.synchronize() + _ = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), n, stream) + torch.cuda.synchronize() + + repeat = pick_repeats(n) + total_kernel_ms = 0.0 + total_e2e_sec = 0.0 + last_hex = None + + # CUDA event 用于“纯 kernel”计时 + start_evt = torch.cuda.Event(enable_timing=True) + end_evt = torch.cuda.Event(enable_timing=True) + + for _ in range(repeat): + # -------- End-to-end: H2D + kernel -------- + torch.cuda.synchronize() + t0 = time.perf_counter() + d[:n].copy_(cpu[:n], non_blocking=True) + # kernel-only 计时:events 只包住内核调用 + start_evt.record() + last_hex = fh.blake3_gpu_sm70_sm80_hex(d.data_ptr(), n, stream) + end_evt.record() + torch.cuda.synchronize() # 等 H2D+kernel 都完成 + t1 = time.perf_counter() + + total_e2e_sec += (t1 - t0) + total_kernel_ms += start_evt.elapsed_time(end_evt) + + # 平均时间 + avg_kernel_ms = total_kernel_ms / repeat + avg_e2e_ms = (total_e2e_sec / repeat) * 1e3 + + # 吞吐(MiB/s;1 MiB = 1024^2) + mib = n / (1024 ** 2) if n != 10.3 else 11 / (1024 ** 2) + kernel_throughput = mib / (avg_kernel_ms / 1e3) if avg_kernel_ms > 0 else float('inf') + e2e_throughput = mib / (avg_e2e_ms / 1e3) if avg_e2e_ms > 0 else float('inf') + + print("{:<14} {:>10} {:>8} {:>12.3f} {:>12.2f} {:>12.3f} {:>12.2f}".format( + label, n, repeat, avg_kernel_ms, kernel_throughput, avg_e2e_ms, e2e_throughput + )) + + if CHECK_ACCURACY: + # CPU 校验(只对当前尺寸做一次) + std_hex = blake3.blake3(cpu[:n].numpy()).hexdigest() + assert last_hex == std_hex, f"Mismatch at {label}: GPU {last_hex} vs CPU {std_hex}" + +print("-" * 96) +print("Notes:") +print(" • kernel_ms / kern_MiB/s 只统计 GPU 计算时间(CUDA events),不含 H2D;") +print(" • e2e_ms / e2e_MiB/s 统计 H2D + kernel 的总时间;") +print(" • '10.3MB' 按 10.3 * 1024 * 1024 字节计算。") diff --git a/benchmark/test_gpu.py b/benchmark/test_gpu.py index 902386c..d634d21 100644 --- a/benchmark/test_gpu.py +++ b/benchmark/test_gpu.py @@ -47,7 +47,7 @@ torch.cuda.synchronize() # 3) 正式计时:这测的是“端到端吞吐”(包含 H2D) - repeat = 0 # 1GiB × 5 已经很重,按机器调整 + repeat = 10 # 1GiB × 5 已经很重,按机器调整 cv_hex = None total_time = 1e-8 for _ in range(repeat):