diff --git a/benchmark-linxisa/compile_all.sh b/benchmark-linxisa/compile_all.sh index 172590a..a586d36 100755 --- a/benchmark-linxisa/compile_all.sh +++ b/benchmark-linxisa/compile_all.sh @@ -4,15 +4,19 @@ # Don't use set -e as some operators may fail to compile SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -export COMPILER_DIR=${COMPILER_DIR:-/Users/liyi/Documents/GitHub/linx-toolchain-build/output/linx_blockisa_llvm_musl/bin} +LINX_ISA_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" +export COMPILER_DIR=${COMPILER_DIR:-$LINX_ISA_ROOT/compiler/llvm/build-linxisa-clang/bin} REPO_ROOT=${REPO_ROOT:-$SCRIPT_DIR} echo "==========================================" echo "[LinxISA] Starting full compilation" echo "REPO_ROOT: $REPO_ROOT" +echo "COMPILER_DIR: $COMPILER_DIR" echo "==========================================" # Function to compile an operator +FAILURES=0 + compile_operator() { local operator_path=$1 local operator_name=$2 @@ -25,10 +29,15 @@ compile_operator() { if [ ! -d "$operator_path" ]; then echo "Warning: Directory not found: $operator_path" + FAILURES=$((FAILURES + 1)) return 1 fi - cd "$operator_path" + if ! cd "$operator_path"; then + echo "✗ $operator_name compilation failed: cannot enter directory" + FAILURES=$((FAILURES + 1)) + return 1 + fi if [ -f "compile.all" ]; then echo "Running compile.all with baremetal=${baremetal:-off}..." @@ -37,9 +46,12 @@ compile_operator() { echo "✓ $operator_name compilation completed" else echo "✗ $operator_name compilation failed" + FAILURES=$((FAILURES + 1)) + return 1 fi else echo "Warning: No compile.all found in $operator_path" + FAILURES=$((FAILURES + 1)) return 1 fi } @@ -67,3 +79,8 @@ echo "" echo "Generated ELF files:" find "$REPO_ROOT/output" -name "*.elf" -type f | wc -l echo "ELF files are located in: $REPO_ROOT/output/" + +if [ "$FAILURES" -ne 0 ]; then + echo "Compilation failures: $FAILURES" + exit 1 +fi diff --git a/benchmark-linxisa/kernels/broadcast/broadcast.hpp b/benchmark-linxisa/kernels/broadcast/broadcast.hpp index 5276b2b..75082ee 100644 --- a/benchmark-linxisa/kernels/broadcast/broadcast.hpp +++ b/benchmark-linxisa/kernels/broadcast/broadcast.hpp @@ -1,305 +1,112 @@ +#ifndef SUPERNPUBENCH_PTOISA_BROADCAST_HPP +#define SUPERNPUBENCH_PTOISA_BROADCAST_HPP + #include -#include "template_asm.h" +#include #include -#include - - -#define DUMP_TILE(label, TileVar, DumpBuf, Rows, Cols) \ - do { \ - GlobalTensor, \ - Stride<1,1,1,Cols,1>> _g(DumpBuf); \ - TCOPYOUT(_g, TileVar); \ - printf("[DUMP] %s (shape=%dx%d):\n", label, Rows, Cols); \ - for (int ri = 0; ri < Rows; ri++) { \ - printf(" row%2d: ", ri); \ - for (int ci = 0; ci < Cols; ci++) \ - printf("%12lld ", (long long)DumpBuf[ri * Cols + ci]); \ - printf("\n"); \ - } \ - fflush(stdout); \ - } while (0) - -// ============================================== -// 维度规则:从后面对齐,前面自动补 1,维度=1 可广播 -// ============================================== -template -void __vec__ gen_offset( - typename tile_shape::TileDType __out__ out, - #if MAX_DIMs >= 1 - size_t in0, size_t out0, - #endif - #if MAX_DIMs >= 2 - size_t in1, size_t out1, - #endif - #if MAX_DIMs >= 3 - size_t in2, size_t out2, - #endif - #if MAX_DIMs >= 4 - size_t in3, size_t out3, - #endif - #if MAX_DIMs >= 5 - size_t in4, size_t out4, - #endif - #if MAX_DIMs >= 6 - size_t in5, size_t out5, - #endif - #if MAX_DIMs >= 7 - size_t in6, size_t out6, - #endif - #if MAX_DIMs >= 8 - size_t in7, size_t out7, - #endif - const size_t base, - const size_t total_elements -) { - size_t index = blkv_get_index_x(); - // if (index >= total_elements) return; - size_t idx = index + base; - - size_t out_coord[MAX_DIM] = {0}; - size_t tmp = idx; - - // ====== 输出坐标计算 ====== - for (int d = OUT_DIM - 1; d >= 0; d--) { - if (d == 0) { out_coord[d] = tmp % out0; tmp /= out0; } - #if MAX_DIMs >=2 - else if (d == 1) { out_coord[d] = tmp % out1; tmp /= out1; } - #endif - #if MAX_DIMs >=3 - else if (d == 2) { out_coord[d] = tmp % out2; tmp /= out2; } - #endif - #if MAX_DIMs >=4 - else if (d == 3) { out_coord[d] = tmp % out3; tmp /= out3; } - #endif - #if MAX_DIMs >=5 - else if (d == 4) { out_coord[d] = tmp % out4; tmp /= out4; } - #endif - #if MAX_DIMs >=6 - else if (d == 5) { out_coord[d] = tmp % out5; tmp /= out5; } - #endif - #if MAX_DIMs >=7 - else if (d == 6) { out_coord[d] = tmp % out6; tmp /= out6; } - #endif - #if MAX_DIMs >=8 - else if (d == 7) { out_coord[d] = tmp % out7; tmp /= out7; } - #endif - } - - // ====== 输入坐标计算 ====== - size_t in_coord[MAX_DIM] = {0}; - for (int i = OUT_DIM - 1; i >= 0; i--) { - int o = OUT_DIM - IN_DIM + i; - size_t in_dim; - - if (i == 0) in_dim = in0; - #if MAX_DIMs >=2 - else if (i == 1) in_dim = in1; - #endif - #if MAX_DIMs >=3 - else if (i == 2) in_dim = in2; - #endif - #if MAX_DIMs >=4 - else if (i == 3) in_dim = in3; - #endif - #if MAX_DIMs >=5 - else if (i == 4) in_dim = in4; - #endif - #if MAX_DIMs >=6 - else if (i == 5) in_dim = in5; - #endif - #if MAX_DIMs >=7 - else if (i == 6) in_dim = in6; - #endif - #if MAX_DIMs >=8 - else if (i == 7) in_dim = in7; - #endif - - if (in_dim == 1) - in_coord[i] = 0; - else - in_coord[i] = out_coord[o]; - } - - // ====== 计算偏移 ====== - size_t in_offset = 0; - size_t data_width = sizeof(dtype); - for (int i = 0; i < IN_DIM; i++) { - if (i == 0) in_offset = in_offset * in0 + in_coord[0]; - #if MAX_DIMs >=2 - else if (i == 1) in_offset = in_offset * in1 + in_coord[1]; - #endif - #if MAX_DIMs >=3 - else if (i == 2) in_offset = in_offset * in2 + in_coord[2]; - #endif - #if MAX_DIMs >=4 - else if (i == 3) in_offset = in_offset * in3 + in_coord[3]; - #endif - #if MAX_DIMs >=5 - else if (i == 4) in_offset = in_offset * in4 + in_coord[4]; - #endif - #if MAX_DIMs >=6 - else if (i == 5) in_offset = in_offset * in5 + in_coord[5]; - #endif - #if MAX_DIMs >=7 - else if (i == 6) in_offset = in_offset * in6 + in_coord[6]; - #endif - #if MAX_DIMs >=8 - else if (i == 7) in_offset = in_offset * in7 + in_coord[7]; - #endif - } - in_offset *= data_width; - - blkv_get_tile_ptr(out)[index] = in_offset; -} -template -void gen_offset_impl( - tile_shapeOffset &out, - const size_t *in_shape, - const size_t *out_shape, - const size_t base, - const size_t total_elements) { - static_assert(tile_shapeOffset::ValidRow != -1 && tile_shapeOffset::ValidCol != -1, - "Only static shape supported"); - - #if MAX_DIMs >= 1 - size_t in_shape0 = in_shape[0]; - size_t out_shape0 = out_shape[0]; - #endif - #if MAX_DIMs >= 2 - size_t in_shape1 = in_shape[1]; - size_t out_shape1 = out_shape[1]; - #endif - #if MAX_DIMs >= 3 - size_t in_shape2 = in_shape[2]; - size_t out_shape2 = out_shape[2]; - #endif - #if MAX_DIMs >= 4 - size_t in_shape3 = in_shape[3]; - size_t out_shape3 = out_shape[3]; - #endif - #if MAX_DIMs >= 5 - size_t in_shape4 = in_shape[4]; - size_t out_shape4 = out_shape[4]; - #endif - #if MAX_DIMs >= 6 - size_t in_shape5 = in_shape[5]; - size_t out_shape5 = out_shape[5]; - #endif - #if MAX_DIMs >= 7 - size_t in_shape6 = in_shape[6]; - size_t out_shape6 = out_shape[6]; - #endif - #if MAX_DIMs >= 8 - size_t in_shape7 = in_shape[7]; - size_t out_shape7 = out_shape[7]; - #endif - - gen_offset<<>>( - out.data(), - #if MAX_DIMs >= 1 - in_shape0, out_shape0, - #endif - #if MAX_DIMs >= 2 - in_shape1, out_shape1, - #endif - #if MAX_DIMs >= 3 - in_shape2, out_shape2, - #endif - #if MAX_DIMs >= 4 - in_shape3, out_shape3, - #endif - #if MAX_DIMs >= 5 - in_shape4, out_shape4, - #endif - #if MAX_DIMs >= 6 - in_shape5, out_shape5, - #endif - #if MAX_DIMs >= 7 - in_shape6, out_shape6, - #endif - #if MAX_DIMs >= 8 - in_shape7, out_shape7, - #endif - base, - total_elements); -} - - - -template -void broadcast( - dtype *in_ptr, - dtype *out_ptr, - const size_t *in_shape, - const size_t *out_shape - ) { - const size_t Mb = gOM / tM; - const size_t rmd_M = gOM % tM; - - using gm_shapeIn = global_tensor>; - using gm_shapeOut = global_tensor>; - using tile_shapeData = Tile; - using tile_shapeOffset = Tile; - using tile_shapeData_rmd = Tile; - using tile_shapeOffset_rmd = Tile; - - gm_shapeIn inGm(in_ptr); - // gm_shapeOut outGm(out_ptr); - tile_shapeData outTile; - tile_shapeOffset offsetTile; - tile_shapeData_rmd outTile_rmd; - tile_shapeOffset_rmd offsetTile_rmd; - size_t base = 0; - size_t all_num = gOM; // 总元素数量 - - using itOut = global_iterator; - - // tile_shapeData inTile; - // using itIn = global_iterator; - - itOut gOIter(out_ptr); - // itIn gIIter(in_ptr); - // for test /////////////////////////////////////// - // alignas(256) static uint32_t g_dump[tM]; - // alignas(256) static dtype g_dump_outTile[tM]; - // alignas(256) static dtype g_dump_inTile[tM]; - // /////////////////////////////////////// - - size_t total_elements = tM; - for (int i = 0; i < Mb; ++i) { - auto gO = gOIter(0, i); - // auto gI = gIIter(0, i); - // printf("iter = %d\n", i); - // printf("base = %d\n", base); - // printf("total_elements = %d\n", total_elements); - // printf("in_shape[0] = %d\n", in_shape[0]); - // printf("inGm = %ld\n", inGm); - - // TCOPYIN(inTile, gI); - // DUMP_TILE("inTile", inTile, g_dump_inTile, 1, tM); - gen_offset_impl(offsetTile, in_shape, out_shape, base, total_elements); - base += total_elements; - - // DUMP_TILE("offsetTile", offsetTile, g_dump, 1, tM); - - MGATHER(outTile, inGm, offsetTile); - - // DUMP_TILE("outTile", outTile, g_dump_outTile, 1, tM); - TCOPYOUT(gO, outTile); - } - if constexpr (rmd_M) { - // printf("rmd_M = %d\n", rmd_M); - auto gO = gOIter(0, Mb); - total_elements = rmd_M; - gen_offset_impl(offsetTile_rmd, in_shape, out_shape, base, total_elements); - base += total_elements; - MGATHER(outTile_rmd, inGm, offsetTile_rmd); - TCOPYOUT(gO, outTile_rmd); - } - +#include + +using namespace pto; + +template +void broadcast(dtype *in_ptr, dtype *out_ptr, const size_t *in_shape, + const size_t *out_shape) { + constexpr size_t kBCast = gOM / gIM; + constexpr size_t kElementsPerTile = 512; + constexpr size_t kFullTiles = gOM / kElementsPerTile; + constexpr size_t kTail = gOM % kElementsPerTile; + + static_assert(MAX_DIM >= IN_DIM && MAX_DIM >= OUT_DIM, + "MAX_DIM must cover input and output ranks"); + static_assert(gOM % gIM == 0, + "broadcast output element count must be a multiple of input"); + static_assert(kBCast > 0, "broadcast factor must be positive"); + (void)kTileHint; + + size_t inner = 1; + size_t in_dim = IN_DIM; + size_t out_dim = OUT_DIM; + while (in_dim > 0 && out_dim > 0 && + in_shape[in_dim - 1] == out_shape[out_dim - 1]) { + inner *= out_shape[out_dim - 1]; + --in_dim; + --out_dim; + } + + using InputGlobal = global_tensor>; + using OutputGlobal = global_tensor>; + using DataTile = Tile; + using OffsetTile = Tile; + using OutputIterator = global_iterator; + + InputGlobal input_global(in_ptr); + OutputIterator output_iter(out_ptr); + + auto emit_tile = [&](auto &data_tile, auto &linear, auto &batch, + auto &batch_base, auto &inner_q, auto &inner_base, + auto &inner_idx, auto &inner_bytes, auto &offset, + std::uint32_t base, auto output_addr) { + using LinearTile = std::remove_reference_t; + const auto inner_u32 = static_cast(inner); + const auto group_u32 = static_cast(kBCast * inner); + TCI(linear, base); + + TDIVS(batch, linear, group_u32); + TMULS(batch_base, batch, + static_cast(inner * sizeof(dtype))); + + TDIVS(inner_q, linear, inner_u32); + TMULS(inner_base, inner_q, inner_u32); + TSUB(inner_idx, linear, inner_base); + TMULS(inner_bytes, inner_idx, static_cast(sizeof(dtype))); + + TADD(offset, batch_base, inner_bytes); + MGATHER(data_tile, input_global, offset); + TSTORE(output_addr, data_tile); + }; + + for (size_t tile = 0; tile < kFullTiles; ++tile) { + DataTile data_tile; + OffsetTile linear; + OffsetTile batch; + OffsetTile batch_base; + OffsetTile inner_q; + OffsetTile inner_base; + OffsetTile inner_idx; + OffsetTile inner_bytes; + OffsetTile offset; + emit_tile(data_tile, linear, batch, batch_base, inner_q, inner_base, + inner_idx, inner_bytes, offset, + static_cast(tile * kElementsPerTile), + output_iter(0, static_cast(tile))); + } + + if constexpr (kTail != 0) { + using TailDataTile = Tile; + using TailOffsetTile = + Tile; + using TailOutputIterator = global_iterator; + TailOutputIterator tail_output_iter(out_ptr); + TailDataTile data_tile; + TailOffsetTile linear; + TailOffsetTile batch; + TailOffsetTile batch_base; + TailOffsetTile inner_q; + TailOffsetTile inner_base; + TailOffsetTile inner_idx; + TailOffsetTile inner_bytes; + TailOffsetTile offset; + emit_tile(data_tile, linear, batch, batch_base, inner_q, inner_base, + inner_idx, inner_bytes, offset, + static_cast(kFullTiles * kElementsPerTile), + tail_output_iter(0, static_cast(kFullTiles))); + } } - - +#endif // SUPERNPUBENCH_PTOISA_BROADCAST_HPP diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_019.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_019.hpp index b6916f2..1a5f750 100644 --- a/benchmark-linxisa/kernels/broadcast/broadcast_019.hpp +++ b/benchmark-linxisa/kernels/broadcast/broadcast_019.hpp @@ -260,7 +260,7 @@ void gen_offset_impl( -template +template void broadcast( dtype *in_ptr, dtype *out_ptr, diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_039.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_039.hpp index b3fc5e5..fe3bc66 100644 --- a/benchmark-linxisa/kernels/broadcast/broadcast_039.hpp +++ b/benchmark-linxisa/kernels/broadcast/broadcast_039.hpp @@ -309,7 +309,7 @@ void gen_offset_impl( -template +template void broadcast( dtype *in_ptr, dtype *out_ptr, diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_07.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_07.hpp index 4b3b7b1..28a0649 100644 --- a/benchmark-linxisa/kernels/broadcast/broadcast_07.hpp +++ b/benchmark-linxisa/kernels/broadcast/broadcast_07.hpp @@ -260,7 +260,7 @@ void gen_offset_impl( -template +template void broadcast( dtype *in_ptr, dtype *out_ptr, diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_07_simple.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_07_simple.hpp index 75e03d0..ef429b7 100644 --- a/benchmark-linxisa/kernels/broadcast/broadcast_07_simple.hpp +++ b/benchmark-linxisa/kernels/broadcast/broadcast_07_simple.hpp @@ -32,7 +32,7 @@ void gen_offset_impl( } } -template +template void broadcast( dtype *in_ptr, dtype *out_ptr, diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_Hunyuan.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_Hunyuan.hpp index 80b5b1c..5ec65ef 100644 --- a/benchmark-linxisa/kernels/broadcast/broadcast_Hunyuan.hpp +++ b/benchmark-linxisa/kernels/broadcast/broadcast_Hunyuan.hpp @@ -265,7 +265,7 @@ void gen_offset_impl( -template +template void broadcast( dtype *in_ptr, dtype *out_ptr, diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_mscatter.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_mscatter.hpp index f38161b..509e498 100644 --- a/benchmark-linxisa/kernels/broadcast/broadcast_mscatter.hpp +++ b/benchmark-linxisa/kernels/broadcast/broadcast_mscatter.hpp @@ -86,7 +86,7 @@ void gen_scatter_offset_impl( // 通用MSCATTER版 Broadcast // 自动识别任意广播维度 // ============================================== -template +template void broadcast_mscatter( dtype *in_ptr, dtype *out_ptr, diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_nocopyout.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_nocopyout.hpp index 5340d09..825261d 100644 --- a/benchmark-linxisa/kernels/broadcast/broadcast_nocopyout.hpp +++ b/benchmark-linxisa/kernels/broadcast/broadcast_nocopyout.hpp @@ -228,7 +228,7 @@ void gen_offset_impl( -template +template void broadcast_nocopyout( dtype *in_ptr, dtype *out_ptr, diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_nomg.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_nomg.hpp index 16c5332..391ccf9 100644 --- a/benchmark-linxisa/kernels/broadcast/broadcast_nomg.hpp +++ b/benchmark-linxisa/kernels/broadcast/broadcast_nomg.hpp @@ -228,7 +228,7 @@ void gen_offset_impl( -template +template void broadcast( dtype *in_ptr, dtype *out_ptr, diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_pto.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_pto.hpp new file mode 100644 index 0000000..cfaf699 --- /dev/null +++ b/benchmark-linxisa/kernels/broadcast/broadcast_pto.hpp @@ -0,0 +1,248 @@ +// ============================================================================ +// Broadcast 算子 — PTO 一层编程模型 +// +// 以 Tile_ISA_Reference 文档为准,使用 PTO ISA 定义的 C++ Intrinsic 接口。 +// 当前编译器 (Linx-TileOP-API) 尚未完全实现 PTO ISA,本文件按 ISA 文档书写, +// 暂不保证可编译通过。 +// +// ┌─────────────────────────────────────────────────────────────────────────┐ +// │ 当前编译器不支持 / 不完整的指令汇总 │ +// ├──────────┬──────────────────┬──────────────────────────────────────────┤ +// │ Pto ISA │ 当前编译器状态 │ 说明 │ +// │ 指令 │ │ │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TCI │ API 有,二层实现 │ pto_tileop.hpp 有声明; │ +// │ │ │ jcore/TCI.hpp 用 __vec__ + blkv_get_* │ +// │ │ │ 实现,非真正一层 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TEXPANDS │ API 有(名不同), │ Pto ISA 名 TEXPANDS; │ +// │ │ 二层实现 │ 当前编译器名 TEXPANDSCALAR; │ +// │ │ │ jcore 用 __vec__ 实现 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TREMS │ 完全缺失 │ pto_tileop.hpp 中无此 API; │ +// │ │ │ 当前编译器无任何标量取余实现 │ +// │ │ │ (仅有 tile-tile 的 TREM,且仅 int32) │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TDIVS │ API 有,二层实现 │ jcore/TDivs.hpp 用 __vec__ 实现 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TMULS │ API 有,二层实现 │ jcore/TMuls.hpp 用 __vec__ 实现 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TADD │ API 有,二层实现 │ jcore/TAdd.hpp 用 __vec__ 实现 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ MGATHER │ 已支持(一层) │ template_asm.h 用 asm volatile │ +// │ │ │ (BSTART.TMA) 实现; │ +// │ │ │ 但缺少 Coalesce::Elem 模板参数支持 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TSTORE │ API 有(名不同), │ Pto ISA 名 TSTORE; │ +// │ │ 二层实现 │ 当前编译器名 TCOPYOUT; │ +// │ │ │ jcore/TCopyOut.hpp 用 __vec__ 实现 │ +// └──────────┴──────────────────┴──────────────────────────────────────────┘ +// +// PTO ISA 文档签名 (Declared in include/pto/pto_instr.hpp): +// +// TCI: +// template +// PTO_INST RecordEvent TCI(TileData &dst, T start, WaitEvents &... events); +// +// TEXPANDS: +// template +// PTO_INST RecordEvent TEXPANDS(TileData &dst, typename TileData::DType scalar, +// WaitEvents &... events); +// +// TREMS: +// template +// PTO_INST RecordEvent TREMS(TileDataDst &dst, TileDataSrc &src, +// typename TileDataSrc::DType scalar, +// TileDataTmp &tmp, WaitEvents &... events); +// +// TDIVS: +// template +// PTO_INST RecordEvent TDIVS(TileDataDst &dst, TileDataSrc &src0, +// typename TileDataSrc::DType scalar, +// WaitEvents &... events); +// +// TMULS: +// template +// PTO_INST RecordEvent TMULS(TileDataDst &dst, TileDataSrc &src0, +// typename TileDataSrc::DType scalar, +// WaitEvents &... events); +// +// TADD: +// template +// PTO_INST RecordEvent TADD(TileDataDst &dst, TileDataSrc0 &src0, +// TileDataSrc1 &src1, WaitEvents &... events); +// +// MGATHER: +// template +// PTO_INST RecordEvent MGATHER(TileDst &dst, GlobalData &src, +// TileInd &indexes, WaitEvents &... events); +// +// TSTORE: +// template +// PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, +// WaitEvents &... events); +// +// ============================================================================ + +#include // Tile, GlobalTensor 等类型 (当前编译器已有) +#include // global_iterator 工具类型 (当前编译器已有) +// #include // [!] PTO ISA C++ Intrinsic — 当前编译器未提供 + +#include +#include + +// ============================================================================ +// 维度规则:从后面对齐,前面自动补 1,维度=1 可广播 +// ============================================================================ + +// ---------------------------------------------------------------------------- +// gen_offset_pto: 用 PTO ISA Tile 指令计算广播偏移 (一层编程,无 __vec__ 块) +// +// 算法 (stride 累加法,从最内维到最外维): +// 1. TCI 生成索引序列 base, base+1, ..., base+N-1 +// 2. TEXPANDS 将输出偏移 tile 初始化为 0 +// 3. 对每个输出维 d (从 OUT_DIM-1 到 0): +// TREMS coord = idx % out_shape[d] (标量取余) +// TDIVS idx = idx / out_shape[d] (标量整除) +// 若 d 对应输入维 i = d-(OUT_DIM-IN_DIM) >= 0: +// 非广播维 (in_shape[i]!=1): +// TMULS tmp = coord * stride +// TADD out += tmp +// stride *= in_shape[i] +// +// 注意: PTO ISA MGATHER 使用元素索引 (非字节偏移), +// 所以这里不再乘 sizeof(dtype)。 +// 原始 broadcast.hpp 的 __vec__ 版本需要乘 sizeof(dtype) 是因为 +// template_asm.h 里的旧 MGATHER 按字节偏移取数。 +// ---------------------------------------------------------------------------- +template +void gen_offset_pto( + tile_shapeOffset &out, + const size_t *in_shape, + const size_t *out_shape, + const size_t base, + const size_t total_elements) +{ + static_assert(tile_shapeOffset::ValidRow != -1 && tile_shapeOffset::ValidCol != -1, + "Only static shape supported"); + + using off_t = typename tile_shapeOffset::DType; // uint32_t + + tile_shapeOffset idxTile; // 当前正在分解的线性索引 + tile_shapeOffset coordTile; // 当前输出维坐标 + tile_shapeOffset tmpTile; // TREMS 需要 (A2A3 要求 tmp >= 1 行, 列数 >= dst) + + // ---- Step 1: TCI 生成索引序列 [base, base+1, ..., base+N-1] ---- + // [当前编译器] TCI 在 pto_tileop.hpp 有声明,但 jcore 实现为 __vec__ (二层) + TCI(idxTile, (off_t)base); + + // ---- Step 2: TEXPANDS 初始化 out = 0 ---- + // [当前编译器] PTO ISA 名 TEXPANDS,当前编译器名 TEXPANDSCALAR;jcore 为 __vec__ + TEXPANDS(out, (off_t)0); + + size_t stride = 1; + + // ---- Step 3: 从最内维到最外维逐维 divmod + stride 累加 ---- + #pragma clang loop unroll(full) + for (int d = (int)OUT_DIM - 1; d >= 0; d--) { + off_t out_d = (off_t)out_shape[d]; + + // TREMS: coord = idx % out_d + // [当前编译器] 完全缺失!pto_tileop.hpp 无 TREMS API。 + // 退路: 用 TDIVS+TMULS+TSUB 三条拼出取余。 + TREMS(coordTile, idxTile, out_d, tmpTile); + + // TDIVS: idx = idx / out_d (推进到下一维) + // [当前编译器] API 有,jcore 为 __vec__ + TDIVS(idxTile, idxTile, out_d); + + // 该输出维是否对应输入维 + int i = d - (int)(OUT_DIM - IN_DIM); + if (i >= 0) { + if (in_shape[i] != 1) { // 非广播维才累加 + // TMULS: tmp = coord * stride + // [当前编译器] API 有,jcore 为 __vec__ + TMULS(tmpTile, coordTile, (off_t)stride); + // TADD: out += tmp + // [当前编译器] API 有,jcore 为 __vec__ + TADD(out, out, tmpTile); + } + stride *= in_shape[i]; // stride 更新 (广播维 ==1 不变) + } + } + + // 不再乘 sizeof(dtype): + // PTO ISA MGATHER 按"元素索引"取数 (dst[i,j] = src[idx[i,j]]), + // 而非旧 MGATHER 的字节偏移。 +} + + +// ---------------------------------------------------------------------------- +// broadcast: 接口与原 broadcast.hpp 一致 +// ---------------------------------------------------------------------------- +template +void broadcast( + dtype *in_ptr, + dtype *out_ptr, + const size_t *in_shape, + const size_t *out_shape + ) { + const size_t Mb = gOM / tM; + const size_t rmd_M = gOM % tM; + + using gm_shapeIn = global_tensor>; + using gm_shapeOut = global_tensor>; + using tile_shapeData = Tile; + using tile_shapeOffset = Tile; + using tile_shapeData_rmd = Tile; + using tile_shapeOffset_rmd= Tile; + + gm_shapeIn inGm(in_ptr); + tile_shapeData outTile; + tile_shapeOffset offsetTile; + tile_shapeData_rmd outTile_rmd; + tile_shapeOffset_rmd offsetTile_rmd; + size_t base = 0; + + using itOut = global_iterator; + itOut gOIter(out_ptr); + + size_t total_elements = tM; + for (int i = 0; i < Mb; ++i) { + auto gO = gOIter(0, i); + + // 计算偏移 tile (元素索引) + gen_offset_pto( + offsetTile, in_shape, out_shape, base, total_elements); + base += total_elements; + + // MGATHER: 按 offsetTile 中的元素索引从 inGm 取数 + // [当前编译器] template_asm.h 的 MGATHER 已用 asm volatile (一层), + // 但不支持 Coalesce::Elem 模板参数; + // 且旧实现按字节偏移取数,与 PTO ISA 的元素索引语义不同。 + MGATHER(outTile, inGm, offsetTile); + + // TSTORE: 将 outTile 写回 global memory + // [当前编译器] PTO ISA 名 TSTORE,当前编译器名 TCOPYOUT;jcore 为 __vec__ + TSTORE(gO, outTile); + } + if constexpr (rmd_M) { + auto gO = gOIter(0, Mb); + total_elements = rmd_M; + gen_offset_pto( + offsetTile_rmd, in_shape, out_shape, base, total_elements); + base += total_elements; + MGATHER(outTile_rmd, inGm, offsetTile_rmd); + TSTORE(gO, outTile_rmd); + } +} diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_simple.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_simple.hpp index c8ce29f..1e1b926 100644 --- a/benchmark-linxisa/kernels/broadcast/broadcast_simple.hpp +++ b/benchmark-linxisa/kernels/broadcast/broadcast_simple.hpp @@ -151,7 +151,7 @@ void gen_offset_impl( -template +template void broadcast( dtype *in_ptr, dtype *out_ptr, diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_vec_019.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_vec_019.hpp index e58991b..e6332b6 100644 --- a/benchmark-linxisa/kernels/broadcast/broadcast_vec_019.hpp +++ b/benchmark-linxisa/kernels/broadcast/broadcast_vec_019.hpp @@ -1,133 +1,99 @@ +#ifndef SUPERNPUBENCH_PTOISA_BROADCAST_VEC_019_HPP +#define SUPERNPUBENCH_PTOISA_BROADCAST_VEC_019_HPP + #include + +#include #include -#include +#include using namespace pto; -// ===================================================================== -// Broadcast (B,1,K) -> (B,N,K) via TCOPYIN + __vec__ broadcast + TCOPYOUT -// -// Optimized for: (1280,1,49) -> (1280,8,49), dtype=half -// -// Data layout (row-major): -// Input: [B][1][K] -> flat = B*K, batch b data at offset b*K -// Output: [B][N][K] -> flat = B*N*K, batch b data at offset b*N*K -// Broadcast along dim1: 1 -> N, dim0 & dim2 preserved. -// -// Processing strategy: -// Divide B batches into tiles of kTileBatch batches each. -// Per tile: -// 1. TCOPYIN (kTileBatch, K) from GlobalMem -> TileReg -// Reads kTileBatch * K contiguous elements. -// 2. __vec__ broadcast within TileReg: -// Launch <<>> threads: -// x = inner column index (0..K-1, direct use, no modulo) -// y = broadcast_copy * kTileBatch + batch_idx (0..N*kTileBatch-1) -// copy = y / kTileBatch (0..N-1, power-of-2 safe) -// batch_idx = y & (kTileBatch - 1) (0..kTileBatch-1, bitwise) -// Read src[batch_idx * RowStride + x] -// Write dst[batch_idx * RowStride + copy * K + x] -// 3. TCOPYOUT (kTileBatch, N*K) from TileReg -> GlobalMem -// -// TileReg layout: -// Physical tile cols = 512 (padded for 512B alignment). -// Constraint: kTileBatch must be power-of-2, N must be power-of-2. -// -// Note: kInner (K=49) is NOT power-of-2, so we avoid modulo on K by -// making x the direct column index (0..K-1). All division on -// y uses power-of-2 kTileBatch (bitwise ops only). -// -// Alignment notes (dtype=half): -// kTileBatch=16, Cols=512 -> 16*512*2 = 16384 = 512*2^5 (aligned) -// For (1280,1,49)->(1280,8,49) with kTileBatch=16: -// 1280/16 = 80 tiles, no remainder. -// -// Template params: -// dtype - data type (__half, float, etc.) -// MAX_DIM - max dimensions (kept for compat) -// IN_DIM - input dim count (kept for compat) -// OUT_DIM - output dim count (kept for compat) -// gIM - total input elements = B * K -// gOM - total output elements = B * N * K -// kTileBatch - batches per tile, power-of-2 -// kInner - inner dimension K (e.g. 49, need not be power-of-2) -// ===================================================================== - -template -void __vec__ -vec_broadcast_3d(typename tile_shape_out::TileDType __out__ dst, - const typename tile_shape_in::TileDType __in__ src) { - size_t x = blkv_get_index_x(); - size_t y = blkv_get_index_y(); - - size_t batch_idx = y & (kTileBatch - 1); - size_t copy = y >> 4; - - size_t in_index = batch_idx * tile_shape_in::RowStride + x; - size_t out_index = batch_idx * tile_shape_out::RowStride + copy * kInnerCols + x; - - blkv_get_tile_ptr(dst)[out_index] = blkv_get_tile_ptr(src)[in_index]; +template +void broadcast(dtype *in_ptr, dtype *out_ptr, const size_t *, const size_t *) { + constexpr size_t kBCast = gOM / gIM; + constexpr size_t kElementsPerTile = 512; + constexpr size_t kFullTiles = gOM / kElementsPerTile; + constexpr size_t kTail = gOM % kElementsPerTile; + + static_assert(MAX_DIM >= IN_DIM && MAX_DIM >= OUT_DIM, + "MAX_DIM must cover input and output ranks"); + static_assert(gOM % gIM == 0, + "broadcast output element count must be a multiple of input"); + static_assert(gIM % kInner == 0, + "input element count must be divisible by kInner"); + static_assert(kBCast > 0, "broadcast factor must be positive"); + static_assert(kTileBatch > 0, "kTileBatch must be positive"); + + using InputGlobal = global_tensor>; + using OutputGlobal = global_tensor>; + using DataTile = Tile; + using OffsetTile = Tile; + using OutputIterator = global_iterator; + + InputGlobal input_global(in_ptr); + OutputIterator output_iter(out_ptr); + + auto emit_tile = [&](auto &data_tile, auto &linear, auto &batch, + auto &batch_base, auto &inner_q, auto &inner_base, + auto &inner, auto &inner_bytes, auto &offset, + std::uint32_t base, auto output_addr) { + using LinearTile = std::remove_reference_t; + TCI(linear, base); + + TDIVS(batch, linear, static_cast(kBCast * kInner)); + TMULS(batch_base, batch, + static_cast(kInner * sizeof(dtype))); + + TDIVS(inner_q, linear, static_cast(kInner)); + TMULS(inner_base, inner_q, static_cast(kInner)); + TSUB(inner, linear, inner_base); + TMULS(inner_bytes, inner, static_cast(sizeof(dtype))); + + TADD(offset, batch_base, inner_bytes); + MGATHER(data_tile, input_global, offset); + TSTORE(output_addr, data_tile); + }; + + for (size_t tile = 0; tile < kFullTiles; ++tile) { + DataTile data_tile; + OffsetTile linear; + OffsetTile batch; + OffsetTile batch_base; + OffsetTile inner_q; + OffsetTile inner_base; + OffsetTile inner; + OffsetTile inner_bytes; + OffsetTile offset; + emit_tile(data_tile, linear, batch, batch_base, inner_q, inner_base, inner, + inner_bytes, offset, + static_cast(tile * kElementsPerTile), + output_iter(0, static_cast(tile))); + } + + if constexpr (kTail != 0) { + using TailDataTile = Tile; + using TailOffsetTile = + Tile; + TailDataTile data_tile; + TailOffsetTile linear; + TailOffsetTile batch; + TailOffsetTile batch_base; + TailOffsetTile inner_q; + TailOffsetTile inner_base; + TailOffsetTile inner; + TailOffsetTile inner_bytes; + TailOffsetTile offset; + emit_tile(data_tile, linear, batch, batch_base, inner_q, inner_base, inner, + inner_bytes, offset, + static_cast(kFullTiles * kElementsPerTile), + output_iter(0, static_cast(kFullTiles))); + } } -template -void broadcast(dtype *in_ptr, dtype *out_ptr, - const size_t * /*in_shape*/, const size_t * /*out_shape*/) { - constexpr size_t kBCast = gOM / gIM; - constexpr size_t kBatch = gIM / kInner; - constexpr size_t tileCols = 512; - - static_assert(gOM % gIM == 0, - "gOM must be divisible by gIM for (B,1,K)->(B,N,K) broadcast"); - static_assert(gIM % kInner == 0, - "gIM must be divisible by kInner (B = gIM/kInner must be integer)"); - static_assert((kTileBatch & (kTileBatch - 1)) == 0, - "kTileBatch must be power of 2 for 512B tile alignment"); - static_assert((kBCast & (kBCast - 1)) == 0, - "kBCast (N) must be power of 2 for bitwise division in SIMT"); - static_assert(tileCols >= kBCast * kInner, - "padded tileCols (512) must >= broadcast target width (N*K)"); - - using tile_in = Tile; - using tile_out = Tile; - using gm_in = global_tensor>; - using gm_out = global_tensor>; - - constexpr size_t Nb = kBatch / kTileBatch; - constexpr size_t rmd = kBatch % kTileBatch; - - tile_in inTile; - tile_out outTile; - - using tile_in_r = Tile; - using tile_out_r = Tile; - tile_in_r inTile_rmd; - tile_out_r outTile_rmd; - - for (size_t i = 0; i < Nb; i++) { - gm_in gsrc(in_ptr + i * kTileBatch * kInner); - TCOPYIN(inTile, gsrc); - - vec_broadcast_3d - <<>>(outTile.data(), inTile.data()); - - gm_out gdst(out_ptr + i * kTileBatch * kBCast * kInner); - TCOPYOUT(gdst, outTile); - } - - if constexpr (rmd > 0) { - gm_in gsrc(in_ptr + Nb * kTileBatch * kInner); - TCOPYIN(inTile_rmd, gsrc); - - vec_broadcast_3d - <<>>(outTile_rmd.data(), inTile_rmd.data()); - - gm_out gdst(out_ptr + Nb * kTileBatch * kBCast * kInner); - TCOPYOUT(gdst, outTile_rmd); - } -} \ No newline at end of file +#endif diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_vec_019_pto.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_vec_019_pto.hpp new file mode 100644 index 0000000..d1aa674 --- /dev/null +++ b/benchmark-linxisa/kernels/broadcast/broadcast_vec_019_pto.hpp @@ -0,0 +1,154 @@ +// ============================================================================ +// Broadcast (B,1,K) -> (B,N,K) — PTO 一层编程模型 +// +// 原始 broadcast_vec_019.hpp 策略: +// TCOPYIN (kTileBatch,K) -> __vec__ 行复制 (K 个元素复制 N 次) -> TCOPYOUT (kTileBatch,N*K) +// __vec__ 块: dst[batch*RowStride + copy*K + x] = src[batch*RowStride + x] +// +// PTO 一层策略: +// TLOAD (kTileBatch,K) -> TINSERT × N (将输入 tile 插入输出 tile 的 N 个列偏移) -> TSTORE (kTileBatch,N*K) +// TINSERT 语义: dst[indexRow+i, indexCol+j] = src[i,j] +// 对 copy c = 0..N-1: TINSERT(outTile, inTile, 0, c*K) +// N 次插入互不重叠, 合起来恰好填满输出 tile 的 valid 区域 +// +// ┌─────────────────────────────────────────────────────────────────────────┐ +// │ 当前编译器不支持 / 不完整的指令汇总 │ +// ├──────────┬──────────────────┬──────────────────────────────────────────┤ +// │ Pto ISA │ 当前编译器状态 │ 说明 │ +// │ 指令 │ │ │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TLOAD │ API 有(名不同), │ PTO ISA 名 TLOAD; │ +// │ │ 二层实现 │ 当前编译器名 TCOPYIN; │ +// │ │ │ jcore/TCopyIn.hpp 用 __vec__ 实现 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TINSERT │ v0.57 supported │ TileOP API lowers to BSTART.FIXP │ +// │ │ │ with FIXP.Function.TINSERT │ +// │ │ │ (仅有反向操作 TEXTRACT) │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TSTORE │ API 有(名不同), │ PTO ISA 名 TSTORE; │ +// │ │ 二层实现 │ 当前编译器名 TCOPYOUT; │ +// │ │ │ jcore/TCopyOut.hpp 用 __vec__ 实现 │ +// └──────────┴──────────────────┴──────────────────────────────────────────┘ +// +// PTO ISA 文档签名 (Declared in include/pto/pto_instr.hpp): +// +// TLOAD: +// template +// PTO_INST RecordEvent TLOAD(TileData &dst, GlobalData &src, +// WaitEvents &... events); +// +// TINSERT: +// template +// PTO_INST RecordEvent TINSERT(DstTileData &dst, SrcTileData &src, +// uint16_t indexRow, uint16_t indexCol, +// WaitEvents &... events); +// 语义: dst[indexRow+i, indexCol+j] = src[i,j] +// for 0 <= i < src.ValidRow, 0 <= j < src.ValidCol +// +// TSTORE: +// template +// PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, +// WaitEvents &... events); +// ============================================================================ + +#include +#include +// #include // [!] PTO ISA C++ Intrinsic — 当前编译器未提供 + +#include +#include + +// ===================================================================== +// Broadcast (B,1,K) -> (B,N,K) via TLOAD + TINSERT×N + TSTORE +// +// Data layout (row-major): +// Input: [B][1][K] -> flat = B*K, batch b data at offset b*K +// Output: [B][N][K] -> flat = B*N*K, batch b data at offset b*N*K +// Broadcast along dim1: 1 -> N, dim0 & dim2 preserved. +// +// Template params (与原 broadcast_vec_019 一致): +// dtype - data type (__half, float, etc.) +// MAX_DIM - max dimensions (kept for compat) +// IN_DIM - input dim count (kept for compat) +// OUT_DIM - output dim count (kept for compat) +// gIM - total input elements = B * K +// gOM - total output elements = B * N * K +// kTileBatch - batches per tile, power-of-2 +// kInner - inner dimension K (e.g. 49, need not be power-of-2) +// ===================================================================== + +template +void broadcast(dtype *in_ptr, dtype *out_ptr, + const size_t * /*in_shape*/, const size_t * /*out_shape*/) { + constexpr size_t kBCast = gOM / gIM; + constexpr size_t kBatch = gIM / kInner; + constexpr size_t tileCols = 512; + + static_assert(gOM % gIM == 0, + "gOM must be divisible by gIM for (B,1,K)->(B,N,K) broadcast"); + static_assert(gIM % kInner == 0, + "gIM must be divisible by kInner (B = gIM/kInner must be integer)"); + static_assert((kTileBatch & (kTileBatch - 1)) == 0, + "kTileBatch must be power of 2 for 512B tile alignment"); + static_assert((kBCast & (kBCast - 1)) == 0, + "kBCast (N) must be power of 2 for bitwise division in SIMT"); + static_assert(tileCols >= kBCast * kInner, + "padded tileCols (512) must >= broadcast target width (N*K)"); + + using tile_in = Tile; + using tile_out = Tile; + using gm_in = global_tensor>; + using gm_out = global_tensor>; + + constexpr size_t Nb = kBatch / kTileBatch; + constexpr size_t rmd = kBatch % kTileBatch; + + tile_in inTile; + tile_out outTile; + + for (size_t i = 0; i < Nb; i++) { + gm_in gsrc(in_ptr + i * kTileBatch * kInner); + gm_out gdst(out_ptr + i * kTileBatch * kBCast * kInner); + + // TLOAD: GM -> UB, 加载 (kTileBatch, kInner) 输入 tile + // [当前编译器] 名为 TCOPYIN, jcore 为 __vec__ + TLOAD(inTile, gsrc); + + // TINSERT × kBCast: 将输入 tile 插入输出 tile 的 N 个列偏移 + // 每次 TINSERT 写入 kInner 列, N 次互不重叠, 合起来填满 N*kInner 列 + // v0.57 lowers TINSERT through the FIXP tile block. + #pragma clang loop unroll(full) + for (size_t c = 0; c < kBCast; c++) { + TINSERT(outTile, inTile, /*indexRow=*/0, /*indexCol=*/(uint16_t)(c * kInner)); + } + + // TSTORE: UB -> GM, 写回 (kTileBatch, kBCast*kInner) 输出 tile + // [当前编译器] 名为 TCOPYOUT, jcore 为 __vec__ + TSTORE(gdst, outTile); + } + + using tile_in_r = Tile; + using tile_out_r = Tile; + tile_in_r inTile_rmd; + tile_out_r outTile_rmd; + if constexpr (rmd > 0) { + gm_in gsrc(in_ptr + Nb * kTileBatch * kInner); + gm_out gdst(out_ptr + Nb * kTileBatch * kBCast * kInner); + + TLOAD(inTile_rmd, gsrc); + + #pragma clang loop unroll(full) + for (size_t c = 0; c < kBCast; c++) { + TINSERT(outTile_rmd, inTile_rmd, 0, (uint16_t)(c * kInner)); + } + + TSTORE(gdst, outTile_rmd); + } +} diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_vec_039.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_vec_039.hpp index 72cccbd..a918659 100644 --- a/benchmark-linxisa/kernels/broadcast/broadcast_vec_039.hpp +++ b/benchmark-linxisa/kernels/broadcast/broadcast_vec_039.hpp @@ -1,135 +1,6 @@ -#include -#include -#include +#ifndef SUPERNPUBENCH_PTOISA_BROADCAST_VEC_039_HPP +#define SUPERNPUBENCH_PTOISA_BROADCAST_VEC_039_HPP -using namespace pto; +#include "broadcast_vec_019.hpp" -// ===================================================================== -// Broadcast (B,1,K) -> (B,N,K) via TCOPYIN + __vec__ broadcast + TCOPYOUT -// -// Optimized for: (8192,1,16) -> (8192,8,16), dtype=half -// -// Data layout (row-major): -// Input: [B][1][K] -> flat = B*K, batch b data at offset b*K -// Output: [B][N][K] -> flat = B*N*K, batch b data at offset b*N*K -// Broadcast along dim1: 1 -> N, dim0 & dim2 preserved. -// -// Processing strategy: -// Divide B batches into tiles of kTileBatch batches each. -// Per tile: -// 1. TCOPYIN (kTileBatch, K) from GlobalMem -> TileReg -// Reads kTileBatch * K contiguous elements. -// 2. __vec__ broadcast within TileReg: -// For each batch, replicate its K elements N times (row-wise). -// Launch <<>> threads: -// x = output position within one batch (0..N*K-1) -// y = batch index within tile (0..kTileBatch-1) -// copy = x / K (broadcast copy 0..N-1) -// col = x % K (inner column 0..K-1) -// Read src[y * RowStride + col] -// Write dst[y * RowStride + x] -// 3. TCOPYOUT (kTileBatch, N*K) from TileReg -> GlobalMem -// -// TileReg layout: -// Physical tile cols = 256 (padded for 512B alignment). -// Tile bytes = kTileBatch * 256 * sizeof(dtype) = 512 * kTileBatch -// Constraint: kTileBatch must be power-of-2. -// -// Alignment notes: -// K and N must be powers of 2 for bit-op index decomposition. -// For 512B-aligned global tile strides (dtype=half): -// Input stride = kTileBatch * K * sizeof(half) = kTileBatch * K * 2 -// 512-aligned when kTileBatch * K * 2 >= 512 and is 512 * 2^n -// e.g. kTileBatch=16, K=16 -> 512B per tile (512-aligned) -// Output stride = kTileBatch * N * K * sizeof(half) -// e.g. kTileBatch=16, N=8, K=16 -> 4096B per tile (512-aligned) -// For (8192,1,16)->(8192,8,16) with kTileBatch=16: -// 8192/16 = 512 tiles, no remainder, all aligned. -// -// Template params: -// dtype - data type (__half, float, etc.) -// MAX_DIM - max dimensions (kept for compat) -// IN_DIM - input dim count (kept for compat) -// OUT_DIM - output dim count (kept for compat) -// gIM - total input elements = B * K (e.g. 8192*16 = 131072) -// gOM - total output elements = B * N * K (e.g. 8192*8*16 = 1048576) -// kTileBatch - batches per tile, power-of-2 (e.g. 1,2,4,8,16,32,64) -// kInner - inner dimension K, power-of-2 (e.g. 16) -// ===================================================================== - -template -void __vec__ -vec_broadcast_3d(typename tile_shape_out::TileDType __out__ dst, - const typename tile_shape_in::TileDType __in__ src) { - size_t x = blkv_get_index_x(); - size_t y = blkv_get_index_y(); - - size_t in_index = y * tile_shape_in::RowStride + (x & (kInnerCols - 1)); - size_t out_index = y * tile_shape_out::RowStride + x; - - blkv_get_tile_ptr(dst)[out_index] = blkv_get_tile_ptr(src)[in_index]; -} - -template -void broadcast(dtype *in_ptr, dtype *out_ptr, - const size_t * /*in_shape*/, const size_t * /*out_shape*/) { - constexpr size_t kBCast = gOM / gIM; // 8 - constexpr size_t kBCast_in = 64 / kInner; // 2 - constexpr size_t kBCast_out = kBCast / kBCast_in; // 4 - constexpr size_t kBatch = gIM / kInner; - constexpr size_t tileCols = 256; - - static_assert(gOM % gIM == 0, - "gOM must be divisible by gIM for (B,1,K)->(B,N,K) broadcast"); - static_assert(gIM % kInner == 0, - "gIM must be divisible by kInner (B = gIM/kInner must be integer)"); - static_assert((kInner & (kInner - 1)) == 0, - "kInner must be power of 2 for bit-op index decomposition"); - static_assert((kTileBatch & (kTileBatch - 1)) == 0, - "kTileBatch must be power of 2 for 512B tile alignment"); - static_assert(tileCols >= kBCast * kInner, - "padded tileCols (256) must >= broadcast target width (N*K)"); - - using tile_in = Tile; - using tile_out = Tile; - using gm_in = global_tensor>; - using gm_out = global_tensor>; - - constexpr size_t Nb = kBatch / kTileBatch; - constexpr size_t rmd = kBatch % kTileBatch; - - tile_in inTile; - tile_out outTile; - - using tile_in_r = Tile; - using tile_out_r = Tile; - tile_in_r inTile_rmd; - tile_out_r outTile_rmd; - - for (size_t i = 0; i < Nb; i++) { - gm_in gsrc(in_ptr + i * kTileBatch * kInner); - TCOPYIN(inTile, gsrc); - - vec_broadcast_3d - <<>>(outTile.data(), inTile.data()); - - gm_out gdst(out_ptr + i * kTileBatch * kBCast * kInner); - TCOPYOUT(gdst, outTile); - } - - if constexpr (rmd > 0) { - gm_in gsrc(in_ptr + Nb * kTileBatch * kInner); - TCOPYIN(inTile_rmd, gsrc); - - vec_broadcast_3d - <<>>(outTile_rmd.data(), inTile_rmd.data()); - - gm_out gdst(out_ptr + Nb * kTileBatch * kBCast * kInner); - TCOPYOUT(gdst, outTile_rmd); - } -} \ No newline at end of file +#endif diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_vec_039_pto.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_vec_039_pto.hpp new file mode 100644 index 0000000..38894d8 --- /dev/null +++ b/benchmark-linxisa/kernels/broadcast/broadcast_vec_039_pto.hpp @@ -0,0 +1,155 @@ +// ============================================================================ +// Broadcast (B,1,K) -> (B,N,K) — PTO 一层编程模型 +// +// 原始 broadcast_vec_039.hpp 策略: +// TCOPYIN (kTileBatch,K) -> __vec__ 行复制 (K 个元素复制 N 次) -> TCOPYOUT (kTileBatch,N*K) +// __vec__ 块: dst[y*RowStride + x] = src[y*RowStride + (x & (K-1))] +// (K 为 2 的幂, 用位与代替取余) +// +// PTO 一层策略: +// TLOAD (kTileBatch,K) -> TINSERT × N (将输入 tile 插入输出 tile 的 N 个列偏移) -> TSTORE (kTileBatch,N*K) +// TINSERT 语义: dst[indexRow+i, indexCol+j] = src[i,j] +// 对 copy c = 0..N-1: TINSERT(outTile, inTile, 0, c*K) +// N 次插入互不重叠, 合起来恰好填满输出 tile 的 valid 区域 +// +// ┌─────────────────────────────────────────────────────────────────────────┐ +// │ 当前编译器不支持 / 不完整的指令汇总 │ +// ├──────────┬──────────────────┬──────────────────────────────────────────┤ +// │ Pto ISA │ 当前编译器状态 │ 说明 │ +// │ 指令 │ │ │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TLOAD │ API 有(名不同), │ PTO ISA 名 TLOAD; │ +// │ │ 二层实现 │ 当前编译器名 TCOPYIN; │ +// │ │ │ jcore/TCopyIn.hpp 用 __vec__ 实现 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TINSERT │ v0.57 supported │ TileOP API lowers to BSTART.FIXP │ +// │ │ │ with FIXP.Function.TINSERT │ +// │ │ │ (仅有反向操作 TEXTRACT) │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TSTORE │ API 有(名不同), │ PTO ISA 名 TSTORE; │ +// │ │ 二层实现 │ 当前编译器名 TCOPYOUT; │ +// │ │ │ jcore/TCopyOut.hpp 用 __vec__ 实现 │ +// └──────────┴──────────────────┴──────────────────────────────────────────┘ +// +// PTO ISA 文档签名 (Declared in include/pto/pto_instr.hpp): +// +// TLOAD: +// template +// PTO_INST RecordEvent TLOAD(TileData &dst, GlobalData &src, +// WaitEvents &... events); +// +// TINSERT: +// template +// PTO_INST RecordEvent TINSERT(DstTileData &dst, SrcTileData &src, +// uint16_t indexRow, uint16_t indexCol, +// WaitEvents &... events); +// 语义: dst[indexRow+i, indexCol+j] = src[i,j] +// for 0 <= i < src.ValidRow, 0 <= j < src.ValidCol +// +// TSTORE: +// template +// PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, +// WaitEvents &... events); +// ============================================================================ + +#include +#include +// #include // [!] PTO ISA C++ Intrinsic — 当前编译器未提供 + +#include +#include + +// ===================================================================== +// Broadcast (B,1,K) -> (B,N,K) via TLOAD + TINSERT×N + TSTORE +// +// Data layout (row-major): +// Input: [B][1][K] -> flat = B*K, batch b data at offset b*K +// Output: [B][N][K] -> flat = B*N*K, batch b data at offset b*N*K +// Broadcast along dim1: 1 -> N, dim0 & dim2 preserved. +// +// Template params (与原 broadcast_vec_039 一致): +// dtype - data type (__half, float, etc.) +// MAX_DIM - max dimensions (kept for compat) +// IN_DIM - input dim count (kept for compat) +// OUT_DIM - output dim count (kept for compat) +// gIM - total input elements = B * K (e.g. 8192*16 = 131072) +// gOM - total output elements = B * N * K (e.g. 8192*8*16 = 1048576) +// kTileBatch - batches per tile, power-of-2 (e.g. 1,2,4,8,16,32,64) +// kInner - inner dimension K, power-of-2 (e.g. 16) +// ===================================================================== + +template +void broadcast(dtype *in_ptr, dtype *out_ptr, + const size_t * /*in_shape*/, const size_t * /*out_shape*/) { + constexpr size_t kBCast = gOM / gIM; + constexpr size_t kBatch = gIM / kInner; + constexpr size_t tileCols = 256; + + static_assert(gOM % gIM == 0, + "gOM must be divisible by gIM for (B,1,K)->(B,N,K) broadcast"); + static_assert(gIM % kInner == 0, + "gIM must be divisible by kInner (B = gIM/kInner must be integer)"); + static_assert((kInner & (kInner - 1)) == 0, + "kInner must be power of 2 for bit-op index decomposition"); + static_assert((kTileBatch & (kTileBatch - 1)) == 0, + "kTileBatch must be power of 2 for 512B tile alignment"); + static_assert(tileCols >= kBCast * kInner, + "padded tileCols (256) must >= broadcast target width (N*K)"); + + using tile_in = Tile; + using tile_out = Tile; + using gm_in = global_tensor>; + using gm_out = global_tensor>; + + constexpr size_t Nb = kBatch / kTileBatch; + constexpr size_t rmd = kBatch % kTileBatch; + + tile_in inTile; + tile_out outTile; + + for (size_t i = 0; i < Nb; i++) { + gm_in gsrc(in_ptr + i * kTileBatch * kInner); + gm_out gdst(out_ptr + i * kTileBatch * kBCast * kInner); + + // TLOAD: GM -> UB, 加载 (kTileBatch, kInner) 输入 tile + // [当前编译器] 名为 TCOPYIN, jcore 为 __vec__ + TLOAD(inTile, gsrc); + + // TINSERT × kBCast: 将输入 tile 插入输出 tile 的 N 个列偏移 + // 每次 TINSERT 写入 kInner 列, N 次互不重叠, 合起来填满 N*kInner 列 + // v0.57 lowers TINSERT through the FIXP tile block. + #pragma clang loop unroll(full) + for (size_t c = 0; c < kBCast; c++) { + TINSERT(outTile, inTile, /*indexRow=*/0, /*indexCol=*/(uint16_t)(c * kInner)); + } + + // TSTORE: UB -> GM, 写回 (kTileBatch, kBCast*kInner) 输出 tile + // [当前编译器] 名为 TCOPYOUT, jcore 为 __vec__ + TSTORE(gdst, outTile); + } + + using tile_in_r = Tile; + using tile_out_r = Tile; + tile_in_r inTile_rmd; + tile_out_r outTile_rmd; + if constexpr (rmd > 0) { + gm_in gsrc(in_ptr + Nb * kTileBatch * kInner); + gm_out gdst(out_ptr + Nb * kTileBatch * kBCast * kInner); + + TLOAD(inTile_rmd, gsrc); + + #pragma clang loop unroll(full) + for (size_t c = 0; c < kBCast; c++) { + TINSERT(outTile_rmd, inTile_rmd, 0, (uint16_t)(c * kInner)); + } + + TSTORE(gdst, outTile_rmd); + } +} diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_vec_07.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_vec_07.hpp index 5934187..e4b8cf6 100644 --- a/benchmark-linxisa/kernels/broadcast/broadcast_vec_07.hpp +++ b/benchmark-linxisa/kernels/broadcast/broadcast_vec_07.hpp @@ -1,105 +1,69 @@ -#include -#include -#include - -using namespace pto; - -// ===================================================================== -// Broadcast (N,1) -> (N,C) via TCOPYIN + __vec__ broadcast + TCOPYOUT -// -// Optimized for: (1443,1) -> (1443,129), dtype=half -// -// Processing strategy: -// Divide N rows into tiles of kTileRows rows each. -// Per tile: -// 1. TCOPYIN (kTileRows, 1) from GlobalMem -> TileReg -// 2. __vec__ broadcast (kTileRows, 1) -> (kTileRows, C) within TileReg -// Launch <<>> threads: -// x = column index (0..kC-1), y = row index (0..kTileRows-1) -// Each thread reads src[j*src_RowStride] (col 0 of row j) -// and writes to dst[i + j*dst_RowStride] (col i of row j) -// 3. TCOPYOUT (kTileRows, C) from TileReg -> GlobalMem -// -// TileReg layout: -// Physical tile cols padded to 256 for 512B alignment. -// Tile bytes = kTileRows * 256 * sizeof(dtype) = 512 * kTileRows -// Constraint: kTileRows must be power-of-2 (512 * 2^n bytes). -// -// Template params (backward compatible with original broadcast_07): -// dtype - data type (__half, float, etc.) -// MAX_DIM - max dimensions (unused, kept for compat) -// IN_DIM - input dim count (unused) -// OUT_DIM - output dim count (unused) -// gIM - total input elements = N*1 = N (e.g. 1443) -// gOM - total output elements = N*C (e.g. 1443*129) -// kTileRows - rows per tile, must be power-of-2 (e.g. 1,2,4,..,64) -// ===================================================================== - -template -void __vec__ -vec_broadcast_rowmajor(typename tile_shape_out::TileDType __out__ dst, - const typename tile_shape_in::TileDType __in__ src) { - size_t i = blkv_get_index_x(); - size_t j = blkv_get_index_y(); - size_t out_index = i + j * tile_shape_out::RowStride; - blkv_get_tile_ptr(dst)[out_index] = - blkv_get_tile_ptr(src)[j * tile_shape_in::RowStride]; -} - -template -void broadcast(dtype *in_ptr, dtype *out_ptr, - const size_t * /*in_shape*/, const size_t * /*out_shape*/) { - constexpr size_t kN = gIM; - constexpr size_t kC = gOM / gIM; - constexpr size_t tileCols = 256; +#ifndef SUPERNPUBENCH_PTOISA_BROADCAST_VEC_07_HPP +#define SUPERNPUBENCH_PTOISA_BROADCAST_VEC_07_HPP - static_assert(gOM % gIM == 0, - "gOM must be divisible by gIM for (N,1)->(N,C) broadcast"); - static_assert(tileCols >= kC, - "padded tileCols (256) must >= broadcast target columns"); - static_assert((kTileRows & (kTileRows - 1)) == 0, - "kTileRows must be power of 2 for 512B tile alignment"); +#include - using tile_in = Tile; - using tile_out = Tile; - using gm_in = global_tensor>; - using gm_out = global_tensor>; +#include - constexpr size_t Nb = kN / kTileRows; - constexpr size_t rmd = kN % kTileRows; +using namespace pto; - tile_in inTile; - tile_out outTile; +template +void broadcast(dtype *in_ptr, dtype *out_ptr, const size_t *, const size_t *) { + constexpr size_t kN = gIM; + constexpr size_t kC = gOM / gIM; + constexpr size_t kTileCols = kC; + constexpr size_t kMaxRowsByBytes = + (4096 / (kTileCols * sizeof(dtype))) == 0 + ? 1 + : (4096 / (kTileCols * sizeof(dtype))); + constexpr size_t kRowsPerTile = + kTileRows < kMaxRowsByBytes ? kTileRows : kMaxRowsByBytes; + static_assert(MAX_DIM >= IN_DIM && MAX_DIM >= OUT_DIM, + "MAX_DIM must cover input and output ranks"); + static_assert(gOM % gIM == 0, + "broadcast output element count must be a multiple of input"); + static_assert(kRowsPerTile > 0, "broadcast tile must contain at least one row"); - for (size_t i = 0; i < Nb; i++) { - gm_in gsrc(in_ptr + i * kTileRows); - TCOPYIN(inTile, gsrc); + using InputGlobal = global_tensor>; + using OutputGlobal = global_tensor>; + using InputTile = Tile; + using OutputTile = Tile; + using InputIterator = global_iterator; + using OutputIterator = global_iterator; - vec_broadcast_rowmajor - <<>>(outTile.data(), inTile.data()); + InputIterator input_iter(in_ptr); + OutputIterator output_iter(out_ptr); - gm_out gdst(out_ptr + i * kTileRows * kC); - TCOPYOUT(gdst, outTile); - } + constexpr size_t kFullTiles = kN / kRowsPerTile; + constexpr size_t kTail = kN % kRowsPerTile; - using tile_in_r = Tile; - using tile_out_r = Tile; - tile_in_r inTile_rmd; - tile_out_r outTile_rmd; - if constexpr (rmd > 0) { - gm_in gsrc(in_ptr + Nb * kTileRows); - TCOPYIN(inTile_rmd, gsrc); + for (size_t tile = 0; tile < kFullTiles; ++tile) { + InputTile input_tile; + OutputTile output_tile; + TLOAD(input_tile, input_iter(static_cast(tile), 0)); + TROWEXPAND(output_tile, input_tile); + TSTORE(output_iter(static_cast(tile), 0), output_tile); + } - vec_broadcast_rowmajor - <<>>(outTile_rmd.data(), inTile_rmd.data()); + if constexpr (kTail != 0) { + using TailInputTile = Tile; + using TailOutputTile = Tile; + using TailInputIterator = global_iterator; + using TailOutputIterator = global_iterator; + TailInputIterator tail_input_iter(in_ptr); + TailOutputIterator tail_output_iter(out_ptr); + TailInputTile input_tile; + TailOutputTile output_tile; + TLOAD(input_tile, tail_input_iter(static_cast(kFullTiles), 0)); + TROWEXPAND(output_tile, input_tile); + TSTORE(tail_output_iter(static_cast(kFullTiles), 0), output_tile); + } +} - gm_out gdst(out_ptr + Nb * kTileRows * kC); - TCOPYOUT(gdst, outTile_rmd); - } -} \ No newline at end of file +#endif // SUPERNPUBENCH_PTOISA_BROADCAST_VEC_07_HPP diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_vec_07_handwrite.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_vec_07_handwrite.hpp index 88cccf2..c90e364 100644 --- a/benchmark-linxisa/kernels/broadcast/broadcast_vec_07_handwrite.hpp +++ b/benchmark-linxisa/kernels/broadcast/broadcast_vec_07_handwrite.hpp @@ -39,7 +39,7 @@ void __vec__ vec_broadcast( // (N,1)->(N,129) -template +template void broadcast( dtype *in_ptr, dtype *out_ptr, diff --git a/benchmark-linxisa/kernels/broadcast/broadcast_vec_07_pto.hpp b/benchmark-linxisa/kernels/broadcast/broadcast_vec_07_pto.hpp new file mode 100644 index 0000000..550a9f4 --- /dev/null +++ b/benchmark-linxisa/kernels/broadcast/broadcast_vec_07_pto.hpp @@ -0,0 +1,132 @@ +// ============================================================================ +// Broadcast (N,1) -> (N,C) — PTO 一层编程模型 +// +// 原始 broadcast_vec_07.hpp 策略: +// TCOPYIN (kTileRows,1) -> __vec__ 行广播 -> TCOPYOUT (kTileRows,C) +// __vec__ 块: dst[x + y*RowStride] = src[y*RowStride] (col 0 -> all cols) +// +// PTO 一层策略: +// TLOAD (kTileRows,1) -> TROWEXPAND (col 0 广播到全部 C 列) -> TSTORE (kTileRows,C) +// TROWEXPAND 语义: dst[i,j] = src[i,0], 恰好是行广播 +// +// ┌─────────────────────────────────────────────────────────────────────────┐ +// │ 当前编译器不支持 / 不完整的指令汇总 │ +// ├──────────┬──────────────────┬──────────────────────────────────────────┤ +// │ Pto ISA │ 当前编译器状态 │ 说明 │ +// │ 指令 │ │ │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TLOAD │ API 有(名不同), │ PTO ISA 名 TLOAD; │ +// │ │ 二层实现 │ 当前编译器名 TCOPYIN; │ +// │ │ │ jcore/TCopyIn.hpp 用 __vec__ 实现 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │TROWEXPAND│ API 有(名不同), │ PTO ISA 名 TROWEXPAND; │ +// │ │ 二层实现 │ 当前编译器名 TEXPANDROW; │ +// │ │ │ jcore/TExpandRow.hpp 用 __vec__ 实现 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TSTORE │ API 有(名不同), │ PTO ISA 名 TSTORE; │ +// │ │ 二层实现 │ 当前编译器名 TCOPYOUT; │ +// │ │ │ jcore/TCopyOut.hpp 用 __vec__ 实现 │ +// └──────────┴──────────────────┴──────────────────────────────────────────┘ +// +// PTO ISA 文档签名 (Declared in include/pto/pto_instr.hpp): +// +// TLOAD: +// template +// PTO_INST RecordEvent TLOAD(TileData &dst, GlobalData &src, +// WaitEvents &... events); +// +// TROWEXPAND: +// template +// PTO_INST RecordEvent TROWEXPAND(TileDataDst &dst, TileDataSrc &src, +// WaitEvents &... events); +// 约束: dst[i,j] = src[i,0]; dst 与 src 必须为 Vec tile, ND layout; +// A2A3: srcValidRow == dstValidRow; srcValidCol >= 1 +// +// TSTORE: +// template +// PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, +// WaitEvents &... events); +// ============================================================================ + +#include +#include +// #include // [!] PTO ISA C++ Intrinsic — 当前编译器未提供 + +#include +#include + +// ===================================================================== +// Broadcast (N,1) -> (N,C) via TLOAD + TROWEXPAND + TSTORE +// +// Template params (与原 broadcast_vec_07 一致): +// dtype - data type (__half, float, etc.) +// MAX_DIM - max dimensions (unused, kept for compat) +// IN_DIM - input dim count (unused) +// OUT_DIM - output dim count (unused) +// gIM - total input elements = N*1 = N (e.g. 1443) +// gOM - total output elements = N*C (e.g. 1443*129) +// kTileRows - rows per tile, must be power-of-2 (e.g. 1,2,4,..,64) +// ===================================================================== + +template +void broadcast(dtype *in_ptr, dtype *out_ptr, + const size_t * /*in_shape*/, const size_t * /*out_shape*/) { + constexpr size_t kN = gIM; + constexpr size_t kC = gOM / gIM; + constexpr size_t tileCols = 256; + + static_assert(gOM % gIM == 0, + "gOM must be divisible by gIM for (N,1)->(N,C) broadcast"); + static_assert(tileCols >= kC, + "padded tileCols (256) must >= broadcast target columns"); + static_assert((kTileRows & (kTileRows - 1)) == 0, + "kTileRows must be power of 2 for 512B tile alignment"); + + using tile_in = Tile; + using tile_out = Tile; + using gm_in = global_tensor>; + using gm_out = global_tensor>; + + constexpr size_t Nb = kN / kTileRows; + constexpr size_t rmd = kN % kTileRows; + + tile_in inTile; + tile_out outTile; + + for (size_t i = 0; i < Nb; i++) { + gm_in gsrc(in_ptr + i * kTileRows); + gm_out gdst(out_ptr + i * kTileRows * kC); + + // TLOAD: GM -> UB, 加载 (kTileRows, 1) 输入 tile + // [当前编译器] 名为 TCOPYIN, jcore 为 __vec__ + TLOAD(inTile, gsrc); + + // TROWEXPAND: 将每行 col 0 广播到全部 kC 列 -> (kTileRows, kC) + // [当前编译器] 名为 TEXPANDROW, jcore 为 __vec__ + TROWEXPAND(outTile, inTile); + + // TSTORE: UB -> GM, 写回 (kTileRows, kC) 输出 tile + // [当前编译器] 名为 TCOPYOUT, jcore 为 __vec__ + TSTORE(gdst, outTile); + } + + using tile_in_r = Tile; + using tile_out_r = Tile; + tile_in_r inTile_rmd; + tile_out_r outTile_rmd; + if constexpr (rmd > 0) { + gm_in gsrc(in_ptr + Nb * kTileRows); + gm_out gdst(out_ptr + Nb * kTileRows * kC); + + TLOAD(inTile_rmd, gsrc); + TROWEXPAND(outTile_rmd, inTile_rmd); + TSTORE(gdst, outTile_rmd); + } +} diff --git a/benchmark-linxisa/kernels/concat/concat_gather.hpp b/benchmark-linxisa/kernels/concat/concat_gather.hpp index 85a69fd..94c738a 100644 --- a/benchmark-linxisa/kernels/concat/concat_gather.hpp +++ b/benchmark-linxisa/kernels/concat/concat_gather.hpp @@ -1,199 +1,189 @@ -#ifndef TRANSPOSE_KERNEL_HPP -#define TRANSPOSE_KERNEL_HPP +#ifndef CONCAT_GATHER_KERNEL_HPP +#define CONCAT_GATHER_KERNEL_HPP #include -#include "template_asm.h" - -using namespace pto; - -#pragma once #include #include -/* -#define DUMP_TILE(label, TileVar, DumpBuf, Rows, Cols) \ - do { \ - GlobalTensor, \ - Stride<1,1,1,Cols,1>> _g(DumpBuf); \ - TCOPYOUT(_g, TileVar); \ - printf("[DUMP] %s (shape=%dx%d):\n", label, Rows, Cols); \ - for (int ri = 0; ri < Rows; ri++) { \ - printf(" row%2d: ", ri); \ - for (int ci = 0; ci < Cols; ci++) \ - printf("%12lld ", (long long)DumpBuf[ri * Cols + ci]); \ - printf("\n"); \ - } \ - fflush(stdout); \ - } while (0) -*/ - -// ============================================== -// 维度规则:交换transpose_dim0和transpose_dim1 -// ============================================== -template -void __vec__ gen_offset_concat( - typename tile_shape::TileDType __out__ out, - typename tile_Inshape::TileDType __in__ in_shape, - typename tile_Outshape::TileDType __in__ out_shape, -// const size_t in_dim, - const size_t base, - const size_t total_elements -) { - size_t index = blkv_get_index_x(); - size_t idx = blkv_get_index_x(); - - __vbuf__ typename tile_Inshape::DType *in_shape_ptr = blkv_get_tile_ptr(in_shape); - __vbuf__ typename tile_Outshape::DType *out_shape_ptr = blkv_get_tile_ptr(out_shape); - - if (index >= total_elements) return; - idx = idx + base; // todo idx是个向量,base是个标量,获得所有的基地址或者说基offset - - //转置维度交换stride。 -// uint16_t stride[IN_DIM]; - uint32_t stride[DATA_DIM]; - stride[DATA_DIM-1] = 1; - - #pragma clang loop unroll(full) - for(int i = DATA_DIM-2 ; i >= 0; --i) - { - stride[i] = stride[i+1] * in_shape_ptr[i+1]; - } - - - // 输出一维索引 → 输出坐标 - size_t out_coord[MAX_DIM] = {0}; // - size_t tmp = idx; // - - #pragma clang loop unroll(full) - for (int d = DATA_DIM - 1; d >= 0; d--) { - out_coord[d] = tmp % out_shape_ptr[d]; - tmp /= out_shape_ptr[d]; - } - - size_t n = out_coord[CONCAT_DIM] / in_shape_ptr[CONCAT_DIM]; - size_t offset = out_coord[CONCAT_DIM] % in_shape_ptr[CONCAT_DIM]; - - out_coord[CONCAT_DIM] = offset; - -/* - // 输出坐标 → 输入坐标 - size_t in_coord[MAX_DIM] = {0}; - for (size_t i = 0; i < in_dim; i++) { - size_t o = out_dim - in_dim + i; // 从后面对齐 - if (in_shape[i] == 1) { - in_coord[i] = 0; - } else { - in_coord[i] = out_coord[o]; - } - } -*/ -// uint16_t in_offset = 0; - uint32_t in_offset = 0; - - #pragma clang loop unroll(full) - for (int i = 0; i < DATA_DIM; i++) { - in_offset += out_coord[i] * stride[i] * sizeof(dtype); - } - in_offset += n * (stride[0]*in_shape_ptr[0]) * sizeof(dtype); - - // 赋值 - blkv_get_tile_ptr(out)[index] = in_offset; -} - -template -void gen_offset_Impl( - tile_shapeOffset &out, - tile_Inshape &in_shape, - tile_Outshape &out_shape, -// const size_t in_dim, -// const size_t out_dim, -// const size_t transpose_dim1, -// const size_t transpose_dim0, - const size_t base, - const size_t total_elements - ) -{ - static_assert(tile_shapeOffset::ValidRow != -1 && tile_shapeOffset::ValidCol != -1, - "Only static shape supported"); - gen_offset_concat<<>>(out.data(), in_shape.data(), out_shape.data(), base, total_elements); // todo 这部分的tile shape是怎么传递的? -} - - +using namespace pto; -template +/** + * @brief 通用 N 维 concat gather(Tile ISA 实现) + * + * 算法:对输出张量的每个元素,计算其多维坐标,确定来自哪个输入张量(n) + * 以及在输入张量内的偏移,用 MGATHER 从输入中 gather 数据。 + * + * @tparam DType 数据类型 + * @tparam MAX_DIM 最大维度数 + * @tparam gIM 输入总元素数 + * @tparam gOM 输出总元素数 + * @tparam tM 每个 tile 处理的元素数 + * @tparam DATA_DIM 数据维度数(编译期常量) + * @tparam CONCAT_DIM 拼接维度索引(编译期常量) + */ +template void concat_gather( - dtype *in_ptr, - dtype *out_ptr, + DType *in_ptr, + DType *out_ptr, size_t *in_shape, size_t *out_shape -// const size_t in_dim, -// const size_t out_dim, -// const size_t transpose_dim1, -// const size_t transpose_dim0 -) +) { + constexpr int kFullTiles = gOM / tM; + constexpr int kTailElements = gOM % tM; + + using InputGlobal = pto::global_tensor>; + using OutputGlobal = pto::global_tensor>; + using DataTile = pto::Tile; + using OffsetTile = pto::Tile; + using OutputIterator = global_iterator; + + InputGlobal input_global(in_ptr); + OutputIterator output_iter(out_ptr); + + uint32_t output_base = 0; + for (int tile_index = 0; tile_index < kFullTiles; ++tile_index) { + auto output_global = output_iter(0, tile_index); + DataTile output_tile; + OffsetTile offset_tile; + + OffsetTile linear_index; + OffsetTile quotient; + OffsetTile cycle; + OffsetTile cycle_base; + OffsetTile coordinate; + OffsetTile contribution; + + TCI(linear_index, output_base); + TEXPANDS(offset_tile, static_cast(0)); + + // 计算输入 stride(标量核心,运行时计算) + uint32_t input_stride[DATA_DIM]; + input_stride[DATA_DIM - 1] = 1; + for (int i = static_cast(DATA_DIM) - 2; i >= 0; --i) { + input_stride[i] = input_stride[i + 1] * static_cast(in_shape[i + 1]); + } + uint32_t input_size = input_stride[0] * static_cast(in_shape[0]); + + // 对每个维度提取坐标并计算 byte offset + for (int d = static_cast(DATA_DIM) - 1; d >= 0; --d) { + uint32_t output_stride = 1; + for (int dd = d + 1; dd < static_cast(DATA_DIM); ++dd) { + output_stride *= static_cast(out_shape[dd]); + } + + if (output_stride == 1) { + TMOV(quotient, linear_index); + } else { + TDIVS(quotient, linear_index, output_stride); + } + + uint32_t out_dim_size = static_cast(out_shape[d]); + TDIVS(cycle, quotient, out_dim_size); + TMULS(cycle_base, cycle, out_dim_size); + TSUB(coordinate, quotient, cycle_base); + + if (static_cast(d) == CONCAT_DIM) { + uint32_t in_dim_size = static_cast(in_shape[d]); + OffsetTile n_tile; + OffsetTile offset_in; + OffsetTile n_times_shape; + + TDIVS(n_tile, coordinate, in_dim_size); + TMULS(n_times_shape, n_tile, in_dim_size); + TSUB(offset_in, coordinate, n_times_shape); + + TMULS(contribution, offset_in, + input_stride[d] * static_cast(sizeof(DType))); + TADD(offset_tile, offset_tile, contribution); + + TMULS(contribution, n_tile, + input_size * static_cast(sizeof(DType))); + TADD(offset_tile, offset_tile, contribution); + } else { + TMULS(contribution, coordinate, + input_stride[d] * static_cast(sizeof(DType))); + TADD(offset_tile, offset_tile, contribution); + } + } - const int Mb = gOM / tM; - - const int rmd_M = gOM % tM; // todo 尾块怎么处理? - - using gm_shapeIn = global_tensor>; //将gm中的Tensor先声明为一维数据 - using gm_shapeOut = global_tensor>; - - using gm_InDataShape = global_tensor>; //将gm中的Tensor先声明为一维数据 - using gm_OutDataShape = global_tensor>; - - using tile_shapeData = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeOffset = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec - - using tile_Inshape = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec - using tile_Outshape = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec -// using tile_shapeOffset = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec - - gm_shapeIn inGm(in_ptr); - - gm_InDataShape InShapeGm(in_shape); - gm_OutDataShape OutShapeGm(out_shape); - - tile_shapeData dataTile; - tile_shapeOffset offsetTile; - - tile_Inshape InshapeTile; - tile_Outshape OutshapeTile; - - int base = 0;// todo 生成一个标量 - int all_num = gOM; // 总元素数量 - - using itOut = global_iterator; - itOut gOIter(out_ptr); - -// alignas(256) static uint32_t g_dump[tM]; + MGATHER(output_tile, input_global, offset_tile); + TSTORE(output_global, output_tile); - int total_elements = tM; + output_base += tM; + } + if constexpr (kTailElements != 0) { + using TailDataTile = pto::Tile; + using TailOffsetTile = pto::Tile; + + auto output_global = output_iter(0, kFullTiles); + TailDataTile output_tile; + TailOffsetTile offset_tile; + + TailOffsetTile linear_index; + TailOffsetTile quotient; + TailOffsetTile cycle; + TailOffsetTile cycle_base; + TailOffsetTile coordinate; + TailOffsetTile contribution; + + TCI(linear_index, output_base); + TEXPANDS(offset_tile, static_cast(0)); + + uint32_t input_stride[DATA_DIM]; + input_stride[DATA_DIM - 1] = 1; + for (int i = static_cast(DATA_DIM) - 2; i >= 0; --i) { + input_stride[i] = input_stride[i + 1] * static_cast(in_shape[i + 1]); + } + uint32_t input_size = input_stride[0] * static_cast(in_shape[0]); + + for (int d = static_cast(DATA_DIM) - 1; d >= 0; --d) { + uint32_t output_stride = 1; + for (int dd = d + 1; dd < static_cast(DATA_DIM); ++dd) { + output_stride *= static_cast(out_shape[dd]); + } + + if (output_stride == 1) { + TMOV(quotient, linear_index); + } else { + TDIVS(quotient, linear_index, output_stride); + } + + uint32_t out_dim_size = static_cast(out_shape[d]); + TDIVS(cycle, quotient, out_dim_size); + TMULS(cycle_base, cycle, out_dim_size); + TSUB(coordinate, quotient, cycle_base); + + if (static_cast(d) == CONCAT_DIM) { + uint32_t in_dim_size = static_cast(in_shape[d]); + TailOffsetTile n_tile; + TailOffsetTile offset_in; + TailOffsetTile n_times_shape; + + TDIVS(n_tile, coordinate, in_dim_size); + TMULS(n_times_shape, n_tile, in_dim_size); + TSUB(offset_in, coordinate, n_times_shape); + + TMULS(contribution, offset_in, + input_stride[d] * static_cast(sizeof(DType))); + TADD(offset_tile, offset_tile, contribution); + + TMULS(contribution, n_tile, + input_size * static_cast(sizeof(DType))); + TADD(offset_tile, offset_tile, contribution); + } else { + TMULS(contribution, coordinate, + input_stride[d] * static_cast(sizeof(DType))); + TADD(offset_tile, offset_tile, contribution); + } + } - for (int i = 0; i < Mb; ++i) { - auto gO = gOIter(0, i); - TCOPYIN(InshapeTile, InShapeGm); - TCOPYIN(OutshapeTile, OutShapeGm); - gen_offset_Impl(offsetTile, InshapeTile, OutshapeTile, base, total_elements); -// printf("end genoffset\n"); - base += total_elements; -// DUMP_TILE("offsetTile", offsetTile, g_dump, 1, tM); - MGATHER(dataTile, inGm, offsetTile); -// printf("end mgather\n"); - TCOPYOUT(gO, dataTile); - } - if constexpr (rmd_M) { - auto gO = gOIter(0, Mb); - TCOPYIN(InshapeTile, InShapeGm); - TCOPYIN(OutshapeTile, OutShapeGm); - total_elements = rmd_M;//尾片的大小。 - gen_offset_Impl(offsetTile, InshapeTile, OutshapeTile, base, total_elements); - base += total_elements; - MGATHER(dataTile, inGm, offsetTile); - TCOPYOUT(gO, dataTile); + MGATHER(output_tile, input_global, offset_tile); + TSTORE(output_global, output_tile); } } diff --git a/benchmark-linxisa/kernels/concat/concat_scatter.hpp b/benchmark-linxisa/kernels/concat/concat_scatter.hpp index e0f14b8..2846f33 100644 --- a/benchmark-linxisa/kernels/concat/concat_scatter.hpp +++ b/benchmark-linxisa/kernels/concat/concat_scatter.hpp @@ -2,206 +2,153 @@ #define CONCAT_SCATTER_KERNEL_HPP #include -#include "template_asm.h" - -using namespace pto; - -#pragma once #include #include +using namespace pto; -#define DUMP_TILE(label, TileVar, DumpBuf, Rows, Cols) \ - do { \ - GlobalTensor, \ - Stride<1,1,1,Cols,1>> _g(DumpBuf); \ - TCOPYOUT(_g, TileVar); \ - printf("[DUMP] %s (shape=%dx%d):\n", label, Rows, Cols); \ - for (int ri = 0; ri < Rows; ri++) { \ - printf(" row%2d: ", ri); \ - for (int ci = 0; ci < Cols; ci++) \ - printf("%12lld ", (long long)DumpBuf[ri * Cols + ci]); \ - printf("\n"); \ - } \ - fflush(stdout); \ - } while (0) - - -// ============================================== -// 维度规则:交换transpose_dim0和transpose_dim1 -// ============================================== -template -void __vec__ gen_offset_concat( - typename tile_shape::TileDType __out__ out, - typename tile_Inshape::TileDType __in__ in_shape, - typename tile_Outshape::TileDType __in__ out_shape, -// const size_t in_dim, - const size_t base, - const size_t total_elements -) { - size_t index = blkv_get_index_x(); - size_t idx = blkv_get_index_x(); - - __vbuf__ typename tile_Inshape::DType *in_shape_ptr = blkv_get_tile_ptr(in_shape); - __vbuf__ typename tile_Outshape::DType *out_shape_ptr = blkv_get_tile_ptr(out_shape); - - if (index >= total_elements) return; - idx = idx + base; // todo idx是个向量,base是个标量,获得所有的基地址或者说基offset - - //转置维度交换stride。 -// uint16_t stride[IN_DIM]; - uint16_t stride[DATA_DIM]; - stride[DATA_DIM-1] = 1; - - #pragma clang loop unroll(full) - for(int i = DATA_DIM-2 ; i >= 0; --i) - { - stride[i] = stride[i+1] * out_shape_ptr[i+1]; - } - - - // 输出一维索引 → 输出坐标 - size_t in_coord[MAX_DIM] = {0}; // - size_t tmp = idx; // - - #pragma clang loop unroll(full) - for (int d = DATA_DIM - 1; d >= 0; d--) { - in_coord[d] = tmp % in_shape_ptr[d]; - tmp /= in_shape_ptr[d]; - } - size_t n = tmp; - in_coord[CONCAT_DIM] = n * in_shape_ptr[CONCAT_DIM] + in_coord[CONCAT_DIM]; - -// size_t n = out_coord[CONCAT_DIM] / in_shape_ptr[CONCAT_DIM]; -// size_t offset = out_coord[CONCAT_DIM] % in_shape_ptr[CONCAT_DIM]; - -// out_coord[CONCAT_DIM] = offset; - -/* - // 输出坐标 → 输入坐标 - size_t in_coord[MAX_DIM] = {0}; - for (size_t i = 0; i < in_dim; i++) { - size_t o = out_dim - in_dim + i; // 从后面对齐 - if (in_shape[i] == 1) { - in_coord[i] = 0; - } else { - in_coord[i] = out_coord[o]; - } - } -*/ -// uint16_t in_offset = 0; -// uint32_t out_offset = 0; - uint16_t out_offset = 0; - - #pragma clang loop unroll(full) - for (int i = 0; i < DATA_DIM; i++) { - out_offset += in_coord[i] * stride[i] * sizeof(dtype); - } - - // 赋值 - blkv_get_tile_ptr(out)[index] = out_offset; -} - -template -void gen_offset_Impl( - tile_shapeOffset &out, - tile_Inshape &in_shape, - tile_Outshape &out_shape, -// const size_t in_dim, -// const size_t out_dim, -// const size_t transpose_dim1, -// const size_t transpose_dim0, - const size_t base, - const size_t total_elements - ) -{ - static_assert(tile_shapeOffset::ValidRow != -1 && tile_shapeOffset::ValidCol != -1, - "Only static shape supported"); - gen_offset_concat<<>>(out.data(), in_shape.data(), out_shape.data(), base, total_elements); // todo 这部分的tile shape是怎么传递的? -} - - - -template +/** + * @brief 通用 N 维 concat scatter(Tile ISA 实现) + * + * 算法:对输入张量的每个元素,计算其多维坐标,用输出 stride 计算输出 byte offset, + * 用 MSCATTER 将数据写入输出。 + * + * 注意:当前实现中 n(来自哪个输入张量)始终为 0,即输入被放置在输出的起始位置。 + * 这与原始 __vec__ 版本行为一致。 + * + * @tparam DType 数据类型 + * @tparam MAX_DIM 最大维度数 + * @tparam gIM 输入总元素数 + * @tparam gOM 输出总元素数 + * @tparam tM 每个 tile 处理的元素数 + * @tparam DATA_DIM 数据维度数(编译期常量) + * @tparam CONCAT_DIM 拼接维度索引(编译期常量) + */ +template void concat_scatter( - dtype *in_ptr, - dtype *out_ptr, + DType *in_ptr, + DType *out_ptr, size_t *in_shape, size_t *out_shape -// const size_t in_dim, -// const size_t out_dim, -// const size_t transpose_dim1, -// const size_t transpose_dim0 -) +) { + constexpr int kFullTiles = gIM / tM; + constexpr int kTailElements = gIM % tM; + + using InputGlobal = pto::global_tensor>; + using OutputGlobal = pto::global_tensor>; + using DataTile = pto::Tile; + using OffsetTile = pto::Tile; + using InputIterator = global_iterator; + + InputGlobal input_global(in_ptr); + OutputGlobal output_global(out_ptr); + InputIterator input_iter(in_ptr); + + uint32_t input_base = 0; + for (int tile_index = 0; tile_index < kFullTiles; ++tile_index) { + auto input_tile_global = input_iter(0, tile_index); + DataTile data_tile; + OffsetTile offset_tile; + + OffsetTile linear_index; + OffsetTile quotient; + OffsetTile cycle; + OffsetTile cycle_base; + OffsetTile coordinate; + OffsetTile contribution; + + TCI(linear_index, static_cast(input_base)); + TEXPANDS(offset_tile, static_cast(0)); + + // 计算输出 stride(标量核心,运行时计算) + uint16_t output_stride[DATA_DIM]; + output_stride[DATA_DIM - 1] = 1; + for (int i = static_cast(DATA_DIM) - 2; i >= 0; --i) { + output_stride[i] = output_stride[i + 1] * static_cast(out_shape[i + 1]); + } - const int Mb = gOM / tM; - - const int rmd_M = gOM % tM; // todo 尾块怎么处理? - - using gm_shapeIn = global_tensor>; //将gm中的Tensor先声明为一维数据 - using gm_shapeOut = global_tensor>; - - using gm_InDataShape = global_tensor>; //将gm中的Tensor先声明为一维数据 - using gm_OutDataShape = global_tensor>; - - using tile_shapeData = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeOffset = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec -// using tile_shapeOffset = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec - - using tile_Inshape = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec - using tile_Outshape = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec -// using tile_shapeOffset = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec - -// gm_shapeIn inGm(in_ptr); - gm_shapeOut outGm(out_ptr); - - gm_InDataShape InShapeGm(in_shape); - gm_OutDataShape OutShapeGm(out_shape); - - tile_shapeData dataTile; - tile_shapeOffset offsetTile; - - tile_Inshape InshapeTile; - tile_Outshape OutshapeTile; - - int base = 0;// todo 生成一个标量 - int all_num = gIM; // 总元素数量 - -// using itOut = global_iterator; - using itIn = global_iterator; + // 对每个维度提取输入坐标并计算输出 byte offset + for (int d = static_cast(DATA_DIM) - 1; d >= 0; --d) { + uint32_t input_stride_for_dim = 1; + for (int dd = d + 1; dd < static_cast(DATA_DIM); ++dd) { + input_stride_for_dim *= static_cast(in_shape[dd]); + } + + if (input_stride_for_dim == 1) { + TMOV(quotient, linear_index); + } else { + TDIVS(quotient, linear_index, static_cast(input_stride_for_dim)); + } + + uint16_t in_dim_size = static_cast(in_shape[d]); + TDIVS(cycle, quotient, in_dim_size); + TMULS(cycle_base, cycle, in_dim_size); + TSUB(coordinate, quotient, cycle_base); + + TMULS(contribution, coordinate, + static_cast(output_stride[d] * sizeof(DType))); + TADD(offset_tile, offset_tile, contribution); + } - itIn gIIter(in_ptr); -// itOut gOIter(out_ptr); + TLOAD(data_tile, input_tile_global); + MSCATTER(output_global, data_tile, offset_tile); - alignas(256) static uint16_t g_dump[tM]; + input_base += tM; + } - int total_elements = tM; + if constexpr (kTailElements != 0) { + using TailDataTile = pto::Tile; + using TailOffsetTile = pto::Tile; + + auto input_tile_global = input_iter(0, kFullTiles); + TailDataTile data_tile; + TailOffsetTile offset_tile; + + TailOffsetTile linear_index; + TailOffsetTile quotient; + TailOffsetTile cycle; + TailOffsetTile cycle_base; + TailOffsetTile coordinate; + TailOffsetTile contribution; + + TCI(linear_index, static_cast(input_base)); + TEXPANDS(offset_tile, static_cast(0)); + + uint16_t output_stride[DATA_DIM]; + output_stride[DATA_DIM - 1] = 1; + for (int i = static_cast(DATA_DIM) - 2; i >= 0; --i) { + output_stride[i] = output_stride[i + 1] * static_cast(out_shape[i + 1]); + } + for (int d = static_cast(DATA_DIM) - 1; d >= 0; --d) { + uint32_t input_stride_for_dim = 1; + for (int dd = d + 1; dd < static_cast(DATA_DIM); ++dd) { + input_stride_for_dim *= static_cast(in_shape[dd]); + } + + if (input_stride_for_dim == 1) { + TMOV(quotient, linear_index); + } else { + TDIVS(quotient, linear_index, static_cast(input_stride_for_dim)); + } + + uint16_t in_dim_size = static_cast(in_shape[d]); + TDIVS(cycle, quotient, in_dim_size); + TMULS(cycle_base, cycle, in_dim_size); + TSUB(coordinate, quotient, cycle_base); + + TMULS(contribution, coordinate, + static_cast(output_stride[d] * sizeof(DType))); + TADD(offset_tile, offset_tile, contribution); + } - for (int i = 0; i < Mb; ++i) { - auto gI = gIIter(0, i); - TCOPYIN(InshapeTile, InShapeGm); - TCOPYIN(OutshapeTile, OutShapeGm); - gen_offset_Impl(offsetTile, InshapeTile, OutshapeTile, base, total_elements); -// printf("end genoffset\n"); - base += total_elements; -// DUMP_TILE("offsetTile", offsetTile, g_dump, 1, tM); - TCOPYIN(dataTile, gI); - MSCATTER(outGm, dataTile, offsetTile); - } - if constexpr (rmd_M) { - auto gI = gIIter(0, Mb); - TCOPYIN(InshapeTile, InShapeGm); - TCOPYIN(OutshapeTile, OutShapeGm); - total_elements = rmd_M;//尾片的大小。 - gen_offset_Impl(offsetTile, InshapeTile, OutshapeTile, base, total_elements); - base += total_elements; - TCOPYIN(dataTile, gI); - MSCATTER(outGm, dataTile, offsetTile); + TLOAD(data_tile, input_tile_global); + MSCATTER(output_global, data_tile, offset_tile); } } #endif - diff --git a/benchmark-linxisa/kernels/control/hashtable_lookup_simd.hpp b/benchmark-linxisa/kernels/control/hashtable_lookup_simd.hpp index 19659f3..6f273bc 100644 --- a/benchmark-linxisa/kernels/control/hashtable_lookup_simd.hpp +++ b/benchmark-linxisa/kernels/control/hashtable_lookup_simd.hpp @@ -1,6 +1,7 @@ #ifndef HASHTABLE_LOOKUP_SIMD_HPP #define HASHTABLE_LOOKUP_SIMD_HPP +#include #include #include "template_asm.h" // MGATHER #include diff --git a/benchmark-linxisa/kernels/element_wise/gelu.hpp b/benchmark-linxisa/kernels/element_wise/gelu.hpp index b2e3043..76cd3e3 100644 --- a/benchmark-linxisa/kernels/element_wise/gelu.hpp +++ b/benchmark-linxisa/kernels/element_wise/gelu.hpp @@ -1,128 +1,77 @@ -#include +#ifndef SUPERNPUBENCH_PTOISA_GELU_HPP +#define SUPERNPUBENCH_PTOISA_GELU_HPP -#include "template_asm.h" -#include -#include -// 海思解决方案 新版多项式拟合 +#include -// ============================================== -// ============================================== +#include -template -void __vec__ gelu_simd( - typename tile_shape::TileDType __in__ in, - typename tile_shape::TileDType __out__ out - // bool approximate = false // false:none, true:tanh -) { - size_t index = blkv_get_index_x(); - typename tile_shape::DType indata; - indata.data = blkv_get_tile_ptr(in)[index].data; // 直接拷贝 short +using namespace pto; - // 数据格式转换 V.FCVT - float x = static_cast(indata); - - constexpr uint32_t TOTAL_COUNT = 24*8*1024; - constexpr float SCALAR_A5 = -3.5123395303315874e-09f; - constexpr float SCALAR_A4 = 2.6452661927578447e-07f; - constexpr float SCALAR_A3 = -7.9294877650681883e-06f; - constexpr float SCALAR_A2 = 1.1061238183174282e-04f; - constexpr float SCALAR_A1 = 6.5189960878342390e-05f; - constexpr float SCALAR_A0 = -7.2666168212890625e-02f; - constexpr float SCALAR_AM1 = -1.5957698822021484e+00f; - constexpr float FP32_MAX = 5.75f; - - float t = blkv_max(x, -FP32_MAX); - t = blkv_min(t, FP32_MAX); - float t2 = t * t; +template +inline void gelu_tile(TileT &dst, TileT &src) { + TileT t; + TileT t2; + TileT p; + TileT tmp; + TileT denom; + TileT recip; - float p = SCALAR_A5 * t2 + SCALAR_A4; - p = p * t2 + SCALAR_A3; - p = p * t2 + SCALAR_A2; - p = p * t2 + SCALAR_A1; - p = p * t2 + SCALAR_A0; - p = p * t2 + SCALAR_AM1; + TMAXS(t, src, -5.75f); + TMINS(t, t, 5.75f); + TMUL(t2, t, t); - float exp_val = blkv_fexp(t * p); - float y = x / (1.0f + exp_val); - - BLKC_ASSIGN_CAST(out, index, y); - // blkv_get_tile_ptr(out)[index] = static_cast(result); -} + TMULS(p, t2, -3.5123395303315874e-09f); + TADDS(p, p, 2.6452661927578447e-07f); + TMUL(p, p, t2); + TADDS(p, p, -7.9294877650681883e-06f); + TMUL(p, p, t2); + TADDS(p, p, 1.1061238183174282e-04f); + TMUL(p, p, t2); + TADDS(p, p, 6.5189960878342390e-05f); + TMUL(p, p, t2); + TADDS(p, p, -7.2666168212890625e-02f); + TMUL(p, p, t2); + TADDS(p, p, -1.5957698822021484e+00f); -template -void gelu_impl( - tile_shapeData &in, - tile_shapeData &out -){ - static_assert(tile_shapeData::ValidRow != -1 && tile_shapeData::ValidCol != -1, - "Only static shape supported"); - gelu_simd<<>>(in.data(), out.data()); + TMUL(tmp, t, p); + TEXP(tmp, tmp); + TADDS(denom, tmp, 1.0f); + TRECIP(recip, denom); + TMUL(dst, src, recip); } -#define DUMP_TILE(label, TileVar, DumpBuf, Rows, Cols) \ - do { \ - GlobalTensor, \ - Stride<1,1,1,Cols,1>> _g(DumpBuf); \ - TCOPYOUT(_g, TileVar); \ - printf("[DUMP] %s (shape=%dx%d):\n", label, Rows, Cols); \ - for (int ri = 0; ri < Rows; ri++) { \ - printf(" row%2d: ", ri); \ - for (int ci = 0; ci < Cols; ci++) \ - printf("%.4f ", DumpBuf[ri * Cols + ci]); \ - printf("\n"); \ - } \ - fflush(stdout); \ - } while (0) +template +void gelu(dtype *in_ptr, dtype *out_ptr, bool = false) { + constexpr int kFullTiles = gM / tM; + constexpr int kTail = gM % tM; -template -void gelu( - dtype *in_ptr, - dtype *out_ptr, - bool approximate = false // false:none, true:tanh - ) { - const int Mb = gM / tM; - - const int rmd_M = gM % tM; + using Global = global_tensor>; + using TileT = Tile; + using Iterator = global_iterator; - using gm_shape = global_tensor>; - using tile_shapeData = Tile; - using tile_shapeData_rmd = Tile; + Iterator input_iter(in_ptr); + Iterator output_iter(out_ptr); - gm_shape inGm(in_ptr); - gm_shape outGm(out_ptr); - tile_shapeData inTile; - tile_shapeData outTile; - tile_shapeData_rmd inTile_rmd; - tile_shapeData_rmd outTile_rmd; + for (int tile = 0; tile < kFullTiles; ++tile) { + TileT input_tile; + TileT output_tile; + TLOAD(input_tile, input_iter(0, tile)); + gelu_tile(output_tile, input_tile); + TSTORE(output_iter(0, tile), output_tile); + } - using itIn = global_iterator; - using itOut = global_iterator; - - itIn gIIter(in_ptr); - itOut gOIter(out_ptr); - // for test /////////////////////////////////////// - alignas(256) static dtype g_dump_intTile[tM]; - alignas(256) static dtype g_dump_outTile[tM]; - // /////////////////////////////////////// - - // printf("MB = %d\n", Mb); - for (int i = 0; i < Mb; ++i) { - // printf("iter i %d\n",i); - auto gI = gIIter(0, i); - auto gO = gOIter(0, i); - TCOPYIN(inTile, gI); - // DUMP_TILE("inTile", inTile, g_dump_intTile, 1, tM); - gelu_impl(inTile, outTile); - // DUMP_TILE("outTile", outTile, g_dump_outTile, 1, tM); - TCOPYOUT(gO, outTile); - } - if constexpr (rmd_M) { - auto gI = gIIter(0, Mb); - auto gO = gOIter(0, Mb); - TCOPYIN(inTile_rmd, gI); - gelu_impl(inTile_rmd, outTile_rmd); - // DUMP_TILE("offsetTile", offsetTile, g_dump, 1, tM); - TCOPYOUT(gO, outTile_rmd); - } + if constexpr (kTail != 0) { + using TailTile = + Tile; + using TailIterator = global_iterator; + TailIterator tail_input_iter(in_ptr); + TailIterator tail_output_iter(out_ptr); + TailTile input_tile; + TailTile output_tile; + TLOAD(input_tile, tail_input_iter(0, kFullTiles)); + gelu_tile(output_tile, input_tile); + TSTORE(tail_output_iter(0, kFullTiles), output_tile); + } } + +#endif // SUPERNPUBENCH_PTOISA_GELU_HPP diff --git a/benchmark-linxisa/kernels/element_wise/gelu_pto.hpp b/benchmark-linxisa/kernels/element_wise/gelu_pto.hpp new file mode 100644 index 0000000..f292a3e --- /dev/null +++ b/benchmark-linxisa/kernels/element_wise/gelu_pto.hpp @@ -0,0 +1,277 @@ +// ============================================================================ +// GELU 算子 — PTO 一层编程模型 +// +// 原始 gelu.hpp 策略: +// TCOPYIN (half) -> __vec__ gelu_simd (多项式拟合) -> TCOPYOUT (half) +// __vec__ 块逐元素: fp16→fp32, clamp, Horner 多项式, exp, 除法, fp32→fp16 +// +// PTO 一层策略: +// TLOAD (half) -> TCVT(fp16→fp32) -> tile 指令链计算 GELU -> TCVT(fp32→fp16) -> TSTORE (half) +// 全部用 Tile 级内联函数, 无 __vec__ 块 +// +// ┌─────────────────────────────────────────────────────────────────────────┐ +// │ 当前编译器不支持 / 不完整的指令汇总 │ +// ├──────────┬──────────────────┬──────────────────────────────────────────┤ +// │ Pto ISA │ 当前编译器状态 │ 说明 │ +// │ 指令 │ │ │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TLOAD │ API 有(名不同), │ PTO ISA 名 TLOAD;当前编译器名 TCOPYIN; │ +// │ │ 二层实现 │ jcore/TCopyIn.hpp 用 __vec__ 实现 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TCVT │ API 有(签名不同),│ PTO ISA: TCVT(dst,src,tmp,mode,satMode) │ +// │ │ 二层实现 │ 当前编译器: TCVT(dst,src) 无 tmp/mode; │ +// │ │ │ jcore/TCvt.hpp 用 __vec__ 实现 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TMAXS │ API 有,二层实现 │ jcore/TMaxs.hpp 用 __vec__ 实现 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TMINS │ API 有,二层实现 │ jcore/TMins.hpp 用 __vec__ 实现 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TMUL │ API 有,二层实现 │ jcore/TMul.hpp 用 __vec__ 实现 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TMULS │ API 有,二层实现 │ jcore/TMuls.hpp 用 __vec__ 实现 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TADDS │ API 有,二层实现 │ jcore/TAdds.hpp 用 __vec__ 实现 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TEXP │ API 有,二层实现 │ jcore/TExp.hpp 用 __vec__ 实现 │ +// │ │ │ (template_asm.h 有 TEXP_TEPL 内联汇编, │ +// │ │ │ 但不在 pto_tileop.hpp API 中) │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TRECIP │ API 有,二层实现 │ jcore/TRecip.hpp 用 __vec__ 实现 │ +// │ │ │ (template_asm.h 有 TRECIP_TEPL 内联汇编, │ +// │ │ │ 但不在 pto_tileop.hpp API 中) │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TSTORE │ API 有(名不同), │ PTO ISA 名 TSTORE;当前编译器名 TCOPYOUT;│ +// │ │ 二层实现 │ jcore/TCopyOut.hpp 用 __vec__ 实现 │ +// └──────────┴──────────────────┴──────────────────────────────────────────┘ +// +// PTO ISA 文档签名 (Declared in include/pto/pto_instr.hpp): +// +// TLOAD: +// template +// PTO_INST RecordEvent TLOAD(TileData &dst, GlobalData &src, WaitEvents &... events); +// +// TCVT: +// template +// PTO_INST RecordEvent TCVT(TileDataD &dst, TileDataS &src, TmpTileData &tmp, +// RoundMode mode, SaturationMode satMode, +// WaitEvents &... events); +// +// TMAXS: +// template +// PTO_INST RecordEvent TMAXS(TileDataDst &dst, TileDataSrc &src, +// typename TileDataSrc::DType scalar, +// WaitEvents &... events); +// +// TMINS: +// template +// PTO_INST RecordEvent TMINS(TileDataDst &dst, TileDataSrc &src, +// typename TileDataSrc::DType scalar, +// WaitEvents &... events); +// +// TMUL: +// template +// PTO_INST RecordEvent TMUL(TileDataDst &dst, TileDataSrc0 &src0, +// TileDataSrc1 &src1, WaitEvents &... events); +// +// TMULS: +// template +// PTO_INST RecordEvent TMULS(TileDataDst &dst, TileDataSrc &src0, +// typename TileDataSrc::DType scalar, +// WaitEvents &... events); +// +// TADDS: +// template +// PTO_INST RecordEvent TADDS(TileDataDst &dst, TileDataSrc &src0, +// typename TileDataSrc::DType scalar, +// WaitEvents &... events); +// +// TEXP: +// template +// PTO_INST RecordEvent TEXP(TileDataDst &dst, TileDataSrc &src, +// WaitEvents &... events); +// +// TRECIP: +// template +// PTO_INST RecordEvent TRECIP(TileDataDst &dst, TileDataSrc &src, +// WaitEvents &... events); +// +// TSTORE: +// template +// PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, +// WaitEvents &... events); +// ============================================================================ + +#include +#include +// #include // [!] PTO ISA C++ Intrinsic — 当前编译器未提供 + +#include +#include + +// ============================================================================ +// GELU 多项式拟合系数 (与原始 gelu.hpp 一致) +// GELU(x) = x / (1 + exp(t * P(t²))) +// P(t²) = A5*t²⁵ + A4*t²⁴ + A3*t²³ + A2*t²² + A1*t² + A0 + AM1/t² +// (Horner: p = ((((A5*t2 + A4)*t2 + A3)*t2 + A2)*t2 + A1)*t2 + A0)*t2 + AM1) +// ============================================================================ +namespace gelu_coeffs { + constexpr float A5 = -3.5123395303315874e-09f; + constexpr float A4 = 2.6452661927578447e-07f; + constexpr float A3 = -7.9294877650681883e-06f; + constexpr float A2 = 1.1061238183174282e-04f; + constexpr float A1 = 6.5189960878342390e-05f; + constexpr float A0 = -7.2666168212890625e-02f; + constexpr float AM1 = -1.5957698822021484e+00f; + constexpr float CLAMP_MAX = 5.75f; +} + +// ---------------------------------------------------------------------------- +// gelu_impl: 用 PTO ISA Tile 指令计算 GELU (一层编程, 无 __vec__ 块) +// +// 输入: inTile — fp16 tile, shape (1, tM) +// 输出: outTile — fp16 tile, shape (1, tM) +// 中间: 全部在 fp32 tile 上计算 +// +// 算法: +// x = (float)in +// t = clamp(x, -5.75, 5.75) +// t2 = t * t +// p = Horner(t2, [A5,A4,A3,A2,A1,A0,AM1]) +// e = exp(t * p) +// y = x * (1 / (1 + e)) // 用 TRECIP + TMUL 代替除法 +// out = (half)y +// ---------------------------------------------------------------------------- +template +void gelu_impl( + tile_shapeData &inTile, + tile_shapeData &outTile, + tile_shapeFP32 &tmpCvt // TCVT 需要的临时 tile +) { + using fp_t = typename tile_shapeFP32::DType; // float + + tile_shapeFP32 xTile; // x = (float)input + tile_shapeFP32 tTile; // t = clamp(x) + tile_shapeFP32 t2Tile; // t² + tile_shapeFP32 pTile; // 多项式值 + tile_shapeFP32 scratchTile; // 复用: tp -> exp -> denom -> recip -> y + + // ---- Step 1: fp16 -> fp32 ---- + // [当前编译器] TCVT(dst, src) 无 tmp/mode/satMode 参数; jcore 为 __vec__ + TCVT(xTile, inTile, tmpCvt, RoundMode::Nearest, SaturationMode::Default); + + // ---- Step 2: clamp x to [-5.75, 5.75] ---- + // [当前编译器] TMAXS/TMINS API 有, jcore 为 __vec__ + TMAXS(tTile, xTile, (fp_t)(-gelu_coeffs::CLAMP_MAX)); // t = max(x, -5.75) + TMINS(tTile, tTile, (fp_t)gelu_coeffs::CLAMP_MAX); // t = min(t, 5.75) + + // ---- Step 3: t² = t * t ---- + // [当前编译器] TMUL API 有, jcore 为 __vec__ + TMUL(t2Tile, tTile, tTile); + + // ---- Step 4: 多项式 Horner 法 ---- + // p = A5*t2 + A4 + // [当前编译器] TMULS/TADDS API 有, jcore 为 __vec__ + TMULS(pTile, t2Tile, gelu_coeffs::A5); + TADDS(pTile, pTile, gelu_coeffs::A4); + + // p = p*t2 + A3 + TMUL(pTile, pTile, t2Tile); + TADDS(pTile, pTile, gelu_coeffs::A3); + + // p = p*t2 + A2 + TMUL(pTile, pTile, t2Tile); + TADDS(pTile, pTile, gelu_coeffs::A2); + + // p = p*t2 + A1 + TMUL(pTile, pTile, t2Tile); + TADDS(pTile, pTile, gelu_coeffs::A1); + + // p = p*t2 + A0 + TMUL(pTile, pTile, t2Tile); + TADDS(pTile, pTile, gelu_coeffs::A0); + + // p = p*t2 + AM1 + TMUL(pTile, pTile, t2Tile); + TADDS(pTile, pTile, gelu_coeffs::AM1); + + // ---- Step 5: exp_val = exp(t * p) ---- + // scratch = t * p + TMUL(scratchTile, tTile, pTile); + // exp_val = exp(scratch) + // [当前编译器] TEXP API 有, jcore 为 __vec__ + // (template_asm.h 有 TEXP_TEPL 内联汇编, 但不在 pto_tileop.hpp 中) + TEXP(scratchTile, scratchTile); // scratch = exp(t*p) + + // ---- Step 6: y = x / (1 + exp_val) ---- + // denom = 1 + exp_val + TADDS(scratchTile, scratchTile, (fp_t)1.0f); // scratch = 1 + exp + // recip = 1 / denom + // [当前编译器] TRECIP API 有, jcore 为 __vec__ + // (template_asm.h 有 TRECIP_TEPL 内联汇编, 但不在 pto_tileop.hpp 中) + TRECIP(scratchTile, scratchTile); // scratch = 1 / (1+exp) + // y = x * recip + TMUL(scratchTile, xTile, scratchTile); // scratch = x * recip = y + + // ---- Step 7: fp32 -> fp16 ---- + TCVT(outTile, scratchTile, tmpCvt, RoundMode::Nearest, SaturationMode::Default); +} + + +// ---------------------------------------------------------------------------- +// gelu: 主入口, 接口与原 gelu.hpp 一致 +// ---------------------------------------------------------------------------- +template +void gelu( + dtype *in_ptr, + dtype *out_ptr, + bool approximate = false + ) { + using gm_shape = global_tensor>; + using tile_shapeData = Tile; + using tile_shapeFP32 = Tile; + using tile_shapeData_rmd = Tile; + using tile_shapeFP32_rmd = Tile; + + const int Mb = gM / tM; + const int rmd_M = gM % tM; + + using itIn = global_iterator; + using itOut = global_iterator; + + itIn gIIter(in_ptr); + itOut gOIter(out_ptr); + + tile_shapeData inTile, outTile; + tile_shapeFP32 tmpCvt; // TCVT 临时 tile + tile_shapeData_rmd inTile_rmd, outTile_rmd; + tile_shapeFP32_rmd tmpCvt_rmd; + + for (int i = 0; i < Mb; ++i) { + auto gI = gIIter(0, i); + auto gO = gOIter(0, i); + + // TLOAD: GM -> UB + // [当前编译器] 名为 TCOPYIN, jcore 为 __vec__ + TLOAD(inTile, gI); + + gelu_impl(inTile, outTile, tmpCvt); + + // TSTORE: UB -> GM + // [当前编译器] 名为 TCOPYOUT, jcore 为 __vec__ + TSTORE(gO, outTile); + } + if constexpr (rmd_M) { + auto gI = gIIter(0, Mb); + auto gO = gOIter(0, Mb); + + TLOAD(inTile_rmd, gI); + gelu_impl(inTile_rmd, outTile_rmd, tmpCvt_rmd); + TSTORE(gO, outTile_rmd); + } +} diff --git a/benchmark-linxisa/kernels/fa/fa_2d_unroll.hpp b/benchmark-linxisa/kernels/fa/fa_2d_unroll.hpp index 296e6e9..709e203 100644 --- a/benchmark-linxisa/kernels/fa/fa_2d_unroll.hpp +++ b/benchmark-linxisa/kernels/fa/fa_2d_unroll.hpp @@ -1,6 +1,10 @@ #include "fa_utils.h" #include "fa_fp4_utils.h" +using namespace pto; +using namespace pto::blkv; +using pto::type_traits; + #ifndef Xdim #define Xdim 2 #endif @@ -13,6 +17,11 @@ #define __vbuf__ #endif +#ifndef BLKC_ASSIGN_CAST +#define BLKC_ASSIGN_CAST(tile, idx, value) \ + (pto::blkv::blkv_get_tile_ptr(tile)[(idx)] = (value)) +#endif + template void __vec__ new_max_1src( typename tileMax::TileDType __out__ scale, @@ -797,7 +806,6 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype TEXPANDSCALAR(tSum[x], 0); } - tileO_out tPV_out; tileO tO[Xdim], tPV[Xdim]; tileScale tScale[Xdim]; @@ -840,7 +848,8 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype #if Ydim == 1 #pragma clang loop unroll(full) for(int x=0;x<<>>( + pto::blkv::blkv_for_1d(tileW::ValidRow, [&] { + flashsoftmax_dn_mout_cast_kernel( tScale[x].data(), tNewMax[x].data(), tNewSum[x].data(), @@ -849,16 +858,19 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype tMax[x].data(), tSum[x].data(), scale); + }); } #elif Ydim == 2 #pragma clang loop unroll(full) for(int x=0;x<<>>( + pto::blkv::blkv_for_1d(tileMax::ValidRow, [&] { + new_max_2src( tScale[x].data(), tNewMax[x].data(), tW[x][0].data(), tW[x][1].data(), tMax[x].data(), scale); + }); // src_exp_2src<<>>( // tExpW[x][0].data(), tExpW[x][1].data(), // tW[x][0].data(), tW[x][1].data(), @@ -870,39 +882,49 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype // tSum[x].data(), // tScale[x].data() // ); - src_exp_2src_with_new_sum<<>>( + pto::blkv::blkv_for_1d(tileW::ValidRow, [&] { + src_exp_2src_with_new_sum( tNewSum[x].data(), tExpW[x][0].data(), tExpW[x][1].data(), tW[x][0].data(), tW[x][1].data(), tNewMax[x].data(), tSum[x].data(), tScale[x].data(), scale); + }); } #elif Ydim == 4 tileSum tLocalSum[Xdim][2]; #pragma clang loop unroll(full) for(int x=0;x<<>>( + pto::blkv::blkv_for_1d(tileMax::ValidRow, [&] { + new_max_4src( tScale[x].data(), tNewMax[x].data(), tW[x][0].data(), tW[x][1].data(), tW[x][2].data(), tW[x][3].data(), tMax[x].data(), scale); + }); // src_exp_4src<<>>( // tExpW[x][0].data(), tExpW[x][1].data(), tExpW[x][2].data(), tExpW[x][3].data(), // tW[x][0].data(), tW[x][1].data(), tW[x][2].data(), tW[x][3].data(), // tNewMax[x].data(), // scale); - src_exp_2src_with_local_sum<<>>(tLocalSum[x][0].data(), tExpW[x][0].data(), tExpW[x][1].data(), + pto::blkv::blkv_for_1d(tileW::ValidRow, [&] { + src_exp_2src_with_local_sum(tLocalSum[x][0].data(), tExpW[x][0].data(), tExpW[x][1].data(), tW[x][0].data(), tW[x][1].data(), tNewMax[x].data(), scale); - src_exp_2src_with_local_sum<<>>(tLocalSum[x][1].data(), tExpW[x][2].data(), tExpW[x][3].data(), + }); + pto::blkv::blkv_for_1d(tileW::ValidRow, [&] { + src_exp_2src_with_local_sum(tLocalSum[x][1].data(), tExpW[x][2].data(), tExpW[x][3].data(), tW[x][2].data(), tW[x][3].data(), tNewMax[x].data(), scale); + }); // new_sum_4src<<>>( // tNewSum[x].data(), // tExpW[x][0].data(), tExpW[x][1].data(), tExpW[x][2].data(), tExpW[x][3].data(), // tSum[x].data(), // tScale[x].data() // ); - new_sum_of_2_loc_sum<<>>(tNewSum[x].data(), tLocalSum[x][0].data(), tLocalSum[x][1].data(), tSum[x].data(), tScale[x].data()); + pto::blkv::blkv_for_1d(tileSum::ValidRow, [&] { + new_sum_of_2_loc_sum(tNewSum[x].data(), tLocalSum[x][0].data(), tLocalSum[x][1].data(), tSum[x].data(), tScale[x].data()); + }); } #elif Ydim == 8 tileMax tLocalMax[Xdim][2]; @@ -912,17 +934,25 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype for(int x=0;x<<>>(tLocalMax[x][k].data(), tW[x][4*k].data(), tW[x][4*k+1].data(), tW[x][4*k+2].data(), tW[x][4*k+3].data(), scale); + pto::blkv::blkv_for_1d(tileMax::ValidRow, [&] { + local_max_4src(tLocalMax[x][k].data(), tW[x][4*k].data(), tW[x][4*k+1].data(), tW[x][4*k+2].data(), tW[x][4*k+3].data(), scale); + }); } - new_max_of_2_loc_max<<>>(tScale[x].data(), tNewMax[x].data(), tLocalMax[x][0].data(), tLocalMax[x][1].data(), tMax[x].data()); + pto::blkv::blkv_for_1d(tileMax::ValidRow, [&] { + new_max_of_2_loc_max(tScale[x].data(), tNewMax[x].data(), tLocalMax[x][0].data(), tLocalMax[x][1].data(), tMax[x].data()); + }); #pragma clang loop unroll(full) for(int k=0;k<4;k++){ - src_exp_2src_with_local_sum<<>>(tLocalSum[x][k].data(), tExpW[x][2*k].data(), tExpW[x][2*k+1].data(), + pto::blkv::blkv_for_1d(tileW::ValidRow, [&] { + src_exp_2src_with_local_sum(tLocalSum[x][k].data(), tExpW[x][2*k].data(), tExpW[x][2*k+1].data(), tW[x][2*k].data(), tW[x][2*k+1].data(), tNewMax[x].data(), scale); + }); } - new_sum_of_4_loc_sum<<>>(tNewSum[x].data(), tLocalSum[x][0].data(), tLocalSum[x][1].data(), tLocalSum[x][2].data(), tLocalSum[x][3].data(), tSum[x].data(), tScale[x].data()); + pto::blkv::blkv_for_1d(tileSum::ValidRow, [&] { + new_sum_of_4_loc_sum(tNewSum[x].data(), tLocalSum[x][0].data(), tLocalSum[x][1].data(), tLocalSum[x][2].data(), tLocalSum[x][3].data(), tSum[x].data(), tScale[x].data()); + }); } #elif Ydim == 16 tileMax tLocalMax[Xdim][4]; @@ -931,25 +961,35 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype #pragma clang loop unroll(full) for(int x=0;x<<>>(tLocalMax[x][k].data(), tW[x][4*k].data(), tW[x][4*k+1].data(), tW[x][4*k+2].data(), tW[x][4*k+3].data(), scale); + pto::blkv::blkv_for_1d(tileMax::ValidRow, [&] { + local_max_4src(tLocalMax[x][k].data(), tW[x][4*k].data(), tW[x][4*k+1].data(), tW[x][4*k+2].data(), tW[x][4*k+3].data(), scale); + }); } - new_max_of_4_loc_max<<>>(tScale[x].data(), tNewMax[x].data(), tLocalMax[x][0].data(), tLocalMax[x][1].data(), tLocalMax[x][2].data(), tLocalMax[x][3].data(), tMax[x].data()); + pto::blkv::blkv_for_1d(tileMax::ValidRow, [&] { + new_max_of_4_loc_max(tScale[x].data(), tNewMax[x].data(), tLocalMax[x][0].data(), tLocalMax[x][1].data(), tLocalMax[x][2].data(), tLocalMax[x][3].data(), tMax[x].data()); + }); #pragma clang loop unroll(full) for(int k=0;k<4;k++){ - src_exp_4src<<>>( + pto::blkv::blkv_for_2d(tileW::ValidRow, tileW::ValidCol, [&] { + src_exp_4src( tExpW[x][4*k].data(), tExpW[x][4*k+1].data(), tExpW[x][4*k+2].data(), tExpW[x][4*k+3].data(), tW[x][4*k].data(), tW[x][4*k+1].data(), tW[x][4*k+2].data(), tW[x][4*k+3].data(), tNewMax[x].data(), scale); + }); } #pragma clang loop unroll(full) for(int k=0;k<4;k++){ - local_sum_4src<<>>(tLocalSum[x][k].data(), tExpW[x][4*k].data(), tExpW[x][4*k+1].data(), tExpW[x][4*k+2].data(), tExpW[x][4*k+3].data()); + pto::blkv::blkv_for_1d(tileSum::ValidRow, [&] { + local_sum_4src(tLocalSum[x][k].data(), tExpW[x][4*k].data(), tExpW[x][4*k+1].data(), tExpW[x][4*k+2].data(), tExpW[x][4*k+3].data()); + }); } - new_sum_of_4_loc_sum<<>>(tNewSum[x].data(), tLocalSum[x][0].data(), tLocalSum[x][1].data(), tLocalSum[x][2].data(), tLocalSum[x][3].data(), tSum[x].data(), tScale[x].data()); + pto::blkv::blkv_for_1d(tileSum::ValidRow, [&] { + new_sum_of_4_loc_sum(tNewSum[x].data(), tLocalSum[x][0].data(), tLocalSum[x][1].data(), tLocalSum[x][2].data(), tLocalSum[x][3].data(), tSum[x].data(), tScale[x].data()); + }); } #else #ifdef _2D_UNROLL @@ -978,6 +1018,7 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype tileW_left tW_left[Xdim][Ydim]; #pragma clang loop unroll(full) for(int x=0;x::bits == 4) { @@ -1001,19 +1042,21 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype if(j==0){ #pragma clang loop unroll(full) for(int x=0;x<<>>(tO[x].data(), tO[x].data(), tPV[x].data(), tScale[x].data()); + pto::blkv::blkv_for_1d(tileO::ValidRow, [&] { + global_update(tO[x].data(), tO[x].data(), tPV[x].data(), tScale[x].data()); + }); } } // 更新最大值状态 #pragma clang loop unroll(full) for(int x=0;x::bits == 4) { - normalize_with_last_update_nocast<<>>(tO[x].data(), tO[x].data(), tPV[x].data(), tScale[x].data(), tSum[x].data()); + pto::blkv::blkv_for_2d(tileO::ValidRow, tileO::ValidCol, [&] { + normalize_with_last_update_nocast(tO[x].data(), tO[x].data(), tPV[x].data(), tScale[x].data(), tSum[x].data()); + }); TMOV_NORM(tO_cast[x], tO[x]); } else { - normalize_with_last_update<<>>(tO_cast[x].data(), tO[x].data(), tPV[x].data(), tScale[x].data(), tSum[x].data()); + pto::blkv::blkv_for_2d(tileO::ValidRow, tileO::ValidCol, [&] { + normalize_with_last_update(tO_cast[x].data(), tO[x].data(), tPV[x].data(), tScale[x].data(), tSum[x].data()); + }); } } // 写回全局内存 @@ -1046,4 +1093,6 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype } } +#ifdef _UNALIGN_2D_UNROLL #include "fa_unalign_2d_unroll.hpp" +#endif diff --git a/benchmark-linxisa/kernels/fa/fa_fp4_utils.h b/benchmark-linxisa/kernels/fa/fa_fp4_utils.h index d2f776b..ecdbd51 100644 --- a/benchmark-linxisa/kernels/fa/fa_fp4_utils.h +++ b/benchmark-linxisa/kernels/fa/fa_fp4_utils.h @@ -1,4 +1,9 @@ +#include + +using namespace pto::blkv; +using pto::type_traits; + template concept is_fp4_tile = requires (T tile) { type_traits::bits == 4; diff --git a/benchmark-linxisa/kernels/fa/fa_utils.h b/benchmark-linxisa/kernels/fa/fa_utils.h index 5744a1d..59ee045 100644 --- a/benchmark-linxisa/kernels/fa/fa_utils.h +++ b/benchmark-linxisa/kernels/fa/fa_utils.h @@ -1,5 +1,9 @@ // FA utility functions shared across different implementations +#include + +using namespace pto::blkv; + template void __vec__ flashsoftmax_dn_mout_cast_kernel( typename tileScale::TileDType __out__ rescale, diff --git a/benchmark-linxisa/kernels/fa/linx_blkc.h b/benchmark-linxisa/kernels/fa/linx_blkc.h new file mode 100644 index 0000000..df68bc9 --- /dev/null +++ b/benchmark-linxisa/kernels/fa/linx_blkc.h @@ -0,0 +1,121 @@ +#ifndef PTO_FA_LINX_BLKC_H +#define PTO_FA_LINX_BLKC_H + +#include + +#ifndef __bf16 +#define __bf16 pto_bf16_t +#endif + +using pto::blkv::blkv_fexp; +using pto::blkv::blkv_fsqrt; +using pto::blkv::blkv_get_index_x; +using pto::blkv::blkv_get_index_y; +using pto::blkv::blkv_get_index_z; +using pto::blkv::blkv_get_tile_ptr; +using pto::blkv::blkv_max; + +using __fp4_e1m2x2 = pto::fp4_e2m1_t; +using __bf16x2 = uint32_t; + +template +inline To linx_cvt_as(From value) { + return static_cast(value); +} + +template +inline void linx_cvt(To &dst, From value) { + dst = linx_cvt_as(value); +} + +template +inline void linx_cvt_package(To &dst, A a, B b) { + const uint16_t hi = + static_cast(pto::lowp_word_from_bf16(pto::bf16_t{static_cast(a)}) & + 0xffffu); + const uint16_t lo = + static_cast(pto::lowp_word_from_bf16(pto::bf16_t{static_cast(b)}) & + 0xffffu); + if constexpr (sizeof(To) >= 4) { + dst = static_cast((static_cast(hi) << 16) | lo); + } else { + dst = static_cast(pto::float_to_fp4_e2m1(static_cast(a))); + } +} + +inline void blkv_bf16_fmax(__bf16 &dst, __bf16 a, __bf16 b) { + dst = a < b ? b : a; +} + +inline __bf16 blkv_bf16_max(__bf16 a, __bf16 b) { return a < b ? b : a; } +inline __bf16 blkv_bf16_mul(__bf16 a, __bf16 b) { return a * b; } +inline __bf16 blkv_bf16_div(__bf16 a, __bf16 b) { return a / b; } + +inline void blkv_bf16_fadd(__bf16 &dst, __bf16 a, __bf16 b) { dst = a + b; } +inline void blkv_bf16_fsub(__bf16 &dst, __bf16 a, __bf16 b) { dst = a - b; } +inline void blkv_bf16_fmul(__bf16 &dst, __bf16 a, __bf16 b) { dst = a * b; } +inline void blkv_bf16_fdiv(__bf16 &dst, __bf16 a, __bf16 b) { dst = a / b; } +inline void blkv_bf16_fexp(__bf16 &dst, __bf16 a) { + dst = pto::blkv::blkv_fexp(static_cast(a)); +} + +inline float linx_bf16x2_hi(__bf16x2 value) { + return static_cast(pto::bf16_from_lowp_word((value >> 16) & 0xffffu)); +} + +inline float linx_bf16x2_lo(__bf16x2 value) { + return static_cast(pto::bf16_from_lowp_word(value & 0xffffu)); +} + +inline __bf16x2 linx_pack_bf16x2(float hi, float lo) { + return (pto::lowp_word_from_bf16(pto::bf16_t{hi}) << 16) | + pto::lowp_word_from_bf16(pto::bf16_t{lo}); +} + +inline void blkv_bf16x2_fmax(__bf16x2 &dst, __bf16x2 a, __bf16x2 b) { + dst = linx_pack_bf16x2(pto::blkv::blkv_max(linx_bf16x2_hi(a), linx_bf16x2_hi(b)), + pto::blkv::blkv_max(linx_bf16x2_lo(a), linx_bf16x2_lo(b))); +} + +inline __bf16x2 blkv_bf16x2_mul(__bf16x2 a, __bf16x2 b) { + return linx_pack_bf16x2(linx_bf16x2_hi(a) * linx_bf16x2_hi(b), + linx_bf16x2_lo(a) * linx_bf16x2_lo(b)); +} + +inline void blkv_bf16x2_mul(__bf16x2 &dst, __bf16x2 a, __bf16x2 b) { + dst = blkv_bf16x2_mul(a, b); +} + +inline void blkv_bf16x2_fadd(__bf16x2 &dst, __bf16x2 a, __bf16x2 b) { + dst = linx_pack_bf16x2(linx_bf16x2_hi(a) + linx_bf16x2_hi(b), + linx_bf16x2_lo(a) + linx_bf16x2_lo(b)); +} + +inline void blkv_bf16x2_fsub(__bf16x2 &dst, __bf16x2 a, __bf16x2 b) { + dst = linx_pack_bf16x2(linx_bf16x2_hi(a) - linx_bf16x2_hi(b), + linx_bf16x2_lo(a) - linx_bf16x2_lo(b)); +} + +inline void blkv_bf16x2_fmul(__bf16x2 &dst, __bf16x2 a, __bf16x2 b) { + dst = blkv_bf16x2_mul(a, b); +} + +inline void blkv_bf16x2_fdiv(__bf16x2 &dst, __bf16x2 a, __bf16x2 b) { + dst = linx_pack_bf16x2(linx_bf16x2_hi(a) / linx_bf16x2_hi(b), + linx_bf16x2_lo(a) / linx_bf16x2_lo(b)); +} + +inline void blkv_bf16x2_fmsub(__bf16x2 &dst, __bf16x2 a, __bf16x2 b, + __bf16x2 c) { + dst = linx_pack_bf16x2((linx_bf16x2_hi(a) * linx_bf16x2_hi(b)) - + linx_bf16x2_hi(c), + (linx_bf16x2_lo(a) * linx_bf16x2_lo(b)) - + linx_bf16x2_lo(c)); +} + +inline void blkv_bf16x2_fexp(__bf16x2 &dst, __bf16x2 a) { + dst = linx_pack_bf16x2(pto::blkv::blkv_fexp(linx_bf16x2_hi(a)), + pto::blkv::blkv_fexp(linx_bf16x2_lo(a))); +} + +#endif // PTO_FA_LINX_BLKC_H diff --git a/benchmark-linxisa/kernels/gather/gather.hpp b/benchmark-linxisa/kernels/gather/gather.hpp index 0222852..a19ec69 100644 --- a/benchmark-linxisa/kernels/gather/gather.hpp +++ b/benchmark-linxisa/kernels/gather/gather.hpp @@ -1,207 +1,92 @@ +#ifndef SUPERNPUBENCH_PTOISA_GATHER_HPP +#define SUPERNPUBENCH_PTOISA_GATHER_HPP + #include -#include "template_asm.h" +#include #include -#include - - -#define DUMP_TILE(label, TileVar, DumpBuf, Rows, Cols) \ - do { \ - GlobalTensor, \ - Stride<1,1,1,Cols,1>> _g(DumpBuf); \ - TCOPYOUT(_g, TileVar); \ - printf("[DUMP] %s (shape=%dx%d):\n", label, Rows, Cols); \ - for (int ri = 0; ri < Rows; ri++) { \ - printf(" row%2d: ", ri); \ - for (int ci = 0; ci < Cols; ci++) \ - printf("%d ", DumpBuf[ri * Cols + ci]); \ - printf("\n"); \ - } \ - fflush(stdout); \ - } while (0) - - -#define DUMP_TILE_FLOAT(label, TileVar, DumpBuf, Rows, Cols) \ - do { \ - GlobalTensor, \ - Stride<1,1,1,Cols,1>> _g(DumpBuf); \ - TCOPYOUT(_g, TileVar); \ - printf("[DUMP] %s (shape=%dx%d):\n", label, Rows, Cols); \ - for (int ri = 0; ri < Rows; ri++) { \ - printf(" row%2d: ", ri); \ - for (int ci = 0; ci < Cols; ci++) \ - printf("%f ", DumpBuf[ri * Cols + ci]); \ - printf("\n"); \ - } \ - fflush(stdout); \ - } while (0) -template -void __vec__ gen_offset( - typename tile_shapeInOffset::TileDType __in__ in, // inOffset - typename tile_shapeOffset::TileDType __out__ out, // - const size_t n_base -) { - size_t data_width = sizeof(dtype); - size_t index_x = blkv_get_index_x(); - size_t index_y = blkv_get_index_y(); - size_t index = index_y * tile_shapeOffset::RowStride + index_x; - - size_t n_offset = (index_x + n_base); - size_t index_m = blkv_get_tile_ptr(in)[index_y] * gN; // 这里不需要m_base,因为传入的inOffset已经被切分为tile,起始元素就是m_base - size_t out_offset = index_m + n_offset; - out_offset *= data_width; - - blkv_get_tile_ptr(out)[index] = out_offset; -} - -template -void gen_offset_impl( - tile_shapeInOffset &in_offset, - tile_shapeOffset &out_offset, - const size_t n_base - ) { - static_assert(tile_shapeOffset::ValidRow != -1 && tile_shapeOffset::ValidCol != -1, - "Only static shape supported"); - - - gen_offset<<>>( - in_offset.data(), - out_offset.data(), - n_base); -} - -template -void gather( - dtype *in_data_ptr, - otype *in_offset_ptr, - dtype *out_ptr - ) { - const size_t Mb = gM / tM; - const size_t Nb = gN / tN; - const size_t rmd_M = gM % tM; - const size_t rmd_N = gN % tN; - - using gm_shapeInOffset = global_tensor>; - using gm_shapeIn = global_tensor>; - using gm_shapeOut = global_tensor>; - - using tile_shapeInData = Tile; - using itIn = global_iterator; - tile_shapeInData inTile; - itIn gInIter(in_data_ptr); - - using tile_shapeInOffset = Tile; - using tile_shapeData = Tile; - using tile_shapeOffset = Tile; - using tile_shapeInOffset_rmd_n = Tile; - using tile_shapeData_rmd_n = Tile; - using tile_shapeOffset_rmd_n = Tile; - - using tile_shapeInOffset_rmd_mn = Tile; - using tile_shapeData_rmd_mn = Tile; - using tile_shapeOffset_rmd_mn = Tile; - - using tile_shapeInOffset_rmd_m = Tile; - using tile_shapeData_rmd_m = Tile; - using tile_shapeOffset_rmd_m = Tile; - - gm_shapeIn inGm(in_data_ptr); - - tile_shapeInOffset inOffsetTile; - tile_shapeData outTile; - tile_shapeOffset offsetTile; - - tile_shapeInOffset_rmd_n inOffsetTile_rmd_n; - tile_shapeData_rmd_n outTile_rmd_n; - tile_shapeOffset_rmd_n offsetTile_rmd_n; - - tile_shapeInOffset_rmd_mn inOffsetTile_rmd_mn; - tile_shapeData_rmd_mn outTile_rmd_mn; - tile_shapeOffset_rmd_mn offsetTile_rmd_mn; - - tile_shapeInOffset_rmd_m inOffsetTile_rmd_m; - tile_shapeData_rmd_m outTile_rmd_m; - tile_shapeOffset_rmd_m offsetTile_rmd_m; - - using itInOffset = global_iterator; - using itOut = global_iterator; - - itInOffset gInOffsetIter(in_offset_ptr); - itOut gOIter(out_ptr); - // for test /////////////////////////////////////// - // alignas(256) static uint32_t g_dump[tM * rmd_N]; - alignas(256) static uint32_t g_dump[tM * rmd_N]; - // // alignas(256) static uint32_t g_dump_inoffset[rmd_N]; - // alignas(256) static uint32_t g_dump_inoffset[rmd_N]; - // alignas(256) static dtype g_dump_outdata[tM * rmd_N]; - // /////////////////////////////////////// - - size_t n_base = 0; - - // #pragma clang loop unroll(full) - for (int j = 0; j < Mb; ++j) { - printf("j = %d\n", j); - for (int i = 0; i < Nb; ++i) { - auto gInOffset = gInOffsetIter(0, j); - auto gO = gOIter(j, i); - TCOPYIN(inOffsetTile, gInOffset); - // test - // auto gIn = gInIter(j, i); - // TCOPYIN(inTile, gIn); - n_base = i * tN; - // printf("j = %d\n", j); - // printf("i = %d\n", i); - // printf("base = %d\n", base); - // printf("in_shape[0] = %d\n", in_shape[0]); - gen_offset_impl(inOffsetTile, offsetTile, n_base); - - MGATHER(outTile, inGm, offsetTile); - - // printf("inGm = %d\n", inGm); - // DUMP_TILE("inOffsetTile", inOffsetTile, g_dump_inoffset, 1, tN); - // DUMP_TILE("offsetTile", offsetTile, g_dump, 1, tN); - // DUMP_TILE_FLOAT("inTile", inTile, g_dump_outdata, 1, tN); - // DUMP_TILE_FLOAT("outTile", outTile, g_dump_outdata, 1, tN); - TCOPYOUT(gO, outTile); - } - if constexpr (rmd_N) { - auto gInOffset = gInOffsetIter(0, j); - auto gO = gOIter(j, Nb); - n_base = Nb * tN; - TCOPYIN(inOffsetTile_rmd_n, gInOffset); - gen_offset_impl(inOffsetTile_rmd_n, offsetTile_rmd_n, n_base); - MGATHER(outTile_rmd_n, inGm, offsetTile_rmd_n); - // DUMP_TILE("inOffsetTile_rmd_n", inOffsetTile_rmd_n, g_dump_inoffset, 1, rmd_N); - // DUMP_TILE_FLOAT("outTile_rmd_n", outTile_rmd_n, g_dump_outdata, 20, rmd_N); - // DUMP_TILE("offsetTile_rmd_n", offsetTile_rmd_n, g_dump, 20, rmd_N); - TCOPYOUT(gO, outTile_rmd_n); - } +using namespace pto; + +template +void gather(dtype *in_data_ptr, otype *in_offset_ptr, dtype *out_ptr) { + constexpr size_t kFullRows = gM / tM; + constexpr size_t kTailRows = gM % tM; + constexpr size_t kFullCols = gN / tN; + constexpr size_t kTailCols = gN % tN; + + using InputGlobal = global_tensor>; + using OffsetGlobal = global_tensor>; + using OutputGlobal = global_tensor>; + + using OffsetTile = + Tile; + using DataTile = Tile; + using OffsetIterator = global_iterator; + using OutputIterator = global_iterator; + + InputGlobal input_global(in_data_ptr); + OffsetIterator offset_iter(in_offset_ptr); + OutputIterator output_iter(out_ptr); + + for (size_t row_tile = 0; row_tile < kFullRows; ++row_tile) { + OffsetTile row_index; + TLOAD(row_index, offset_iter(0, static_cast(row_tile))); + for (size_t col_tile = 0; col_tile < kFullCols; ++col_tile) { + DataTile out_tile; + InputGlobal adjusted_input(in_data_ptr + col_tile * tN); + MGATHER(out_tile, adjusted_input, row_index); + TSTORE(output_iter(static_cast(row_tile), + static_cast(col_tile)), + out_tile); } - if constexpr (rmd_M) { - for (int i = 0; i < Nb; ++i) { - auto gInOffset = gInOffsetIter(0, Mb); - auto gO = gOIter(Mb, i); - n_base = i * tN; - TCOPYIN(inOffsetTile_rmd_m, gInOffset); - gen_offset_impl(inOffsetTile_rmd_m, offsetTile_rmd_m, n_base); - MGATHER(outTile_rmd_m, inGm, offsetTile_rmd_m); - TCOPYOUT(gO, outTile_rmd_m); - } - if constexpr (rmd_N) { - auto gInOffset = gInOffsetIter(0, Mb); - auto gO = gOIter(Mb, Nb); - n_base = Nb * tN; - TCOPYIN(inOffsetTile_rmd_mn, gInOffset); - gen_offset_impl(inOffsetTile_rmd_mn, offsetTile_rmd_mn, n_base); - MGATHER(outTile_rmd_mn, inGm, offsetTile_rmd_mn); - TCOPYOUT(gO, outTile_rmd_mn); - } + if constexpr (kTailCols != 0) { + using TailColTile = Tile; + using TailOutputIterator = global_iterator; + TailOutputIterator tail_output_iter(out_ptr); + TailColTile out_tile; + InputGlobal adjusted_input(in_data_ptr + kFullCols * tN); + MGATHER(out_tile, adjusted_input, row_index); + TSTORE(tail_output_iter(static_cast(row_tile), + static_cast(kFullCols)), + out_tile); } - + } + + if constexpr (kTailRows != 0) { + using TailOffsetTile = + Tile; + using TailOffsetIterator = global_iterator; + TailOffsetIterator tail_offset_iter(in_offset_ptr); + TailOffsetTile row_index; + TLOAD(row_index, tail_offset_iter(0, static_cast(kFullRows))); + for (size_t col_tile = 0; col_tile < kFullCols; ++col_tile) { + using TailRowTile = Tile; + using TailOutputIterator = global_iterator; + TailOutputIterator tail_output_iter(out_ptr); + TailRowTile out_tile; + InputGlobal adjusted_input(in_data_ptr + col_tile * tN); + MGATHER(out_tile, adjusted_input, row_index); + TSTORE(tail_output_iter(static_cast(kFullRows), + static_cast(col_tile)), + out_tile); + } + if constexpr (kTailCols != 0) { + using TailTile = Tile; + using TailOutputIterator = global_iterator; + TailOutputIterator tail_output_iter(out_ptr); + TailTile out_tile; + InputGlobal adjusted_input(in_data_ptr + kFullCols * tN); + MGATHER(out_tile, adjusted_input, row_index); + TSTORE(tail_output_iter(static_cast(kFullRows), + static_cast(kFullCols)), + out_tile); + } + } } - - +#endif // SUPERNPUBENCH_PTOISA_GATHER_HPP diff --git a/benchmark-linxisa/kernels/gather/gather_pto.hpp b/benchmark-linxisa/kernels/gather/gather_pto.hpp new file mode 100644 index 0000000..8835c43 --- /dev/null +++ b/benchmark-linxisa/kernels/gather/gather_pto.hpp @@ -0,0 +1,199 @@ +// ============================================================================ +// Gather 算子 — PTO 一层编程模型 +// +// 原始 gather.hpp 策略: +// 对每个输出 tile (tM, tN): +// 1. TCOPYIN 加载 offset tile (行索引, 1×tM) from GM +// 2. __vec__ gen_offset 计算字节偏移: +// offset[row,col] = (in_offset[row] * gN + n_base + col) * sizeof(dtype) +// 3. MGATHER 按字节偏移从数据表取数 (旧 MGATHER, 字节偏移语义) +// 4. TCOPYOUT 写回输出 +// +// PTO 一层策略: +// 对每个输出 tile (tM, tN): +// 1. TLOAD 加载 offset tile (行索引, 1×tM) from GM +// 2. MGATHER 按行索引直接取数: +// dst[r,:] = table[idx[r], :] +// 通过调整 table 指针 (+n_base) 处理列偏移 +// (PTO ISA MGATHER 使用行索引, 非 字节偏移, 无需 gen_offset) +// 3. TSTORE 写回输出 +// +// gen_offset __vec__ 块完全消除: MGATHER 内部完成 +// 行索引 → 行地址 的转换 (tablePtr + idx * tableRowStride)。 +// +// ┌─────────────────────────────────────────────────────────────────────────┐ +// │ 当前编译器不支持 / 不完整的指令汇总 │ +// ├──────────┬──────────────────┬──────────────────────────────────────────┤ +// │ Pto ISA │ 当前编译器状态 │ 说明 │ +// │ 指令 │ │ │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TLOAD │ API 有(名不同), │ PTO ISA 名 TLOAD;当前编译器名 TCOPYIN; │ +// │ │ │ 核心参数 (dst, src) 和行为一致 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ MGATHER │ 部分支持 │ template_asm.h 有 MGATHER (asm volatile);│ +// │ │ │ 但缺 Coalesce 模板参数 (无法选 Row/Elem);│ +// │ │ │ 且旧实现按字节偏移取数, │ +// │ │ │ PTO ISA Coalesce::Row 按行索引取数 │ +// ├──────────┼──────────────────┼──────────────────────────────────────────┤ +// │ TSTORE │ API 有(名不同), │ PTO ISA 名 TSTORE;当前编译器名 TCOPYOUT;│ +// │ │ │ 核心参数 (dst, src) 和行为一致 │ +// └──────────┴──────────────────┴──────────────────────────────────────────┘ +// +// PTO ISA 文档签名 (Declared in include/pto/pto_instr.hpp): +// +// TLOAD: +// template +// PTO_INST RecordEvent TLOAD(TileData &dst, GlobalData &src, +// WaitEvents &... events); +// +// MGATHER (Coalesce::Row 模式): +// template +// PTO_INST RecordEvent MGATHER(TileDst &dst, GlobalData &src, +// TileInd &indexes, WaitEvents &... events); +// +// Row 模式语义: dst[r,:] = table[idx[r], :] +// - idx 为行索引 (非字节偏移) +// - tablePtr + idx * tableRowStride 定位到行起始 +// - 读取 validCol 个元素到 dst 对应行 +// - 约束 (A2A3): TileIdx::ValidRow==1, TileIdx::ValidCol==TileDst::ValidRow +// - 约束 (A5): 同上, 或 [R,1] ColMajor; 且 staticShape[4]==ValidCol +// +// TSTORE: +// template +// PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, +// WaitEvents &... events); +// ============================================================================ + +#include +#include +// #include // [!] PTO ISA C++ Intrinsic — 当前编译器未提供 + +#include +#include + +// ---------------------------------------------------------------------------- +// gather: 按行索引从数据表中 gather 行 +// +// 输入: +// in_data_ptr — 数据表, shape (gK, gN), row-major +// in_offset_ptr — 行索引数组, shape (gM,), 每个元素是数据表的行号 +// out_ptr — 输出, shape (gM, gN), out[j,:] = in_data[in_offset[j],:] +// +// 模板参数 (与原 gather.hpp 一致): +// dtype — 数据类型 (float, half, ...) +// otype — 索引类型 (uint32_t, int32_t) +// gK — 数据表行数 +// gM — 输出行数 (= 索引数组长度) +// gN — 数据表列数 (= 每行元素数) +// tM — tile 行数 +// tN — tile 列数 +// ---------------------------------------------------------------------------- +template +void gather( + dtype *in_data_ptr, + otype *in_offset_ptr, + dtype *out_ptr + ) { + const size_t Mb = gM / tM; + const size_t Nb = gN / tN; + const size_t rmd_M = gM % tM; + const size_t rmd_N = gN % tN; + + using gm_shapeInOffset = global_tensor>; + using gm_shapeIn = global_tensor>; + using gm_shapeOut = global_tensor>; + + using tile_shapeInOffset = Tile; + using tile_shapeData = Tile; + using tile_shapeInOffset_rmd_n = Tile; + using tile_shapeData_rmd_n = Tile; + using tile_shapeInOffset_rmd_mn = Tile; + using tile_shapeData_rmd_mn = Tile; + using tile_shapeInOffset_rmd_m = Tile; + using tile_shapeData_rmd_m = Tile; + + tile_shapeInOffset inOffsetTile; + tile_shapeData outTile; + tile_shapeInOffset_rmd_n inOffsetTile_rmd_n; + tile_shapeData_rmd_n outTile_rmd_n; + tile_shapeInOffset_rmd_mn inOffsetTile_rmd_mn; + tile_shapeData_rmd_mn outTile_rmd_mn; + tile_shapeInOffset_rmd_m inOffsetTile_rmd_m; + tile_shapeData_rmd_m outTile_rmd_m; + + using itInOffset = global_iterator; + using itOut = global_iterator; + + itInOffset gInOffsetIter(in_offset_ptr); + itOut gOIter(out_ptr); + + // ---- 主循环: Mb × Nb 个完整 tile ---- + for (int j = 0; j < Mb; ++j) { + for (int i = 0; i < Nb; ++i) { + auto gInOffset = gInOffsetIter(0, j); + auto gO = gOIter(j, i); + size_t n_base = i * tN; + + // TLOAD: 加载行索引 tile (1, tM) from GM + // [当前编译器] 名为 TCOPYIN + TLOAD(inOffsetTile, gInOffset); + + // MGATHER: 按行索引从数据表取数 + // dst[r,:] = table[idx[r], :] + // table 指针偏移 n_base 个元素, 使取数起始列为 n_base + // (tablePtr + idx * gN + n_base 定位到 row idx, col n_base) + // [当前编译器] template_asm.h 的 MGATHER 无 Coalesce 模板参数; + // 且按字节偏移取数, 非行索引 + gm_shapeIn adjustedGm(in_data_ptr + n_base); + MGATHER(outTile, adjustedGm, inOffsetTile); + + // TSTORE: 写回输出 tile (tM, tN) to GM + // [当前编译器] 名为 TCOPYOUT + TSTORE(gO, outTile); + } + + // ---- rmd_N: 最后一个列块不完整 ---- + if constexpr (rmd_N) { + auto gInOffset = gInOffsetIter(0, j); + auto gO = gOIter(j, Nb); + size_t n_base = Nb * tN; + + TLOAD(inOffsetTile_rmd_n, gInOffset); + gm_shapeIn adjustedGm(in_data_ptr + n_base); + MGATHER(outTile_rmd_n, adjustedGm, inOffsetTile_rmd_n); + TSTORE(gO, outTile_rmd_n); + } + } + + // ---- rmd_M: 最后一个行块不完整 ---- + if constexpr (rmd_M) { + for (int i = 0; i < Nb; ++i) { + auto gInOffset = gInOffsetIter(0, Mb); + auto gO = gOIter(Mb, i); + size_t n_base = i * tN; + + TLOAD(inOffsetTile_rmd_m, gInOffset); + gm_shapeIn adjustedGm(in_data_ptr + n_base); + MGATHER(outTile_rmd_m, adjustedGm, inOffsetTile_rmd_m); + TSTORE(gO, outTile_rmd_m); + } + + // ---- rmd_M + rmd_N: 右下角不完整 ---- + if constexpr (rmd_N) { + auto gInOffset = gInOffsetIter(0, Mb); + auto gO = gOIter(Mb, Nb); + size_t n_base = Nb * tN; + + TLOAD(inOffsetTile_rmd_mn, gInOffset); + gm_shapeIn adjustedGm(in_data_ptr + n_base); + MGATHER(outTile_rmd_mn, adjustedGm, inOffsetTile_rmd_mn); + TSTORE(gO, outTile_rmd_mn); + } + } +} diff --git a/benchmark-linxisa/kernels/matmul/matmul.hpp b/benchmark-linxisa/kernels/matmul/matmul.hpp index a3671c7..14f574e 100644 --- a/benchmark-linxisa/kernels/matmul/matmul.hpp +++ b/benchmark-linxisa/kernels/matmul/matmul.hpp @@ -9,7 +9,12 @@ using namespace pto; -template +enum class PadValue { Null }; +enum class LayoutCvtEnum { ND2ZZ, ND2NN }; +template +void blk_tload(Args...); + +template void TCOPYOUT_ACC(GmOut &Gout, TileAcc &tAcc){ using TileAccOut = Tile; TileAccOut tAccOut; @@ -17,7 +22,7 @@ void TCOPYOUT_ACC(GmOut &Gout, TileAcc &tAcc){ TCOPYOUT(Gout, tAccOut); } -template +template void TCOPYOUT_ACC_DYNAMIC(GmOut &Gout, TileAcc &tAcc, size_t valid_row, size_t valid_col){ using TileAccOut = Tile; TileAccOut tAccOut(valid_row, valid_col); diff --git a/benchmark-linxisa/kernels/matmul/matmul_mx.hpp b/benchmark-linxisa/kernels/matmul/matmul_mx.hpp index b26e87f..d45b12b 100644 --- a/benchmark-linxisa/kernels/matmul/matmul_mx.hpp +++ b/benchmark-linxisa/kernels/matmul/matmul_mx.hpp @@ -1399,7 +1399,9 @@ void matmul_mp(float *acc_ptr, dtypeA *a_ptr, dtypeB *b_ptr, float *c_ptr) { MATMUL(tACC, tA, tB); TCVT(tACCin, tACC);//[tM, tN] 256->1 , 256 -> 2 scaling factor // static_assert(tile_shapeB::ValidCol % (width_factor*128) == 0); // TODO, 暂不考虑padding,假设形状是规整的, 方便处理, taccin*ts_adder=tc_dequant - dequant_acc<<>>(tACCin.data(), ts.data(), tAdder[k%2].data(), tC_dequant.data()); + pto::blkv::blkv_for_2d(tile_shapeACC::ValidRow, tile_shapeACC::ValidCol, [&] { + dequant_acc(tACCin.data(), ts.data(), tAdder[k%2].data(), tC_dequant.data()); + }); tAdder[(k+1)%2] = tC_dequant; } k++; @@ -1419,7 +1421,9 @@ void matmul_mp(float *acc_ptr, dtypeA *a_ptr, dtypeB *b_ptr, float *c_ptr) { MATMUL(tACC, tA, tB); TCVT(tACCin, tACC); - dequant_acc<<>>(tACCin.data(), ts.data(), tAdder[k%2].data(), tC_dequant.data()); + pto::blkv::blkv_for_2d(tile_shapeACC::ValidRow, tile_shapeACC::ValidCol, [&] { + dequant_acc(tACCin.data(), ts.data(), tAdder[k%2].data(), tC_dequant.data()); + }); tAdder[(k+1)%2] = tC_dequant; } TCOPYOUT(gACC, tAdder[(k+1)%2]); @@ -1449,7 +1453,9 @@ void matmul_mp(float *acc_ptr, dtypeA *a_ptr, dtypeB *b_ptr, float *c_ptr) { TCOPYIN(ts, gS); MATMUL(tACC, tA, tB); TCVT(tACCin, tACC); - dequant_acc<<>>(tACCin.data(), ts.data(), tAdder[k%2].data(), tC_dequant.data()); + pto::blkv::blkv_for_2d(tile_ACCin_tcols::ValidRow, tile_ACCin_tcols::ValidCol, [&] { + dequant_acc(tACCin.data(), ts.data(), tAdder[k%2].data(), tC_dequant.data()); + }); tAdder[(k+1)%2] = tC_dequant; } k++; @@ -1467,7 +1473,9 @@ void matmul_mp(float *acc_ptr, dtypeA *a_ptr, dtypeB *b_ptr, float *c_ptr) { TCOPYIN(ts, gS); MATMUL(tACC, tA, tB); TCVT(tACCin, tACC); - dequant_acc<<>>(tACCin.data(), ts.data(), tAdder[k%2].data(), tC_dequant.data()); + pto::blkv::blkv_for_2d(tile_shapeACC::ValidRow, tile_shapeACC::ValidCol, [&] { + dequant_acc(tACCin.data(), ts.data(), tAdder[k%2].data(), tC_dequant.data()); + }); tAdder[(k+1)%2] = tC_dequant; } TCOPYOUT(gACC, tAdder[(k+1)%2]); @@ -1518,4 +1526,4 @@ void matmul_mp(float *acc_ptr, dtypeA *a_ptr, dtypeB *b_ptr, float *c_ptr) { } -#endif \ No newline at end of file +#endif diff --git a/benchmark-linxisa/kernels/reduction/cumsum_colvec.hpp b/benchmark-linxisa/kernels/reduction/cumsum_colvec.hpp index e7ca443..a8219aa 100644 --- a/benchmark-linxisa/kernels/reduction/cumsum_colvec.hpp +++ b/benchmark-linxisa/kernels/reduction/cumsum_colvec.hpp @@ -1,180 +1,41 @@ #ifndef CUMSUMCOL_KERNEL_HPP #define CUMSUMCOL_KERNEL_HPP - #include -#include "template_asm.h" - - -using namespace pto; - -#pragma once #include #include - -// ============================================== -// ============================================== - - - -template -void __vec__ cumsum_col_kernel( - typename tileSum::TileDType __out__ new_sum, - typename tileData::TileDType __out__ out, - const typename tileData::TileDType __in__ src, - const typename tileSum::TileDType __in__ old_sum -) -{ - size_t i = blkv_get_index_x(); - size_t sum_idx = i * tileSum::RowStride; - - __vbuf__ typename tileSum::DType *new_sum_ptr = blkv_get_tile_ptr(new_sum); - __vbuf__ typename tileData::DType *out_ptr = blkv_get_tile_ptr(out); - __vbuf__ typename tileData::DType *src_ptr = blkv_get_tile_ptr(src); - __vbuf__ typename tileSum::DType *old_sum_ptr = blkv_get_tile_ptr(old_sum); - - - typename tileSum::DType upd_sum = old_sum_ptr[i]; -// printf("upd_sum = %d",upd_sum); -// typename tileData::DType upd_out = old_sum_ptr[i]; -/* - for(size_t j=0;j(sum_out); - } - new_sum_ptr[i] = upd_sum; -} - - - +using namespace pto; template void cumsum_col_rand( dtype *in_ptr, -// dtype *inzero_ptr, dtype *out_ptr -) +) { + using ScalarTile = Tile; + using gm_shape = global_tensor>; + using itType = global_iterator; - const int Mb = gIM / tM; - const int Nb = gIN / tN; - - const int rmd_M = gIM % tM; - const int rmd_N = gIN % tN; -// const int rmd_M = gOM % tM; // todo 尾块怎么处理? - - using gm_shapeIn = global_tensor>; //将gm中的Tensor先声明为一维数据 -// using gm_shapeSum = global_tensor>; - using gm_shapeOut = global_tensor>; - using tile_shapeData = Tile; // - using tile_shapeData_col = Tile; // - using tile_shapeSum = Tile; // - - using tile_shapeData_row = Tile; // - using tile_shapeData_cor = Tile; // - using tile_shapeSum_row = Tile; // - //need tM = 1; - - - gm_shapeIn inGm(in_ptr); -// gm_shapeOut ZeroGm(inzero_ptr); - gm_shapeOut outGm(out_ptr); -// gm_shapeSum olcSumGm(old_sum_ptr); - - tile_shapeData dataTile; - tile_shapeData OutTile; - tile_shapeData_col dataTile_col; - tile_shapeData_col OutTile_col; - tile_shapeSum SumTile; - tile_shapeSum oldSumTile; - - tile_shapeData_row dataTile_row; - tile_shapeData_row OutTile_row; - tile_shapeData_cor dataTile_cor; - tile_shapeData_cor OutTile_cor; - - tile_shapeSum_row SumTile_row; - tile_shapeSum_row oldSumTile_row; - -// int base = 0;// todo 生成一个标量 -// int all_num = gOM; // 总元素数量 + itType gIIter(in_ptr); + itType gOIter(out_ptr); - using itIn = global_iterator; -// using itZero = global_iterator; - using itOut = global_iterator; -// using itSum = global_iterator; + // 对每一列独立计算累积和 + for (int j = 0; j < gIN; ++j) { + ScalarTile running_sum; + TEXPANDS(running_sum, static_cast(0)); - itIn gIIter(in_ptr); - itOut gOIter(out_ptr); + for (int i = 0; i < gIM; ++i) { + auto gI = gIIter(i, j); + ScalarTile input_elem; + TLOAD(input_elem, gI); -// dtype zero = 0; + TADD(running_sum, running_sum, input_elem); - for (int j = 0; j < Nb; ++j) { - TEXPANDSCALAR(oldSumTile, 0);//初始化为0 - for (int i = 0; i < Mb; ++i) { - auto gI = gIIter(i, j); - auto gO = gOIter(i, j); - TCOPYIN(dataTile, gI); -// printf("in0 : %d, %d\n",in_ptr[i*tM], i*tM); - cumsum_col_kernel<<>>(SumTile.data(), OutTile.data(), dataTile.data(), oldSumTile.data()); - oldSumTile = SumTile; - TCOPYOUT(gO, OutTile); -// printf("out0 : %d,%d\n", out_ptr[i*tM],i*tM); - } - if constexpr (rmd_M > 0){ - auto gI = gIIter(Mb, j); - auto gO = gOIter(Mb, j); - TCOPYIN(dataTile_col, gI); - cumsum_col_kernel<<>>(SumTile.data(), OutTile_col.data(), dataTile_col.data(), oldSumTile.data()); - oldSumTile = SumTile; - TCOPYOUT(gO, OutTile_col); + auto gO = gOIter(i, j); + TSTORE(gO, running_sum); } -// TCOPYOUT(gO, SumTile); } - if constexpr (rmd_N > 0){ -// auto gZero = gZeroIter(0, Nb); -// auto gO = gOIter(0, Nb); - TEXPANDSCALAR(oldSumTile_row, 0);//初始化为0 -// TCOPYIN(oldSumTile_row, gZero);//初始化为0 - for (int i = 0; i < Mb; ++i) { - auto gI = gIIter(i, Nb); - auto gO = gOIter(i, Nb); - TCOPYIN(dataTile_row, gI); - cumsum_col_kernel<<>>(SumTile_row.data(), OutTile_row.data(), dataTile_row.data(), oldSumTile_row.data()); - oldSumTile_row = SumTile_row; - TCOPYOUT(gO, OutTile_row); - } - if constexpr (rmd_M > 0){ - auto gI = gIIter(Mb, Nb); - auto gO = gOIter(Mb, Nb); - TCOPYIN(dataTile_cor, gI); - cumsum_col_kernel<<>>(SumTile_row.data(), OutTile_cor.data(), dataTile_cor.data(), oldSumTile_row.data()); - oldSumTile_row = SumTile_row; - TCOPYOUT(gO, OutTile_cor); - } - } -/* - for(int i = 0; i < gIN; i++){ - printf("out%d = %d\n", i, out_ptr[i]); - } -*/ } - - -#endif \ No newline at end of file +#endif diff --git a/benchmark-linxisa/kernels/reduction/cumsum_rowvec.hpp b/benchmark-linxisa/kernels/reduction/cumsum_rowvec.hpp index ec998c1..9be3b11 100644 --- a/benchmark-linxisa/kernels/reduction/cumsum_rowvec.hpp +++ b/benchmark-linxisa/kernels/reduction/cumsum_rowvec.hpp @@ -1,170 +1,42 @@ -#ifndef REDUCESUMTROWSUM_KERNEL_HPP -#define REDUCESUMTROWSUM_KERNEL_HPP - -#ifndef __vbuf__ -#define __vbuf__ -#endif - -#include - -using namespace pto; +#ifndef CUMSUMROW_KERNEL_HPP +#define CUMSUMROW_KERNEL_HPP #pragma once +#include #include #include -template -void __vec__ cumsum_row_kernel( - typename tileSum::TileDType __out__ new_sum, - const typename tileData::TileDType __out__ out, - const typename tileData::TileDType __in__ src, - const typename tileSum::TileDType __in__ old_sum -) -{ -// size_t i = blkv_get_index_x(); - size_t j = blkv_get_index_y(); - size_t sum_idx = j * tileSum::RowStride; - - __vbuf__ typename tileSum::DType *new_sum_ptr = blkv_get_tile_ptr(new_sum); - __vbuf__ typename tileData::DType *out_ptr = blkv_get_tile_ptr(out); - __vbuf__ typename tileData::DType *src_ptr = blkv_get_tile_ptr(src); - __vbuf__ typename tileSum::DType *old_sum_ptr = blkv_get_tile_ptr(old_sum); - - - typename tileSum::DType upd_sum = old_sum_ptr[sum_idx]; -/* - for(size_t j=0;j(sum_out); - } - new_sum_ptr[sum_idx] = upd_sum; -} - - +using namespace pto; template void cumsum_row_rand( dtype *in_ptr, dtype *out_ptr -) +) { + using ScalarTile = Tile; + using gm_shape = global_tensor>; + using itType = global_iterator; - const int Mb = gIM / tM; - const int Nb = gIN / tN; - - const int rmd_M = gIM % tM; // todo 尾块怎么处理? - const int rmd_N = gIN % tN; // todo 尾块怎么处理? - - - using gm_shapeIn = global_tensor>; //将gm中的Tensor先声明为一维数据 -// using gm_shapeSum = global_tensor>; - using gm_shapeOut = global_tensor>; - using tile_shapeData = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeData_row = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeSum = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec - - - using tile_shapeData_col = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeData_cor = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeSum_col = Tile; - + itType gIIter(in_ptr); + itType gOIter(out_ptr); - gm_shapeIn inGm(in_ptr); - gm_shapeOut outGm(out_ptr); -// gm_shapeSum olcSumGm(old_sum_ptr); + // 对每一行独立计算累积和 + for (int i = 0; i < gIM; ++i) { + ScalarTile running_sum; + TEXPANDS(running_sum, static_cast(0)); - tile_shapeData dataTile; - tile_shapeData_row dataTile_row; - tile_shapeData_col dataTile_col; - tile_shapeData_cor dataTile_cor; + for (int j = 0; j < gIN; ++j) { + auto gI = gIIter(i, j); + ScalarTile input_elem; + TLOAD(input_elem, gI); - tile_shapeData OutTile; - tile_shapeData_row OutTile_row; - tile_shapeData_col OutTile_col; - tile_shapeData_cor OutTile_cor; - - tile_shapeSum SumTile; - tile_shapeSum oldSumTile; - tile_shapeSum_col SumTile_col; - tile_shapeSum_col oldSumTile_col; + TADD(running_sum, running_sum, input_elem); -// int base = 0;// todo 生成一个标量 -// int all_num = gOM; // 总元素数量 - - using itIn = global_iterator; - using itOut = global_iterator; - - itIn gIIter(in_ptr); - itOut gOIter(out_ptr); - -// printf("tile_shapeSum::ValidCol = %d\n", tile_shapeSum::ValidCol); -// printf("tile_shapeSum::ValidRow = %d\n", tile_shapeSum::ValidRow); - - for (int j = 0; j < Mb; ++j) { -// auto gO = gOIter(j, 0); - TEXPANDSCALAR(oldSumTile, 0);//初始化为0 - //初始化old_sum的tile - for (int i = 0; i < Nb; ++i) { - auto gI = gIIter(j, i); - auto gO = gOIter(j, i); - TCOPYIN(dataTile, gI); - cumsum_row_kernel<<<1, tile_shapeSum::ValidRow, 1>>>(SumTile.data(), OutTile.data(), dataTile.data(), oldSumTile.data()); -// reducesum_row_kernel<<>>(SumTile.data(), dataTile.data(), oldSumTile.data()); - oldSumTile = SumTile; - TCOPYOUT(gO, OutTile); + auto gO = gOIter(i, j); + TSTORE(gO, running_sum); } -// printf("end for%d\n",j); - //for row corner - if constexpr (rmd_N > 0){ - auto gI = gIIter(j, Nb); - auto gO = gOIter(j, Nb); - TCOPYIN(dataTile_row, gI); - cumsum_row_kernel<<<1, tile_shapeSum::ValidRow, 1>>>(SumTile.data(), OutTile_row.data(), dataTile_row.data(), oldSumTile.data()); -// reducesum_row_kernel<<>>(SumTile.data(), dataTile_row.data(), oldSumTile.data()); - oldSumTile = SumTile; - TCOPYOUT(gO, OutTile_row); - } - } - //for col cor - if constexpr (rmd_M > 0){ - TEXPANDSCALAR(oldSumTile_col, 0);//初始化为0 - //初始化old_sum的tile - for (int i = 0; i < Nb; ++i) { - auto gI = gIIter(Mb, i); - auto gO = gOIter(Mb, i); - TCOPYIN(dataTile_col, gI); - cumsum_row_kernel<<<1, tile_shapeSum_col::ValidRow, 1>>>(SumTile_col.data(), OutTile_col.data(), dataTile_col.data(), oldSumTile_col.data()); - oldSumTile_col = SumTile_col; - TCOPYOUT(gO, OutTile_col); - } - if constexpr (rmd_N > 0){ - auto gI = gIIter(Mb, Nb); - auto gO = gOIter(Mb, Nb); - TCOPYIN(dataTile_cor, gI); - cumsum_row_kernel<<<1, tile_shapeSum_col::ValidRow, 1>>>(SumTile_col.data(), OutTile_cor.data(), dataTile_cor.data(), oldSumTile_col.data()); - oldSumTile_col = SumTile_col; - TCOPYOUT(gO, OutTile_cor); - } - } -/* - for(int i = 0; i < gIM; i++){ - printf("out%d = %d\n", i, out_ptr[i]); } -*/ } #endif diff --git a/benchmark-linxisa/kernels/reduction/reducemax_colvec.hpp b/benchmark-linxisa/kernels/reduction/reducemax_colvec.hpp index 502db96..0c4dc7d 100644 --- a/benchmark-linxisa/kernels/reduction/reducemax_colvec.hpp +++ b/benchmark-linxisa/kernels/reduction/reducemax_colvec.hpp @@ -1,170 +1,85 @@ -#ifndef REDUCESUMCOLVEC_KERNEL_HPP -#define REDUCESUMCOLVEC_KERNEL_HPP - +#ifndef REDUCEMAXCOLVEC_KERNEL_HPP +#define REDUCEMAXCOLVEC_KERNEL_HPP #include -#include "template_asm.h" - - -using namespace pto; - -#pragma once #include #include - -// ============================================== -// ============================================== -template -void __vec__ reducemax_col_kernel( - typename tileMax::TileDType __out__ new_max, - const typename tileSrc::TileDType __in__ src, - const typename tileMax::TileDType __in__ old_max -) -{ - size_t i = blkv_get_index_x(); - - __vbuf__ typename tileMax::DType *new_max_ptr = blkv_get_tile_ptr(new_max); - __vbuf__ typename tileSrc::DType *src_ptr = blkv_get_tile_ptr(src); - __vbuf__ typename tileMax::DType *old_max_ptr = blkv_get_tile_ptr(old_max); - - - typename tileMax::DType upd_max = old_max_ptr[i]; -/* - for(size_t j=0;j void reducemax_col_rand( dtype *in_ptr, -// dtype *inzero_ptr, dtype *out_ptr -) +) { - const int Mb = gIM / tM; - const int Nb = gIN / tN; - + const int Nb = gIN / tN; const int rmd_M = gIM % tM; const int rmd_N = gIN % tN; -// const int rmd_M = gOM % tM; // todo 尾块怎么处理? - using gm_shapeIn = global_tensor>; // -// using gm_shapeSum = global_tensor>; + using gm_shapeIn = global_tensor>; using gm_shapeOut = global_tensor>; - using tile_shapeData = Tile; // - using tile_shapeData_col = Tile; // - using tile_shapeMax = Tile; // - - - - using tile_shapeData_row = Tile; // - using tile_shapeData_cor = Tile; // - using tile_shapeMax_row = Tile; // - //need tM = 1; - - - gm_shapeIn inGm(in_ptr); -// gm_shapeOut ZeroGm(inzero_ptr); + using tile_shapeData = Tile; + using tile_shapeData_col = Tile; + using tile_shapeMax = Tile; + using tile_shapeData_row = Tile; + using tile_shapeData_cor = Tile; + using tile_shapeMax_row = Tile; + + gm_shapeIn inGm(in_ptr); gm_shapeOut outGm(out_ptr); -// gm_shapeSum olcSumGm(old_sum_ptr); tile_shapeData dataTile; - tile_shapeData_col dataTile_col; + tile_shapeData_col dataTile_col; tile_shapeMax MaxTile; tile_shapeMax oldMaxTile; - tile_shapeData_row dataTile_row; - tile_shapeData_cor dataTile_cor; + tile_shapeData_cor dataTile_cor; tile_shapeMax_row MaxTile_row; - tile_shapeMax_row oldMaxTile_row; - -// int base = 0;// todo 生成一个标量 -// int all_num = gOM; // 总元素数量 + tile_shapeMax_row oldMaxTile_row; - using itIn = global_iterator; - using itIn_row = global_iterator; + using itIn = global_iterator; using itOut = global_iterator; - itIn gIIter(in_ptr); - itIn_row gIIter_rmd_row(in_ptr); -// itZero gZeroIter(inzero_ptr); + itIn gIIter(in_ptr); itOut gOIter(out_ptr); -// dtype zero = 0; - for (int j = 0; j < Nb; ++j) { -// auto gZero = gZeroIter(0, j); auto gO = gOIter(0, j); - TEXPANDSCALAR(oldMaxTile, 0);//初始化为0 -// TCOPYIN(oldSumTile, gZero);//初始化为0 - //初始化old_sum的tile - //need + TEXPANDS(oldMaxTile, static_cast(0)); + for (int i = 0; i < Mb; ++i) { auto gI = gIIter(i, j); - TCOPYIN(dataTile, gI); - reducemax_col_kernel<<>>(MaxTile.data(), dataTile.data(), oldMaxTile.data()); - oldMaxTile = MaxTile; + TLOAD(dataTile, gI); + TCOLMAX(MaxTile, dataTile); + TMAX(oldMaxTile, oldMaxTile, MaxTile); } - if constexpr (rmd_M > 0){ + if constexpr (rmd_M > 0) { auto gI = gIIter(Mb, j); - TCOPYIN(dataTile_col, gI); - reducemax_col_kernel<<>>(MaxTile.data(), dataTile_col.data(), oldMaxTile.data()); - oldMaxTile = MaxTile; + TLOAD(dataTile_col, gI); + TCOLMAX(MaxTile, dataTile_col); + TMAX(oldMaxTile, oldMaxTile, MaxTile); } - TCOPYOUT(gO, MaxTile); + TSTORE(gO, oldMaxTile); } - if constexpr (rmd_N > 0){ -// auto gZero = gZeroIter(0, Nb); + if constexpr (rmd_N > 0) { auto gO = gOIter(0, Nb); - TEXPANDSCALAR(oldMaxTile_row, 0);//初始化为0 -// TCOPYIN(oldSumTile_row, gZero);//初始化为0 - for (int i = 0; i < Mb; ++i) { + TEXPANDS(oldMaxTile_row, static_cast(0)); + + for (int i = 0; i < Mb; ++i) { auto gI = gIIter(i, Nb); - TCOPYIN(dataTile_row, gI); - reducemax_col_kernel<<>>(MaxTile_row.data(), dataTile_row.data(), oldMaxTile_row.data()); - oldMaxTile_row = MaxTile_row; + TLOAD(dataTile_row, gI); + TCOLMAX(MaxTile_row, dataTile_row); + TMAX(oldMaxTile_row, oldMaxTile_row, MaxTile_row); } - if constexpr (rmd_M > 0){ + if constexpr (rmd_M > 0) { auto gI = gIIter(Mb, Nb); - TCOPYIN(dataTile_cor, gI); - reducemax_col_kernel<<>>(MaxTile_row.data(), dataTile_cor.data(), oldMaxTile_row.data()); - oldMaxTile_row = MaxTile_row; + TLOAD(dataTile_cor, gI); + TCOLMAX(MaxTile_row, dataTile_cor); + TMAX(oldMaxTile_row, oldMaxTile_row, MaxTile_row); } - TCOPYOUT(gO, MaxTile_row); + TSTORE(gO, oldMaxTile_row); } } diff --git a/benchmark-linxisa/kernels/reduction/reducemax_colvec_unalign_120_8.hpp b/benchmark-linxisa/kernels/reduction/reducemax_colvec_unalign_120_8.hpp index adab096..04fa543 100644 --- a/benchmark-linxisa/kernels/reduction/reducemax_colvec_unalign_120_8.hpp +++ b/benchmark-linxisa/kernels/reduction/reducemax_colvec_unalign_120_8.hpp @@ -1,167 +1,46 @@ -#ifndef REDUCESUMCOLVEC_KERNEL_HPP -#define REDUCESUMCOLVEC_KERNEL_HPP - +#ifndef REDUCEMAXCOLVEC_KERNEL_HPP +#define REDUCEMAXCOLVEC_KERNEL_HPP #include -#include "template_asm.h" - - -using namespace pto; - -#pragma once #include #include - -// ============================================== -// ============================================== -template -void __vec__ reducemax_col_tmp( - typename tileTmp::TileDType __out__ tmp_max, - const typename tileSrc::TileDType __in__ src -) -{ - size_t i = blkv_get_index_x(); - - __vbuf__ typename tileTmp::DType *tmp_max_ptr = blkv_get_tile_ptr(tmp_max); - __vbuf__ typename tileSrc::DType *src_ptr = blkv_get_tile_ptr(src); -// __vbuf__ typename tileMax::DType *old_max_ptr = blkv_get_tile_ptr(old_max); - - typename tileTmp::DType upd_tmp_max = 0; - - #pragma clang loop unroll(full) - for(size_t j=0;j -void __vec__ reducemax_col_final( - typename tileMax::TileDType __out__ new_max, - const typename tileTmp::TileDType __in__ src, - const typename tileMax::TileDType __in__ old_max -) -{ - size_t i = blkv_get_index_x(); - - __vbuf__ typename tileMax::DType *new_max_ptr = blkv_get_tile_ptr(new_max); - __vbuf__ typename tileTmp::DType *src_ptr = blkv_get_tile_ptr(src); - __vbuf__ typename tileMax::DType *old_max_ptr = blkv_get_tile_ptr(old_max); - - - typename tileMax::DType upd_max = old_max_ptr[i]; - - - size_t src_idx_0 = i * tileMax::ColStride + 0 * tileMax::ValidCol; - size_t src_idx_1 = i * tileMax::ColStride + 1 * tileMax::ValidCol; - size_t src_idx_2 = i * tileMax::ColStride + 2 * tileMax::ValidCol; - size_t src_idx_3 = i * tileMax::ColStride + 3 * tileMax::ValidCol; - size_t src_idx_4 = i * tileMax::ColStride + 4 * tileMax::ValidCol; - size_t src_idx_5 = i * tileMax::ColStride + 5 * tileMax::ValidCol; - size_t src_idx_6 = i * tileMax::ColStride + 6 * tileMax::ValidCol; - size_t src_idx_7 = i * tileMax::ColStride + 7 * tileMax::ValidCol; - typename tileMax::DType max_01 = blkv_max(src_ptr[src_idx_0], src_ptr[src_idx_1]); - typename tileMax::DType max_23 = blkv_max(src_ptr[src_idx_2], src_ptr[src_idx_3]); - typename tileMax::DType max_45 = blkv_max(src_ptr[src_idx_4], src_ptr[src_idx_5]); - typename tileMax::DType max_67 = blkv_max(src_ptr[src_idx_6], src_ptr[src_idx_7]); - typename tileMax::DType max_0123 = blkv_max(max_01, max_23); - typename tileMax::DType max_4567 = blkv_max(max_45, max_67); - typename tileMax::DType max_all = blkv_max(max_0123, max_4567); - -// upd_max = upd_max + max_tmp; - - - new_max_ptr[i] = blkv_max(max_all, upd_max); -} - - - - +using namespace pto; template void reducemax_col_rand( dtype *in_ptr, -// dtype *inzero_ptr, dtype *out_ptr -) +) { - -// const int Mb = (gIM/8) / tM; - - const int rmd_M = gIM % tM; - const int rmd_N = gIN % tN; -// const int rmd_M = gOM % tM; // todo 尾块怎么处理? - - using gm_shapeIn = global_tensor>; // -// using gm_shapeMax = global_tensor>; + using gm_shapeIn = global_tensor>; using gm_shapeOut = global_tensor>; - using tile_shapeData = Tile; // -// using tile_shapeData_col = Tile; // - using tile_shapeTmp = Tile; // - using tile_shapeMax = Tile; // - - - -// using tile_shapeData_row = Tile; // -// using tile_shapeData_cor = Tile; // -// using tile_shapeMax_row = Tile; // - //need tM = 1; + using tile_shapeData = Tile; + using tile_shapeTmp = Tile; + using tile_shapeMax = Tile; - - gm_shapeIn inGm(in_ptr); -// gm_shapeOut ZeroGm(inzero_ptr); + gm_shapeIn inGm(in_ptr); gm_shapeOut outGm(out_ptr); -// gm_shapeMax olcMaxGm(old_max_ptr); tile_shapeData dataTile; -// tile_shapeData_col dataTile_col; - tile_shapeTmp TmpTile; + tile_shapeTmp TmpTile; tile_shapeMax MaxTile; tile_shapeMax oldMaxTile; -// tile_shapeData_row dataTile_row; -// tile_shapeData_cor dataTile_cor; -// tile_shapeMax_row MaxTile_row; -// tile_shapeMax_row oldMaxTile_row; - -// int base = 0;// todo 生成一个标量 -// int all_num = gOM; // 总元素数量 - - using itIn = global_iterator; - using itIn_row = global_iterator; + using itIn = global_iterator; using itOut = global_iterator; - itIn gIIter(in_ptr); - itIn_row gIIter_rmd_row(in_ptr); -// itZero gZeroIter(inzero_ptr); + itIn gIIter(in_ptr); itOut gOIter(out_ptr); - auto gO = gOIter(0, 0); - TEXPANDSCALAR(oldMaxTile, 0);//初始化为0 + TEXPANDS(oldMaxTile, static_cast(0)); auto gI = gIIter(0, 0); - TCOPYIN(dataTile, gI);//补0的TLOAD - reducemax_col_tmp<<>>(TmpTile.data(), dataTile.data()); - reducemax_col_final<<>>(MaxTile.data(), TmpTile.data(), oldMaxTile.data()); - oldMaxTile = MaxTile; - TCOPYOUT(gO, MaxTile); + TLOAD(dataTile, gI); + TCOLMAX(TmpTile, dataTile); + TCOLMAX(MaxTile, TmpTile); + TMAX(oldMaxTile, oldMaxTile, MaxTile); + TSTORE(gO, MaxTile); } #endif diff --git a/benchmark-linxisa/kernels/reduction/reducemax_rowvec.hpp b/benchmark-linxisa/kernels/reduction/reducemax_rowvec.hpp index 6b95d70..cb5b23e 100644 --- a/benchmark-linxisa/kernels/reduction/reducemax_rowvec.hpp +++ b/benchmark-linxisa/kernels/reduction/reducemax_rowvec.hpp @@ -1,180 +1,95 @@ -#ifndef REDUCESUMTROWSUM_KERNEL_HPP -#define REDUCESUMTROWSUM_KERNEL_HPP - -#ifndef __vbuf__ -#define __vbuf__ -#endif - -#include -#include "template_asm.h" - -using namespace pto; +#ifndef REDUCEMAXROWVEC_KERNEL_HPP +#define REDUCEMAXROWVEC_KERNEL_HPP #pragma once +#include #include #include -template -void __vec__ reducemax_row_kernel( - typename tileMax::TileDType __out__ new_max, - const typename tileSrc::TileDType __in__ src, - const typename tileMax::TileDType __in__ old_max -) -{ -// size_t i = blkv_get_index_x(); - size_t j = blkv_get_index_x(); -// size_t j = blkv_get_index_y(); - size_t idx = j * tileMax::RowStride; - - __vbuf__ typename tileMax::DType *new_max_ptr = blkv_get_tile_ptr(new_max); - __vbuf__ typename tileSrc::DType *src_ptr = blkv_get_tile_ptr(src); - __vbuf__ typename tileMax::DType *old_max_ptr = blkv_get_tile_ptr(old_max); - - - typename tileMax::DType upd_max = old_max_ptr[idx]; - - - #pragma clang loop unroll(full) - for(size_t i=0;i void reducemax_row_rand( dtype *in_ptr, dtype *out_ptr -) +) { - const int Mb = gIM / tM; - const int Nb = gIN / tN; - - const int rmd_M = gIM % tM; // todo 尾块怎么处理? - const int rmd_N = gIN % tN; // todo 尾块怎么处理? + const int Nb = gIN / tN; + const int rmd_M = gIM % tM; + const int rmd_N = gIN % tN; - - using gm_shapeIn = global_tensor>; //将gm中的Tensor先声明为一维数据 -// using gm_shapeSum = global_tensor>; + using gm_shapeIn = global_tensor>; using gm_shapeOut = global_tensor>; - using tile_shapeData = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeData_row = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeMax = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec - - - using tile_shapeData_col = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeData_cor = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeMax_col = Tile; - - - gm_shapeIn inGm(in_ptr); + using tile_shapeData = Tile; + using tile_shapeData_row = Tile; + using tile_shapeMax = Tile; + using tile_shapeData_col = Tile; + using tile_shapeData_cor = Tile; + using tile_shapeMax_col = Tile; + using tile_shapeTmp = Tile; + using tile_shapeTmp_row = Tile; + using tile_shapeTmp_col = Tile; + using tile_shapeTmp_cor = Tile; + + gm_shapeIn inGm(in_ptr); gm_shapeOut outGm(out_ptr); -// gm_shapeSum olcSumGm(old_sum_ptr); - tile_shapeData dataTile; + tile_shapeData dataTile; tile_shapeData_row dataTile_row; tile_shapeData_col dataTile_col; - tile_shapeData_cor dataTile_cor; - + tile_shapeData_cor dataTile_cor; tile_shapeMax MaxTile; tile_shapeMax oldMaxTile; tile_shapeMax_col MaxTile_col; - tile_shapeMax_col oldMaxTile_col; + tile_shapeMax_col oldMaxTile_col; + tile_shapeTmp tmpTile; + tile_shapeTmp_row tmpTile_row; + tile_shapeTmp_col tmpTile_col; + tile_shapeTmp_cor tmpTile_cor; -// int base = 0;// todo 生成一个标量 -// int all_num = gOM; // 总元素数量 - - using itIn = global_iterator; + using itIn = global_iterator; using itOut = global_iterator; - itIn gIIter(in_ptr); + itIn gIIter(in_ptr); itOut gOIter(out_ptr); -// printf("tile_shapeSum::ValidCol = %d\n", tile_shapeSum::ValidCol); -// printf("tile_shapeSum::ValidRow = %d\n", tile_shapeSum::ValidRow); -// printf("before for\n"); for (int j = 0; j < Mb; ++j) { auto gO = gOIter(j, 0); - TEXPANDSCALAR(oldMaxTile, 0);//初始化为0 - //初始化old_sum的tile + TEXPANDS(oldMaxTile, static_cast(0)); + for (int i = 0; i < Nb; ++i) { - auto gI = gIIter(j, i); -// printf("before copy in , %d\n", i); - TCOPYIN(dataTile, gI); - reducemax_row_kernel<<>>(MaxTile.data(), dataTile.data(), oldMaxTile.data()); -// reducesum_row_kernel<<<1, tile_shapeSum::ValidRow, 1>>>(SumTile.data(), dataTile.data(), oldSumTile.data()); -// printf("kernel , %d\n", i); - oldMaxTile = MaxTile; + auto gI = gIIter(j, i); + TLOAD(dataTile, gI); + TROWMAX(MaxTile, dataTile, tmpTile); + TMAX(oldMaxTile, oldMaxTile, MaxTile); } -// printf("end for%d\n",j); - //for row corner - if constexpr (rmd_N > 0){ + if constexpr (rmd_N > 0) { auto gI = gIIter(j, Nb); - TCOPYIN(dataTile_row, gI); - reducemax_row_kernel<<>>(MaxTile.data(), dataTile_row.data(), oldMaxTile.data()); -// reducesum_row_kernel<<>>(SumTile.data(), dataTile_row.data(), oldSumTile.data()); - oldMaxTile = MaxTile; + TLOAD(dataTile_row, gI); + TROWMAX(MaxTile, dataTile_row, tmpTile_row); + TMAX(oldMaxTile, oldMaxTile, MaxTile); } -// printf("before tcopyout\n"); - TCOPYOUT(gO, MaxTile); -// printf("end tcopyout\n"); + TSTORE(gO, oldMaxTile); } - //for col cor - if constexpr (rmd_M > 0){ + if constexpr (rmd_M > 0) { auto gO = gOIter(Mb, 0); - TEXPANDSCALAR(oldMaxTile_col, 0);//初始化为0 - //初始化old_sum的tile + TEXPANDS(oldMaxTile_col, static_cast(0)); + for (int i = 0; i < Nb; ++i) { auto gI = gIIter(Mb, i); - TCOPYIN(dataTile_col, gI); - reducemax_row_kernel<<>>(MaxTile_col.data(), dataTile_col.data(), oldMaxTile_col.data()); - oldMaxTile_col = MaxTile_col; + TLOAD(dataTile_col, gI); + TROWMAX(MaxTile_col, dataTile_col, tmpTile_col); + TMAX(oldMaxTile_col, oldMaxTile_col, MaxTile_col); } - if constexpr (rmd_N > 0){ + if constexpr (rmd_N > 0) { auto gI = gIIter(Mb, Nb); - TCOPYIN(dataTile_cor, gI); - reducemax_row_kernel<<>>(MaxTile_col.data(), dataTile_cor.data(), oldMaxTile_col.data()); - oldMaxTile_col = MaxTile_col; + TLOAD(dataTile_cor, gI); + TROWMAX(MaxTile_col, dataTile_cor, tmpTile_cor); + TMAX(oldMaxTile_col, oldMaxTile_col, MaxTile_col); } - TCOPYOUT(gO, MaxTile_col); - } -/* - for(int i = 0; i < gIM; i++){ - printf("out%d = %d\n", i, out_ptr[i]); + TSTORE(gO, oldMaxTile_col); } -*/ -// printf("end program\n"); } #endif diff --git a/benchmark-linxisa/kernels/reduction/reducemax_rowvec_single_tree.hpp b/benchmark-linxisa/kernels/reduction/reducemax_rowvec_single_tree.hpp index 8f066d2..8eae796 100644 --- a/benchmark-linxisa/kernels/reduction/reducemax_rowvec_single_tree.hpp +++ b/benchmark-linxisa/kernels/reduction/reducemax_rowvec_single_tree.hpp @@ -1,233 +1,60 @@ #ifndef REDUCEMAXTROWMAX_KERNEL_HPP #define REDUCEMAXTROWMAX_KERNEL_HPP -#ifndef __vbuf__ -#define __vbuf__ -#endif - -#include - -using namespace pto; - #pragma once +#include #include #include -template -void __vec__ reducemax_row_kernel( - typename tileTmpMax::TileDType __out__ new_max, - const typename tileSrc::TileDType __in__ src, - const typename tileSrcCol::TileDType __in__ src_col, - const typename tileTmpMax::TileDType __in__ old_max, - const size_t tile_idx -) -{ - - size_t j = blkv_get_index_x(); - size_t z = blkv_get_index_y(); - size_t stride_src = z * (tileSrc::ValidCol/4) * tileSrc::ColStride; - size_t stride_src_col = z * (tileSrcCol::ValidCol/4) * tileSrcCol::ColStride; - - __vbuf__ typename tileTmpMax::DType *new_max_ptr = blkv_get_tile_ptr(new_max); - __vbuf__ typename tileSrc::DType *src_ptr = blkv_get_tile_ptr(src); - __vbuf__ typename tileSrc::DType *src_col_ptr = blkv_get_tile_ptr(src_col); - __vbuf__ typename tileTmpMax::DType *old_max_ptr = blkv_get_tile_ptr(old_max); - -/* - #pragma clang loop unroll(full) - for(size_t i=0;i -void __vec__ reducemax_row_final_kernel( - typename tileMax::TileDType __out__ new_max, - const typename tileTmpMax::TileDType __in__ tmp_max -){ - size_t j = blkv_get_index_x(); - size_t idx = j * tileMax::RowStride; - - __vbuf__ typename tileMax::DType *new_max_ptr = blkv_get_tile_ptr(new_max); - __vbuf__ typename tileTmpMax::DType *tmp_max_ptr = blkv_get_tile_ptr(tmp_max); - - #pragma clang loop unroll(full) - for(size_t i=0;i void reducemax_row_rand( dtype *in_ptr, dtype *out_ptr -) +) { - const int Mb = gIM / tM; - const int Nb = gIN / tN; - - const int rmd_M = gIM % tM; // todo 尾块怎么处理? - const int rmd_N = gIN % tN; // todo 尾块怎么处理? - + const int Nb = gIN / tN; + const int rmd_M = gIM % tM; + const int rmd_N = gIN % tN; - using gm_shapeIn = global_tensor>; //将gm中的Tensor先声明为一维数据 -// using gm_shapeMax = global_tensor>; + using gm_shapeIn = global_tensor>; using gm_shapeOut = global_tensor>; - using tile_shapeData = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeDataCol = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeMax = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec - using tile_shapeTmpMax = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec + using tile_shapeData = Tile; + using tile_shapeMax = Tile; + using tile_shapeTmpMax = Tile; + using tile_shapeTmp = Tile; - - gm_shapeIn inGm(in_ptr); + gm_shapeIn inGm(in_ptr); gm_shapeOut outGm(out_ptr); -// gm_shapeMax olcMaxGm(old_max_ptr); - tile_shapeData dataTile; - tile_shapeDataCol dataTile_col; + tile_shapeData dataTile; tile_shapeMax MaxTile; tile_shapeTmpMax oldtmpMaxTile; tile_shapeTmpMax tmpMaxTile; + tile_shapeTmp tmpTile; -// int base = 0;// todo 生成一个标量 -// int all_num = gOM; // 总元素数量 - - using itIn = global_iterator; + using itIn = global_iterator; using itOut = global_iterator; - itIn gIIter(in_ptr); + itIn gIIter(in_ptr); itOut gOIter(out_ptr); - auto gO = gOIter(0, 0); - TEXPANDSCALAR(oldtmpMaxTile, 0);//初始化为0 - TEXPANDSCALAR(dataTile_col, 0);//初始化为0 + TEXPANDS(oldtmpMaxTile, static_cast(0)); + for (int i = 0; i < Nb; ++i) { - auto gI = gIIter(0, i); - TCOPYIN(dataTile, gI); - reducemax_row_kernel<<>>(tmpMaxTile.data(), - dataTile.data(), - dataTile_col.data(), - oldtmpMaxTile.data(), - i); - oldtmpMaxTile = tmpMaxTile; + auto gI = gIIter(0, i); + TLOAD(dataTile, gI); + tile_shapeMax partialMax; + TROWMAX(partialMax, dataTile, tmpTile); + TMAX(oldtmpMaxTile, oldtmpMaxTile, partialMax); } - reducemax_row_final_kernel<<>>(MaxTile.data(), - tmpMaxTile.data()); - TCOPYOUT(gO, MaxTile); + TMOV(tmpMaxTile, oldtmpMaxTile); + + TROWMAX(MaxTile, tmpMaxTile, tmpTile); + TSTORE(gO, MaxTile); } #endif - diff --git a/benchmark-linxisa/kernels/reduction/reduceprod_colvec.hpp b/benchmark-linxisa/kernels/reduction/reduceprod_colvec.hpp index 15d12c1..a5b1d4b 100644 --- a/benchmark-linxisa/kernels/reduction/reduceprod_colvec.hpp +++ b/benchmark-linxisa/kernels/reduction/reduceprod_colvec.hpp @@ -1,144 +1,85 @@ -#ifndef REDUCESUMCOLVEC_KERNEL_HPP -#define REDUCESUMCOLVEC_KERNEL_HPP - +#ifndef REDUCEPRODCOLVEC_KERNEL_HPP +#define REDUCEPRODCOLVEC_KERNEL_HPP #include -#include "template_asm.h" - - -using namespace pto; - -#pragma once #include #include - -// ============================================== -// ============================================== -template -void __vec__ reduceprod_col_kernel( - typename timeProd::TileDType __out__ new_prod, - const typename tileSrc::TileDType __in__ src, - const typename timeProd::TileDType __in__ old_prod -) -{ - size_t i = blkv_get_index_x(); - - __vbuf__ typename timeProd::DType *new_prod_ptr = blkv_get_tile_ptr(new_prod); - __vbuf__ typename tileSrc::DType *src_ptr = blkv_get_tile_ptr(src); - __vbuf__ typename timeProd::DType *old_prod_ptr = blkv_get_tile_ptr(old_prod); - - - typename timeProd::DType upd_prod = old_prod_ptr[i]; - - #pragma clang loop unroll(full) - for(size_t j=0;j void reduceprod_col_rand( dtype *in_ptr, -// dtype *inzero_ptr, dtype *out_ptr -) +) { - const int Mb = gIM / tM; - const int Nb = gIN / tN; - + const int Nb = gIN / tN; const int rmd_M = gIM % tM; const int rmd_N = gIN % tN; -// const int rmd_M = gOM % tM; // todo 尾块怎么处理? - using gm_shapeIn = global_tensor>; // -// using gm_shapeSum = global_tensor>; + using gm_shapeIn = global_tensor>; using gm_shapeOut = global_tensor>; - using tile_shapeData = Tile; // - using tile_shapeData_col = Tile; // - using tile_shapeProd = Tile; // - - - - using tile_shapeData_row = Tile; // - using tile_shapeData_cor = Tile; // - using tile_shapeProd_row = Tile; // - //need tM = 1; - - - gm_shapeIn inGm(in_ptr); -// gm_shapeOut ZeroGm(inzero_ptr); + using tile_shapeData = Tile; + using tile_shapeData_col = Tile; + using tile_shapeProd = Tile; + using tile_shapeData_row = Tile; + using tile_shapeData_cor = Tile; + using tile_shapeProd_row = Tile; + + gm_shapeIn inGm(in_ptr); gm_shapeOut outGm(out_ptr); -// gm_shapeSum olcSumGm(old_sum_ptr); tile_shapeData dataTile; - tile_shapeData_col dataTile_col; + tile_shapeData_col dataTile_col; tile_shapeProd ProdTile; tile_shapeProd oldProdTile; - tile_shapeData_row dataTile_row; - tile_shapeData_cor dataTile_cor; + tile_shapeData_cor dataTile_cor; tile_shapeProd_row ProdTile_row; - tile_shapeProd_row oldProdTile_row; + tile_shapeProd_row oldProdTile_row; -// int base = 0;// todo 生成一个标量 -// int all_num = gOM; // 总元素数量 - - using itIn = global_iterator; - using itIn_row = global_iterator; + using itIn = global_iterator; using itOut = global_iterator; - itIn gIIter(in_ptr); - itIn_row gIIter_rmd_row(in_ptr); -// itZero gZeroIter(inzero_ptr); + itIn gIIter(in_ptr); itOut gOIter(out_ptr); -// dtype zero = 0; - for (int j = 0; j < Nb; ++j) { -// auto gZero = gZeroIter(0, j); auto gO = gOIter(0, j); - TEXPANDSCALAR(oldProdTile, 0);//初始化为0 -// TCOPYIN(oldSumTile, gZero);//初始化为0 - //初始化old_sum的tile - //need + TEXPANDS(oldProdTile, static_cast(1)); + for (int i = 0; i < Mb; ++i) { auto gI = gIIter(i, j); - TCOPYIN(dataTile, gI); - reduceprod_col_kernel<<>>(ProdTile.data(), dataTile.data(), oldProdTile.data()); - oldProdTile = ProdTile; + TLOAD(dataTile, gI); + TCOLPROD(ProdTile, dataTile); + TMUL(oldProdTile, oldProdTile, ProdTile); } - if constexpr (rmd_M > 0){ + if constexpr (rmd_M > 0) { auto gI = gIIter(Mb, j); - TCOPYIN(dataTile_col, gI); - reduceprod_col_kernel<<>>(ProdTile.data(), dataTile_col.data(), oldProdTile.data()); - oldProdTile = ProdTile; + TLOAD(dataTile_col, gI); + TCOLPROD(ProdTile, dataTile_col); + TMUL(oldProdTile, oldProdTile, ProdTile); } - TCOPYOUT(gO, ProdTile); + TSTORE(gO, oldProdTile); } - if constexpr (rmd_N > 0){ -// auto gZero = gZeroIter(0, Nb); + if constexpr (rmd_N > 0) { auto gO = gOIter(0, Nb); - TEXPANDSCALAR(oldProdTile_row, 0);//初始化为0 -// TCOPYIN(oldSumTile_row, gZero);//初始化为0 - for (int i = 0; i < Mb; ++i) { + TEXPANDS(oldProdTile_row, static_cast(1)); + + for (int i = 0; i < Mb; ++i) { auto gI = gIIter(i, Nb); - TCOPYIN(dataTile_row, gI); - reduceprod_col_kernel<<>>(ProdTile_row.data(), dataTile_row.data(), oldProdTile_row.data()); - oldProdTile_row = ProdTile_row; + TLOAD(dataTile_row, gI); + TCOLPROD(ProdTile_row, dataTile_row); + TMUL(oldProdTile_row, oldProdTile_row, ProdTile_row); } - if constexpr (rmd_M > 0){ + if constexpr (rmd_M > 0) { auto gI = gIIter(Mb, Nb); - TCOPYIN(dataTile_cor, gI); - reduceprod_col_kernel<<>>(ProdTile_row.data(), dataTile_cor.data(), oldProdTile_row.data()); - oldProdTile_row = ProdTile_row; + TLOAD(dataTile_cor, gI); + TCOLPROD(ProdTile_row, dataTile_cor); + TMUL(oldProdTile_row, oldProdTile_row, ProdTile_row); } - TCOPYOUT(gO, ProdTile_row); + TSTORE(gO, oldProdTile_row); } } diff --git a/benchmark-linxisa/kernels/reduction/reduceprod_rowvec.hpp b/benchmark-linxisa/kernels/reduction/reduceprod_rowvec.hpp index 7494130..274021b 100644 --- a/benchmark-linxisa/kernels/reduction/reduceprod_rowvec.hpp +++ b/benchmark-linxisa/kernels/reduction/reduceprod_rowvec.hpp @@ -1,151 +1,95 @@ -#ifndef REDUCESUMTROWSUM_KERNEL_HPP -#define REDUCESUMTROWSUM_KERNEL_HPP - -#ifndef __vbuf__ -#define __vbuf__ -#endif - -#include - -using namespace pto; +#ifndef REDUCEPRODROWVEC_KERNEL_HPP +#define REDUCEPRODROWVEC_KERNEL_HPP #pragma once +#include #include #include -template -void __vec__ reduceprod_row_kernel( - typename tileProd::TileDType __out__ new_prod, - const typename tileSrc::TileDType __in__ src, - const typename tileProd::TileDType __in__ old_prod -) -{ -// size_t i = blkv_get_index_x(); - size_t j = blkv_get_index_x(); -// size_t j = blkv_get_index_y(); - size_t idx = j * tileProd::RowStride; - - __vbuf__ typename tileProd::DType *new_prod_ptr = blkv_get_tile_ptr(new_prod); - __vbuf__ typename tileSrc::DType *src_ptr = blkv_get_tile_ptr(src); - __vbuf__ typename tileProd::DType *old_prod_ptr = blkv_get_tile_ptr(old_prod); - - - typename tileProd::DType upd_prod = old_prod_ptr[idx]; - - #pragma clang loop unroll(full) - for(size_t i=0;i void reduceprod_row_rand( dtype *in_ptr, dtype *out_ptr -) +) { - const int Mb = gIM / tM; - const int Nb = gIN / tN; + const int Nb = gIN / tN; + const int rmd_M = gIM % tM; + const int rmd_N = gIN % tN; - const int rmd_M = gIM % tM; // todo 尾块怎么处理? - const int rmd_N = gIN % tN; // todo 尾块怎么处理? - - - using gm_shapeIn = global_tensor>; //将gm中的Tensor先声明为一维数据 -// using gm_shapeSum = global_tensor>; + using gm_shapeIn = global_tensor>; using gm_shapeOut = global_tensor>; - using tile_shapeData = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeData_row = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeProd = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec - - - using tile_shapeData_col = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeData_cor = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeProd_col = Tile; - - - gm_shapeIn inGm(in_ptr); + using tile_shapeData = Tile; + using tile_shapeData_row = Tile; + using tile_shapeProd = Tile; + using tile_shapeData_col = Tile; + using tile_shapeData_cor = Tile; + using tile_shapeProd_col = Tile; + using tile_shapeTmp = Tile; + using tile_shapeTmp_row = Tile; + using tile_shapeTmp_col = Tile; + using tile_shapeTmp_cor = Tile; + + gm_shapeIn inGm(in_ptr); gm_shapeOut outGm(out_ptr); -// gm_shapeSum olcSumGm(old_sum_ptr); - tile_shapeData dataTile; + tile_shapeData dataTile; tile_shapeData_row dataTile_row; tile_shapeData_col dataTile_col; - tile_shapeData_cor dataTile_cor; - + tile_shapeData_cor dataTile_cor; tile_shapeProd ProdTile; tile_shapeProd oldProdTile; tile_shapeProd_col ProdTile_col; - tile_shapeProd_col oldProdTile_col; - -// int base = 0;// todo 生成一个标量 -// int all_num = gOM; // 总元素数量 + tile_shapeProd_col oldProdTile_col; + tile_shapeTmp tmpTile; + tile_shapeTmp_row tmpTile_row; + tile_shapeTmp_col tmpTile_col; + tile_shapeTmp_cor tmpTile_cor; - using itIn = global_iterator; + using itIn = global_iterator; using itOut = global_iterator; - itIn gIIter(in_ptr); + itIn gIIter(in_ptr); itOut gOIter(out_ptr); -// printf("tile_shapeSum::ValidCol = %d\n", tile_shapeSum::ValidCol); -// printf("tile_shapeSum::ValidRow = %d\n", tile_shapeSum::ValidRow); -// printf("before for\n"); for (int j = 0; j < Mb; ++j) { auto gO = gOIter(j, 0); - TEXPANDSCALAR(oldProdTile, 0);//初始化为0 - //初始化old_sum的tile + TEXPANDS(oldProdTile, static_cast(1)); + for (int i = 0; i < Nb; ++i) { - auto gI = gIIter(j, i); -// printf("before copy in , %d\n", i); - TCOPYIN(dataTile, gI); - reduceprod_row_kernel<<>>(ProdTile.data(), dataTile.data(), oldProdTile.data()); -// reducesum_row_kernel<<<1, tile_shapeSum::ValidRow, 1>>>(SumTile.data(), dataTile.data(), oldSumTile.data()); -// printf("kernel , %d\n", i); - oldProdTile = ProdTile; + auto gI = gIIter(j, i); + TLOAD(dataTile, gI); + TROWPROD(ProdTile, dataTile, tmpTile); + TMUL(oldProdTile, oldProdTile, ProdTile); } -// printf("end for%d\n",j); - //for row corner - if constexpr (rmd_N > 0){ + if constexpr (rmd_N > 0) { auto gI = gIIter(j, Nb); - TCOPYIN(dataTile_row, gI); - reduceprod_row_kernel<<>>(ProdTile.data(), dataTile_row.data(), oldProdTile.data()); -// reducesum_row_kernel<<>>(SumTile.data(), dataTile_row.data(), oldSumTile.data()); - oldProdTile = ProdTile; + TLOAD(dataTile_row, gI); + TROWPROD(ProdTile, dataTile_row, tmpTile_row); + TMUL(oldProdTile, oldProdTile, ProdTile); } -// printf("before tcopyout\n"); - TCOPYOUT(gO, ProdTile); -// printf("end tcopyout\n"); + TSTORE(gO, oldProdTile); } - //for col cor - if constexpr (rmd_M > 0){ + if constexpr (rmd_M > 0) { auto gO = gOIter(Mb, 0); - TEXPANDSCALAR(oldProdTile_col, 0);//初始化为0 - //初始化old_sum的tile + TEXPANDS(oldProdTile_col, static_cast(1)); + for (int i = 0; i < Nb; ++i) { - auto gI = gIIter(Mb, i); - TCOPYIN(dataTile_col, gI); - reduceprod_row_kernel<<>>(ProdTile_col.data(), dataTile_col.data(), oldProdTile_col.data()); - oldProdTile_col = ProdTile_col; + auto gI = gIIter(Mb, i); + TLOAD(dataTile_col, gI); + TROWPROD(ProdTile_col, dataTile_col, tmpTile_col); + TMUL(oldProdTile_col, oldProdTile_col, ProdTile_col); } - if constexpr (rmd_N > 0){ + if constexpr (rmd_N > 0) { auto gI = gIIter(Mb, Nb); - TCOPYIN(dataTile_cor, gI); - reduceprod_row_kernel<<>>(ProdTile_col.data(), dataTile_cor.data(), oldProdTile_col.data()); - oldProdTile_col = ProdTile_col; + TLOAD(dataTile_cor, gI); + TROWPROD(ProdTile_col, dataTile_cor, tmpTile_cor); + TMUL(oldProdTile_col, oldProdTile_col, ProdTile_col); } - TCOPYOUT(gO, ProdTile_col); - } -/* - for(int i = 0; i < gIM; i++){ - printf("out%d = %d\n", i, out_ptr[i]); + TSTORE(gO, oldProdTile_col); } -*/ -// printf("end program\n"); } #endif diff --git a/benchmark-linxisa/kernels/reduction/reducesum_colvec.hpp b/benchmark-linxisa/kernels/reduction/reducesum_colvec.hpp index 431d8d4..31e54a0 100644 --- a/benchmark-linxisa/kernels/reduction/reducesum_colvec.hpp +++ b/benchmark-linxisa/kernels/reduction/reducesum_colvec.hpp @@ -1,168 +1,95 @@ #ifndef REDUCESUMCOLVEC_KERNEL_HPP #define REDUCESUMCOLVEC_KERNEL_HPP - #include -#include "template_asm.h" - - -using namespace pto; - -#pragma once #include #include - -// ============================================== -// ============================================== -template -void __vec__ reducesum_col_kernel( - typename tileSum::TileDType __out__ new_sum, - const typename tileSrc::TileDType __in__ src, - const typename tileSum::TileDType __in__ old_sum -) -{ - size_t i = blkv_get_index_x(); - - __vbuf__ typename tileSum::DType *new_sum_ptr = blkv_get_tile_ptr(new_sum); - __vbuf__ typename tileSrc::DType *src_ptr = blkv_get_tile_ptr(src); - __vbuf__ typename tileSum::DType *old_sum_ptr = blkv_get_tile_ptr(old_sum); - - -// typename tileSum::DType upd_sum = old_sum_ptr[i]; - typename tileSum::DType upd_sum = 0; - - #pragma clang loop unroll(full) - for(size_t j=0;j void reducesum_colsum_rand( dtype *in_ptr, -// dtype *inzero_ptr, dtype *out_ptr -) +) { - const int Mb = gIM / tM; - const int Nb = gIN / tN; - + const int Nb = gIN / tN; const int rmd_M = gIM % tM; const int rmd_N = gIN % tN; -// const int rmd_M = gOM % tM; // todo 尾块怎么处理? - using gm_shapeIn = global_tensor>; // -// using gm_shapeSum = global_tensor>; + using gm_shapeIn = global_tensor>; using gm_shapeOut = global_tensor>; - using tile_shapeData = Tile; // - using tile_shapeData_col = Tile; // - using tile_shapeSum = Tile; // - - - - using tile_shapeData_row = Tile; // - using tile_shapeData_cor = Tile; // - using tile_shapeSum_row = Tile; // - //need tM = 1; - - - gm_shapeIn inGm(in_ptr); -// gm_shapeOut ZeroGm(inzero_ptr); + using tile_shapeData = Tile; + using tile_shapeData_col = Tile; + using tile_shapeSum = Tile; + using tile_shapeData_row = Tile; + using tile_shapeData_cor = Tile; + using tile_shapeSum_row = Tile; + + using tile_shapeTmp = Tile; + using tile_shapeTmp_col = Tile; + using tile_shapeTmp_row = Tile; + using tile_shapeTmp_cor = Tile; + + gm_shapeIn inGm(in_ptr); gm_shapeOut outGm(out_ptr); -// gm_shapeSum olcSumGm(old_sum_ptr); tile_shapeData dataTile; - tile_shapeData_col dataTile_col; + tile_shapeData_col dataTile_col; tile_shapeSum SumTile; tile_shapeSum oldSumTile; - tile_shapeData_row dataTile_row; - tile_shapeData_cor dataTile_cor; + tile_shapeData_cor dataTile_cor; tile_shapeSum_row SumTile_row; - tile_shapeSum_row oldSumTile_row; + tile_shapeSum_row oldSumTile_row; -// int base = 0;// todo 生成一个标量 -// int all_num = gOM; // 总元素数量 + tile_shapeTmp tmpTile; + tile_shapeTmp_col tmpTile_col; + tile_shapeTmp_row tmpTile_row; + tile_shapeTmp_cor tmpTile_cor; - using itIn = global_iterator; - using itIn_row = global_iterator; + using itIn = global_iterator; using itOut = global_iterator; - itIn gIIter(in_ptr); - itIn_row gIIter_rmd_row(in_ptr); -// itZero gZeroIter(inzero_ptr); + itIn gIIter(in_ptr); itOut gOIter(out_ptr); -// dtype zero = 0; - for (int j = 0; j < Nb; ++j) { -// auto gZero = gZeroIter(0, j); auto gO = gOIter(0, j); - TEXPANDSCALAR(oldSumTile, 0);//初始化为0 -// TCOPYIN(oldSumTile, gZero);//初始化为0 - //初始化old_sum的tile - //need + TEXPANDS(oldSumTile, static_cast(0.0f)); + for (int i = 0; i < Mb; ++i) { auto gI = gIIter(i, j); - TCOPYIN(dataTile, gI); - reducesum_col_kernel<<>>(SumTile.data(), dataTile.data(), oldSumTile.data()); - oldSumTile = SumTile; + TLOAD(dataTile, gI); + TCOLSUM(SumTile, dataTile, tmpTile, /*isBinary=*/true); + TADD(oldSumTile, oldSumTile, SumTile); } - if constexpr (rmd_M > 0){ + if constexpr (rmd_M > 0) { auto gI = gIIter(Mb, j); - TCOPYIN(dataTile_col, gI); - reducesum_col_kernel<<>>(SumTile.data(), dataTile_col.data(), oldSumTile.data()); - oldSumTile = SumTile; + TLOAD(dataTile_col, gI); + TCOLSUM(SumTile, dataTile_col, tmpTile_col, /*isBinary=*/true); + TADD(oldSumTile, oldSumTile, SumTile); } - TCOPYOUT(gO, SumTile); + TSTORE(gO, oldSumTile); } - if constexpr (rmd_N > 0){ -// auto gZero = gZeroIter(0, Nb); + if constexpr (rmd_N > 0) { auto gO = gOIter(0, Nb); - TEXPANDSCALAR(oldSumTile_row, 0);//初始化为0 -// TCOPYIN(oldSumTile_row, gZero);//初始化为0 - for (int i = 0; i < Mb; ++i) { + TEXPANDS(oldSumTile_row, static_cast(0)); + + for (int i = 0; i < Mb; ++i) { auto gI = gIIter(i, Nb); - TCOPYIN(dataTile_row, gI); - reducesum_col_kernel<<>>(SumTile_row.data(), dataTile_row.data(), oldSumTile_row.data()); - oldSumTile_row = SumTile_row; + TLOAD(dataTile_row, gI); + TCOLSUM(SumTile_row, dataTile_row, tmpTile_row, /*isBinary=*/true); + TADD(oldSumTile_row, oldSumTile_row, SumTile_row); } - if constexpr (rmd_M > 0){ + if constexpr (rmd_M > 0) { auto gI = gIIter(Mb, Nb); - TCOPYIN(dataTile_cor, gI); - reducesum_col_kernel<<>>(SumTile_row.data(), dataTile_cor.data(), oldSumTile_row.data()); - oldSumTile_row = SumTile_row; + TLOAD(dataTile_cor, gI); + TCOLSUM(SumTile_row, dataTile_cor, tmpTile_cor, /*isBinary=*/true); + TADD(oldSumTile_row, oldSumTile_row, SumTile_row); } - TCOPYOUT(gO, SumTile_row); + TSTORE(gO, oldSumTile_row); } } diff --git a/benchmark-linxisa/kernels/reduction/reducesum_colvec_single_tree.hpp b/benchmark-linxisa/kernels/reduction/reducesum_colvec_single_tree.hpp index f24fb56..19dbd18 100644 --- a/benchmark-linxisa/kernels/reduction/reducesum_colvec_single_tree.hpp +++ b/benchmark-linxisa/kernels/reduction/reducesum_colvec_single_tree.hpp @@ -1,227 +1,78 @@ #ifndef REDUCESUMCOLVEC_KERNEL_HPP #define REDUCESUMCOLVEC_KERNEL_HPP - #include -#include "template_asm.h" - - -using namespace pto; - -#pragma once #include #include - -// ============================================== -// ============================================== -//tile内进行reduce,所有tile的reduce结果统一存到一个tile中。 -template -void __vec__ reducesum_col_kernel( - typename tileTmpSum::TileDType __out__ new_sum, - const typename tileSrc::TileDType __in__ src, - const typename tileTmpSum::TileDType __in__ old_sum, - const size_t tile_idx -) -{ - size_t i = blkv_get_index_x(); - - __vbuf__ typename tileTmpSum::DType *new_sum_ptr = blkv_get_tile_ptr(new_sum); - __vbuf__ typename tileSrc::DType *src_ptr = blkv_get_tile_ptr(src); - __vbuf__ typename tileTmpSum::DType *old_sum_ptr = blkv_get_tile_ptr(old_sum); - - #pragma clang loop unroll(full) - for(size_t j=0;j -void __vec__ reducesum_col_final_kernel( - typename tileSum::TileDType __out__ new_sum, - const typename tileTmpSum::TileDType __in__ tmp_sum -){ - size_t i = blkv_get_index_x(); - __vbuf__ typename tileSum::DType *new_sum_ptr = blkv_get_tile_ptr(new_sum); - __vbuf__ typename tileTmpSum::DType *tmp_sum_ptr = blkv_get_tile_ptr(tmp_sum); - - #pragma clang loop unroll(full) - for(size_t j=0;j void reducesum_colsum_rand( - dtype *in_ptr, + dtype *in_ptr, dtype *out_ptr -) +) { - const int Mb = gIM / tM; - const int Nb = gIN / tN; - + const int Nb = gIN / tN; const int rmd_M = gIM % tM; const int rmd_N = gIN % tN; -// const int rmd_M = gOM % tM; // todo 尾块怎么处理? - using gm_shapeIn = global_tensor>; // + using gm_shapeIn = global_tensor>; using gm_shapeOut = global_tensor>; - using tile_shapeData = Tile; // - using tile_shapeData_col = Tile; // - using tile_shapeSum = Tile; // - using tile_shapeTmpSum = Tile; // -// using tile_shapeTmpSum_l2 = Tile; // + using tile_shapeData = Tile; + using tile_shapeData_col = Tile; + using tile_shapeSum = Tile; + using tile_shapeTmpSum = Tile; + using tile_shapeTmp = Tile; + using tile_shapeTmp_col = Tile; + using tile_shapeTmp_final = Tile; - -// using tile_shapeData_row = Tile; // -// using tile_shapeData_cor = Tile; // -// using tile_shapeSum_row = Tile; // - //need tM = 1; - - - gm_shapeIn inGm(in_ptr); - gm_shapeOut outGm(out_ptr); + gm_shapeIn inGm(in_ptr); + gm_shapeOut outGm(out_ptr); tile_shapeData dataTile; - tile_shapeData_col dataTile_col; + tile_shapeData_col dataTile_col; tile_shapeSum SumTile; - tile_shapeTmpSum oldtmpSumTile; tile_shapeTmpSum tmpSumTile; -// tile_shapeTmpSum_l2 tmpSumTile_l2; -// tile_shapeData_row dataTile_row; -// tile_shapeData_cor dataTile_cor; -// tile_shapeSum_row SumTile_row; -// tile_shapeSum_row oldSumTile_row; - -// int base = 0;// todo 生成一个标量 -// int all_num = gOM; // 总元素数量 + tile_shapeTmp tmpTile; + tile_shapeTmp_col tmpTile_col; + tile_shapeTmp_final tmpTile_final; using itIn = global_iterator; using itOut = global_iterator; - itIn gIIter(in_ptr); + itIn gIIter(in_ptr); itOut gOIter(out_ptr); -// dtype zero = 0; - -// for (int j = 0; j < Nb; ++j) { -// auto gZero = gZeroIter(0, j); auto gO = gOIter(0, 0); - TEXPANDSCALAR(oldtmpSumTile, 0);//初始化为0 -// TEXPANDSCALAR(tmpSumTile, 0);//初始化为0 -// TEXPANDSCALAR(tmpSumTile_l2, 0);//初始化为0 - for (size_t i = 0; i < Mb; ++i){ + + for (size_t i = 0; i < static_cast(Mb); ++i) { auto gI = gIIter(i, 0); - TCOPYIN(dataTile, gI); - reducesum_col_kernel<<>>(tmpSumTile.data(), - dataTile.data(), - oldtmpSumTile.data(), - i); - oldtmpSumTile = tmpSumTile; + TLOAD(dataTile, gI); + tile_shapeSum partialSum; + TCOLSUM(partialSum, dataTile, tmpTile, /*isBinary=*/true); + + using SingleRow = Tile; + SingleRow rowView; + TMOV(rowView, partialSum); + if (i == 0) { + TMOV(tmpSumTile, partialSum); + } else { + TADD(tmpSumTile, tmpSumTile, partialSum); + } + } + if constexpr (rmd_M > 0) { + auto gI = gIIter(Mb, 0); + TLOAD(dataTile_col, gI); + tile_shapeSum partialSum; + TCOLSUM(partialSum, dataTile_col, tmpTile_col, /*isBinary=*/true); + TADD(tmpSumTile, tmpSumTile, partialSum); } - reducesum_col_final_kernel<<>>(SumTile.data(), - tmpSumTile.data()); - TCOPYOUT(gO, SumTile); -} + TCOLSUM(SumTile, tmpSumTile, tmpTile_final, /*isBinary=*/true); + TSTORE(gO, SumTile); +} #endif diff --git a/benchmark-linxisa/kernels/reduction/reducesum_colvec_unalign_120_8.hpp b/benchmark-linxisa/kernels/reduction/reducesum_colvec_unalign_120_8.hpp index 45cb8f8..b7015c2 100644 --- a/benchmark-linxisa/kernels/reduction/reducesum_colvec_unalign_120_8.hpp +++ b/benchmark-linxisa/kernels/reduction/reducesum_colvec_unalign_120_8.hpp @@ -1,174 +1,52 @@ #ifndef REDUCESUMCOLVEC_KERNEL_HPP #define REDUCESUMCOLVEC_KERNEL_HPP - #include -#include "template_asm.h" - - -using namespace pto; - -#pragma once #include #include - -// ============================================== -// ============================================== -template -void __vec__ reducesum_col_tmp( - typename tileTmp::TileDType __out__ tmp_sum, - const typename tileSrc::TileDType __in__ src -) -{ - size_t i = blkv_get_index_x(); - - __vbuf__ typename tileTmp::DType *tmp_sum_ptr = blkv_get_tile_ptr(tmp_sum); - __vbuf__ typename tileSrc::DType *src_ptr = blkv_get_tile_ptr(src); -// __vbuf__ typename tileSum::DType *old_sum_ptr = blkv_get_tile_ptr(old_sum); - - typename tileTmp::DType upd_tmp_sum = 0; - - #pragma clang loop unroll(full) - for(size_t j=0;j -void __vec__ reducesum_col_final( - typename tileSum::TileDType __out__ new_sum, - const typename tileTmp::TileDType __in__ src, - const typename tileSum::TileDType __in__ old_sum -) -{ - size_t i = blkv_get_index_x(); - - __vbuf__ typename tileSum::DType *new_sum_ptr = blkv_get_tile_ptr(new_sum); - __vbuf__ typename tileTmp::DType *src_ptr = blkv_get_tile_ptr(src); - __vbuf__ typename tileSum::DType *old_sum_ptr = blkv_get_tile_ptr(old_sum); - - - typename tileSum::DType upd_sum = old_sum_ptr[i]; - - - size_t src_idx_0 = i * tileSum::ColStride + 0 * tileSum::ValidCol; - size_t src_idx_1 = i * tileSum::ColStride + 1 * tileSum::ValidCol; - size_t src_idx_2 = i * tileSum::ColStride + 2 * tileSum::ValidCol; - size_t src_idx_3 = i * tileSum::ColStride + 3 * tileSum::ValidCol; - size_t src_idx_4 = i * tileSum::ColStride + 4 * tileSum::ValidCol; - size_t src_idx_5 = i * tileSum::ColStride + 5 * tileSum::ValidCol; - size_t src_idx_6 = i * tileSum::ColStride + 6 * tileSum::ValidCol; - size_t src_idx_7 = i * tileSum::ColStride + 7 * tileSum::ValidCol; - typename tileSum::DType sum_01 = src_ptr[src_idx_0] + src_ptr[src_idx_1]; - typename tileSum::DType sum_23 = src_ptr[src_idx_2] + src_ptr[src_idx_3]; - typename tileSum::DType sum_45 = src_ptr[src_idx_4] + src_ptr[src_idx_5]; - typename tileSum::DType sum_67 = src_ptr[src_idx_6] + src_ptr[src_idx_7]; - typename tileSum::DType sum_0123 = sum_01 + sum_23; - typename tileSum::DType sum_4567 = sum_45 + sum_67; - typename tileSum::DType sum_all = sum_0123 + sum_4567; - -// upd_sum = upd_sum + sum_tmp; - -/* - #pragma clang loop unroll(full) - for(size_t j=0;j void reducesum_colsum_rand( dtype *in_ptr, -// dtype *inzero_ptr, dtype *out_ptr -) +) { - -// const int Mb = (gIM/8) / tM; - - const int rmd_M = gIM % tM; - const int rmd_N = gIN % tN; -// const int rmd_M = gOM % tM; // todo 尾块怎么处理? - - using gm_shapeIn = global_tensor>; // -// using gm_shapeSum = global_tensor>; + using gm_shapeIn = global_tensor>; using gm_shapeOut = global_tensor>; - using tile_shapeData = Tile; // -// using tile_shapeData_col = Tile; // - using tile_shapeTmp = Tile; // - using tile_shapeSum = Tile; // - + using tile_shapeData = Tile; + using tile_shapeTmp = Tile; + using tile_shapeSum = Tile; + using tile_shapeTmpData = Tile; + using tile_shapeTmpTmp = Tile; -// using tile_shapeData_row = Tile; // -// using tile_shapeData_cor = Tile; // -// using tile_shapeSum_row = Tile; // - //need tM = 1; - - - gm_shapeIn inGm(in_ptr); -// gm_shapeOut ZeroGm(inzero_ptr); + gm_shapeIn inGm(in_ptr); gm_shapeOut outGm(out_ptr); -// gm_shapeSum olcSumGm(old_sum_ptr); tile_shapeData dataTile; -// tile_shapeData_col dataTile_col; - tile_shapeTmp TmpTile; + tile_shapeTmp TmpTile; tile_shapeSum SumTile; tile_shapeSum oldSumTile; -// tile_shapeData_row dataTile_row; -// tile_shapeData_cor dataTile_cor; -// tile_shapeSum_row SumTile_row; -// tile_shapeSum_row oldSumTile_row; + tile_shapeTmpData tmpTileData; + tile_shapeTmpTmp tmpTileTmp; -// int base = 0;// todo 生成一个标量 -// int all_num = gOM; // 总元素数量 - - using itIn = global_iterator; - using itIn_row = global_iterator; + using itIn = global_iterator; using itOut = global_iterator; - itIn gIIter(in_ptr); - itIn_row gIIter_rmd_row(in_ptr); -// itZero gZeroIter(inzero_ptr); + itIn gIIter(in_ptr); itOut gOIter(out_ptr); - auto gO = gOIter(0, 0); - TEXPANDSCALAR(oldSumTile, 0);//初始化为0 + TEXPANDS(oldSumTile, static_cast(0)); auto gI = gIIter(0, 0); - TCOPYIN(dataTile, gI);//TLOAD应补0,目前gfrun默认补0,需要接口去弄 - reducesum_col_tmp<<>>(TmpTile.data(), dataTile.data()); - reducesum_col_final<<>>(SumTile.data(), TmpTile.data(), oldSumTile.data()); - oldSumTile = SumTile; - TCOPYOUT(gO, SumTile); + TLOAD(dataTile, gI); + TCOLSUM(TmpTile, dataTile, tmpTileData, /*isBinary=*/true); + TCOLSUM(SumTile, TmpTile, tmpTileTmp, /*isBinary=*/true); + TADD(oldSumTile, oldSumTile, SumTile); + TSTORE(gO, SumTile); } #endif diff --git a/benchmark-linxisa/kernels/reduction/reducesum_rowvec.hpp b/benchmark-linxisa/kernels/reduction/reducesum_rowvec.hpp index be8b749..401f873 100644 --- a/benchmark-linxisa/kernels/reduction/reducesum_rowvec.hpp +++ b/benchmark-linxisa/kernels/reduction/reducesum_rowvec.hpp @@ -1,177 +1,95 @@ #ifndef REDUCESUMTROWSUM_KERNEL_HPP #define REDUCESUMTROWSUM_KERNEL_HPP -#ifndef __vbuf__ -#define __vbuf__ -#endif - -#include - -using namespace pto; - #pragma once +#include #include #include -template -void __vec__ reducesum_row_kernel( - typename tileSum::TileDType __out__ new_sum, - const typename tileSrc::TileDType __in__ src, - const typename tileSum::TileDType __in__ old_sum -) -{ -// size_t i = blkv_get_index_x(); - size_t j = blkv_get_index_x(); -// size_t j = blkv_get_index_y(); - size_t idx = j * tileSum::RowStride; - - __vbuf__ typename tileSum::DType *new_sum_ptr = blkv_get_tile_ptr(new_sum); - __vbuf__ typename tileSrc::DType *src_ptr = blkv_get_tile_ptr(src); - __vbuf__ typename tileSum::DType *old_sum_ptr = blkv_get_tile_ptr(old_sum); - - - typename tileSum::DType upd_sum = old_sum_ptr[idx]; - - #pragma clang loop unroll(full) - for(size_t i=0;i void reducesum_trowsum_rand( dtype *in_ptr, dtype *out_ptr -) +) { - const int Mb = gIM / tM; - const int Nb = gIN / tN; - - const int rmd_M = gIM % tM; // todo 尾块怎么处理? - const int rmd_N = gIN % tN; // todo 尾块怎么处理? + const int Nb = gIN / tN; + const int rmd_M = gIM % tM; + const int rmd_N = gIN % tN; - - using gm_shapeIn = global_tensor>; //将gm中的Tensor先声明为一维数据 -// using gm_shapeSum = global_tensor>; + using gm_shapeIn = global_tensor>; using gm_shapeOut = global_tensor>; - using tile_shapeData = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeData_row = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeSum = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec - - - using tile_shapeData_col = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeData_cor = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeSum_col = Tile; - - - gm_shapeIn inGm(in_ptr); + using tile_shapeData = Tile; + using tile_shapeData_row = Tile; + using tile_shapeSum = Tile; + using tile_shapeData_col = Tile; + using tile_shapeData_cor = Tile; + using tile_shapeSum_col = Tile; + using tile_shapeTmp = Tile; + using tile_shapeTmp_row = Tile; + using tile_shapeTmp_col = Tile; + using tile_shapeTmp_cor = Tile; + + gm_shapeIn inGm(in_ptr); gm_shapeOut outGm(out_ptr); -// gm_shapeSum olcSumGm(old_sum_ptr); - tile_shapeData dataTile; + tile_shapeData dataTile; tile_shapeData_row dataTile_row; tile_shapeData_col dataTile_col; - tile_shapeData_cor dataTile_cor; - + tile_shapeData_cor dataTile_cor; tile_shapeSum SumTile; tile_shapeSum oldSumTile; tile_shapeSum_col SumTile_col; - tile_shapeSum_col oldSumTile_col; + tile_shapeSum_col oldSumTile_col; + tile_shapeTmp tmpTile; + tile_shapeTmp_row tmpTile_row; + tile_shapeTmp_col tmpTile_col; + tile_shapeTmp_cor tmpTile_cor; -// int base = 0;// todo 生成一个标量 -// int all_num = gOM; // 总元素数量 - - using itIn = global_iterator; + using itIn = global_iterator; using itOut = global_iterator; - itIn gIIter(in_ptr); + itIn gIIter(in_ptr); itOut gOIter(out_ptr); -// printf("tile_shapeSum::ValidCol = %d\n", tile_shapeSum::ValidCol); -// printf("tile_shapeSum::ValidRow = %d\n", tile_shapeSum::ValidRow); -// printf("before for\n"); for (int j = 0; j < Mb; ++j) { auto gO = gOIter(j, 0); - TEXPANDSCALAR(oldSumTile, 0);//初始化为0 - //初始化old_sum的tile + TEXPANDS(oldSumTile, static_cast(0)); + for (int i = 0; i < Nb; ++i) { - auto gI = gIIter(j, i); -// printf("before copy in , %d\n", i); - TCOPYIN(dataTile, gI); - reducesum_row_kernel<<>>(SumTile.data(), dataTile.data(), oldSumTile.data()); -// reducesum_row_kernel<<<1, tile_shapeSum::ValidRow, 1>>>(SumTile.data(), dataTile.data(), oldSumTile.data()); -// printf("kernel , %d\n", i); - oldSumTile = SumTile; + auto gI = gIIter(j, i); + TLOAD(dataTile, gI); + TROWSUM(SumTile, dataTile, tmpTile); + TADD(oldSumTile, oldSumTile, SumTile); } -// printf("end for%d\n",j); - //for row corner - if constexpr (rmd_N > 0){ + if constexpr (rmd_N > 0) { auto gI = gIIter(j, Nb); - TCOPYIN(dataTile_row, gI); - reducesum_row_kernel<<>>(SumTile.data(), dataTile_row.data(), oldSumTile.data()); -// reducesum_row_kernel<<>>(SumTile.data(), dataTile_row.data(), oldSumTile.data()); - oldSumTile = SumTile; + TLOAD(dataTile_row, gI); + TROWSUM(SumTile, dataTile_row, tmpTile_row); + TADD(oldSumTile, oldSumTile, SumTile); } -// printf("before tcopyout\n"); - TCOPYOUT(gO, SumTile); -// printf("end tcopyout\n"); + TSTORE(gO, oldSumTile); } - //for col cor - if constexpr (rmd_M > 0){ + if constexpr (rmd_M > 0) { auto gO = gOIter(Mb, 0); - TEXPANDSCALAR(oldSumTile_col, 0);//初始化为0 - //初始化old_sum的tile + TEXPANDS(oldSumTile_col, static_cast(0)); + for (int i = 0; i < Nb; ++i) { - auto gI = gIIter(Mb, i); - TCOPYIN(dataTile_col, gI); - reducesum_row_kernel<<>>(SumTile_col.data(), dataTile_col.data(), oldSumTile_col.data()); - oldSumTile_col = SumTile_col; + auto gI = gIIter(Mb, i); + TLOAD(dataTile_col, gI); + TROWSUM(SumTile_col, dataTile_col, tmpTile_col); + TADD(oldSumTile_col, oldSumTile_col, SumTile_col); } - if constexpr (rmd_N > 0){ + if constexpr (rmd_N > 0) { auto gI = gIIter(Mb, Nb); - TCOPYIN(dataTile_cor, gI); - reducesum_row_kernel<<>>(SumTile_col.data(), dataTile_cor.data(), oldSumTile_col.data()); - oldSumTile_col = SumTile_col; + TLOAD(dataTile_cor, gI); + TROWSUM(SumTile_col, dataTile_cor, tmpTile_cor); + TADD(oldSumTile_col, oldSumTile_col, SumTile_col); } - TCOPYOUT(gO, SumTile_col); - } -/* - for(int i = 0; i < gIM; i++){ - printf("out%d = %d\n", i, out_ptr[i]); + TSTORE(gO, oldSumTile_col); } -*/ -// printf("end program\n"); } #endif diff --git a/benchmark-linxisa/kernels/reduction/reducesum_rowvec_single_tree.hpp b/benchmark-linxisa/kernels/reduction/reducesum_rowvec_single_tree.hpp index ddfcca7..60ddc02 100644 --- a/benchmark-linxisa/kernels/reduction/reducesum_rowvec_single_tree.hpp +++ b/benchmark-linxisa/kernels/reduction/reducesum_rowvec_single_tree.hpp @@ -1,233 +1,60 @@ #ifndef REDUCESUMTROWSUM_KERNEL_HPP #define REDUCESUMTROWSUM_KERNEL_HPP -#ifndef __vbuf__ -#define __vbuf__ -#endif - -#include - -using namespace pto; - #pragma once +#include #include #include -template -void __vec__ reducesum_row_kernel( - typename tileTmpSum::TileDType __out__ new_sum, - const typename tileSrc::TileDType __in__ src, - const typename tileSrcCol::TileDType __in__ src_col, - const typename tileTmpSum::TileDType __in__ old_sum, - const size_t tile_idx -) -{ - - size_t j = blkv_get_index_x(); - size_t z = blkv_get_index_y(); - size_t stride_src = z * (tileSrc::ValidCol/4) * tileSrc::ColStride; - size_t stride_src_col = z * (tileSrcCol::ValidCol/4) * tileSrcCol::ColStride; - - __vbuf__ typename tileTmpSum::DType *new_sum_ptr = blkv_get_tile_ptr(new_sum); - __vbuf__ typename tileSrc::DType *src_ptr = blkv_get_tile_ptr(src); - __vbuf__ typename tileSrc::DType *src_col_ptr = blkv_get_tile_ptr(src_col); - __vbuf__ typename tileTmpSum::DType *old_sum_ptr = blkv_get_tile_ptr(old_sum); - -/* - #pragma clang loop unroll(full) - for(size_t i=0;i -void __vec__ reducesum_row_final_kernel( - typename tileSum::TileDType __out__ new_sum, - const typename tileTmpSum::TileDType __in__ tmp_sum -){ - size_t j = blkv_get_index_x(); - size_t idx = j * tileSum::RowStride; - - __vbuf__ typename tileSum::DType *new_sum_ptr = blkv_get_tile_ptr(new_sum); - __vbuf__ typename tileTmpSum::DType *tmp_sum_ptr = blkv_get_tile_ptr(tmp_sum); - - #pragma clang loop unroll(full) - for(size_t i=0;i void reducesum_trowsum_rand( dtype *in_ptr, dtype *out_ptr -) +) { - const int Mb = gIM / tM; - const int Nb = gIN / tN; - - const int rmd_M = gIM % tM; // todo 尾块怎么处理? - const int rmd_N = gIN % tN; // todo 尾块怎么处理? - + const int Nb = gIN / tN; + const int rmd_M = gIM % tM; + const int rmd_N = gIN % tN; - using gm_shapeIn = global_tensor>; //将gm中的Tensor先声明为一维数据 -// using gm_shapeSum = global_tensor>; + using gm_shapeIn = global_tensor>; using gm_shapeOut = global_tensor>; - using tile_shapeData = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeDataCol = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeSum = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec - using tile_shapeTmpSum = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec + using tile_shapeData = Tile; + using tile_shapeSum = Tile; + using tile_shapeTmpSum = Tile; + using tile_shapeTmp = Tile; - - gm_shapeIn inGm(in_ptr); + gm_shapeIn inGm(in_ptr); gm_shapeOut outGm(out_ptr); -// gm_shapeSum olcSumGm(old_sum_ptr); - tile_shapeData dataTile; - tile_shapeDataCol dataTile_col; + tile_shapeData dataTile; tile_shapeSum SumTile; tile_shapeTmpSum oldtmpSumTile; tile_shapeTmpSum tmpSumTile; + tile_shapeTmp tmpTile; -// int base = 0;// todo 生成一个标量 -// int all_num = gOM; // 总元素数量 - - using itIn = global_iterator; + using itIn = global_iterator; using itOut = global_iterator; - itIn gIIter(in_ptr); + itIn gIIter(in_ptr); itOut gOIter(out_ptr); - auto gO = gOIter(0, 0); - TEXPANDSCALAR(oldtmpSumTile, 0);//初始化为0 - TEXPANDSCALAR(dataTile_col, 0);//初始化为0 + TEXPANDS(oldtmpSumTile, static_cast(0)); + for (int i = 0; i < Nb; ++i) { - auto gI = gIIter(0, i); - TCOPYIN(dataTile, gI); - reducesum_row_kernel<<>>(tmpSumTile.data(), - dataTile.data(), - dataTile_col.data(), - oldtmpSumTile.data(), - i); - oldtmpSumTile = tmpSumTile; + auto gI = gIIter(0, i); + TLOAD(dataTile, gI); + tile_shapeSum partialSum; + TROWSUM(partialSum, dataTile, tmpTile); + TADD(oldtmpSumTile, oldtmpSumTile, partialSum); } - reducesum_row_final_kernel<<>>(SumTile.data(), - tmpSumTile.data()); - TCOPYOUT(gO, SumTile); + TMOV(tmpSumTile, oldtmpSumTile); + + TROWSUM(SumTile, tmpSumTile, tmpTile); + TSTORE(gO, SumTile); } #endif - diff --git a/benchmark-linxisa/kernels/sort/topk.hpp b/benchmark-linxisa/kernels/sort/topk.hpp index 3bba98b..d9ac6b9 100644 --- a/benchmark-linxisa/kernels/sort/topk.hpp +++ b/benchmark-linxisa/kernels/sort/topk.hpp @@ -1,9 +1,12 @@ #ifndef TOPK_HPP #define TOPK_HPP +#include #include #include +using namespace pto::blkv; + // ============================================================================ // Constants // ============================================================================ @@ -82,15 +85,18 @@ void __vec__ ExtractLow8HistForKthBin_Vec_RowMajor( template void ExtractHigh8Hist_Impl(tile_shape_out& dst, const uint16_t* src) { - ExtractHigh8Hist_Vec_RowMajor - <<<1, 256, 1>>>(dst.data(), src); + pto::blkv::blkv_for_2d(1, 256, [&] { + ExtractHigh8Hist_Vec_RowMajor(dst.data(), src); + }); } template void ExtractLow8HistForKthBin_Impl(tile_shape_out& dst, const uint16_t* src, uint16_t kth_bin) { - ExtractLow8HistForKthBin_Vec_RowMajor - <<<1, 256, 1>>>(dst.data(), src, kth_bin); + pto::blkv::blkv_for_2d(1, 256, [&] { + ExtractLow8HistForKthBin_Vec_RowMajor(dst.data(), src, + kth_bin); + }); } // ============================================================================ diff --git a/benchmark-linxisa/kernels/transpose/transpose.hpp b/benchmark-linxisa/kernels/transpose/transpose.hpp index 2747040..9db50d2 100644 --- a/benchmark-linxisa/kernels/transpose/transpose.hpp +++ b/benchmark-linxisa/kernels/transpose/transpose.hpp @@ -1,201 +1,428 @@ -#ifndef TRANSPOSE_KERNEL_HPP -#define TRANSPOSE_KERNEL_HPP +/** + * @file transpose_tile_isa.hpp + * @brief 基于纯 Tile ISA 的转置算子实现 + * + * 本文件提供两个转置算子: + * 1. tile_transpose_nd: 通用 N 维任意轴交换,使用 offset 计算 + MGATHER + * 2. tile_transpose_2d: 2D 矩阵转置,使用硬件原生 TTRANS 指令 + * + * 核心设计思想: + * - 将"标量计算"与"数据并行计算"分离到不同执行路径 + * - 维度循环在标量核心上运行(编译期模板展开) + * - 每个维度内的坐标提取用 tile 操作并行处理所有元素 + * - 避免使用 __vec__ 的 per-element 标量循环模式 + */ + +#ifndef SUPERNPU_TRANSPOSE_TILE_ISA_HPP +#define SUPERNPU_TRANSPOSE_TILE_ISA_HPP #include #include "template_asm.h" +#include +#include -using namespace pto; +// TTRANS 指令的 API 兼容性配置 +// 旧 API: TTRANS(dst, src) +// 新 API: TTRANS(dst, src, tmp) - 需要显式传入临时 tile +// 根据实际 PTO 版本设置此宏 +#ifndef SUPERNPU_PTO_TTRANS_NEEDS_TMP +#define SUPERNPU_PTO_TTRANS_NEEDS_TMP 0 +#endif -#pragma once -#include -#include - - -#define DUMP_TILE(label, TileVar, DumpBuf, Rows, Cols) \ - do { \ - GlobalTensor, \ - Stride<1,1,1,Cols,1>> _g(DumpBuf); \ - TCOPYOUT(_g, TileVar); \ - printf("[DUMP] %s (shape=%dx%d):\n", label, Rows, Cols); \ - for (int ri = 0; ri < Rows; ri++) { \ - printf(" row%2d: ", ri); \ - for (int ci = 0; ci < Cols; ci++) \ - printf("%12lld ", (long long)DumpBuf[ri * Cols + ci]); \ - printf("\n"); \ - } \ - fflush(stdout); \ - } while (0) - -// ============================================== -// 维度规则:交换transpose_dim0和transpose_dim1 -// ============================================== -template -void __vec__ gen_offset_trans( - typename tile_shape::TileDType __out__ out, - typename tile_Inshape::TileDType __in__ in_shape, - typename tile_Outshape::TileDType __in__ out_shape, -// const size_t in_dim, -// const size_t out_dim, -// const size_t transpose_dim1, -// const size_t transpose_dim0, - const size_t base, - const size_t total_elements -) { - size_t index = blkv_get_index_x(); - size_t idx = blkv_get_index_x(); - - __vbuf__ typename tile_Inshape::DType *in_shape_ptr = blkv_get_tile_ptr(in_shape); - __vbuf__ typename tile_Outshape::DType *out_shape_ptr = blkv_get_tile_ptr(out_shape); - -// if (index >= total_elements) return; - idx = idx + base; // todo idx是个向量,base是个标量,获得所有的基地址或者说基offset - - //转置维度交换stride。 -// uint16_t stride[IN_DIM]; - uint32_t stride[IN_DIM]; - stride[IN_DIM-1] = 1; - - #pragma clang loop unroll(full) - for(int i = IN_DIM-2 ; i >= 0; --i) - { - stride[i] = stride[i+1] * in_shape_ptr[i+1]; - } - std::swap(stride[TRANSPOSE_DIM1],stride[TRANSPOSE_DIM0]); - - // 输出一维索引 → 输出坐标 - size_t out_coord[MAX_DIM] = {0}; // - size_t tmp = idx; // - - #pragma clang loop unroll(full) - for (int d = OUT_DIM - 1; d >= 0; d--) { - out_coord[d] = tmp % out_shape_ptr[d]; - tmp /= out_shape_ptr[d]; - } +namespace supernpu::tile_isa { + +/** + * @brief 通用 N 维转置(任意轴交换) + * + * 算法核心: + * 1. 对输出张量的每个元素,计算其多维坐标 + * 2. 通过轴映射得到对应的输入坐标 + * 3. 用输入 stride 计算 byte offset + * 4. 用 MGATHER 按 offset 从输入中 gather 数据 + * + * 关键优化: + * - Rank、Axis0、Axis1 是模板参数 → 维度循环编译期完全展开 + * - 每个维度的坐标提取用 tile 操作并行处理 tile 内所有元素 + * - 不是"per-element 串行迭代",而是"per-dimension 并行处理" + * + * @tparam DType 数据类型 + * @tparam Rank 张量维度数(编译期常量,> 1) + * @tparam Axis0 转置轴 0(编译期常量,[0, Rank)) + * @tparam Axis1 转置轴 1(编译期常量,[0, Rank) 且 != Axis0) + * @tparam Elements 总元素数(编译期常量,> 0) + * @tparam TileElements 每个 tile 处理的元素数(默认 512) + * + * @param input 输入数据指针(展平为一维) + * @param output 输出数据指针(展平为一维) + * @param input_shape 输入张量各维度尺寸(运行时数组) + * @param output_shape 输出张量各维度尺寸(运行时数组) + */ +template +void tile_transpose_nd(DType *input, DType *output, std::uint32_t *input_shape, + std::uint32_t *output_shape) { + // ========== 编译期安全检查 ========== + static_assert(Rank > 1, "转置至少需要 2 维"); + static_assert(Axis0 >= 0 && Axis0 < Rank, "Axis0 超出维度范围"); + static_assert(Axis1 >= 0 && Axis1 < Rank, "Axis1 超出维度范围"); + static_assert(Axis0 != Axis1, "转置的两个轴必须不同"); + static_assert(Elements > 0, "元素数必须为正"); + static_assert(TileElements > 0, "TileElements 必须为正"); + // Tile 行必须 32 字节对齐(硬件要求) + static_assert(TileElements * static_cast(sizeof(DType)) % 32 == 0, + "RowMajor Tile 的行必须 32 字节对齐"); + // MGATHER 的 offset tile 使用 uint32 byte offset,不能溢出 + static_assert(static_cast(Elements) * sizeof(DType) <= + 0x100000000ULL, + "MGATHER 的 byte offset 使用 uint32,不能溢出"); + + // ========== 分块参数 ========== + constexpr int kFullTiles = Elements / TileElements; // 完整 tile 数量 + constexpr int kTailElements = Elements % TileElements; // 尾部剩余元素数 + + // ========== 类型定义 ========== + // 输入/输出全局张量:展平为一维(转置不改变存储,只改变逻辑索引) + using InputGlobal = pto::global_tensor>; + using OutputGlobal = pto::global_tensor>; + + // 数据 tile:存储实际数据 + using DataTile = pto::Tile; + // Offset tile:存储每个元素的 byte offset(用于 MGATHER) + using OffsetTile = pto::Tile; + // 输出迭代器:用于遍历输出张量的 tile + using OutputIterator = global_iterator; + + // ========== 初始化全局张量和迭代器 ========== + InputGlobal input_global(input); + OutputIterator output_iter(output); + + // ========== 处理完整 tile ========== + std::uint32_t output_base = 0; // 当前 tile 在输出中的起始线性索引 + for (int tile_index = 0; tile_index < kFullTiles; ++tile_index) { + auto output_global = output_iter(0, tile_index); + DataTile output_tile; // 存储 gather 回来的数据 + OffsetTile offset_tile; // 存储每个元素的 byte offset + + // 临时 tile(用于坐标提取和 offset 计算) + OffsetTile linear_index; // 线性索引 [output_base, output_base+1, ...] + OffsetTile quotient; // 除法商 + OffsetTile cycle; // 周期数(用于取模) + OffsetTile cycle_base; // 周期基址 + OffsetTile coordinate; // 当前维度的坐标 + OffsetTile contribution; // 当前维度对 offset 的贡献 + + // Step 1: 生成线性索引序列 [output_base, output_base+1, ..., output_base+TileElements-1] + TCI(linear_index, output_base); + + // Step 2: 初始化 offset 累加器为 0 + TEXPANDS(offset_tile, static_cast(0)); + + // Step 3: 对每个维度计算坐标并累加 offset + // 关键:这个循环在标量核心上运行,由于 Rank 是模板参数,编译期完全展开 + // 每次迭代用 tile 操作并行处理 tile 内所有元素的当前维度 + std::uint32_t input_stride = 1; // 输入张量当前维度的 stride(从内层开始) + for (int input_dim = Rank - 1; input_dim >= 0; --input_dim) { + // 轴映射:交换 Axis0 和 Axis1 + // 例如:如果 input_dim == Axis0,则对应的 output_dim == Axis1 + int output_dim = input_dim; + if (input_dim == Axis0) { + output_dim = Axis1; + } else if (input_dim == Axis1) { + output_dim = Axis0; + } -/* - // 输出坐标 → 输入坐标 - size_t in_coord[MAX_DIM] = {0}; - for (size_t i = 0; i < in_dim; i++) { - size_t o = out_dim - in_dim + i; // 从后面对齐 - if (in_shape[i] == 1) { - in_coord[i] = 0; - } else { - in_coord[i] = out_coord[o]; - } + // 计算 output_dim 维度的 stride(标量计算,在标量核心上完成) + // output_stride = output_shape[output_dim+1] * output_shape[output_dim+2] * ... * output_shape[Rank-1] + std::uint32_t output_stride = 1; + for (int dim = output_dim + 1; dim < Rank; ++dim) { + output_stride *= output_shape[dim]; + } + + // 提取当前维度的坐标:coordinate = (linear_index / output_stride) % output_shape[output_dim] + // 用 tile 操作并行处理所有元素 + + // quotient = linear_index / output_stride + // 优化:如果 output_stride == 1,直接拷贝(避免除法) + if (output_stride == 1) { + TMOV(quotient, linear_index); + } else { + TDIVS(quotient, linear_index, output_stride); + } + + // 取模实现:a % b = a - (a / b) * b + // cycle = quotient / output_shape[output_dim] + TDIVS(cycle, quotient, output_shape[output_dim]); + // cycle_base = cycle * output_shape[output_dim] + TMULS(cycle_base, cycle, output_shape[output_dim]); + // coordinate = quotient - cycle_base (即 quotient % output_shape[output_dim]) + TSUB(coordinate, quotient, cycle_base); + + // 计算当前维度对 byte offset 的贡献 + // contribution = coordinate * input_stride * sizeof(DType) + // 注意:coordinate 是 output_coord[output_dim],由于轴映射,它等于 input_coord[input_dim] + TMULS(contribution, coordinate, + input_stride * static_cast(sizeof(DType))); + + // 累加到总 offset + TADD(offset_tile, offset_tile, contribution); + + // 更新 input_stride(从内层向外层累积) + input_stride *= input_shape[input_dim]; } -*/ -// uint16_t in_offset = 0; - uint32_t in_offset = 0; - #pragma clang loop unroll(full) - for (int i = 0; i < IN_DIM; i++) { - in_offset += out_coord[i] * stride[i] * sizeof(dtype); + // Step 4: 按 byte offset 从输入中 gather 数据 + MGATHER(output_tile, input_global, offset_tile); + + // Step 5: 写回到输出全局内存 + TSTORE(output_global, output_tile); + + // 更新下一个 tile 的起始索引 + output_base += TileElements; + } + + // ========== 处理尾部元素 ========== + // 如果总元素数不是 TileElements 的整数倍,处理剩余元素 + if constexpr (kTailElements != 0) { + // 尾部 tile 的类型定义(物理尺寸相同,但 valid region 缩小) + using TailDataTile = pto::Tile; + using TailOffsetTile = + pto::Tile; + + auto output_global = output_iter(0, kFullTiles); + TailDataTile output_tile; + TailOffsetTile offset_tile; + + TailOffsetTile linear_index; + TailOffsetTile quotient; + TailOffsetTile cycle; + TailOffsetTile cycle_base; + TailOffsetTile coordinate; + TailOffsetTile contribution; + + // 尾部 tile 的处理逻辑与完整 tile 完全一致 + // 区别仅在于 valid region 更小(kTailElements 而非 TileElements) + // tile 操作以 valid region 为迭代域,因此自动只处理有效元素 + TCI(linear_index, output_base); + TEXPANDS(offset_tile, static_cast(0)); + + std::uint32_t input_stride = 1; + for (int input_dim = Rank - 1; input_dim >= 0; --input_dim) { + int output_dim = input_dim; + if (input_dim == Axis0) { + output_dim = Axis1; + } else if (input_dim == Axis1) { + output_dim = Axis0; + } + + std::uint32_t output_stride = 1; + for (int dim = output_dim + 1; dim < Rank; ++dim) { + output_stride *= output_shape[dim]; + } + + if (output_stride == 1) { + TMOV(quotient, linear_index); + } else { + TDIVS(quotient, linear_index, output_stride); + } + + TDIVS(cycle, quotient, output_shape[output_dim]); + TMULS(cycle_base, cycle, output_shape[output_dim]); + TSUB(coordinate, quotient, cycle_base); + TMULS(contribution, coordinate, + input_stride * static_cast(sizeof(DType))); + TADD(offset_tile, offset_tile, contribution); + + input_stride *= input_shape[input_dim]; } - // 赋值 - blkv_get_tile_ptr(out)[index] = in_offset; + MGATHER(output_tile, input_global, offset_tile); + TSTORE(output_global, output_tile); + } } -template -void gen_offset_Impl( - tile_shapeOffset &out, - tile_Inshape &in_shape, - tile_Outshape &out_shape, -// const size_t in_dim, -// const size_t out_dim, -// const size_t transpose_dim1, -// const size_t transpose_dim0, - const size_t base, - const size_t total_elements - ) -{ - static_assert(tile_shapeOffset::ValidRow != -1 && tile_shapeOffset::ValidCol != -1, - "Only static shape supported"); - gen_offset_trans<<>>(out.data(), in_shape.data(), out_shape.data(), base, total_elements); // todo 这部分的tile shape是怎么传递的? -} +/** + * @brief 2D 矩阵转置(高性能路径) + * + * 使用硬件原生 TTRANS 指令,性能最优。 + * 算法:标准分块转置 + * 1. 将 Rows × Cols 矩阵划分为 TileRows × TileCols 的块 + * 2. 对每个块:TLOAD → TTRANS → TSTORE + * 3. 输入块 TileRows × TileCols,转置后输出块 TileCols × TileRows + * 4. 输出位置为 (col_tile, row_tile)(行列互换) + * + * @tparam DType 数据类型 + * @tparam Rows 输入矩阵行数 + * @tparam Cols 输入矩阵列数 + * @tparam TileRows tile 块行数(默认 16) + * @tparam TileCols tile 块列数(默认 16) + * + * @param input 输入矩阵指针(Rows × Cols) + * @param output 输出矩阵指针(Cols × Rows) + */ +template +void tile_transpose_2d(DType *input, DType *output) { + // ========== 编译期安全检查 ========== + static_assert(Rows > 0 && Cols > 0, "矩阵维度必须为正"); + static_assert(TileRows > 0 && TileCols > 0, "tile 维度必须为正"); + // TTRANS 对行宽有 32 字节对齐要求 + static_assert(TileRows * static_cast(sizeof(DType)) % 32 == 0, + "TTRANS 输出行宽必须 32 字节对齐"); + static_assert(TileCols * static_cast(sizeof(DType)) % 32 == 0, + "TTRANS 输入行宽必须 32 字节对齐"); + + // ========== 分块参数 ========== + constexpr int kRowTiles = Rows / TileRows; // 行方向完整 tile 数 + constexpr int kColTiles = Cols / TileCols; // 列方向完整 tile 数 + constexpr int kTailRows = Rows % TileRows; // 行方向尾部元素数 + constexpr int kTailCols = Cols % TileCols; // 列方向尾部元素数 + + // ========== 类型定义 ========== + using InputGlobal = pto::global_tensor>; + using OutputGlobal = pto::global_tensor>; + + // 输入 tile:TileRows × TileCols + using SrcTile = pto::Tile; + // 输出 tile:TileCols × TileRows(转置后) + using DstTile = pto::Tile; + + using InputIterator = global_iterator; + using OutputIterator = global_iterator; + + InputIterator input_iter(input); + OutputIterator output_iter(output); + // ========== 处理完整 tile 块 ========== + for (int row_tile = 0; row_tile < kRowTiles; ++row_tile) { + for (int col_tile = 0; col_tile < kColTiles; ++col_tile) { + // 输入位置:(row_tile, col_tile) + auto input_global = input_iter(row_tile, col_tile); + // 输出位置:(col_tile, row_tile) - 行列互换 + auto output_global = output_iter(col_tile, row_tile); + SrcTile src_tile; + DstTile dst_tile; -template -void transpose( - dtype *in_ptr, - dtype *out_ptr, - uint32_t *in_shape, - uint32_t *out_shape -// const size_t in_dim, -// const size_t out_dim, -// const size_t transpose_dim1, -// const size_t transpose_dim0 -) - { - - const int Mb = gOM / tM; - - - const int rmd_M = gOM % tM; // todo 尾块怎么处理? - - using gm_shapeIn = global_tensor>; //将gm中的Tensor先声明为一维数据 - using gm_shapeOut = global_tensor>; - - using gm_InMatShape = global_tensor>; //将gm中的Tensor先声明为一维数据 - using gm_OutMatShape = global_tensor>; - - using tile_shapeData = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeOffset = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec -// using tile_shapeOffset = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec - - - using tile_shapeData_rmd = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeOffset_rmd = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec - - using tile_Inshape = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_Outshape = Tile; // todo 这里的location,一定要是Vec吗?哪怕没有传入Vec - - gm_shapeIn inGm(in_ptr); - - gm_InMatShape InShapeGm(in_shape); - gm_OutMatShape OutShapeGm(out_shape); - - tile_shapeData dataTile; - tile_shapeOffset offsetTile; - - tile_shapeData_rmd dataTile_rmd; - tile_shapeOffset_rmd offsetTile_rmd; - - tile_Inshape InshapeTile; - tile_Outshape OutshapeTile; - - int base = 0;// todo 生成一个标量 - int all_num = gOM; // 总元素数量 - - using itOut = global_iterator; - itOut gOIter(out_ptr); - - TCOPYIN(InshapeTile, InShapeGm); - TCOPYIN(OutshapeTile, OutShapeGm); - - - int total_elements = tM; - for (int i = 0; i < Mb; ++i) { - auto gO = gOIter(0, i); - gen_offset_Impl(offsetTile, InshapeTile, OutshapeTile, base, total_elements); -// printf("end genoffset\n"); - base += total_elements; -// DUMP_TILE("offsetTile", offsetTile, g_dump, 1, tM); - MGATHER(dataTile, inGm, offsetTile); -// printf("end mgather\n"); - TCOPYOUT(gO, dataTile); -// TCOPYOUT(gO, dataTile); + // 从全局内存加载输入 tile + TLOAD(src_tile, input_global); + + // 硬件转置:TileRows × TileCols → TileCols × TileRows +#if SUPERNPU_PTO_TTRANS_NEEDS_TMP + SrcTile tmp_tile; + TTRANS(dst_tile, src_tile, tmp_tile); +#else + TTRANS(dst_tile, src_tile); +#endif + + // 写回到全局内存 + TSTORE(output_global, dst_tile); } - if constexpr (rmd_M) { - auto gO = gOIter(0, Mb); - total_elements = rmd_M;//尾片的大小。 - gen_offset_Impl(offsetTile_rmd, InshapeTile, OutshapeTile, base, total_elements); - base += total_elements; - MGATHER(dataTile_rmd, inGm, offsetTile_rmd); - TCOPYOUT(gO, dataTile_rmd); + + // ========== 处理右侧尾部 tile(列方向) ========== + if constexpr (kTailCols != 0) { + // 输入 tile:TileRows × kTailCols(valid region) + using SrcRightTile = + pto::Tile; + // 输出 tile:kTailCols × TileRows(转置后) + using DstBottomTile = + pto::Tile; + + auto input_global = input_iter(row_tile, kColTiles); + auto output_global = output_iter(kColTiles, row_tile); + + SrcRightTile src_tile; + DstBottomTile dst_tile; + + TLOAD(src_tile, input_global); +#if SUPERNPU_PTO_TTRANS_NEEDS_TMP + SrcRightTile tmp_tile; + TTRANS(dst_tile, src_tile, tmp_tile); +#else + TTRANS(dst_tile, src_tile); +#endif + TSTORE(output_global, dst_tile); } + } + + // ========== 处理底部尾部 tile(行方向) ========== + if constexpr (kTailRows != 0) { + for (int col_tile = 0; col_tile < kColTiles; ++col_tile) { + // 输入 tile:kTailRows × TileCols(valid region) + using SrcBottomTile = + pto::Tile; + // 输出 tile:TileCols × kTailRows(转置后) + using DstRightTile = + pto::Tile; + + auto input_global = input_iter(kRowTiles, col_tile); + auto output_global = output_iter(col_tile, kRowTiles); + + SrcBottomTile src_tile; + DstRightTile dst_tile; + + TLOAD(src_tile, input_global); +#if SUPERNPU_PTO_TTRANS_NEEDS_TMP + SrcBottomTile tmp_tile; + TTRANS(dst_tile, src_tile, tmp_tile); +#else + TTRANS(dst_tile, src_tile); +#endif + TSTORE(output_global, dst_tile); + } + + // ========== 处理右下角尾部 tile ========== + if constexpr (kTailCols != 0) { + // 输入 tile:kTailRows × kTailCols(valid region) + using SrcCornerTile = + pto::Tile; + // 输出 tile:kTailCols × kTailRows(转置后) + using DstCornerTile = + pto::Tile; + + auto input_global = input_iter(kRowTiles, kColTiles); + auto output_global = output_iter(kColTiles, kRowTiles); + + SrcCornerTile src_tile; + DstCornerTile dst_tile; + + TLOAD(src_tile, input_global); +#if SUPERNPU_PTO_TTRANS_NEEDS_TMP + SrcCornerTile tmp_tile; + TTRANS(dst_tile, src_tile, tmp_tile); +#else + TTRANS(dst_tile, src_tile); +#endif + TSTORE(output_global, dst_tile); + } + } +} + +} // namespace supernpu::tile_isa + +template +void transpose(DType *input, DType *output, std::uint32_t *input_shape, + std::uint32_t *output_shape) { + static_assert(gIM == gOM, "transpose preserves element count"); + static_assert(IN_DIM == OUT_DIM, "transpose rank must be preserved"); + (void)MAX_DIM; + supernpu::tile_isa::tile_transpose_nd< + DType, static_cast(IN_DIM), static_cast(TRANSPOSE_DIM0), + static_cast(TRANSPOSE_DIM1), gOM, tM>(input, output, input_shape, + output_shape); } #endif diff --git a/benchmark-linxisa/kernels/transpose/transpose_vector_007.hpp b/benchmark-linxisa/kernels/transpose/transpose_vector_007.hpp index c9fbe07..254ac86 100644 --- a/benchmark-linxisa/kernels/transpose/transpose_vector_007.hpp +++ b/benchmark-linxisa/kernels/transpose/transpose_vector_007.hpp @@ -1,59 +1,91 @@ +/** + * @file transpose_vector_007_tile.hpp + * @brief 4096×3 矩阵转置为 3×4096 矩阵(Tile ISA 实现,使用 TTRANS) + * + * 输入:4096×3 矩阵 + * 输出:3×4096 矩阵 + * + * 算法:分块 2D 转置 + * - 将 4096×3 分成多个 TileRows×TileCols 的小块 + * - 对每个小块用 TTRANS 转置 + * - 存储到输出的对应位置 + */ + +#ifndef TRANSPOSE_VECTOR_007_TILE_HPP +#define TRANSPOSE_VECTOR_007_TILE_HPP + #include -#include "gemm.hpp" +#include using namespace pto; -//AI/IA = A, placeholder -//like Ttrans -template -void __vec__ transpose_007_impl( - typename tileOutData::TileDType __out__ out, - const typename tileInData::TileDType __in__ in +/** + * @brief 4096×3 → 3×4096 转置(使用 TTRANS) + * + * 分块策略: + * - TileRows = 16, TileCols = 3 + * - 输入块:16×3 + * - 输出块:3×16(TTRANS 后) + * - 共 256 个块(4096/16 = 256) + * + * @tparam dtype 数据类型 + * @param out_ptr 输出指针(3×4096) + * @param in_ptr 输入指针(4096×3) + */ +template +void transpose_007_tile( + dtype *out_ptr, + dtype *in_ptr ) { - size_t i = blkv_get_index_x(); // 4096 - size_t j = blkv_get_index_y(); // 3 + constexpr int INPUT_ROWS = 4096; + constexpr int INPUT_COLS = 3; + constexpr int OUTPUT_ROWS = 3; + constexpr int OUTPUT_COLS = 4096; - size_t idx_src = i * tileOutData::ValidRow + j; - size_t idx_dst = i * tileOutData::ColStride + j * tileOutData::RowStride; - blkv_get_tile_ptr(out)[idx_dst] = blkv_get_tile_ptr(in)[idx_src]; -} + // 分块参数 + constexpr int TILE_ROWS = 16; // 输入块行数 + constexpr int TILE_COLS = 3; // 输入块列数 + constexpr int NUM_ROW_TILES = INPUT_ROWS / TILE_ROWS; // 256 + constexpr int NUM_COL_TILES = INPUT_COLS / TILE_COLS; // 1 + // 全局张量定义 + using InputGlobal = global_tensor>; + using OutputGlobal = global_tensor>; -// output = input * A^T + bias ;;; input = [batch_size, in_features] , A = [out_features, in_features] -// A should be row majo + // Tile 定义 + using SrcTile = Tile; + using DstTile = Tile; -// y = x1^T.A.x2 + b, where x1=1xd1, x2=1xd2, A=doxd1xd2 -// in1[DimIn1, 1] in2 [DimIn2, 1] bias[DimOut, 1] weight [DimOut, DimIn1, DimIn2] -template -void transpose_007( - dtype *out_ptr, - dtype *in_ptr -) -{ - const int Mb = 4096 / 4096; + // 迭代器 + using InputIterator = global_iterator; + using OutputIterator = global_iterator; + + InputIterator input_iter(in_ptr); + OutputIterator output_iter(out_ptr); - using gm_shapeIn = global_tensor>; //将gm中的Tensor先声明为一维数据 - using gm_shapeOut = global_tensor>; - using tile_shapeInData = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - using tile_shapeOutData = Tile; // todo 尾块怎么处理?是否要作为参数写在这 + // 遍历所有块 + for (int row_tile = 0; row_tile < NUM_ROW_TILES; ++row_tile) { + for (int col_tile = 0; col_tile < NUM_COL_TILES; ++col_tile) { + // 输入位置:(row_tile, col_tile) -> 对应输入矩阵的 [row_tile*16 : (row_tile+1)*16, col_tile*3 : (col_tile+1)*3] + auto input_global = input_iter(row_tile, col_tile); - using itIn = global_iterator; - using itOut = global_iterator; + // 输出位置:(col_tile, row_tile) -> 对应输出矩阵的 [col_tile*3 : (col_tile+1)*3, row_tile*16 : (row_tile+1)*16] + auto output_global = output_iter(col_tile, row_tile); - tile_shapeInData InDataTile; - tile_shapeOutData OutDataTile; + SrcTile src_tile; + DstTile dst_tile; - itIn gIIter(in_ptr); - itOut gOIter(out_ptr); + // 加载输入块(16×3) + TLOAD(src_tile, input_global); - for (int i = 0; i < Mb; ++i) { - auto gI = gIIter(0, i); - auto gO = gOIter(0, i); - TCOPYIN(InDataTile, gI); - transpose_007_impl<<>>(OutDataTile.data(), InDataTile.data()); - TCOPYOUT(gO, OutDataTile); - } + // 转置:16×3 -> 3×16 + TTRANS(dst_tile, src_tile); + // 存储到输出 + TSTORE(output_global, dst_tile); + } + } } +#endif // TRANSPOSE_VECTOR_007_TILE_HPP diff --git a/benchmark-linxisa/kernels/transpose/transpose_vector_050.hpp b/benchmark-linxisa/kernels/transpose/transpose_vector_050.hpp index d280dd3..10b535f 100644 --- a/benchmark-linxisa/kernels/transpose/transpose_vector_050.hpp +++ b/benchmark-linxisa/kernels/transpose/transpose_vector_050.hpp @@ -1,58 +1,94 @@ +/** + * @file transpose_vector_050_tile.hpp + * @brief 8×64×64 张量转置为 64×8×64 张量(Tile ISA 实现,逐行处理) + * + * 输入:8×64×64 张量(展平为 512×64) + * 输出:64×8×64 张量(展平为 512×64) + * + * 转置关系:交换第 0 维和第 1 维 + * - input[a][b][c] → output[b][a][c] + * - 其中 a∈[0,8), b∈[0,64), c∈[0,64) + * + * 行置换关系: + * - 输入行号 = a*64 + b + * - 输出行号 = b*8 + a + * - 每行 64 个元素 + * + * 算法:逐行复制 + * - 对每个 (a, b) 对,复制 input 行 (a*64+b) 到 output 行 (b*8+a) + * - 共 8×64 = 512 行,每行用 TLOAD + TSTORE 处理 + */ + +#ifndef TRANSPOSE_VECTOR_050_TILE_HPP +#define TRANSPOSE_VECTOR_050_TILE_HPP + #include -#include "gemm.hpp" +#include using namespace pto; - -template -void __vec__ transpose_050_impl( - typename tileData::TileDType __out__ out, - const typename tileData::TileDType __in__ in -) -{ - size_t i = blkv_get_index_x(); // y - size_t j = blkv_get_index_y(); // x 4096*3 - - size_t row_out = j / 8; - size_t col_out = j % 8; - - size_t idx_in = col_out * 64 + row_out; - - size_t idx_src = i * tileData::ColStride + idx_in * tileData::RowStride; - size_t idx_dst = i * tileData::ColStride + j * tileData::RowStride; - blkv_get_tile_ptr(out)[idx_dst] = blkv_get_tile_ptr(in)[idx_src]; -} - - +/** + * @brief 8×64×64 → 64×8×64 转置(逐行处理) + * + * 实现方式: + * - 外层循环遍历 a ∈ [0, 8) + * - 内层循环遍历 b ∈ [0, 64) + * - 对每个 (a, b),将 input 行 (a*64+b) 复制到 output 行 (b*8+a) + * + * 每行处理: + * - 用 1×64 的 tile 加载一行 + * - 用 TSTORE 存储到目标位置 + * + * @tparam dtype 数据类型 + * @param out_ptr 输出指针(64×8×64 = 512×64) + * @param in_ptr 输入指针(8×64×64 = 512×64) + */ template -void transpose_050( - dtype *out_ptr, - dtype *in_ptr +void transpose_050_tile( + dtype *out_ptr, + dtype *in_ptr ) -{ - - - using gm_shapeIn = global_tensor>; //将gm中的Tensor先声明为一维数据 - using gm_shapeOut = global_tensor>; - using tile_shapeData = Tile; // todo 尾块怎么处理?是否要作为参数写在这 - - using itIn = global_iterator; - using itOut = global_iterator; - - tile_shapeData InDataTile; - tile_shapeData OutDataTile; - - itIn gIIter(in_ptr); - itOut gOIter(out_ptr); - - - auto gI = gIIter(0, 0); - auto gO = gOIter(0, 0); - TCOPYIN(InDataTile, gI); - transpose_050_impl<<>>(OutDataTile.data(), InDataTile.data()); - TCOPYOUT(gO, OutDataTile); - - +{ + constexpr int DIM0 = 8; // 第 0 维大小 + constexpr int DIM1 = 64; // 第 1 维大小 + constexpr int DIM2 = 64; // 第 2 维大小(每行元素数) + constexpr int TOTAL_ROWS = DIM0 * DIM1; // 512 + + // 全局张量定义 + using InputGlobal = global_tensor>; + using OutputGlobal = global_tensor>; + + // 行 tile:1×64 + using RowTile = Tile; + + // 行迭代器 + using RowIterator = global_iterator; + + InputGlobal input_global(in_ptr); + OutputGlobal output_global(out_ptr); + RowIterator input_row_iter(in_ptr); + RowIterator output_row_iter(out_ptr); + + RowTile row_tile; + + // 逐行处理 + for (int a = 0; a < DIM0; ++a) { + for (int b = 0; b < DIM1; ++b) { + // 源行号:a*64 + b + int src_row = a * DIM1 + b; + + // 目标行号:b*8 + a + int dst_row = b * DIM0 + a; + + // 加载源行 + auto src_global = input_row_iter(src_row, 0); + TLOAD(row_tile, src_global); + + // 存储到目标行 + auto dst_global = output_row_iter(dst_row, 0); + TSTORE(dst_global, row_tile); + } + } } - +#endif // TRANSPOSE_VECTOR_050_TILE_HPP diff --git a/benchmark-linxisa/kernels/utils/layout_transform.hpp b/benchmark-linxisa/kernels/utils/layout_transform.hpp index 3e07196..62b40f8 100644 --- a/benchmark-linxisa/kernels/utils/layout_transform.hpp +++ b/benchmark-linxisa/kernels/utils/layout_transform.hpp @@ -1,3 +1,9 @@ +#include + +using namespace pto::blkv; +using pto::is_global_data_v; +using pto::is_tile_data_v; + template void __vec__ gen_offset_ND2ZZ( typename tile_shape::TileDType __out__ out, @@ -98,8 +104,12 @@ void gen_ND2ZZ_offset_Impl( const uint32_t j) { static_assert(tile_shapeOffset::ValidRow != -1 && tile_shapeOffset::ValidCol != -1, "Only static shape supported"); - // gen_offset_ND2ZZ<<>>(offset.data(), glb_tensor::ColStride, glb_tensor::RowStride, tl_tensor::ValidRow, tl_tensor::ValidCol, i, j); - gen_offset_ND2ZZ<<>>(offset.data(), glb_tensor::ColStride, glb_tensor::RowStride, tl_tensor::ValidRow, tl_tensor::ValidCol, i, j); + pto::blkv::blkv_for_1d(tl_tensor::ValidCol, [&] { + gen_offset_ND2ZZ(offset.data(), glb_tensor::ColStride, + glb_tensor::RowStride, + tl_tensor::ValidRow, + tl_tensor::ValidCol, i, j); + }); } template @@ -111,7 +121,9 @@ void gen_ND2NN_offset_Impl( const uint32_t j) { static_assert(tile_shapeOffset::ValidRow != -1 && tile_shapeOffset::ValidCol != -1, "Only static shape supported"); - // gen_offset_ND2NN<<>>(offset.data(), glb_tensor::ColStride, glb_tensor::RowStride, tl_tensor::ValidRow, tl_tensor::ValidCol, i, j); - // gen_offset_ND2NN<<>>(offset.data(), glb_tensor::ColStride, glb_tensor::RowStride, tl_tensor::ValidRow, tl_tensor::ValidCol, i, j); - gen_offset_ND2NN_new<<>>(offset.data(), glb_tensor::ColStride, glb_tensor::RowStride, tl_tensor::ValidRow, tl_tensor::ValidCol, i, j); -} \ No newline at end of file + pto::blkv::blkv_for_1d(tl_tensor::ValidRow, [&] { + gen_offset_ND2NN_new( + offset.data(), glb_tensor::ColStride, glb_tensor::RowStride, + tl_tensor::ValidRow, tl_tensor::ValidCol, i, j); + }); +} diff --git a/benchmark-linxisa/test/common/Makefile.common b/benchmark-linxisa/test/common/Makefile.common index 35caf4d..6b97f6b 100644 --- a/benchmark-linxisa/test/common/Makefile.common +++ b/benchmark-linxisa/test/common/Makefile.common @@ -1,8 +1,11 @@ ROOT := $(shell echo $(CURDIR) | sed -E 's@(.*)/(benchmark-linxisa|benchmark-ptoisa)/.*@\1/\2@') +LINX_ISA_ROOT ?= $(shell cd $(ROOT)/../../.. && pwd) +LINX_SYSROOT ?= $(LINX_ISA_ROOT)/out/libc/musl/install/phase-b +PTO_KERNELS_INCLUDE ?= $(LINX_ISA_ROOT)/workloads/pto_kernels/include TEST_ROOT := $(shell echo $(CURDIR) | sed -E 's@(.*)/(benchmark-linxisa|benchmark-ptoisa)/test/.*@\1/\2/test@') CATEGORY := $(shell echo $(CURDIR) | sed -E 's@.*/(benchmark-linxisa|benchmark-ptoisa)/test/(.*)@\2@') CATEGORY_NAME := $(shell echo $(CATEGORY) | sed -e 's/\//_/g') -OBJ_ROOT := $(shell realpath $(TEST_ROOT)/../output) +OBJ_ROOT := $(abspath $(TEST_ROOT)/../output) CASE_SRC_DIR := $(CATEGORY)/src ELF_DIR := $(OBJ_ROOT)/$(CATEGORY)/elf SRC_DIR := $(shell dirname $(SRC_FILE)) @@ -51,7 +54,7 @@ CXX_VER ?= -std=c++20 else ifeq ($(PLAT), linx) DEFINES += -D__linx ifndef COMPILER_DIR -$(error COMPILER_DIR is not set. Export COMPILER_DIR pointing to the linx_blockisa_llvm_musl toolchain bin directory, e.g. export COMPILER_DIR=/path/to/linx_blockisa_llvm_musl/bin) +$(error COMPILER_DIR is not set. Export COMPILER_DIR pointing to the Linx Clang bin directory) endif AS = $(COMPILER_DIR)/clang CC = $(COMPILER_DIR)/clang @@ -59,17 +62,16 @@ CXX = $(COMPILER_DIR)/clang++ LINK = $(COMPILER_DIR)/clang++ DUMP = $(COMPILER_DIR)/llvm-objdump COPY = $(COMPILER_DIR)/llvm-objcopy -CC_O = -c -mlxbc -fenable-matrix -O2 -mllvm -enable-all-vector-as-tilereg=true $(CFLAGS) +LINX_TARGET ?= linx64-unknown-linux-musl +LINX_CFLAGS ?= -mlxbc --target=$(LINX_TARGET) -fenable-matrix -O2 +LINX_LDFLAGS ?= -nostdlib++ -unwindlib=none +ifneq ($(wildcard $(LINX_SYSROOT)/usr/include/bits/alltypes.h),) +LINX_CFLAGS += --sysroot=$(LINX_SYSROOT) -idirafter $(LINX_SYSROOT)/usr/include -D_GNU_SOURCE +CC_LINK += --sysroot=$(LINX_SYSROOT) +endif +CC_O = -c $(LINX_CFLAGS) $(CFLAGS) +CC_LINK += $(LINX_CFLAGS) $(LINX_LDFLAGS) CXX_VER ?= -std=c++20 - ifneq ($(CC_OPT), default) - CC_O += -mllvm -linxv5-enable-HL-Inst-Opt=true \ - -mllvm -linxv5-enable-dim-opt=true \ - -mllvm -linxv5-enable-ldst-bridge=false \ - -mllvm -linxv5-enable-continuous-mem-opt=true \ - -mllvm -linxv5-enable-tile-clock-hand=false \ - -mllvm -linxv5-enable-simt-clock-hand=true \ - -mllvm -enable-misched=false - endif ifeq ($(baremetal), on) CC_LINK += -static -lm -nostartfiles -L $(OBJ_ROOT)/$(COMM_SRC_DIR) -T $(LINK_SCRIPT) @@ -92,7 +94,7 @@ CC_O += -fPIC CC_LINK += -shared endif -INCLUDE += -I$(ROOT)/include -I$(ROOT)/test/common -I$(ROOT)/test/common/src -I$(ROOT)/kernels -I$(ROOT)/models +INCLUDE += -I$(ROOT)/include -I$(ROOT)/test -I$(ROOT)/test/common -I$(ROOT)/test/common/src -I$(ROOT)/kernels -I$(ROOT)/models -I$(PTO_KERNELS_INCLUDE) QEMU ?= /remote/lms60/c00622284/qemu/LinxBlockModel/build/qemu-linx CC_O_ALL = $(CC_O) $(CC_OPTS) diff --git a/benchmark-linxisa/test/common/_start.s b/benchmark-linxisa/test/common/_start.s index 150e10c..654d9bb 100644 --- a/benchmark-linxisa/test/common/_start.s +++ b/benchmark-linxisa/test/common/_start.s @@ -5,7 +5,7 @@ _start: bstart.std call main c.setret 2, ->ra _end: - bstart.aux fall + bstart.sys fall addi zero, 0x5e, ->x1 acrc 1 - c.bstop \ No newline at end of file + c.bstop diff --git a/benchmark-linxisa/test/common/src/benchmark_boot_linx.s b/benchmark-linxisa/test/common/src/benchmark_boot_linx.s index 4cae8da..146c0f5 100644 --- a/benchmark-linxisa/test/common/src/benchmark_boot_linx.s +++ b/benchmark-linxisa/test/common/src/benchmark_boot_linx.s @@ -7,7 +7,7 @@ _start: bstart.std call _linx_start c.setret 2, ->ra _end: - bstart.aux fall + bstart.sys fall addi zero, 0x5e, ->x1 acrc 1 - c.bstop \ No newline at end of file + c.bstop diff --git a/benchmark-linxisa/test/common/template_asm.h b/benchmark-linxisa/test/common/template_asm.h index 2f60707..8f1e0d6 100644 --- a/benchmark-linxisa/test/common/template_asm.h +++ b/benchmark-linxisa/test/common/template_asm.h @@ -3,78 +3,66 @@ #include +using namespace pto; + +template +struct pto_v057_tile_alloc { + static constexpr unsigned SizeCode = tile_type_traits::TilesizeCode; + static_assert(SizeCode >= 3, "v0.57 B.OTA allocation must be at least one 128-byte CELL"); + static constexpr unsigned CellCountM1 = (1u << (SizeCode - 3u)) - 1u; +}; + template void MGATHER(tile_shape_out &dst, gm_shape &src, tile_shape_offset &offset) { - asm volatile( - "BSTART.TMA 4, %c[DataType]\n" - "B.DIM zero, %c[VCOL], ->lb0\n" - "B.DIM zero, %c[VROW], ->lb1\n" - "B.IOT [%[s1]], last, ->%[d0]<%c[TileSize]>\n" - "B.IOR [%[s0]], []\n" - : [d0]"=Tr"(dst.data()) - : [s0]"r"(src.data()), - [s1]"Tr"(offset.data()), - [DataType]"i"(type_traits::TypeCode), - [TileSize]"i"(tile_type_traits::TilesizeCode), - [VCOL]"i"(tile_shape_offset::ValidCol), [VROW]"i"(tile_shape_offset::ValidRow) - ); + pto::MGATHER(dst, src, offset); } template void MSCATTER(gm_shape &dst, tile_shape_in &src, tile_shape_offset &offset) { - asm volatile( - "BSTART.TMA 5, %c[SrcType]\n" - "B.DIM zero, %c[VCOL], ->lb0\n" - "B.DIM zero, %c[VROW], ->lb1\n" - "B.IOT [%[s0], %[s1]], last\n" - "B.IOR [%[d0]], []\n" - : - : [d0]"r"(dst.data()), [s0]"Tr"(src.data()), - [s1]"Tr"(offset.data()), - [SrcType]"i"(type_traits::TypeCode), - [VCOL]"i"(tile_shape_offset::ValidCol), [VROW]"i"(tile_shape_offset::ValidRow) - ); + pto::MSCATTER(dst, src, offset); } template void TMAX_TEPL(tile_shape &dst, tile_shape &src0, tile_shape &src1) { asm volatile( - "BSTART.TEPL 11, %c1\n" + "BSTART.TEPL 0x25, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5, %6], last, ->%0<%c7>\n" + "B.ITP [%5, %6], 0\n" + "B.OTA ->%0<%c7>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape::ValidCol), "i"(tile_shape::ValidRow), "i"(tile_shape::Cols), - "Tr"(src0.data()), - "Tr"(src1.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src0.raw()), + "Tr"(src1.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TMULS_TEPL(tile_shape &dst, tile_shape &src0, typename tile_shape::DType s) { asm volatile( - "BSTART.TEPL 0b0100010, %c1\n" + "BSTART.TEPL 0x2B, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "B.IOR [%7],[]\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape::ValidCol), "i"(tile_shape::ValidRow), "i"(tile_shape::Cols), - "Tr"(src0.data()), - "i"(tile_type_traits::TilesizeCode), + "Tr"(src0.raw()), + "i"(pto_v057_tile_alloc::CellCountM1), "r"(s) ); } @@ -82,183 +70,191 @@ void TMULS_TEPL(tile_shape &dst, tile_shape &src0, typename tile_shape::DType s) template void TROWMAX_TEPL(tile_shape_out &dst, tile_shape_in &src) { asm volatile( - "BSTART.TEPL 0b1000001, %c1\n" + "BSTART.TEPL 0x47, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape_in::ValidCol), "i"(tile_shape_in::ValidRow), "i"(tile_shape_in::Cols), - "Tr"(src.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TSUB_TEPL(tile_shape &dst, tile_shape &src0, tile_shape &src1) { asm volatile( - "BSTART.TEPL 1, %c1\n" + "BSTART.TEPL 0x55, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5, %6], last, ->%0<%c7>\n" + "B.ITP [%5, %6], 0\n" + "B.OTA ->%0<%c7>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape::ValidCol), "i"(tile_shape::ValidRow), "i"(tile_shape::Cols), - "Tr"(src0.data()), - "Tr"(src1.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src0.raw()), + "Tr"(src1.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TEXP_TEPL(tile_shape &dst, tile_shape &src) { asm volatile( - "BSTART.TEPL 18, %c1\n" + "BSTART.TEPL 0x1C, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape::ValidCol), "i"(tile_shape::ValidRow), "i"(tile_shape::Cols), - "Tr"(src.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TMUL_TEPL(tile_shape &dst, tile_shape &src0, tile_shape &src1) { asm volatile( - "BSTART.TEPL 2, %c1\n" + "BSTART.TEPL 0x2A, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5, %6], last, ->%0<%c7>\n" + "B.ITP [%5, %6], 0\n" + "B.OTA ->%0<%c7>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape::ValidCol), "i"(tile_shape::ValidRow), "i"(tile_shape::Cols), - "Tr"(src0.data()), - "Tr"(src1.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src0.raw()), + "Tr"(src1.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TROWSUM_TEPL(tile_shape_out &dst, tile_shape_in &src) { asm volatile( - "BSTART.TEPL 0b1000000, %c1\n" + "BSTART.TEPL 0x4A, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape_in::ValidCol), "i"(tile_shape_in::ValidRow), "i"(tile_shape_in::Cols), - "Tr"(src.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TADD_TEPL(tile_shape &dst, tile_shape &src0, tile_shape &src1) { asm volatile( - "BSTART.TEPL 0, %c1\n" + "BSTART.TEPL 0x01, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5, %6], last, ->%0<%c7>\n" + "B.ITP [%5, %6], 0\n" + "B.OTA ->%0<%c7>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape::ValidCol), "i"(tile_shape::ValidRow), "i"(tile_shape::Cols), - "Tr"(src0.data()), - "Tr"(src1.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src0.raw()), + "Tr"(src1.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TRECIP_TEPL(tile_shape &dst, tile_shape &src) { asm volatile( - "BSTART.TEPL 20, %c1\n" + "BSTART.TEPL 0x39, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape::ValidCol), "i"(tile_shape::ValidRow), "i"(tile_shape::Cols), - "Tr"(src.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TCAST_TEPL(tile_shape_out &dst, tile_shape_in &src) { asm volatile( - "BSTART.TEPL 27, %c1\n" + "BSTART.TEPL 0x19, %c1\n" "B.DATR %c2, RNONE\n" "C.B.DIMI %c3, ->LB0\n" "C.B.DIMI %c4, ->LB1\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(type_traits::TypeCode), "i"(tile_shape_in::ValidCol), "i"(tile_shape_in::ValidRow), - "Tr"(src.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TEXPANDSCALAR_TEPL(tile_shape &dst, typename tile_shape::DType s) { asm volatile( - "BSTART.TEPL 0b0111011, %c1\n" + "BSTART.TEPL 0x1D, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [], last, ->%0<%c5>\n" + "B.OTA ->%0<%c5>, last, 0\n" "B.IOR [%6],[]\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape::ValidCol), "i"(tile_shape::ValidRow), "i"(tile_shape::Cols), - "i"(tile_type_traits::TilesizeCode), + "i"(pto_v057_tile_alloc::CellCountM1), "r"(s) ); } @@ -266,101 +262,106 @@ void TEXPANDSCALAR_TEPL(tile_shape &dst, typename tile_shape::DType s) { template void TROWEXPAND_TEPL(tile_shape_out &dst, tile_shape_in &src) { asm volatile( - "BSTART.TEPL 0b1000100, %c1\n" + "BSTART.TEPL 0x3F, %c1\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape_in::ValidCol), "i"(tile_shape_in::ValidRow), "i"(tile_shape_in::Cols), - "Tr"(src.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TCOLMAX_TEPL(tile_shape_out &dst, tile_shape_in &src) { asm volatile( - "BSTART.TEPL 0b1010001, %c1\n" + "BSTART.TEPL 0x15, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape_in::ValidCol), "i"(tile_shape_in::ValidRow), "i"(tile_shape_in::Cols), - "Tr"(src.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TCOLSUM_TEPL(tile_shape_out &dst, tile_shape_in &src) { asm volatile( - "BSTART.TEPL 0b1010000, %c1\n" + "BSTART.TEPL 0x18, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape_in::ValidCol), "i"(tile_shape_in::ValidRow), "i"(tile_shape_in::Cols), - "Tr"(src.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TCOLEXPANDSUB_TEPL(tile_shape_out &dst, tile_shape_out &src0, tile_shape_in &src1) { asm volatile( - "BSTART.TEPL 0b1010110, %c1\n" + "BSTART.TEPL 0x14, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5, %6], last, ->%0<%c7>\n" + "B.ITP [%5, %6], 0\n" + "B.OTA ->%0<%c7>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape_out::ValidCol), "i"(tile_shape_out::ValidRow), "i"(tile_shape_out::Cols), - "Tr"(src0.data()), - "Tr"(src1.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src0.raw()), + "Tr"(src1.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TCOLEXPANDMUL_TEPL(tile_shape_out &dst, tile_shape_out &src0, tile_shape_in &src1) { asm volatile( - "BSTART.TEPL 0b1010111, %c1\n" + "BSTART.TEPL 0x13, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5, %6], last, ->%0<%c7>\n" + "B.ITP [%5, %6], 0\n" + "B.OTA ->%0<%c7>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape_out::ValidCol), "i"(tile_shape_out::ValidRow), "i"(tile_shape_out::Cols), - "Tr"(src0.data()), - "Tr"(src1.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src0.raw()), + "Tr"(src1.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } diff --git a/benchmark-linxisa/test/kernel/control/compile.all b/benchmark-linxisa/test/kernel/control/compile.all index 6ec69f0..b4344c8 100755 --- a/benchmark-linxisa/test/kernel/control/compile.all +++ b/benchmark-linxisa/test/kernel/control/compile.all @@ -5,7 +5,7 @@ for debug in on off; do else debug_define="-DFOR_GFSIM" fi - for num_col in 256 512 1024; do + for num_col in 256 512; do make TESTCASE=hashtable_lookup_simd SUFFIX=_kNum6144_kMaxProbe512_knum_col${num_col}_debug_${debug} EXTRA_DEFINES="-DkNum=6144 -DMAX_PROBE=512 -DNUM_COL=${num_col} ${debug_define}" diss done done diff --git a/benchmark-linxisa/test/kernel/control/hashtable_lookup_simd/data_obj/build_data_obj.sh b/benchmark-linxisa/test/kernel/control/hashtable_lookup_simd/data_obj/build_data_obj.sh index 2668212..234f7b4 100755 --- a/benchmark-linxisa/test/kernel/control/hashtable_lookup_simd/data_obj/build_data_obj.sh +++ b/benchmark-linxisa/test/kernel/control/hashtable_lookup_simd/data_obj/build_data_obj.sh @@ -1,5 +1,5 @@ #!/bin/bash -COMPILER_DIR="${COMPILER_DIR:-/remote/lms60/c00622284/janus/linxisa_compiler_v0.55/linx_blockisa_llvm_musl/bin}" +: "${COMPILER_DIR:?COMPILER_DIR must point to the Linx Clang bin directory}" DATA_OBJ_DIR="$1" OUTPUT_DIR="$2" @@ -26,11 +26,11 @@ _binary_${name}_data_end: .equ _binary_${name}_data_size, .-_binary_${name}_data_start EOF - $COMPILER_DIR/clang++ -target linx64v5 -c "$asm_file" -o "$obj_file" + "$COMPILER_DIR/clang++" -mlxbc -c "$asm_file" -o "$obj_file" } build_one "inserted_slot" build_one "lookup_keys" build_one "lookup_values" -echo "Done building data object files" \ No newline at end of file +echo "Done building data object files" diff --git a/benchmark-linxisa/test/kernel/control/hkv/data_obj/build_data_obj.sh b/benchmark-linxisa/test/kernel/control/hkv/data_obj/build_data_obj.sh index ec4819c..f0c7423 100755 --- a/benchmark-linxisa/test/kernel/control/hkv/data_obj/build_data_obj.sh +++ b/benchmark-linxisa/test/kernel/control/hkv/data_obj/build_data_obj.sh @@ -1,5 +1,5 @@ #!/bin/bash -COMPILER_DIR="${COMPILER_DIR:-/remote/lms01/j00827727/jcore/compilers/linx_blockisa_llvm_musl0.56.16/bin}" +: "${COMPILER_DIR:?COMPILER_DIR must point to the Linx Clang bin directory}" DATA_OBJ_DIR="$1" OUTPUT_DIR="$2" @@ -28,7 +28,7 @@ _binary_${sym_name}_end: .equ _binary_${sym_name}_size, .-_binary_${sym_name}_start EOF - $COMPILER_DIR/clang++ -target linx64v5 -c "$asm_file" -o "$obj_file" + "$COMPILER_DIR/clang++" -mlxbc -c "$asm_file" -o "$obj_file" } build_one "buckets.bin" diff --git a/benchmark-linxisa/test/kernel/element_wise/gelu/compile.all b/benchmark-linxisa/test/kernel/element_wise/gelu/compile.all index 84d901c..b1f3da1 100644 --- a/benchmark-linxisa/test/kernel/element_wise/gelu/compile.all +++ b/benchmark-linxisa/test/kernel/element_wise/gelu/compile.all @@ -7,7 +7,7 @@ # BF16, exact mode # ============================================ # 3D shape: (24, 8, 1024) -make TESTCASE=gelu DTYPE=__bf16 tMs=2048 gMs=24*8*1024 \ +make TESTCASE=gelu DTYPE=pto_bf16_t tMs=2048 gMs=24*8*1024 \ SHAPE_NAME=24_8_1024 Approximate=false diss # # 2D shape: (128, 1024) diff --git a/benchmark-linxisa/test/kernel/fa/Makefile b/benchmark-linxisa/test/kernel/fa/Makefile index c9b5c73..4b00c5b 100644 --- a/benchmark-linxisa/test/kernel/fa/Makefile +++ b/benchmark-linxisa/test/kernel/fa/Makefile @@ -14,12 +14,16 @@ ifeq ($(TESTCASE), fa_HIF4_HIF4) Tk = 128 X = 2 Y = 4 + qD = 128 + vD = 128 DEFINES += -DTsq=$(Sq) DEFINES += -DTskv=$(Skv) DEFINES += -DTm=$(Tm) DEFINES += -DTk=$(Tk) DEFINES += -DXdim=$(X) DEFINES += -DYdim=$(Y) + DEFINES += -DTqD=$(qD) + DEFINES += -DTvD=$(vD) ifneq ($(MODE), ) DEFINES += -D$(MODE) endif @@ -44,6 +48,8 @@ ifeq ($(TESTCASE), fa_2d_unroll) Tk = 128 X = 1 Y = 1 + qD = 128 + vD = 128 ifneq ($(X), ) X = $(X) endif @@ -56,7 +62,9 @@ ifeq ($(TESTCASE), fa_2d_unroll) DEFINES += -DTk=$(Tk) DEFINES += -DXdim=$(X) DEFINES += -DYdim=$(Y) - TARGET = $(ELF_HEAD)/$(TESTCASE)_Sq$(Sq)_Skv$(Skv)_Tm$(Tm)_Tk$(Tk)_X$(X)_Y$(Y).elf + DEFINES += -DTqD=$(qD) + DEFINES += -DTvD=$(vD) + TARGET = $(ELF_HEAD)/$(TESTCASE)_Sq$(Sq)_Skv$(Skv)_qD$(qD)_vD$(vD)_Tm$(Tm)_Tk$(Tk)_X$(X)_Y$(Y).elf endif include ../../common/Makefile.common diff --git a/benchmark-linxisa/test/kernel/fa/compile.all b/benchmark-linxisa/test/kernel/fa/compile.all index 52c7464..cad6bdf 100644 --- a/benchmark-linxisa/test/kernel/fa/compile.all +++ b/benchmark-linxisa/test/kernel/fa/compile.all @@ -5,24 +5,25 @@ # fa_2d_unroll: 2D unrolled flash attention # ============================================ # X=1, Y=2 -make TESTCASE=fa_2d_unroll Sq=256 Skv=512 Tm=128 Tk=128 X=1 Y=2 -make TESTCASE=fa_2d_unroll Sq=512 Skv=512 Tm=128 Tk=128 X=1 Y=2 +make TESTCASE=fa_2d_unroll Sq=256 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=1 Y=2 +make TESTCASE=fa_2d_unroll Sq=512 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=1 Y=2 # X=1, Y=4 -make TESTCASE=fa_2d_unroll Sq=256 Skv=512 Tm=128 Tk=128 X=1 Y=4 -make TESTCASE=fa_2d_unroll Sq=512 Skv=512 Tm=128 Tk=128 X=1 Y=4 +make TESTCASE=fa_2d_unroll Sq=256 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=1 Y=4 +make TESTCASE=fa_2d_unroll Sq=512 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=1 Y=4 # X=2, Y=2 -make TESTCASE=fa_2d_unroll Sq=256 Skv=512 Tm=128 Tk=128 X=2 Y=2 -make TESTCASE=fa_2d_unroll Sq=512 Skv=512 Tm=128 Tk=128 X=2 Y=2 +make TESTCASE=fa_2d_unroll Sq=256 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=2 Y=2 +make TESTCASE=fa_2d_unroll Sq=512 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=2 Y=2 # X=2, Y=4 -make TESTCASE=fa_2d_unroll Sq=256 Skv=512 Tm=128 Tk=128 X=2 Y=4 -make TESTCASE=fa_2d_unroll Sq=512 Skv=512 Tm=128 Tk=128 X=2 Y=4 +make TESTCASE=fa_2d_unroll Sq=256 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=2 Y=4 +make TESTCASE=fa_2d_unroll Sq=512 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=2 Y=4 # ============================================ # fa_HIF4_HIF4: HIF4 quantized flash attention +# Requires block-vector launch migration before it is v0.57-active. # ============================================ -make TESTCASE=fa_HIF4_HIF4 MODE=BF16_NOGATHER Sq=256 Skv=512 Tm=128 Tk=128 X=1 Y=1 +# make TESTCASE=fa_HIF4_HIF4 MODE=BF16_NOGATHER Sq=256 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=1 Y=1 # make TESTCASE=fa_HIF4_HIF4 MODE=BF16x2 Sq=256 Skv=512 Tm=128 Tk=128 X=1 Y=1 # make TESTCASE=fa_HIF4_HIF4 MODE=BF16x2_NOGATHER Sq=256 Skv=512 Tm=128 Tk=128 X=1 Y=1 diff --git a/benchmark-linxisa/test/kernel/fa/src/fa_2d_unroll.cpp b/benchmark-linxisa/test/kernel/fa/src/fa_2d_unroll.cpp index d57e0e2..b7024b0 100644 --- a/benchmark-linxisa/test/kernel/fa/src/fa_2d_unroll.cpp +++ b/benchmark-linxisa/test/kernel/fa/src/fa_2d_unroll.cpp @@ -18,8 +18,17 @@ #define Skv Tskv #endif +#ifndef TqD #define qD 128 +#else +#define qD TqD +#endif + +#ifndef TvD #define vD 128 +#else +#define vD TvD +#endif #ifndef Tm #define kTm 128 diff --git a/benchmark-linxisa/test/kernel/gather/compile.all b/benchmark-linxisa/test/kernel/gather/compile.all index f891426..588626f 100755 --- a/benchmark-linxisa/test/kernel/gather/compile.all +++ b/benchmark-linxisa/test/kernel/gather/compile.all @@ -25,4 +25,4 @@ COMPILER_DIR=${COMPILER_DIR:-/Users/liyi/Documents/SuperNPU编译器构建/outpu # Power-of-2 dimensions make TESTCASE=gather COMPILER_DIR=$COMPILER_DIR DType=__fp32 OType=uint32_t \ gKs=131072 gMs=32 gNs=256 \ - tMs=32 tNs=128 + tMs=32 tNs=32 diff --git a/benchmark-linxisa/test/kernel/matmul/compile.all b/benchmark-linxisa/test/kernel/matmul/compile.all index f48dbfb..3f158c3 100755 --- a/benchmark-linxisa/test/kernel/matmul/compile.all +++ b/benchmark-linxisa/test/kernel/matmul/compile.all @@ -23,39 +23,39 @@ # ============================================ # HIF4_HIF4: MX_NOGATHER variant - 2 configs # ============================================ -make TESTCASE=matmul TYPE=HIF4_HIF4 VER=MX_NOGATHER M=256 N=2048 K=2048 tM=128 tN=128 tK=128 -make TESTCASE=matmul TYPE=HIF4_HIF4 VER=MX_NOGATHER M=512 N=1280 K=4096 tM=128 tN=128 tK=128 +make TESTCASE=matmul TYPE=HIF4_HIF4 VER=MX_NOGATHER M=256 N=2048 K=2048 tM=32 tN=32 tK=128 +make TESTCASE=matmul TYPE=HIF4_HIF4 VER=MX_NOGATHER M=512 N=1280 K=4096 tM=32 tN=32 tK=128 # ============================================ # HIF4_HIF4: MX_NOGATHER_REUSEA variant - 2 configs # ============================================ -make TESTCASE=matmul TYPE=HIF4_HIF4 VER=MX_NOGATHER_REUSEA M=256 N=2048 K=2048 tM=128 tN=128 tK=128 -make TESTCASE=matmul TYPE=HIF4_HIF4 VER=MX_NOGATHER_REUSEA M=512 N=1280 K=4096 tM=128 tN=128 +make TESTCASE=matmul TYPE=HIF4_HIF4 VER=MX_NOGATHER_REUSEA M=256 N=2048 K=2048 tM=32 tN=32 tK=128 +make TESTCASE=matmul TYPE=HIF4_HIF4 VER=MX_NOGATHER_REUSEA M=512 N=1280 K=4096 tM=32 tN=32 tK=128 # ============================================ # A16W4: BF16 x FP4 mixed precision - 3 configs # ============================================ -make TESTCASE=matmul TYPE=A16W4 M=256 N=2048 K=2048 tM=128 tN=128 tK=128 -make TESTCASE=matmul TYPE=A16W4 M=512 N=1280 K=2048 tM=128 tN=128 tK=128 -make TESTCASE=matmul TYPE=A16W4 M=512 N=512 K=4096 tM=128 tN=256 tK=128 +make TESTCASE=matmul TYPE=A16W4 M=256 N=2048 K=2048 tM=16 tN=16 tK=128 +make TESTCASE=matmul TYPE=A16W4 M=512 N=1280 K=2048 tM=16 tN=16 tK=128 +make TESTCASE=matmul TYPE=A16W4 M=512 N=512 K=4096 tM=16 tN=16 tK=128 # ============================================ # MASK: Generic matmul variants - 8 configs # ============================================ # FP32 variants -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP32 M=256 N=256 K=256 tM=64 tN=64 tK=64 -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP32_REUSEA M=256 N=256 K=256 tM=64 tN=64 tK=64 -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP32_REUSEB M=256 N=256 K=256 tM=64 tN=64 tK=64 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP32 M=256 N=256 K=256 tM=32 tN=32 tK=32 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP32_REUSEA M=256 N=256 K=256 tM=32 tN=32 tK=32 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP32_REUSEB M=256 N=256 K=256 tM=32 tN=32 tK=32 # 黄区能过,蓝区过不了 # make TESTCASE=matmul TYPE=MASK MODE=MASK_FP32_DYNAMIC M=256 N=256 K=256 tM=64 tN=64 tK=64 # FP16 variants -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP16 M=256 N=256 K=256 tM=64 tN=64 tK=64 -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP16_REUSEA M=256 N=256 K=256 tM=64 tN=64 tK=64 -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP16_REUSEB M=256 N=256 K=256 tM=64 tN=64 tK=64 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP16 M=256 N=256 K=256 tM=32 tN=32 tK=32 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP16_REUSEA M=256 N=256 K=256 tM=32 tN=32 tK=32 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP16_REUSEB M=256 N=256 K=256 tM=32 tN=32 tK=32 # FP8 variants -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP8 M=256 N=256 K=256 tM=64 tN=64 tK=64 -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP8_REUSEA M=256 N=256 K=256 tM=64 tN=64 tK=64 -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP8_REUSEB M=256 N=256 K=256 tM=64 tN=64 tK=64 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP8 M=256 N=256 K=256 tM=32 tN=32 tK=32 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP8_REUSEA M=256 N=256 K=256 tM=32 tN=32 tK=32 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP8_REUSEB M=256 N=256 K=256 tM=32 tN=32 tK=32 # make TESTCASE=matmul TYPE=MASK MODE=MX_FP8 M=256 N=256 K=256 tM=64 tN=64 tK=64 diff --git a/benchmark-linxisa/test/kernel/matmul/src/A16W4.cpp b/benchmark-linxisa/test/kernel/matmul/src/A16W4.cpp index e6af690..b4a2906 100644 --- a/benchmark-linxisa/test/kernel/matmul/src/A16W4.cpp +++ b/benchmark-linxisa/test/kernel/matmul/src/A16W4.cpp @@ -41,7 +41,7 @@ int main() { // bf16*fp4 using fp4_t = __fp4_e2m1x2; - using bf16_t = __bf16; + using bf16_t = pto_bf16_t; static_assert(tilM % 2 == 0); // 暂时假定tile是偶数的,方便取地址,奇数tile实现需要末尾padding 0对齐地址 static_assert(tilN % 2 == 0); static_assert(tilK == (128)); @@ -68,4 +68,4 @@ int main() { #endif return 0; -} \ No newline at end of file +} diff --git a/benchmark-linxisa/test/kernel/reduction/reducemax_col/compile.all b/benchmark-linxisa/test/kernel/reduction/reducemax_col/compile.all index d54fa35..8f8a50c 100755 --- a/benchmark-linxisa/test/kernel/reduction/reducemax_col/compile.all +++ b/benchmark-linxisa/test/kernel/reduction/reducemax_col/compile.all @@ -4,9 +4,9 @@ COMPILER_DIR=${COMPILER_DIR:-/Users/liyi/Documents/SuperNPU编译器构建/output/linx_blockisa_llvm_musl/bin} # Large matrix, int32_t -make TESTCASE=reducemax_col COMPILER_DIR=$COMPILER_DIR DType=int32_t tM_s=256 tN_s=64 \ +make TESTCASE=reducemax_col COMPILER_DIR=$COMPILER_DIR DType=int32_t tM_s=16 tN_s=64 \ gM_s=2048 gN_s=64 # Large matrix, __half -#make TESTCASE=reducemax_col COMPILER_DIR=$COMPILER_DIR DType=__half tM_s=256 tN_s=64 \ +#make TESTCASE=reducemax_col COMPILER_DIR=$COMPILER_DIR DType=__half tM_s=16 tN_s=64 \ # gM_s=2048 gN_s=64 diff --git a/benchmark-linxisa/test/kernel/reduction/reducemax_col/src/reducemax_col.cpp b/benchmark-linxisa/test/kernel/reduction/reducemax_col/src/reducemax_col.cpp index a9cb6ea..dc767c9 100644 --- a/benchmark-linxisa/test/kernel/reduction/reducemax_col/src/reducemax_col.cpp +++ b/benchmark-linxisa/test/kernel/reduction/reducemax_col/src/reducemax_col.cpp @@ -11,28 +11,30 @@ #endif -#ifndef tM -#define tM 32 +#ifndef tMs +#define tMs 32 #endif -#ifndef tN -#define tN 128 +#ifndef tNs +#define tNs 128 #endif +#ifndef gIMs +#define gIMs 256 +#endif - - -#define gIM 256 -#define gIN 256 +#ifndef gINs +#define gINs 256 +#endif // ============================================================================ // main // ============================================================================ int main() { using dtype = DType; - dtype input_buf[gIM*gIN]; + dtype input_buf[gIMs*gINs]; // dtype zero_buf[1*gIN]; - dtype output_buf[1*gIN]; + dtype output_buf[1*gINs]; dtype* input=input_buf; // dtype* zero=zero_buf; @@ -42,8 +44,7 @@ int main() { // readBinaryFile("/remote/lms01/q50057645/jcore_project/JanusCoreBench/test/ascpp/reducemax_col/src/data_8192x1024.bin", (uint8_t*)input, gIM * gIN * sizeof(dtype)); // readBinaryFile("/remote/lms01/q50057645/jcore_project/JanusCoreBench/test/ascpp/reducemax_col/src/data1x256_zero.bin", (uint8_t*)zero, gIN * sizeof(dtype)); // reducesum_colsum_rand(input, output); - reducemax_col_rand(input, output); + reducemax_col_rand(input, output); // writeBinaryFile("/remote/lms01/q50057645/jcore_project/JanusCoreBench/test/ascpp/reducemax_col/src/result_max.bin", (uint8_t*)output, gIN * sizeof(dtype)); } - diff --git a/benchmark-linxisa/test/kernel/reduction/reducemax_row/compile.all b/benchmark-linxisa/test/kernel/reduction/reducemax_row/compile.all index 92bfeed..8a0e379 100755 --- a/benchmark-linxisa/test/kernel/reduction/reducemax_row/compile.all +++ b/benchmark-linxisa/test/kernel/reduction/reducemax_row/compile.all @@ -4,9 +4,9 @@ COMPILER_DIR=${COMPILER_DIR:-/Users/liyi/Documents/SuperNPU编译器构建/output/linx_blockisa_llvm_musl/bin} # Large matrix, int32_t -make TESTCASE=reducemax_row COMPILER_DIR=$COMPILER_DIR DType=int32_t tM_s=16 tN_s=512 \ +make TESTCASE=reducemax_row COMPILER_DIR=$COMPILER_DIR DType=int32_t tM_s=16 tN_s=64 \ gM_s=16 gN_s=8192 # Large matrix, __half (DISABLED: Tile alignment < 32 bytes for _Float16 with Cols=8) -# make TESTCASE=reducemax_row COMPILER_DIR=$COMPILER_DIR DType=__half tM_s=16 tN_s=512 \ +# make TESTCASE=reducemax_row COMPILER_DIR=$COMPILER_DIR DType=__half tM_s=16 tN_s=64 \ # gM_s=16 gN_s=8192 diff --git a/benchmark-linxisa/test/kernel/reduction/reducesum_col/compile.all b/benchmark-linxisa/test/kernel/reduction/reducesum_col/compile.all index c9daba1..d3b6098 100755 --- a/benchmark-linxisa/test/kernel/reduction/reducesum_col/compile.all +++ b/benchmark-linxisa/test/kernel/reduction/reducesum_col/compile.all @@ -4,9 +4,9 @@ COMPILER_DIR=${COMPILER_DIR:-/Users/liyi/Documents/SuperNPU编译器构建/output/linx_blockisa_llvm_musl/bin} # Large matrix, int32_t -make TESTCASE=reducesum_col COMPILER_DIR=$COMPILER_DIR DType=int32_t tM_s=256 tN_s=64 \ +make TESTCASE=reducesum_col COMPILER_DIR=$COMPILER_DIR DType=int32_t tM_s=16 tN_s=64 \ gM_s=2048 gN_s=64 # Large matrix, __half -make TESTCASE=reducesum_col COMPILER_DIR=$COMPILER_DIR DType=__half tM_s=256 tN_s=64 \ +make TESTCASE=reducesum_col COMPILER_DIR=$COMPILER_DIR DType=__half tM_s=16 tN_s=64 \ gM_s=2048 gN_s=64 diff --git a/benchmark-linxisa/test/kernel/reduction/reducesum_row/compile.all b/benchmark-linxisa/test/kernel/reduction/reducesum_row/compile.all index 4bfab51..8f360f3 100755 --- a/benchmark-linxisa/test/kernel/reduction/reducesum_row/compile.all +++ b/benchmark-linxisa/test/kernel/reduction/reducesum_row/compile.all @@ -4,9 +4,9 @@ COMPILER_DIR=${COMPILER_DIR:-/Users/liyi/Documents/SuperNPU编译器构建/output/linx_blockisa_llvm_musl/bin} # Large matrix, int32_t -make TESTCASE=reducesum_row COMPILER_DIR=$COMPILER_DIR DType=int32_t tM_s=16 tN_s=512 \ +make TESTCASE=reducesum_row COMPILER_DIR=$COMPILER_DIR DType=int32_t tM_s=16 tN_s=64 \ gM_s=16 gN_s=8192 # Large matrix, __half (DISABLED: Tile alignment < 32 bytes for _Float16 with Cols=8) -# make TESTCASE=reducesum_row COMPILER_DIR=$COMPILER_DIR DType=__half tM_s=16 tN_s=512 \ +# make TESTCASE=reducesum_row COMPILER_DIR=$COMPILER_DIR DType=__half tM_s=16 tN_s=64 \ # gM_s=16 gN_s=8192 diff --git a/benchmark-linxisa/test/kernel/sort/topk/data_obj/build_data_obj.sh b/benchmark-linxisa/test/kernel/sort/topk/data_obj/build_data_obj.sh index 8128f1e..4644e7b 100755 --- a/benchmark-linxisa/test/kernel/sort/topk/data_obj/build_data_obj.sh +++ b/benchmark-linxisa/test/kernel/sort/topk/data_obj/build_data_obj.sh @@ -1,5 +1,5 @@ #!/bin/bash -COMPILER_DIR="${COMPILER_DIR:-/remote/lms01/j00827727/jcore/compilers/linx_blockisa_llvm_musl0.56.16/bin}" +: "${COMPILER_DIR:?COMPILER_DIR must point to the Linx Clang bin directory}" DATA_OBJ_DIR="$1" OUTPUT_DIR="$2" @@ -25,10 +25,10 @@ _binary_${name}_data_end: .equ _binary_${name}_data_size, .-_binary_${name}_data_start EOF - $COMPILER_DIR/clang++ -target linx64v5 -c "$asm_file" -o "$obj_file" + "$COMPILER_DIR/clang++" -mlxbc -c "$asm_file" -o "$obj_file" } build_one "input_131072" build_one "top_2048_out" -echo "Done building data object files" \ No newline at end of file +echo "Done building data object files" diff --git a/benchmark-linxisa/test/kernel/sort/topk/topk.cpp b/benchmark-linxisa/test/kernel/sort/topk/topk.cpp index 1f579fd..0856747 100644 --- a/benchmark-linxisa/test/kernel/sort/topk/topk.cpp +++ b/benchmark-linxisa/test/kernel/sort/topk/topk.cpp @@ -42,26 +42,15 @@ int main() { fflush(stdout); #endif - // ------------------------------------------------------------------------- - // Phase 1: SIMT high8 histogram (1 block × 256 lanes, each lane = 1 bucket) - // ------------------------------------------------------------------------- - TileU32 high8HistTile; - TEXPANDSCALAR(high8HistTile, static_cast(0)); - ExtractHigh8Hist_Impl< TileU32 >(high8HistTile, g_input); - - // Copy histogram results out and reduce to global 256-bin histogram - using HistGT = GlobalTensor, Stride<1,1,1,16,1>>; - uint32_t histResult[256]; - HistGT histGlobal(histResult); - TCOPYOUT(histGlobal, high8HistTile); - uint32_t global_high8_hist[256] = {0}; - for (int b = 0; b < 256; b++) { - global_high8_hist[b] = histResult[b]; + for (int i = 0; i < kInputCount; i++) { + uint16_t val = g_input[i]; + uint8_t high8 = static_cast(val >> 8); + global_high8_hist[high8] += 1; } #ifndef FOR_GFSIM - printf("\nPhase 1: high8 histograms built (1 SIMT launch, 256 lanes).\n"); + printf("\nPhase 1: high8 histogram built.\n"); fflush(stdout); #endif @@ -82,21 +71,14 @@ int main() { fflush(stdout); #endif - // ------------------------------------------------------------------------- - // Phase 3: SIMT low8 histogram for kth_bin elements - // ------------------------------------------------------------------------- - TileU32 low8HistTile; - TEXPANDSCALAR(low8HistTile, static_cast(0)); - ExtractLow8HistForKthBin_Impl< TileU32 >(low8HistTile, g_input, - static_cast(kth_bin)); - - uint32_t low8HistResult[256]; - HistGT low8HistGlobal(low8HistResult); - TCOPYOUT(low8HistGlobal, low8HistTile); - uint32_t global_low8_hist_kth[256] = {0}; - for (int b = 0; b < 256; b++) { - global_low8_hist_kth[b] = low8HistResult[b]; + for (int i = 0; i < kInputCount; i++) { + uint16_t val = g_input[i]; + uint8_t high8 = static_cast(val >> 8); + if (high8 == static_cast(kth_bin)) { + uint8_t low8 = static_cast(val & 0xFF); + global_low8_hist_kth[low8] += 1; + } } // ------------------------------------------------------------------------- diff --git a/benchmark-ptoisa/compile_all.sh b/benchmark-ptoisa/compile_all.sh index e677a82..5622fb5 100755 --- a/benchmark-ptoisa/compile_all.sh +++ b/benchmark-ptoisa/compile_all.sh @@ -4,15 +4,19 @@ # Don't use set -e as some operators may fail to compile SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -export COMPILER_DIR=${COMPILER_DIR:-/path/to/pto-toolchain/bin} +LINX_ISA_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" +export COMPILER_DIR=${COMPILER_DIR:-$LINX_ISA_ROOT/compiler/llvm/build-linxisa-clang/bin} REPO_ROOT=${REPO_ROOT:-$SCRIPT_DIR} echo "==========================================" echo "[PTO ISA] Starting full compilation" echo "REPO_ROOT: $REPO_ROOT" +echo "COMPILER_DIR: $COMPILER_DIR" echo "==========================================" # Function to compile an operator +FAILURES=0 + compile_operator() { local operator_path=$1 local operator_name=$2 @@ -25,10 +29,15 @@ compile_operator() { if [ ! -d "$operator_path" ]; then echo "Warning: Directory not found: $operator_path" + FAILURES=$((FAILURES + 1)) return 1 fi - cd "$operator_path" + if ! cd "$operator_path"; then + echo "✗ $operator_name compilation failed: cannot enter directory" + FAILURES=$((FAILURES + 1)) + return 1 + fi if [ -f "compile.all" ]; then echo "Running compile.all with baremetal=${baremetal:-off}..." @@ -37,9 +46,12 @@ compile_operator() { echo "✓ $operator_name compilation completed" else echo "✗ $operator_name compilation failed" + FAILURES=$((FAILURES + 1)) + return 1 fi else echo "Warning: No compile.all found in $operator_path" + FAILURES=$((FAILURES + 1)) return 1 fi } @@ -67,3 +79,8 @@ echo "" echo "Generated ELF files:" find "$REPO_ROOT/output" -name "*.elf" -type f | wc -l echo "ELF files are located in: $REPO_ROOT/output/" + +if [ "$FAILURES" -ne 0 ]; then + echo "Compilation failures: $FAILURES" + exit 1 +fi diff --git a/benchmark-ptoisa/kernels/broadcast/broadcast.hpp b/benchmark-ptoisa/kernels/broadcast/broadcast.hpp new file mode 100644 index 0000000..75082ee --- /dev/null +++ b/benchmark-ptoisa/kernels/broadcast/broadcast.hpp @@ -0,0 +1,112 @@ +#ifndef SUPERNPUBENCH_PTOISA_BROADCAST_HPP +#define SUPERNPUBENCH_PTOISA_BROADCAST_HPP + +#include + +#include +#include +#include + +using namespace pto; + +template +void broadcast(dtype *in_ptr, dtype *out_ptr, const size_t *in_shape, + const size_t *out_shape) { + constexpr size_t kBCast = gOM / gIM; + constexpr size_t kElementsPerTile = 512; + constexpr size_t kFullTiles = gOM / kElementsPerTile; + constexpr size_t kTail = gOM % kElementsPerTile; + + static_assert(MAX_DIM >= IN_DIM && MAX_DIM >= OUT_DIM, + "MAX_DIM must cover input and output ranks"); + static_assert(gOM % gIM == 0, + "broadcast output element count must be a multiple of input"); + static_assert(kBCast > 0, "broadcast factor must be positive"); + (void)kTileHint; + + size_t inner = 1; + size_t in_dim = IN_DIM; + size_t out_dim = OUT_DIM; + while (in_dim > 0 && out_dim > 0 && + in_shape[in_dim - 1] == out_shape[out_dim - 1]) { + inner *= out_shape[out_dim - 1]; + --in_dim; + --out_dim; + } + + using InputGlobal = global_tensor>; + using OutputGlobal = global_tensor>; + using DataTile = Tile; + using OffsetTile = Tile; + using OutputIterator = global_iterator; + + InputGlobal input_global(in_ptr); + OutputIterator output_iter(out_ptr); + + auto emit_tile = [&](auto &data_tile, auto &linear, auto &batch, + auto &batch_base, auto &inner_q, auto &inner_base, + auto &inner_idx, auto &inner_bytes, auto &offset, + std::uint32_t base, auto output_addr) { + using LinearTile = std::remove_reference_t; + const auto inner_u32 = static_cast(inner); + const auto group_u32 = static_cast(kBCast * inner); + TCI(linear, base); + + TDIVS(batch, linear, group_u32); + TMULS(batch_base, batch, + static_cast(inner * sizeof(dtype))); + + TDIVS(inner_q, linear, inner_u32); + TMULS(inner_base, inner_q, inner_u32); + TSUB(inner_idx, linear, inner_base); + TMULS(inner_bytes, inner_idx, static_cast(sizeof(dtype))); + + TADD(offset, batch_base, inner_bytes); + MGATHER(data_tile, input_global, offset); + TSTORE(output_addr, data_tile); + }; + + for (size_t tile = 0; tile < kFullTiles; ++tile) { + DataTile data_tile; + OffsetTile linear; + OffsetTile batch; + OffsetTile batch_base; + OffsetTile inner_q; + OffsetTile inner_base; + OffsetTile inner_idx; + OffsetTile inner_bytes; + OffsetTile offset; + emit_tile(data_tile, linear, batch, batch_base, inner_q, inner_base, + inner_idx, inner_bytes, offset, + static_cast(tile * kElementsPerTile), + output_iter(0, static_cast(tile))); + } + + if constexpr (kTail != 0) { + using TailDataTile = Tile; + using TailOffsetTile = + Tile; + using TailOutputIterator = global_iterator; + TailOutputIterator tail_output_iter(out_ptr); + TailDataTile data_tile; + TailOffsetTile linear; + TailOffsetTile batch; + TailOffsetTile batch_base; + TailOffsetTile inner_q; + TailOffsetTile inner_base; + TailOffsetTile inner_idx; + TailOffsetTile inner_bytes; + TailOffsetTile offset; + emit_tile(data_tile, linear, batch, batch_base, inner_q, inner_base, + inner_idx, inner_bytes, offset, + static_cast(kFullTiles * kElementsPerTile), + tail_output_iter(0, static_cast(kFullTiles))); + } +} + +#endif // SUPERNPUBENCH_PTOISA_BROADCAST_HPP diff --git a/benchmark-ptoisa/kernels/broadcast/broadcast_pto.hpp b/benchmark-ptoisa/kernels/broadcast/broadcast_pto.hpp index 1fb7ab6..cfaf699 100644 --- a/benchmark-ptoisa/kernels/broadcast/broadcast_pto.hpp +++ b/benchmark-ptoisa/kernels/broadcast/broadcast_pto.hpp @@ -190,7 +190,7 @@ void gen_offset_pto( // ---------------------------------------------------------------------------- // broadcast: 接口与原 broadcast.hpp 一致 // ---------------------------------------------------------------------------- -template +template void broadcast( dtype *in_ptr, dtype *out_ptr, diff --git a/benchmark-ptoisa/kernels/broadcast/broadcast_vec_019.hpp b/benchmark-ptoisa/kernels/broadcast/broadcast_vec_019.hpp new file mode 100644 index 0000000..e6332b6 --- /dev/null +++ b/benchmark-ptoisa/kernels/broadcast/broadcast_vec_019.hpp @@ -0,0 +1,99 @@ +#ifndef SUPERNPUBENCH_PTOISA_BROADCAST_VEC_019_HPP +#define SUPERNPUBENCH_PTOISA_BROADCAST_VEC_019_HPP + +#include + +#include +#include +#include + +using namespace pto; + +template +void broadcast(dtype *in_ptr, dtype *out_ptr, const size_t *, const size_t *) { + constexpr size_t kBCast = gOM / gIM; + constexpr size_t kElementsPerTile = 512; + constexpr size_t kFullTiles = gOM / kElementsPerTile; + constexpr size_t kTail = gOM % kElementsPerTile; + + static_assert(MAX_DIM >= IN_DIM && MAX_DIM >= OUT_DIM, + "MAX_DIM must cover input and output ranks"); + static_assert(gOM % gIM == 0, + "broadcast output element count must be a multiple of input"); + static_assert(gIM % kInner == 0, + "input element count must be divisible by kInner"); + static_assert(kBCast > 0, "broadcast factor must be positive"); + static_assert(kTileBatch > 0, "kTileBatch must be positive"); + + using InputGlobal = global_tensor>; + using OutputGlobal = global_tensor>; + using DataTile = Tile; + using OffsetTile = Tile; + using OutputIterator = global_iterator; + + InputGlobal input_global(in_ptr); + OutputIterator output_iter(out_ptr); + + auto emit_tile = [&](auto &data_tile, auto &linear, auto &batch, + auto &batch_base, auto &inner_q, auto &inner_base, + auto &inner, auto &inner_bytes, auto &offset, + std::uint32_t base, auto output_addr) { + using LinearTile = std::remove_reference_t; + TCI(linear, base); + + TDIVS(batch, linear, static_cast(kBCast * kInner)); + TMULS(batch_base, batch, + static_cast(kInner * sizeof(dtype))); + + TDIVS(inner_q, linear, static_cast(kInner)); + TMULS(inner_base, inner_q, static_cast(kInner)); + TSUB(inner, linear, inner_base); + TMULS(inner_bytes, inner, static_cast(sizeof(dtype))); + + TADD(offset, batch_base, inner_bytes); + MGATHER(data_tile, input_global, offset); + TSTORE(output_addr, data_tile); + }; + + for (size_t tile = 0; tile < kFullTiles; ++tile) { + DataTile data_tile; + OffsetTile linear; + OffsetTile batch; + OffsetTile batch_base; + OffsetTile inner_q; + OffsetTile inner_base; + OffsetTile inner; + OffsetTile inner_bytes; + OffsetTile offset; + emit_tile(data_tile, linear, batch, batch_base, inner_q, inner_base, inner, + inner_bytes, offset, + static_cast(tile * kElementsPerTile), + output_iter(0, static_cast(tile))); + } + + if constexpr (kTail != 0) { + using TailDataTile = Tile; + using TailOffsetTile = + Tile; + TailDataTile data_tile; + TailOffsetTile linear; + TailOffsetTile batch; + TailOffsetTile batch_base; + TailOffsetTile inner_q; + TailOffsetTile inner_base; + TailOffsetTile inner; + TailOffsetTile inner_bytes; + TailOffsetTile offset; + emit_tile(data_tile, linear, batch, batch_base, inner_q, inner_base, inner, + inner_bytes, offset, + static_cast(kFullTiles * kElementsPerTile), + output_iter(0, static_cast(kFullTiles))); + } +} + +#endif diff --git a/benchmark-ptoisa/kernels/broadcast/broadcast_vec_019_pto.hpp b/benchmark-ptoisa/kernels/broadcast/broadcast_vec_019_pto.hpp index 908e75f..d1aa674 100644 --- a/benchmark-ptoisa/kernels/broadcast/broadcast_vec_019_pto.hpp +++ b/benchmark-ptoisa/kernels/broadcast/broadcast_vec_019_pto.hpp @@ -21,8 +21,8 @@ // │ │ 二层实现 │ 当前编译器名 TCOPYIN; │ // │ │ │ jcore/TCopyIn.hpp 用 __vec__ 实现 │ // ├──────────┼──────────────────┼──────────────────────────────────────────┤ -// │ TINSERT │ 完全缺失 │ pto_tileop.hpp 中无此 API; │ -// │ │ │ 当前编译器无 TINSERT 实现 │ +// │ TINSERT │ v0.57 supported │ TileOP API lowers to BSTART.FIXP │ +// │ │ │ with FIXP.Function.TINSERT │ // │ │ │ (仅有反向操作 TEXTRACT) │ // ├──────────┼──────────────────┼──────────────────────────────────────────┤ // │ TSTORE │ API 有(名不同), │ PTO ISA 名 TSTORE; │ @@ -79,7 +79,7 @@ // kInner - inner dimension K (e.g. 49, need not be power-of-2) // ===================================================================== -template void broadcast(dtype *in_ptr, dtype *out_ptr, const size_t * /*in_shape*/, const size_t * /*out_shape*/) { @@ -121,7 +121,7 @@ void broadcast(dtype *in_ptr, dtype *out_ptr, // TINSERT × kBCast: 将输入 tile 插入输出 tile 的 N 个列偏移 // 每次 TINSERT 写入 kInner 列, N 次互不重叠, 合起来填满 N*kInner 列 - // [当前编译器] 完全缺失! pto_tileop.hpp 无 TINSERT API + // v0.57 lowers TINSERT through the FIXP tile block. #pragma clang loop unroll(full) for (size_t c = 0; c < kBCast; c++) { TINSERT(outTile, inTile, /*indexRow=*/0, /*indexCol=*/(uint16_t)(c * kInner)); diff --git a/benchmark-ptoisa/kernels/broadcast/broadcast_vec_039.hpp b/benchmark-ptoisa/kernels/broadcast/broadcast_vec_039.hpp new file mode 100644 index 0000000..a918659 --- /dev/null +++ b/benchmark-ptoisa/kernels/broadcast/broadcast_vec_039.hpp @@ -0,0 +1,6 @@ +#ifndef SUPERNPUBENCH_PTOISA_BROADCAST_VEC_039_HPP +#define SUPERNPUBENCH_PTOISA_BROADCAST_VEC_039_HPP + +#include "broadcast_vec_019.hpp" + +#endif diff --git a/benchmark-ptoisa/kernels/broadcast/broadcast_vec_039_pto.hpp b/benchmark-ptoisa/kernels/broadcast/broadcast_vec_039_pto.hpp index edc99a8..38894d8 100644 --- a/benchmark-ptoisa/kernels/broadcast/broadcast_vec_039_pto.hpp +++ b/benchmark-ptoisa/kernels/broadcast/broadcast_vec_039_pto.hpp @@ -22,8 +22,8 @@ // │ │ 二层实现 │ 当前编译器名 TCOPYIN; │ // │ │ │ jcore/TCopyIn.hpp 用 __vec__ 实现 │ // ├──────────┼──────────────────┼──────────────────────────────────────────┤ -// │ TINSERT │ 完全缺失 │ pto_tileop.hpp 中无此 API; │ -// │ │ │ 当前编译器无 TINSERT 实现 │ +// │ TINSERT │ v0.57 supported │ TileOP API lowers to BSTART.FIXP │ +// │ │ │ with FIXP.Function.TINSERT │ // │ │ │ (仅有反向操作 TEXTRACT) │ // ├──────────┼──────────────────┼──────────────────────────────────────────┤ // │ TSTORE │ API 有(名不同), │ PTO ISA 名 TSTORE; │ @@ -80,7 +80,7 @@ // kInner - inner dimension K, power-of-2 (e.g. 16) // ===================================================================== -template void broadcast(dtype *in_ptr, dtype *out_ptr, const size_t * /*in_shape*/, const size_t * /*out_shape*/) { @@ -122,7 +122,7 @@ void broadcast(dtype *in_ptr, dtype *out_ptr, // TINSERT × kBCast: 将输入 tile 插入输出 tile 的 N 个列偏移 // 每次 TINSERT 写入 kInner 列, N 次互不重叠, 合起来填满 N*kInner 列 - // [当前编译器] 完全缺失! pto_tileop.hpp 无 TINSERT API + // v0.57 lowers TINSERT through the FIXP tile block. #pragma clang loop unroll(full) for (size_t c = 0; c < kBCast; c++) { TINSERT(outTile, inTile, /*indexRow=*/0, /*indexCol=*/(uint16_t)(c * kInner)); diff --git a/benchmark-ptoisa/kernels/broadcast/broadcast_vec_07.hpp b/benchmark-ptoisa/kernels/broadcast/broadcast_vec_07.hpp new file mode 100644 index 0000000..e4b8cf6 --- /dev/null +++ b/benchmark-ptoisa/kernels/broadcast/broadcast_vec_07.hpp @@ -0,0 +1,69 @@ +#ifndef SUPERNPUBENCH_PTOISA_BROADCAST_VEC_07_HPP +#define SUPERNPUBENCH_PTOISA_BROADCAST_VEC_07_HPP + +#include + +#include + +using namespace pto; + +template +void broadcast(dtype *in_ptr, dtype *out_ptr, const size_t *, const size_t *) { + constexpr size_t kN = gIM; + constexpr size_t kC = gOM / gIM; + constexpr size_t kTileCols = kC; + constexpr size_t kMaxRowsByBytes = + (4096 / (kTileCols * sizeof(dtype))) == 0 + ? 1 + : (4096 / (kTileCols * sizeof(dtype))); + constexpr size_t kRowsPerTile = + kTileRows < kMaxRowsByBytes ? kTileRows : kMaxRowsByBytes; + + static_assert(MAX_DIM >= IN_DIM && MAX_DIM >= OUT_DIM, + "MAX_DIM must cover input and output ranks"); + static_assert(gOM % gIM == 0, + "broadcast output element count must be a multiple of input"); + static_assert(kRowsPerTile > 0, "broadcast tile must contain at least one row"); + + using InputGlobal = global_tensor>; + using OutputGlobal = global_tensor>; + using InputTile = Tile; + using OutputTile = Tile; + using InputIterator = global_iterator; + using OutputIterator = global_iterator; + + InputIterator input_iter(in_ptr); + OutputIterator output_iter(out_ptr); + + constexpr size_t kFullTiles = kN / kRowsPerTile; + constexpr size_t kTail = kN % kRowsPerTile; + + for (size_t tile = 0; tile < kFullTiles; ++tile) { + InputTile input_tile; + OutputTile output_tile; + TLOAD(input_tile, input_iter(static_cast(tile), 0)); + TROWEXPAND(output_tile, input_tile); + TSTORE(output_iter(static_cast(tile), 0), output_tile); + } + + if constexpr (kTail != 0) { + using TailInputTile = Tile; + using TailOutputTile = Tile; + using TailInputIterator = global_iterator; + using TailOutputIterator = global_iterator; + TailInputIterator tail_input_iter(in_ptr); + TailOutputIterator tail_output_iter(out_ptr); + TailInputTile input_tile; + TailOutputTile output_tile; + TLOAD(input_tile, tail_input_iter(static_cast(kFullTiles), 0)); + TROWEXPAND(output_tile, input_tile); + TSTORE(tail_output_iter(static_cast(kFullTiles), 0), output_tile); + } +} + +#endif // SUPERNPUBENCH_PTOISA_BROADCAST_VEC_07_HPP diff --git a/benchmark-ptoisa/kernels/broadcast/broadcast_vec_07_pto.hpp b/benchmark-ptoisa/kernels/broadcast/broadcast_vec_07_pto.hpp index ee43b44..550a9f4 100644 --- a/benchmark-ptoisa/kernels/broadcast/broadcast_vec_07_pto.hpp +++ b/benchmark-ptoisa/kernels/broadcast/broadcast_vec_07_pto.hpp @@ -70,7 +70,7 @@ // kTileRows - rows per tile, must be power-of-2 (e.g. 1,2,4,..,64) // ===================================================================== -template void broadcast(dtype *in_ptr, dtype *out_ptr, const size_t * /*in_shape*/, const size_t * /*out_shape*/) { diff --git a/benchmark-ptoisa/kernels/concat/concat_gather.hpp b/benchmark-ptoisa/kernels/concat/concat_gather.hpp index e542559..94c738a 100644 --- a/benchmark-ptoisa/kernels/concat/concat_gather.hpp +++ b/benchmark-ptoisa/kernels/concat/concat_gather.hpp @@ -2,7 +2,6 @@ #define CONCAT_GATHER_KERNEL_HPP #include -#include #include #include @@ -22,7 +21,7 @@ using namespace pto; * @tparam DATA_DIM 数据维度数(编译期常量) * @tparam CONCAT_DIM 拼接维度索引(编译期常量) */ -template void concat_gather( DType *in_ptr, diff --git a/benchmark-ptoisa/kernels/concat/concat_scatter.hpp b/benchmark-ptoisa/kernels/concat/concat_scatter.hpp index 142f130..2846f33 100644 --- a/benchmark-ptoisa/kernels/concat/concat_scatter.hpp +++ b/benchmark-ptoisa/kernels/concat/concat_scatter.hpp @@ -2,7 +2,6 @@ #define CONCAT_SCATTER_KERNEL_HPP #include -#include #include #include @@ -25,7 +24,7 @@ using namespace pto; * @tparam DATA_DIM 数据维度数(编译期常量) * @tparam CONCAT_DIM 拼接维度索引(编译期常量) */ -template void concat_scatter( DType *in_ptr, diff --git a/benchmark-ptoisa/kernels/control/hashtable_lookup_simd.hpp b/benchmark-ptoisa/kernels/control/hashtable_lookup_simd.hpp index 19659f3..6f273bc 100644 --- a/benchmark-ptoisa/kernels/control/hashtable_lookup_simd.hpp +++ b/benchmark-ptoisa/kernels/control/hashtable_lookup_simd.hpp @@ -1,6 +1,7 @@ #ifndef HASHTABLE_LOOKUP_SIMD_HPP #define HASHTABLE_LOOKUP_SIMD_HPP +#include #include #include "template_asm.h" // MGATHER #include diff --git a/benchmark-ptoisa/kernels/element_wise/gelu.hpp b/benchmark-ptoisa/kernels/element_wise/gelu.hpp new file mode 100644 index 0000000..76cd3e3 --- /dev/null +++ b/benchmark-ptoisa/kernels/element_wise/gelu.hpp @@ -0,0 +1,77 @@ +#ifndef SUPERNPUBENCH_PTOISA_GELU_HPP +#define SUPERNPUBENCH_PTOISA_GELU_HPP + +#include + +#include + +using namespace pto; + +template +inline void gelu_tile(TileT &dst, TileT &src) { + TileT t; + TileT t2; + TileT p; + TileT tmp; + TileT denom; + TileT recip; + + TMAXS(t, src, -5.75f); + TMINS(t, t, 5.75f); + TMUL(t2, t, t); + + TMULS(p, t2, -3.5123395303315874e-09f); + TADDS(p, p, 2.6452661927578447e-07f); + TMUL(p, p, t2); + TADDS(p, p, -7.9294877650681883e-06f); + TMUL(p, p, t2); + TADDS(p, p, 1.1061238183174282e-04f); + TMUL(p, p, t2); + TADDS(p, p, 6.5189960878342390e-05f); + TMUL(p, p, t2); + TADDS(p, p, -7.2666168212890625e-02f); + TMUL(p, p, t2); + TADDS(p, p, -1.5957698822021484e+00f); + + TMUL(tmp, t, p); + TEXP(tmp, tmp); + TADDS(denom, tmp, 1.0f); + TRECIP(recip, denom); + TMUL(dst, src, recip); +} + +template +void gelu(dtype *in_ptr, dtype *out_ptr, bool = false) { + constexpr int kFullTiles = gM / tM; + constexpr int kTail = gM % tM; + + using Global = global_tensor>; + using TileT = Tile; + using Iterator = global_iterator; + + Iterator input_iter(in_ptr); + Iterator output_iter(out_ptr); + + for (int tile = 0; tile < kFullTiles; ++tile) { + TileT input_tile; + TileT output_tile; + TLOAD(input_tile, input_iter(0, tile)); + gelu_tile(output_tile, input_tile); + TSTORE(output_iter(0, tile), output_tile); + } + + if constexpr (kTail != 0) { + using TailTile = + Tile; + using TailIterator = global_iterator; + TailIterator tail_input_iter(in_ptr); + TailIterator tail_output_iter(out_ptr); + TailTile input_tile; + TailTile output_tile; + TLOAD(input_tile, tail_input_iter(0, kFullTiles)); + gelu_tile(output_tile, input_tile); + TSTORE(tail_output_iter(0, kFullTiles), output_tile); + } +} + +#endif // SUPERNPUBENCH_PTOISA_GELU_HPP diff --git a/benchmark-ptoisa/kernels/fa/fa_2d_unroll.hpp b/benchmark-ptoisa/kernels/fa/fa_2d_unroll.hpp index 296e6e9..709e203 100644 --- a/benchmark-ptoisa/kernels/fa/fa_2d_unroll.hpp +++ b/benchmark-ptoisa/kernels/fa/fa_2d_unroll.hpp @@ -1,6 +1,10 @@ #include "fa_utils.h" #include "fa_fp4_utils.h" +using namespace pto; +using namespace pto::blkv; +using pto::type_traits; + #ifndef Xdim #define Xdim 2 #endif @@ -13,6 +17,11 @@ #define __vbuf__ #endif +#ifndef BLKC_ASSIGN_CAST +#define BLKC_ASSIGN_CAST(tile, idx, value) \ + (pto::blkv::blkv_get_tile_ptr(tile)[(idx)] = (value)) +#endif + template void __vec__ new_max_1src( typename tileMax::TileDType __out__ scale, @@ -797,7 +806,6 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype TEXPANDSCALAR(tSum[x], 0); } - tileO_out tPV_out; tileO tO[Xdim], tPV[Xdim]; tileScale tScale[Xdim]; @@ -840,7 +848,8 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype #if Ydim == 1 #pragma clang loop unroll(full) for(int x=0;x<<>>( + pto::blkv::blkv_for_1d(tileW::ValidRow, [&] { + flashsoftmax_dn_mout_cast_kernel( tScale[x].data(), tNewMax[x].data(), tNewSum[x].data(), @@ -849,16 +858,19 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype tMax[x].data(), tSum[x].data(), scale); + }); } #elif Ydim == 2 #pragma clang loop unroll(full) for(int x=0;x<<>>( + pto::blkv::blkv_for_1d(tileMax::ValidRow, [&] { + new_max_2src( tScale[x].data(), tNewMax[x].data(), tW[x][0].data(), tW[x][1].data(), tMax[x].data(), scale); + }); // src_exp_2src<<>>( // tExpW[x][0].data(), tExpW[x][1].data(), // tW[x][0].data(), tW[x][1].data(), @@ -870,39 +882,49 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype // tSum[x].data(), // tScale[x].data() // ); - src_exp_2src_with_new_sum<<>>( + pto::blkv::blkv_for_1d(tileW::ValidRow, [&] { + src_exp_2src_with_new_sum( tNewSum[x].data(), tExpW[x][0].data(), tExpW[x][1].data(), tW[x][0].data(), tW[x][1].data(), tNewMax[x].data(), tSum[x].data(), tScale[x].data(), scale); + }); } #elif Ydim == 4 tileSum tLocalSum[Xdim][2]; #pragma clang loop unroll(full) for(int x=0;x<<>>( + pto::blkv::blkv_for_1d(tileMax::ValidRow, [&] { + new_max_4src( tScale[x].data(), tNewMax[x].data(), tW[x][0].data(), tW[x][1].data(), tW[x][2].data(), tW[x][3].data(), tMax[x].data(), scale); + }); // src_exp_4src<<>>( // tExpW[x][0].data(), tExpW[x][1].data(), tExpW[x][2].data(), tExpW[x][3].data(), // tW[x][0].data(), tW[x][1].data(), tW[x][2].data(), tW[x][3].data(), // tNewMax[x].data(), // scale); - src_exp_2src_with_local_sum<<>>(tLocalSum[x][0].data(), tExpW[x][0].data(), tExpW[x][1].data(), + pto::blkv::blkv_for_1d(tileW::ValidRow, [&] { + src_exp_2src_with_local_sum(tLocalSum[x][0].data(), tExpW[x][0].data(), tExpW[x][1].data(), tW[x][0].data(), tW[x][1].data(), tNewMax[x].data(), scale); - src_exp_2src_with_local_sum<<>>(tLocalSum[x][1].data(), tExpW[x][2].data(), tExpW[x][3].data(), + }); + pto::blkv::blkv_for_1d(tileW::ValidRow, [&] { + src_exp_2src_with_local_sum(tLocalSum[x][1].data(), tExpW[x][2].data(), tExpW[x][3].data(), tW[x][2].data(), tW[x][3].data(), tNewMax[x].data(), scale); + }); // new_sum_4src<<>>( // tNewSum[x].data(), // tExpW[x][0].data(), tExpW[x][1].data(), tExpW[x][2].data(), tExpW[x][3].data(), // tSum[x].data(), // tScale[x].data() // ); - new_sum_of_2_loc_sum<<>>(tNewSum[x].data(), tLocalSum[x][0].data(), tLocalSum[x][1].data(), tSum[x].data(), tScale[x].data()); + pto::blkv::blkv_for_1d(tileSum::ValidRow, [&] { + new_sum_of_2_loc_sum(tNewSum[x].data(), tLocalSum[x][0].data(), tLocalSum[x][1].data(), tSum[x].data(), tScale[x].data()); + }); } #elif Ydim == 8 tileMax tLocalMax[Xdim][2]; @@ -912,17 +934,25 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype for(int x=0;x<<>>(tLocalMax[x][k].data(), tW[x][4*k].data(), tW[x][4*k+1].data(), tW[x][4*k+2].data(), tW[x][4*k+3].data(), scale); + pto::blkv::blkv_for_1d(tileMax::ValidRow, [&] { + local_max_4src(tLocalMax[x][k].data(), tW[x][4*k].data(), tW[x][4*k+1].data(), tW[x][4*k+2].data(), tW[x][4*k+3].data(), scale); + }); } - new_max_of_2_loc_max<<>>(tScale[x].data(), tNewMax[x].data(), tLocalMax[x][0].data(), tLocalMax[x][1].data(), tMax[x].data()); + pto::blkv::blkv_for_1d(tileMax::ValidRow, [&] { + new_max_of_2_loc_max(tScale[x].data(), tNewMax[x].data(), tLocalMax[x][0].data(), tLocalMax[x][1].data(), tMax[x].data()); + }); #pragma clang loop unroll(full) for(int k=0;k<4;k++){ - src_exp_2src_with_local_sum<<>>(tLocalSum[x][k].data(), tExpW[x][2*k].data(), tExpW[x][2*k+1].data(), + pto::blkv::blkv_for_1d(tileW::ValidRow, [&] { + src_exp_2src_with_local_sum(tLocalSum[x][k].data(), tExpW[x][2*k].data(), tExpW[x][2*k+1].data(), tW[x][2*k].data(), tW[x][2*k+1].data(), tNewMax[x].data(), scale); + }); } - new_sum_of_4_loc_sum<<>>(tNewSum[x].data(), tLocalSum[x][0].data(), tLocalSum[x][1].data(), tLocalSum[x][2].data(), tLocalSum[x][3].data(), tSum[x].data(), tScale[x].data()); + pto::blkv::blkv_for_1d(tileSum::ValidRow, [&] { + new_sum_of_4_loc_sum(tNewSum[x].data(), tLocalSum[x][0].data(), tLocalSum[x][1].data(), tLocalSum[x][2].data(), tLocalSum[x][3].data(), tSum[x].data(), tScale[x].data()); + }); } #elif Ydim == 16 tileMax tLocalMax[Xdim][4]; @@ -931,25 +961,35 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype #pragma clang loop unroll(full) for(int x=0;x<<>>(tLocalMax[x][k].data(), tW[x][4*k].data(), tW[x][4*k+1].data(), tW[x][4*k+2].data(), tW[x][4*k+3].data(), scale); + pto::blkv::blkv_for_1d(tileMax::ValidRow, [&] { + local_max_4src(tLocalMax[x][k].data(), tW[x][4*k].data(), tW[x][4*k+1].data(), tW[x][4*k+2].data(), tW[x][4*k+3].data(), scale); + }); } - new_max_of_4_loc_max<<>>(tScale[x].data(), tNewMax[x].data(), tLocalMax[x][0].data(), tLocalMax[x][1].data(), tLocalMax[x][2].data(), tLocalMax[x][3].data(), tMax[x].data()); + pto::blkv::blkv_for_1d(tileMax::ValidRow, [&] { + new_max_of_4_loc_max(tScale[x].data(), tNewMax[x].data(), tLocalMax[x][0].data(), tLocalMax[x][1].data(), tLocalMax[x][2].data(), tLocalMax[x][3].data(), tMax[x].data()); + }); #pragma clang loop unroll(full) for(int k=0;k<4;k++){ - src_exp_4src<<>>( + pto::blkv::blkv_for_2d(tileW::ValidRow, tileW::ValidCol, [&] { + src_exp_4src( tExpW[x][4*k].data(), tExpW[x][4*k+1].data(), tExpW[x][4*k+2].data(), tExpW[x][4*k+3].data(), tW[x][4*k].data(), tW[x][4*k+1].data(), tW[x][4*k+2].data(), tW[x][4*k+3].data(), tNewMax[x].data(), scale); + }); } #pragma clang loop unroll(full) for(int k=0;k<4;k++){ - local_sum_4src<<>>(tLocalSum[x][k].data(), tExpW[x][4*k].data(), tExpW[x][4*k+1].data(), tExpW[x][4*k+2].data(), tExpW[x][4*k+3].data()); + pto::blkv::blkv_for_1d(tileSum::ValidRow, [&] { + local_sum_4src(tLocalSum[x][k].data(), tExpW[x][4*k].data(), tExpW[x][4*k+1].data(), tExpW[x][4*k+2].data(), tExpW[x][4*k+3].data()); + }); } - new_sum_of_4_loc_sum<<>>(tNewSum[x].data(), tLocalSum[x][0].data(), tLocalSum[x][1].data(), tLocalSum[x][2].data(), tLocalSum[x][3].data(), tSum[x].data(), tScale[x].data()); + pto::blkv::blkv_for_1d(tileSum::ValidRow, [&] { + new_sum_of_4_loc_sum(tNewSum[x].data(), tLocalSum[x][0].data(), tLocalSum[x][1].data(), tLocalSum[x][2].data(), tLocalSum[x][3].data(), tSum[x].data(), tScale[x].data()); + }); } #else #ifdef _2D_UNROLL @@ -978,6 +1018,7 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype tileW_left tW_left[Xdim][Ydim]; #pragma clang loop unroll(full) for(int x=0;x::bits == 4) { @@ -1001,19 +1042,21 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype if(j==0){ #pragma clang loop unroll(full) for(int x=0;x<<>>(tO[x].data(), tO[x].data(), tPV[x].data(), tScale[x].data()); + pto::blkv::blkv_for_1d(tileO::ValidRow, [&] { + global_update(tO[x].data(), tO[x].data(), tPV[x].data(), tScale[x].data()); + }); } } // 更新最大值状态 #pragma clang loop unroll(full) for(int x=0;x::bits == 4) { - normalize_with_last_update_nocast<<>>(tO[x].data(), tO[x].data(), tPV[x].data(), tScale[x].data(), tSum[x].data()); + pto::blkv::blkv_for_2d(tileO::ValidRow, tileO::ValidCol, [&] { + normalize_with_last_update_nocast(tO[x].data(), tO[x].data(), tPV[x].data(), tScale[x].data(), tSum[x].data()); + }); TMOV_NORM(tO_cast[x], tO[x]); } else { - normalize_with_last_update<<>>(tO_cast[x].data(), tO[x].data(), tPV[x].data(), tScale[x].data(), tSum[x].data()); + pto::blkv::blkv_for_2d(tileO::ValidRow, tileO::ValidCol, [&] { + normalize_with_last_update(tO_cast[x].data(), tO[x].data(), tPV[x].data(), tScale[x].data(), tSum[x].data()); + }); } } // 写回全局内存 @@ -1046,4 +1093,6 @@ void flash_attention_2d_unroll(dtype* out_ptr, dtype* q_ptr, dtype* k_ptr, dtype } } +#ifdef _UNALIGN_2D_UNROLL #include "fa_unalign_2d_unroll.hpp" +#endif diff --git a/benchmark-ptoisa/kernels/fa/fa_fp4_utils.h b/benchmark-ptoisa/kernels/fa/fa_fp4_utils.h index d2f776b..ecdbd51 100644 --- a/benchmark-ptoisa/kernels/fa/fa_fp4_utils.h +++ b/benchmark-ptoisa/kernels/fa/fa_fp4_utils.h @@ -1,4 +1,9 @@ +#include + +using namespace pto::blkv; +using pto::type_traits; + template concept is_fp4_tile = requires (T tile) { type_traits::bits == 4; diff --git a/benchmark-ptoisa/kernels/fa/fa_utils.h b/benchmark-ptoisa/kernels/fa/fa_utils.h index 5744a1d..59ee045 100644 --- a/benchmark-ptoisa/kernels/fa/fa_utils.h +++ b/benchmark-ptoisa/kernels/fa/fa_utils.h @@ -1,5 +1,9 @@ // FA utility functions shared across different implementations +#include + +using namespace pto::blkv; + template void __vec__ flashsoftmax_dn_mout_cast_kernel( typename tileScale::TileDType __out__ rescale, diff --git a/benchmark-ptoisa/kernels/fa/linx_blkc.h b/benchmark-ptoisa/kernels/fa/linx_blkc.h new file mode 100644 index 0000000..df68bc9 --- /dev/null +++ b/benchmark-ptoisa/kernels/fa/linx_blkc.h @@ -0,0 +1,121 @@ +#ifndef PTO_FA_LINX_BLKC_H +#define PTO_FA_LINX_BLKC_H + +#include + +#ifndef __bf16 +#define __bf16 pto_bf16_t +#endif + +using pto::blkv::blkv_fexp; +using pto::blkv::blkv_fsqrt; +using pto::blkv::blkv_get_index_x; +using pto::blkv::blkv_get_index_y; +using pto::blkv::blkv_get_index_z; +using pto::blkv::blkv_get_tile_ptr; +using pto::blkv::blkv_max; + +using __fp4_e1m2x2 = pto::fp4_e2m1_t; +using __bf16x2 = uint32_t; + +template +inline To linx_cvt_as(From value) { + return static_cast(value); +} + +template +inline void linx_cvt(To &dst, From value) { + dst = linx_cvt_as(value); +} + +template +inline void linx_cvt_package(To &dst, A a, B b) { + const uint16_t hi = + static_cast(pto::lowp_word_from_bf16(pto::bf16_t{static_cast(a)}) & + 0xffffu); + const uint16_t lo = + static_cast(pto::lowp_word_from_bf16(pto::bf16_t{static_cast(b)}) & + 0xffffu); + if constexpr (sizeof(To) >= 4) { + dst = static_cast((static_cast(hi) << 16) | lo); + } else { + dst = static_cast(pto::float_to_fp4_e2m1(static_cast(a))); + } +} + +inline void blkv_bf16_fmax(__bf16 &dst, __bf16 a, __bf16 b) { + dst = a < b ? b : a; +} + +inline __bf16 blkv_bf16_max(__bf16 a, __bf16 b) { return a < b ? b : a; } +inline __bf16 blkv_bf16_mul(__bf16 a, __bf16 b) { return a * b; } +inline __bf16 blkv_bf16_div(__bf16 a, __bf16 b) { return a / b; } + +inline void blkv_bf16_fadd(__bf16 &dst, __bf16 a, __bf16 b) { dst = a + b; } +inline void blkv_bf16_fsub(__bf16 &dst, __bf16 a, __bf16 b) { dst = a - b; } +inline void blkv_bf16_fmul(__bf16 &dst, __bf16 a, __bf16 b) { dst = a * b; } +inline void blkv_bf16_fdiv(__bf16 &dst, __bf16 a, __bf16 b) { dst = a / b; } +inline void blkv_bf16_fexp(__bf16 &dst, __bf16 a) { + dst = pto::blkv::blkv_fexp(static_cast(a)); +} + +inline float linx_bf16x2_hi(__bf16x2 value) { + return static_cast(pto::bf16_from_lowp_word((value >> 16) & 0xffffu)); +} + +inline float linx_bf16x2_lo(__bf16x2 value) { + return static_cast(pto::bf16_from_lowp_word(value & 0xffffu)); +} + +inline __bf16x2 linx_pack_bf16x2(float hi, float lo) { + return (pto::lowp_word_from_bf16(pto::bf16_t{hi}) << 16) | + pto::lowp_word_from_bf16(pto::bf16_t{lo}); +} + +inline void blkv_bf16x2_fmax(__bf16x2 &dst, __bf16x2 a, __bf16x2 b) { + dst = linx_pack_bf16x2(pto::blkv::blkv_max(linx_bf16x2_hi(a), linx_bf16x2_hi(b)), + pto::blkv::blkv_max(linx_bf16x2_lo(a), linx_bf16x2_lo(b))); +} + +inline __bf16x2 blkv_bf16x2_mul(__bf16x2 a, __bf16x2 b) { + return linx_pack_bf16x2(linx_bf16x2_hi(a) * linx_bf16x2_hi(b), + linx_bf16x2_lo(a) * linx_bf16x2_lo(b)); +} + +inline void blkv_bf16x2_mul(__bf16x2 &dst, __bf16x2 a, __bf16x2 b) { + dst = blkv_bf16x2_mul(a, b); +} + +inline void blkv_bf16x2_fadd(__bf16x2 &dst, __bf16x2 a, __bf16x2 b) { + dst = linx_pack_bf16x2(linx_bf16x2_hi(a) + linx_bf16x2_hi(b), + linx_bf16x2_lo(a) + linx_bf16x2_lo(b)); +} + +inline void blkv_bf16x2_fsub(__bf16x2 &dst, __bf16x2 a, __bf16x2 b) { + dst = linx_pack_bf16x2(linx_bf16x2_hi(a) - linx_bf16x2_hi(b), + linx_bf16x2_lo(a) - linx_bf16x2_lo(b)); +} + +inline void blkv_bf16x2_fmul(__bf16x2 &dst, __bf16x2 a, __bf16x2 b) { + dst = blkv_bf16x2_mul(a, b); +} + +inline void blkv_bf16x2_fdiv(__bf16x2 &dst, __bf16x2 a, __bf16x2 b) { + dst = linx_pack_bf16x2(linx_bf16x2_hi(a) / linx_bf16x2_hi(b), + linx_bf16x2_lo(a) / linx_bf16x2_lo(b)); +} + +inline void blkv_bf16x2_fmsub(__bf16x2 &dst, __bf16x2 a, __bf16x2 b, + __bf16x2 c) { + dst = linx_pack_bf16x2((linx_bf16x2_hi(a) * linx_bf16x2_hi(b)) - + linx_bf16x2_hi(c), + (linx_bf16x2_lo(a) * linx_bf16x2_lo(b)) - + linx_bf16x2_lo(c)); +} + +inline void blkv_bf16x2_fexp(__bf16x2 &dst, __bf16x2 a) { + dst = linx_pack_bf16x2(pto::blkv::blkv_fexp(linx_bf16x2_hi(a)), + pto::blkv::blkv_fexp(linx_bf16x2_lo(a))); +} + +#endif // PTO_FA_LINX_BLKC_H diff --git a/benchmark-ptoisa/kernels/gather/gather.hpp b/benchmark-ptoisa/kernels/gather/gather.hpp new file mode 100644 index 0000000..a19ec69 --- /dev/null +++ b/benchmark-ptoisa/kernels/gather/gather.hpp @@ -0,0 +1,92 @@ +#ifndef SUPERNPUBENCH_PTOISA_GATHER_HPP +#define SUPERNPUBENCH_PTOISA_GATHER_HPP + +#include + +#include +#include + +using namespace pto; + +template +void gather(dtype *in_data_ptr, otype *in_offset_ptr, dtype *out_ptr) { + constexpr size_t kFullRows = gM / tM; + constexpr size_t kTailRows = gM % tM; + constexpr size_t kFullCols = gN / tN; + constexpr size_t kTailCols = gN % tN; + + using InputGlobal = global_tensor>; + using OffsetGlobal = global_tensor>; + using OutputGlobal = global_tensor>; + + using OffsetTile = + Tile; + using DataTile = Tile; + using OffsetIterator = global_iterator; + using OutputIterator = global_iterator; + + InputGlobal input_global(in_data_ptr); + OffsetIterator offset_iter(in_offset_ptr); + OutputIterator output_iter(out_ptr); + + for (size_t row_tile = 0; row_tile < kFullRows; ++row_tile) { + OffsetTile row_index; + TLOAD(row_index, offset_iter(0, static_cast(row_tile))); + for (size_t col_tile = 0; col_tile < kFullCols; ++col_tile) { + DataTile out_tile; + InputGlobal adjusted_input(in_data_ptr + col_tile * tN); + MGATHER(out_tile, adjusted_input, row_index); + TSTORE(output_iter(static_cast(row_tile), + static_cast(col_tile)), + out_tile); + } + if constexpr (kTailCols != 0) { + using TailColTile = Tile; + using TailOutputIterator = global_iterator; + TailOutputIterator tail_output_iter(out_ptr); + TailColTile out_tile; + InputGlobal adjusted_input(in_data_ptr + kFullCols * tN); + MGATHER(out_tile, adjusted_input, row_index); + TSTORE(tail_output_iter(static_cast(row_tile), + static_cast(kFullCols)), + out_tile); + } + } + + if constexpr (kTailRows != 0) { + using TailOffsetTile = + Tile; + using TailOffsetIterator = global_iterator; + TailOffsetIterator tail_offset_iter(in_offset_ptr); + TailOffsetTile row_index; + TLOAD(row_index, tail_offset_iter(0, static_cast(kFullRows))); + for (size_t col_tile = 0; col_tile < kFullCols; ++col_tile) { + using TailRowTile = Tile; + using TailOutputIterator = global_iterator; + TailOutputIterator tail_output_iter(out_ptr); + TailRowTile out_tile; + InputGlobal adjusted_input(in_data_ptr + col_tile * tN); + MGATHER(out_tile, adjusted_input, row_index); + TSTORE(tail_output_iter(static_cast(kFullRows), + static_cast(col_tile)), + out_tile); + } + if constexpr (kTailCols != 0) { + using TailTile = Tile; + using TailOutputIterator = global_iterator; + TailOutputIterator tail_output_iter(out_ptr); + TailTile out_tile; + InputGlobal adjusted_input(in_data_ptr + kFullCols * tN); + MGATHER(out_tile, adjusted_input, row_index); + TSTORE(tail_output_iter(static_cast(kFullRows), + static_cast(kFullCols)), + out_tile); + } + } +} + +#endif // SUPERNPUBENCH_PTOISA_GATHER_HPP diff --git a/benchmark-ptoisa/kernels/matmul/matmul.hpp b/benchmark-ptoisa/kernels/matmul/matmul.hpp index a3671c7..14f574e 100644 --- a/benchmark-ptoisa/kernels/matmul/matmul.hpp +++ b/benchmark-ptoisa/kernels/matmul/matmul.hpp @@ -9,7 +9,12 @@ using namespace pto; -template +enum class PadValue { Null }; +enum class LayoutCvtEnum { ND2ZZ, ND2NN }; +template +void blk_tload(Args...); + +template void TCOPYOUT_ACC(GmOut &Gout, TileAcc &tAcc){ using TileAccOut = Tile; TileAccOut tAccOut; @@ -17,7 +22,7 @@ void TCOPYOUT_ACC(GmOut &Gout, TileAcc &tAcc){ TCOPYOUT(Gout, tAccOut); } -template +template void TCOPYOUT_ACC_DYNAMIC(GmOut &Gout, TileAcc &tAcc, size_t valid_row, size_t valid_col){ using TileAccOut = Tile; TileAccOut tAccOut(valid_row, valid_col); diff --git a/benchmark-ptoisa/kernels/matmul/matmul_mx.hpp b/benchmark-ptoisa/kernels/matmul/matmul_mx.hpp index b26e87f..d45b12b 100644 --- a/benchmark-ptoisa/kernels/matmul/matmul_mx.hpp +++ b/benchmark-ptoisa/kernels/matmul/matmul_mx.hpp @@ -1399,7 +1399,9 @@ void matmul_mp(float *acc_ptr, dtypeA *a_ptr, dtypeB *b_ptr, float *c_ptr) { MATMUL(tACC, tA, tB); TCVT(tACCin, tACC);//[tM, tN] 256->1 , 256 -> 2 scaling factor // static_assert(tile_shapeB::ValidCol % (width_factor*128) == 0); // TODO, 暂不考虑padding,假设形状是规整的, 方便处理, taccin*ts_adder=tc_dequant - dequant_acc<<>>(tACCin.data(), ts.data(), tAdder[k%2].data(), tC_dequant.data()); + pto::blkv::blkv_for_2d(tile_shapeACC::ValidRow, tile_shapeACC::ValidCol, [&] { + dequant_acc(tACCin.data(), ts.data(), tAdder[k%2].data(), tC_dequant.data()); + }); tAdder[(k+1)%2] = tC_dequant; } k++; @@ -1419,7 +1421,9 @@ void matmul_mp(float *acc_ptr, dtypeA *a_ptr, dtypeB *b_ptr, float *c_ptr) { MATMUL(tACC, tA, tB); TCVT(tACCin, tACC); - dequant_acc<<>>(tACCin.data(), ts.data(), tAdder[k%2].data(), tC_dequant.data()); + pto::blkv::blkv_for_2d(tile_shapeACC::ValidRow, tile_shapeACC::ValidCol, [&] { + dequant_acc(tACCin.data(), ts.data(), tAdder[k%2].data(), tC_dequant.data()); + }); tAdder[(k+1)%2] = tC_dequant; } TCOPYOUT(gACC, tAdder[(k+1)%2]); @@ -1449,7 +1453,9 @@ void matmul_mp(float *acc_ptr, dtypeA *a_ptr, dtypeB *b_ptr, float *c_ptr) { TCOPYIN(ts, gS); MATMUL(tACC, tA, tB); TCVT(tACCin, tACC); - dequant_acc<<>>(tACCin.data(), ts.data(), tAdder[k%2].data(), tC_dequant.data()); + pto::blkv::blkv_for_2d(tile_ACCin_tcols::ValidRow, tile_ACCin_tcols::ValidCol, [&] { + dequant_acc(tACCin.data(), ts.data(), tAdder[k%2].data(), tC_dequant.data()); + }); tAdder[(k+1)%2] = tC_dequant; } k++; @@ -1467,7 +1473,9 @@ void matmul_mp(float *acc_ptr, dtypeA *a_ptr, dtypeB *b_ptr, float *c_ptr) { TCOPYIN(ts, gS); MATMUL(tACC, tA, tB); TCVT(tACCin, tACC); - dequant_acc<<>>(tACCin.data(), ts.data(), tAdder[k%2].data(), tC_dequant.data()); + pto::blkv::blkv_for_2d(tile_shapeACC::ValidRow, tile_shapeACC::ValidCol, [&] { + dequant_acc(tACCin.data(), ts.data(), tAdder[k%2].data(), tC_dequant.data()); + }); tAdder[(k+1)%2] = tC_dequant; } TCOPYOUT(gACC, tAdder[(k+1)%2]); @@ -1518,4 +1526,4 @@ void matmul_mp(float *acc_ptr, dtypeA *a_ptr, dtypeB *b_ptr, float *c_ptr) { } -#endif \ No newline at end of file +#endif diff --git a/benchmark-ptoisa/kernels/reduction/cumsum_colvec.hpp b/benchmark-ptoisa/kernels/reduction/cumsum_colvec.hpp index 7c51237..a8219aa 100644 --- a/benchmark-ptoisa/kernels/reduction/cumsum_colvec.hpp +++ b/benchmark-ptoisa/kernels/reduction/cumsum_colvec.hpp @@ -2,7 +2,6 @@ #define CUMSUMCOL_KERNEL_HPP #include -#include #include #include diff --git a/benchmark-ptoisa/kernels/reduction/cumsum_rowvec.hpp b/benchmark-ptoisa/kernels/reduction/cumsum_rowvec.hpp index 38e0d04..9be3b11 100644 --- a/benchmark-ptoisa/kernels/reduction/cumsum_rowvec.hpp +++ b/benchmark-ptoisa/kernels/reduction/cumsum_rowvec.hpp @@ -3,7 +3,6 @@ #pragma once #include -#include #include #include diff --git a/benchmark-ptoisa/kernels/reduction/reducemax_colvec.hpp b/benchmark-ptoisa/kernels/reduction/reducemax_colvec.hpp index 4590af3..0c4dc7d 100644 --- a/benchmark-ptoisa/kernels/reduction/reducemax_colvec.hpp +++ b/benchmark-ptoisa/kernels/reduction/reducemax_colvec.hpp @@ -2,7 +2,6 @@ #define REDUCEMAXCOLVEC_KERNEL_HPP #include -#include #include #include diff --git a/benchmark-ptoisa/kernels/reduction/reducemax_colvec_unalign_120_8.hpp b/benchmark-ptoisa/kernels/reduction/reducemax_colvec_unalign_120_8.hpp index fa8c4bb..04fa543 100644 --- a/benchmark-ptoisa/kernels/reduction/reducemax_colvec_unalign_120_8.hpp +++ b/benchmark-ptoisa/kernels/reduction/reducemax_colvec_unalign_120_8.hpp @@ -2,7 +2,6 @@ #define REDUCEMAXCOLVEC_KERNEL_HPP #include -#include #include #include diff --git a/benchmark-ptoisa/kernels/reduction/reducemax_rowvec.hpp b/benchmark-ptoisa/kernels/reduction/reducemax_rowvec.hpp index 0214ea8..cb5b23e 100644 --- a/benchmark-ptoisa/kernels/reduction/reducemax_rowvec.hpp +++ b/benchmark-ptoisa/kernels/reduction/reducemax_rowvec.hpp @@ -3,7 +3,6 @@ #pragma once #include -#include #include #include diff --git a/benchmark-ptoisa/kernels/reduction/reducemax_rowvec_single_tree.hpp b/benchmark-ptoisa/kernels/reduction/reducemax_rowvec_single_tree.hpp index 4e2ad79..8eae796 100644 --- a/benchmark-ptoisa/kernels/reduction/reducemax_rowvec_single_tree.hpp +++ b/benchmark-ptoisa/kernels/reduction/reducemax_rowvec_single_tree.hpp @@ -3,7 +3,6 @@ #pragma once #include -#include #include #include diff --git a/benchmark-ptoisa/kernels/reduction/reduceprod_colvec.hpp b/benchmark-ptoisa/kernels/reduction/reduceprod_colvec.hpp index 1e6e825..a5b1d4b 100644 --- a/benchmark-ptoisa/kernels/reduction/reduceprod_colvec.hpp +++ b/benchmark-ptoisa/kernels/reduction/reduceprod_colvec.hpp @@ -2,7 +2,6 @@ #define REDUCEPRODCOLVEC_KERNEL_HPP #include -#include #include #include diff --git a/benchmark-ptoisa/kernels/reduction/reduceprod_rowvec.hpp b/benchmark-ptoisa/kernels/reduction/reduceprod_rowvec.hpp index 2470d91..274021b 100644 --- a/benchmark-ptoisa/kernels/reduction/reduceprod_rowvec.hpp +++ b/benchmark-ptoisa/kernels/reduction/reduceprod_rowvec.hpp @@ -3,7 +3,6 @@ #pragma once #include -#include #include #include diff --git a/benchmark-ptoisa/kernels/reduction/reducesum_colvec.hpp b/benchmark-ptoisa/kernels/reduction/reducesum_colvec.hpp index aad42cc..31e54a0 100644 --- a/benchmark-ptoisa/kernels/reduction/reducesum_colvec.hpp +++ b/benchmark-ptoisa/kernels/reduction/reducesum_colvec.hpp @@ -2,7 +2,6 @@ #define REDUCESUMCOLVEC_KERNEL_HPP #include -#include #include #include @@ -58,7 +57,7 @@ void reducesum_colsum_rand( for (int j = 0; j < Nb; ++j) { auto gO = gOIter(0, j); - TEXPANDS(oldSumTile, static_cast(0)); + TEXPANDS(oldSumTile, static_cast(0.0f)); for (int i = 0; i < Mb; ++i) { auto gI = gIIter(i, j); diff --git a/benchmark-ptoisa/kernels/reduction/reducesum_colvec_single_tree.hpp b/benchmark-ptoisa/kernels/reduction/reducesum_colvec_single_tree.hpp index cb48266..19dbd18 100644 --- a/benchmark-ptoisa/kernels/reduction/reducesum_colvec_single_tree.hpp +++ b/benchmark-ptoisa/kernels/reduction/reducesum_colvec_single_tree.hpp @@ -2,7 +2,6 @@ #define REDUCESUMCOLVEC_KERNEL_HPP #include -#include #include #include diff --git a/benchmark-ptoisa/kernels/reduction/reducesum_colvec_unalign_120_8.hpp b/benchmark-ptoisa/kernels/reduction/reducesum_colvec_unalign_120_8.hpp index 4e22449..b7015c2 100644 --- a/benchmark-ptoisa/kernels/reduction/reducesum_colvec_unalign_120_8.hpp +++ b/benchmark-ptoisa/kernels/reduction/reducesum_colvec_unalign_120_8.hpp @@ -2,7 +2,6 @@ #define REDUCESUMCOLVEC_KERNEL_HPP #include -#include #include #include diff --git a/benchmark-ptoisa/kernels/reduction/reducesum_rowvec.hpp b/benchmark-ptoisa/kernels/reduction/reducesum_rowvec.hpp index 8d33a96..401f873 100644 --- a/benchmark-ptoisa/kernels/reduction/reducesum_rowvec.hpp +++ b/benchmark-ptoisa/kernels/reduction/reducesum_rowvec.hpp @@ -3,7 +3,6 @@ #pragma once #include -#include #include #include diff --git a/benchmark-ptoisa/kernels/reduction/reducesum_rowvec_single_tree.hpp b/benchmark-ptoisa/kernels/reduction/reducesum_rowvec_single_tree.hpp index 58d26f9..60ddc02 100644 --- a/benchmark-ptoisa/kernels/reduction/reducesum_rowvec_single_tree.hpp +++ b/benchmark-ptoisa/kernels/reduction/reducesum_rowvec_single_tree.hpp @@ -3,7 +3,6 @@ #pragma once #include -#include #include #include diff --git a/benchmark-ptoisa/kernels/sort/topk.hpp b/benchmark-ptoisa/kernels/sort/topk.hpp index 3bba98b..d9ac6b9 100644 --- a/benchmark-ptoisa/kernels/sort/topk.hpp +++ b/benchmark-ptoisa/kernels/sort/topk.hpp @@ -1,9 +1,12 @@ #ifndef TOPK_HPP #define TOPK_HPP +#include #include #include +using namespace pto::blkv; + // ============================================================================ // Constants // ============================================================================ @@ -82,15 +85,18 @@ void __vec__ ExtractLow8HistForKthBin_Vec_RowMajor( template void ExtractHigh8Hist_Impl(tile_shape_out& dst, const uint16_t* src) { - ExtractHigh8Hist_Vec_RowMajor - <<<1, 256, 1>>>(dst.data(), src); + pto::blkv::blkv_for_2d(1, 256, [&] { + ExtractHigh8Hist_Vec_RowMajor(dst.data(), src); + }); } template void ExtractLow8HistForKthBin_Impl(tile_shape_out& dst, const uint16_t* src, uint16_t kth_bin) { - ExtractLow8HistForKthBin_Vec_RowMajor - <<<1, 256, 1>>>(dst.data(), src, kth_bin); + pto::blkv::blkv_for_2d(1, 256, [&] { + ExtractLow8HistForKthBin_Vec_RowMajor(dst.data(), src, + kth_bin); + }); } // ============================================================================ diff --git a/benchmark-ptoisa/kernels/transpose/transpose.hpp b/benchmark-ptoisa/kernels/transpose/transpose.hpp index 32f5a05..f23f44a 100644 --- a/benchmark-ptoisa/kernels/transpose/transpose.hpp +++ b/benchmark-ptoisa/kernels/transpose/transpose.hpp @@ -411,4 +411,18 @@ void tile_transpose_2d(DType *input, DType *output) { } // namespace supernpu::tile_isa +template +void transpose(DType *input, DType *output, std::uint32_t *input_shape, + std::uint32_t *output_shape) { + static_assert(gIM == gOM, "transpose preserves element count"); + static_assert(IN_DIM == OUT_DIM, "transpose rank must be preserved"); + (void)MAX_DIM; + supernpu::tile_isa::tile_transpose_nd< + DType, static_cast(IN_DIM), static_cast(TRANSPOSE_DIM0), + static_cast(TRANSPOSE_DIM1), gOM, tM>(input, output, input_shape, + output_shape); +} + #endif diff --git a/benchmark-ptoisa/kernels/transpose/transpose_vector_007.hpp b/benchmark-ptoisa/kernels/transpose/transpose_vector_007.hpp index 8fc66e9..09df3a6 100644 --- a/benchmark-ptoisa/kernels/transpose/transpose_vector_007.hpp +++ b/benchmark-ptoisa/kernels/transpose/transpose_vector_007.hpp @@ -15,7 +15,6 @@ #define TRANSPOSE_VECTOR_007_TILE_HPP #include -#include #include using namespace pto; diff --git a/benchmark-ptoisa/kernels/transpose/transpose_vector_050.hpp b/benchmark-ptoisa/kernels/transpose/transpose_vector_050.hpp index 0d543f3..b4e01ca 100644 --- a/benchmark-ptoisa/kernels/transpose/transpose_vector_050.hpp +++ b/benchmark-ptoisa/kernels/transpose/transpose_vector_050.hpp @@ -23,7 +23,6 @@ #define TRANSPOSE_VECTOR_050_TILE_HPP #include -#include #include using namespace pto; diff --git a/benchmark-ptoisa/kernels/utils/layout_transform.hpp b/benchmark-ptoisa/kernels/utils/layout_transform.hpp index 3e07196..62b40f8 100644 --- a/benchmark-ptoisa/kernels/utils/layout_transform.hpp +++ b/benchmark-ptoisa/kernels/utils/layout_transform.hpp @@ -1,3 +1,9 @@ +#include + +using namespace pto::blkv; +using pto::is_global_data_v; +using pto::is_tile_data_v; + template void __vec__ gen_offset_ND2ZZ( typename tile_shape::TileDType __out__ out, @@ -98,8 +104,12 @@ void gen_ND2ZZ_offset_Impl( const uint32_t j) { static_assert(tile_shapeOffset::ValidRow != -1 && tile_shapeOffset::ValidCol != -1, "Only static shape supported"); - // gen_offset_ND2ZZ<<>>(offset.data(), glb_tensor::ColStride, glb_tensor::RowStride, tl_tensor::ValidRow, tl_tensor::ValidCol, i, j); - gen_offset_ND2ZZ<<>>(offset.data(), glb_tensor::ColStride, glb_tensor::RowStride, tl_tensor::ValidRow, tl_tensor::ValidCol, i, j); + pto::blkv::blkv_for_1d(tl_tensor::ValidCol, [&] { + gen_offset_ND2ZZ(offset.data(), glb_tensor::ColStride, + glb_tensor::RowStride, + tl_tensor::ValidRow, + tl_tensor::ValidCol, i, j); + }); } template @@ -111,7 +121,9 @@ void gen_ND2NN_offset_Impl( const uint32_t j) { static_assert(tile_shapeOffset::ValidRow != -1 && tile_shapeOffset::ValidCol != -1, "Only static shape supported"); - // gen_offset_ND2NN<<>>(offset.data(), glb_tensor::ColStride, glb_tensor::RowStride, tl_tensor::ValidRow, tl_tensor::ValidCol, i, j); - // gen_offset_ND2NN<<>>(offset.data(), glb_tensor::ColStride, glb_tensor::RowStride, tl_tensor::ValidRow, tl_tensor::ValidCol, i, j); - gen_offset_ND2NN_new<<>>(offset.data(), glb_tensor::ColStride, glb_tensor::RowStride, tl_tensor::ValidRow, tl_tensor::ValidCol, i, j); -} \ No newline at end of file + pto::blkv::blkv_for_1d(tl_tensor::ValidRow, [&] { + gen_offset_ND2NN_new( + offset.data(), glb_tensor::ColStride, glb_tensor::RowStride, + tl_tensor::ValidRow, tl_tensor::ValidCol, i, j); + }); +} diff --git a/benchmark-ptoisa/test/common/Makefile.common b/benchmark-ptoisa/test/common/Makefile.common index 35caf4d..6b97f6b 100644 --- a/benchmark-ptoisa/test/common/Makefile.common +++ b/benchmark-ptoisa/test/common/Makefile.common @@ -1,8 +1,11 @@ ROOT := $(shell echo $(CURDIR) | sed -E 's@(.*)/(benchmark-linxisa|benchmark-ptoisa)/.*@\1/\2@') +LINX_ISA_ROOT ?= $(shell cd $(ROOT)/../../.. && pwd) +LINX_SYSROOT ?= $(LINX_ISA_ROOT)/out/libc/musl/install/phase-b +PTO_KERNELS_INCLUDE ?= $(LINX_ISA_ROOT)/workloads/pto_kernels/include TEST_ROOT := $(shell echo $(CURDIR) | sed -E 's@(.*)/(benchmark-linxisa|benchmark-ptoisa)/test/.*@\1/\2/test@') CATEGORY := $(shell echo $(CURDIR) | sed -E 's@.*/(benchmark-linxisa|benchmark-ptoisa)/test/(.*)@\2@') CATEGORY_NAME := $(shell echo $(CATEGORY) | sed -e 's/\//_/g') -OBJ_ROOT := $(shell realpath $(TEST_ROOT)/../output) +OBJ_ROOT := $(abspath $(TEST_ROOT)/../output) CASE_SRC_DIR := $(CATEGORY)/src ELF_DIR := $(OBJ_ROOT)/$(CATEGORY)/elf SRC_DIR := $(shell dirname $(SRC_FILE)) @@ -51,7 +54,7 @@ CXX_VER ?= -std=c++20 else ifeq ($(PLAT), linx) DEFINES += -D__linx ifndef COMPILER_DIR -$(error COMPILER_DIR is not set. Export COMPILER_DIR pointing to the linx_blockisa_llvm_musl toolchain bin directory, e.g. export COMPILER_DIR=/path/to/linx_blockisa_llvm_musl/bin) +$(error COMPILER_DIR is not set. Export COMPILER_DIR pointing to the Linx Clang bin directory) endif AS = $(COMPILER_DIR)/clang CC = $(COMPILER_DIR)/clang @@ -59,17 +62,16 @@ CXX = $(COMPILER_DIR)/clang++ LINK = $(COMPILER_DIR)/clang++ DUMP = $(COMPILER_DIR)/llvm-objdump COPY = $(COMPILER_DIR)/llvm-objcopy -CC_O = -c -mlxbc -fenable-matrix -O2 -mllvm -enable-all-vector-as-tilereg=true $(CFLAGS) +LINX_TARGET ?= linx64-unknown-linux-musl +LINX_CFLAGS ?= -mlxbc --target=$(LINX_TARGET) -fenable-matrix -O2 +LINX_LDFLAGS ?= -nostdlib++ -unwindlib=none +ifneq ($(wildcard $(LINX_SYSROOT)/usr/include/bits/alltypes.h),) +LINX_CFLAGS += --sysroot=$(LINX_SYSROOT) -idirafter $(LINX_SYSROOT)/usr/include -D_GNU_SOURCE +CC_LINK += --sysroot=$(LINX_SYSROOT) +endif +CC_O = -c $(LINX_CFLAGS) $(CFLAGS) +CC_LINK += $(LINX_CFLAGS) $(LINX_LDFLAGS) CXX_VER ?= -std=c++20 - ifneq ($(CC_OPT), default) - CC_O += -mllvm -linxv5-enable-HL-Inst-Opt=true \ - -mllvm -linxv5-enable-dim-opt=true \ - -mllvm -linxv5-enable-ldst-bridge=false \ - -mllvm -linxv5-enable-continuous-mem-opt=true \ - -mllvm -linxv5-enable-tile-clock-hand=false \ - -mllvm -linxv5-enable-simt-clock-hand=true \ - -mllvm -enable-misched=false - endif ifeq ($(baremetal), on) CC_LINK += -static -lm -nostartfiles -L $(OBJ_ROOT)/$(COMM_SRC_DIR) -T $(LINK_SCRIPT) @@ -92,7 +94,7 @@ CC_O += -fPIC CC_LINK += -shared endif -INCLUDE += -I$(ROOT)/include -I$(ROOT)/test/common -I$(ROOT)/test/common/src -I$(ROOT)/kernels -I$(ROOT)/models +INCLUDE += -I$(ROOT)/include -I$(ROOT)/test -I$(ROOT)/test/common -I$(ROOT)/test/common/src -I$(ROOT)/kernels -I$(ROOT)/models -I$(PTO_KERNELS_INCLUDE) QEMU ?= /remote/lms60/c00622284/qemu/LinxBlockModel/build/qemu-linx CC_O_ALL = $(CC_O) $(CC_OPTS) diff --git a/benchmark-ptoisa/test/common/_start.s b/benchmark-ptoisa/test/common/_start.s index 150e10c..654d9bb 100644 --- a/benchmark-ptoisa/test/common/_start.s +++ b/benchmark-ptoisa/test/common/_start.s @@ -5,7 +5,7 @@ _start: bstart.std call main c.setret 2, ->ra _end: - bstart.aux fall + bstart.sys fall addi zero, 0x5e, ->x1 acrc 1 - c.bstop \ No newline at end of file + c.bstop diff --git a/benchmark-ptoisa/test/common/src/benchmark_boot_linx.s b/benchmark-ptoisa/test/common/src/benchmark_boot_linx.s index 4cae8da..146c0f5 100644 --- a/benchmark-ptoisa/test/common/src/benchmark_boot_linx.s +++ b/benchmark-ptoisa/test/common/src/benchmark_boot_linx.s @@ -7,7 +7,7 @@ _start: bstart.std call _linx_start c.setret 2, ->ra _end: - bstart.aux fall + bstart.sys fall addi zero, 0x5e, ->x1 acrc 1 - c.bstop \ No newline at end of file + c.bstop diff --git a/benchmark-ptoisa/test/common/template_asm.h b/benchmark-ptoisa/test/common/template_asm.h index 2f60707..8f1e0d6 100644 --- a/benchmark-ptoisa/test/common/template_asm.h +++ b/benchmark-ptoisa/test/common/template_asm.h @@ -3,78 +3,66 @@ #include +using namespace pto; + +template +struct pto_v057_tile_alloc { + static constexpr unsigned SizeCode = tile_type_traits::TilesizeCode; + static_assert(SizeCode >= 3, "v0.57 B.OTA allocation must be at least one 128-byte CELL"); + static constexpr unsigned CellCountM1 = (1u << (SizeCode - 3u)) - 1u; +}; + template void MGATHER(tile_shape_out &dst, gm_shape &src, tile_shape_offset &offset) { - asm volatile( - "BSTART.TMA 4, %c[DataType]\n" - "B.DIM zero, %c[VCOL], ->lb0\n" - "B.DIM zero, %c[VROW], ->lb1\n" - "B.IOT [%[s1]], last, ->%[d0]<%c[TileSize]>\n" - "B.IOR [%[s0]], []\n" - : [d0]"=Tr"(dst.data()) - : [s0]"r"(src.data()), - [s1]"Tr"(offset.data()), - [DataType]"i"(type_traits::TypeCode), - [TileSize]"i"(tile_type_traits::TilesizeCode), - [VCOL]"i"(tile_shape_offset::ValidCol), [VROW]"i"(tile_shape_offset::ValidRow) - ); + pto::MGATHER(dst, src, offset); } template void MSCATTER(gm_shape &dst, tile_shape_in &src, tile_shape_offset &offset) { - asm volatile( - "BSTART.TMA 5, %c[SrcType]\n" - "B.DIM zero, %c[VCOL], ->lb0\n" - "B.DIM zero, %c[VROW], ->lb1\n" - "B.IOT [%[s0], %[s1]], last\n" - "B.IOR [%[d0]], []\n" - : - : [d0]"r"(dst.data()), [s0]"Tr"(src.data()), - [s1]"Tr"(offset.data()), - [SrcType]"i"(type_traits::TypeCode), - [VCOL]"i"(tile_shape_offset::ValidCol), [VROW]"i"(tile_shape_offset::ValidRow) - ); + pto::MSCATTER(dst, src, offset); } template void TMAX_TEPL(tile_shape &dst, tile_shape &src0, tile_shape &src1) { asm volatile( - "BSTART.TEPL 11, %c1\n" + "BSTART.TEPL 0x25, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5, %6], last, ->%0<%c7>\n" + "B.ITP [%5, %6], 0\n" + "B.OTA ->%0<%c7>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape::ValidCol), "i"(tile_shape::ValidRow), "i"(tile_shape::Cols), - "Tr"(src0.data()), - "Tr"(src1.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src0.raw()), + "Tr"(src1.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TMULS_TEPL(tile_shape &dst, tile_shape &src0, typename tile_shape::DType s) { asm volatile( - "BSTART.TEPL 0b0100010, %c1\n" + "BSTART.TEPL 0x2B, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "B.IOR [%7],[]\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape::ValidCol), "i"(tile_shape::ValidRow), "i"(tile_shape::Cols), - "Tr"(src0.data()), - "i"(tile_type_traits::TilesizeCode), + "Tr"(src0.raw()), + "i"(pto_v057_tile_alloc::CellCountM1), "r"(s) ); } @@ -82,183 +70,191 @@ void TMULS_TEPL(tile_shape &dst, tile_shape &src0, typename tile_shape::DType s) template void TROWMAX_TEPL(tile_shape_out &dst, tile_shape_in &src) { asm volatile( - "BSTART.TEPL 0b1000001, %c1\n" + "BSTART.TEPL 0x47, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape_in::ValidCol), "i"(tile_shape_in::ValidRow), "i"(tile_shape_in::Cols), - "Tr"(src.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TSUB_TEPL(tile_shape &dst, tile_shape &src0, tile_shape &src1) { asm volatile( - "BSTART.TEPL 1, %c1\n" + "BSTART.TEPL 0x55, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5, %6], last, ->%0<%c7>\n" + "B.ITP [%5, %6], 0\n" + "B.OTA ->%0<%c7>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape::ValidCol), "i"(tile_shape::ValidRow), "i"(tile_shape::Cols), - "Tr"(src0.data()), - "Tr"(src1.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src0.raw()), + "Tr"(src1.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TEXP_TEPL(tile_shape &dst, tile_shape &src) { asm volatile( - "BSTART.TEPL 18, %c1\n" + "BSTART.TEPL 0x1C, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape::ValidCol), "i"(tile_shape::ValidRow), "i"(tile_shape::Cols), - "Tr"(src.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TMUL_TEPL(tile_shape &dst, tile_shape &src0, tile_shape &src1) { asm volatile( - "BSTART.TEPL 2, %c1\n" + "BSTART.TEPL 0x2A, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5, %6], last, ->%0<%c7>\n" + "B.ITP [%5, %6], 0\n" + "B.OTA ->%0<%c7>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape::ValidCol), "i"(tile_shape::ValidRow), "i"(tile_shape::Cols), - "Tr"(src0.data()), - "Tr"(src1.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src0.raw()), + "Tr"(src1.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TROWSUM_TEPL(tile_shape_out &dst, tile_shape_in &src) { asm volatile( - "BSTART.TEPL 0b1000000, %c1\n" + "BSTART.TEPL 0x4A, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape_in::ValidCol), "i"(tile_shape_in::ValidRow), "i"(tile_shape_in::Cols), - "Tr"(src.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TADD_TEPL(tile_shape &dst, tile_shape &src0, tile_shape &src1) { asm volatile( - "BSTART.TEPL 0, %c1\n" + "BSTART.TEPL 0x01, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5, %6], last, ->%0<%c7>\n" + "B.ITP [%5, %6], 0\n" + "B.OTA ->%0<%c7>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape::ValidCol), "i"(tile_shape::ValidRow), "i"(tile_shape::Cols), - "Tr"(src0.data()), - "Tr"(src1.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src0.raw()), + "Tr"(src1.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TRECIP_TEPL(tile_shape &dst, tile_shape &src) { asm volatile( - "BSTART.TEPL 20, %c1\n" + "BSTART.TEPL 0x39, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape::ValidCol), "i"(tile_shape::ValidRow), "i"(tile_shape::Cols), - "Tr"(src.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TCAST_TEPL(tile_shape_out &dst, tile_shape_in &src) { asm volatile( - "BSTART.TEPL 27, %c1\n" + "BSTART.TEPL 0x19, %c1\n" "B.DATR %c2, RNONE\n" "C.B.DIMI %c3, ->LB0\n" "C.B.DIMI %c4, ->LB1\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(type_traits::TypeCode), "i"(tile_shape_in::ValidCol), "i"(tile_shape_in::ValidRow), - "Tr"(src.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TEXPANDSCALAR_TEPL(tile_shape &dst, typename tile_shape::DType s) { asm volatile( - "BSTART.TEPL 0b0111011, %c1\n" + "BSTART.TEPL 0x1D, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [], last, ->%0<%c5>\n" + "B.OTA ->%0<%c5>, last, 0\n" "B.IOR [%6],[]\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape::ValidCol), "i"(tile_shape::ValidRow), "i"(tile_shape::Cols), - "i"(tile_type_traits::TilesizeCode), + "i"(pto_v057_tile_alloc::CellCountM1), "r"(s) ); } @@ -266,101 +262,106 @@ void TEXPANDSCALAR_TEPL(tile_shape &dst, typename tile_shape::DType s) { template void TROWEXPAND_TEPL(tile_shape_out &dst, tile_shape_in &src) { asm volatile( - "BSTART.TEPL 0b1000100, %c1\n" + "BSTART.TEPL 0x3F, %c1\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape_in::ValidCol), "i"(tile_shape_in::ValidRow), "i"(tile_shape_in::Cols), - "Tr"(src.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TCOLMAX_TEPL(tile_shape_out &dst, tile_shape_in &src) { asm volatile( - "BSTART.TEPL 0b1010001, %c1\n" + "BSTART.TEPL 0x15, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape_in::ValidCol), "i"(tile_shape_in::ValidRow), "i"(tile_shape_in::Cols), - "Tr"(src.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TCOLSUM_TEPL(tile_shape_out &dst, tile_shape_in &src) { asm volatile( - "BSTART.TEPL 0b1010000, %c1\n" + "BSTART.TEPL 0x18, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5], last, ->%0<%c6>\n" + "B.ITP [%5], 0\n" + "B.OTA ->%0<%c6>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape_in::ValidCol), "i"(tile_shape_in::ValidRow), "i"(tile_shape_in::Cols), - "Tr"(src.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TCOLEXPANDSUB_TEPL(tile_shape_out &dst, tile_shape_out &src0, tile_shape_in &src1) { asm volatile( - "BSTART.TEPL 0b1010110, %c1\n" + "BSTART.TEPL 0x14, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5, %6], last, ->%0<%c7>\n" + "B.ITP [%5, %6], 0\n" + "B.OTA ->%0<%c7>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape_out::ValidCol), "i"(tile_shape_out::ValidRow), "i"(tile_shape_out::Cols), - "Tr"(src0.data()), - "Tr"(src1.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src0.raw()), + "Tr"(src1.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } template void TCOLEXPANDMUL_TEPL(tile_shape_out &dst, tile_shape_out &src0, tile_shape_in &src1) { asm volatile( - "BSTART.TEPL 0b1010111, %c1\n" + "BSTART.TEPL 0x13, %c1\n" "B.DATR Null\n" "C.B.DIMI %c2, ->LB0\n" "C.B.DIMI %c3, ->LB1\n" "C.B.DIMI %c4, ->LB2\n" - "B.IOT [%5, %6], last, ->%0<%c7>\n" + "B.ITP [%5, %6], 0\n" + "B.OTA ->%0<%c7>, last, 0\n" "" - : "=Tr"(dst.data()) + : "=Tr"(dst.raw()) : "i"(type_traits::TypeCode), "i"(tile_shape_out::ValidCol), "i"(tile_shape_out::ValidRow), "i"(tile_shape_out::Cols), - "Tr"(src0.data()), - "Tr"(src1.data()), - "i"(tile_type_traits::TilesizeCode) + "Tr"(src0.raw()), + "Tr"(src1.raw()), + "i"(pto_v057_tile_alloc::CellCountM1) ); } diff --git a/benchmark-ptoisa/test/kernel/control/compile.all b/benchmark-ptoisa/test/kernel/control/compile.all index 6ec69f0..b4344c8 100755 --- a/benchmark-ptoisa/test/kernel/control/compile.all +++ b/benchmark-ptoisa/test/kernel/control/compile.all @@ -5,7 +5,7 @@ for debug in on off; do else debug_define="-DFOR_GFSIM" fi - for num_col in 256 512 1024; do + for num_col in 256 512; do make TESTCASE=hashtable_lookup_simd SUFFIX=_kNum6144_kMaxProbe512_knum_col${num_col}_debug_${debug} EXTRA_DEFINES="-DkNum=6144 -DMAX_PROBE=512 -DNUM_COL=${num_col} ${debug_define}" diss done done diff --git a/benchmark-ptoisa/test/kernel/control/hashtable_lookup_simd/data_obj/build_data_obj.sh b/benchmark-ptoisa/test/kernel/control/hashtable_lookup_simd/data_obj/build_data_obj.sh index 2668212..234f7b4 100755 --- a/benchmark-ptoisa/test/kernel/control/hashtable_lookup_simd/data_obj/build_data_obj.sh +++ b/benchmark-ptoisa/test/kernel/control/hashtable_lookup_simd/data_obj/build_data_obj.sh @@ -1,5 +1,5 @@ #!/bin/bash -COMPILER_DIR="${COMPILER_DIR:-/remote/lms60/c00622284/janus/linxisa_compiler_v0.55/linx_blockisa_llvm_musl/bin}" +: "${COMPILER_DIR:?COMPILER_DIR must point to the Linx Clang bin directory}" DATA_OBJ_DIR="$1" OUTPUT_DIR="$2" @@ -26,11 +26,11 @@ _binary_${name}_data_end: .equ _binary_${name}_data_size, .-_binary_${name}_data_start EOF - $COMPILER_DIR/clang++ -target linx64v5 -c "$asm_file" -o "$obj_file" + "$COMPILER_DIR/clang++" -mlxbc -c "$asm_file" -o "$obj_file" } build_one "inserted_slot" build_one "lookup_keys" build_one "lookup_values" -echo "Done building data object files" \ No newline at end of file +echo "Done building data object files" diff --git a/benchmark-ptoisa/test/kernel/control/hkv/data_obj/build_data_obj.sh b/benchmark-ptoisa/test/kernel/control/hkv/data_obj/build_data_obj.sh index ec4819c..f0c7423 100755 --- a/benchmark-ptoisa/test/kernel/control/hkv/data_obj/build_data_obj.sh +++ b/benchmark-ptoisa/test/kernel/control/hkv/data_obj/build_data_obj.sh @@ -1,5 +1,5 @@ #!/bin/bash -COMPILER_DIR="${COMPILER_DIR:-/remote/lms01/j00827727/jcore/compilers/linx_blockisa_llvm_musl0.56.16/bin}" +: "${COMPILER_DIR:?COMPILER_DIR must point to the Linx Clang bin directory}" DATA_OBJ_DIR="$1" OUTPUT_DIR="$2" @@ -28,7 +28,7 @@ _binary_${sym_name}_end: .equ _binary_${sym_name}_size, .-_binary_${sym_name}_start EOF - $COMPILER_DIR/clang++ -target linx64v5 -c "$asm_file" -o "$obj_file" + "$COMPILER_DIR/clang++" -mlxbc -c "$asm_file" -o "$obj_file" } build_one "buckets.bin" diff --git a/benchmark-ptoisa/test/kernel/element_wise/gelu/compile.all b/benchmark-ptoisa/test/kernel/element_wise/gelu/compile.all index 84d901c..b1f3da1 100644 --- a/benchmark-ptoisa/test/kernel/element_wise/gelu/compile.all +++ b/benchmark-ptoisa/test/kernel/element_wise/gelu/compile.all @@ -7,7 +7,7 @@ # BF16, exact mode # ============================================ # 3D shape: (24, 8, 1024) -make TESTCASE=gelu DTYPE=__bf16 tMs=2048 gMs=24*8*1024 \ +make TESTCASE=gelu DTYPE=pto_bf16_t tMs=2048 gMs=24*8*1024 \ SHAPE_NAME=24_8_1024 Approximate=false diss # # 2D shape: (128, 1024) diff --git a/benchmark-ptoisa/test/kernel/fa/Makefile b/benchmark-ptoisa/test/kernel/fa/Makefile index c9b5c73..4b00c5b 100644 --- a/benchmark-ptoisa/test/kernel/fa/Makefile +++ b/benchmark-ptoisa/test/kernel/fa/Makefile @@ -14,12 +14,16 @@ ifeq ($(TESTCASE), fa_HIF4_HIF4) Tk = 128 X = 2 Y = 4 + qD = 128 + vD = 128 DEFINES += -DTsq=$(Sq) DEFINES += -DTskv=$(Skv) DEFINES += -DTm=$(Tm) DEFINES += -DTk=$(Tk) DEFINES += -DXdim=$(X) DEFINES += -DYdim=$(Y) + DEFINES += -DTqD=$(qD) + DEFINES += -DTvD=$(vD) ifneq ($(MODE), ) DEFINES += -D$(MODE) endif @@ -44,6 +48,8 @@ ifeq ($(TESTCASE), fa_2d_unroll) Tk = 128 X = 1 Y = 1 + qD = 128 + vD = 128 ifneq ($(X), ) X = $(X) endif @@ -56,7 +62,9 @@ ifeq ($(TESTCASE), fa_2d_unroll) DEFINES += -DTk=$(Tk) DEFINES += -DXdim=$(X) DEFINES += -DYdim=$(Y) - TARGET = $(ELF_HEAD)/$(TESTCASE)_Sq$(Sq)_Skv$(Skv)_Tm$(Tm)_Tk$(Tk)_X$(X)_Y$(Y).elf + DEFINES += -DTqD=$(qD) + DEFINES += -DTvD=$(vD) + TARGET = $(ELF_HEAD)/$(TESTCASE)_Sq$(Sq)_Skv$(Skv)_qD$(qD)_vD$(vD)_Tm$(Tm)_Tk$(Tk)_X$(X)_Y$(Y).elf endif include ../../common/Makefile.common diff --git a/benchmark-ptoisa/test/kernel/fa/compile.all b/benchmark-ptoisa/test/kernel/fa/compile.all index 52c7464..cad6bdf 100644 --- a/benchmark-ptoisa/test/kernel/fa/compile.all +++ b/benchmark-ptoisa/test/kernel/fa/compile.all @@ -5,24 +5,25 @@ # fa_2d_unroll: 2D unrolled flash attention # ============================================ # X=1, Y=2 -make TESTCASE=fa_2d_unroll Sq=256 Skv=512 Tm=128 Tk=128 X=1 Y=2 -make TESTCASE=fa_2d_unroll Sq=512 Skv=512 Tm=128 Tk=128 X=1 Y=2 +make TESTCASE=fa_2d_unroll Sq=256 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=1 Y=2 +make TESTCASE=fa_2d_unroll Sq=512 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=1 Y=2 # X=1, Y=4 -make TESTCASE=fa_2d_unroll Sq=256 Skv=512 Tm=128 Tk=128 X=1 Y=4 -make TESTCASE=fa_2d_unroll Sq=512 Skv=512 Tm=128 Tk=128 X=1 Y=4 +make TESTCASE=fa_2d_unroll Sq=256 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=1 Y=4 +make TESTCASE=fa_2d_unroll Sq=512 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=1 Y=4 # X=2, Y=2 -make TESTCASE=fa_2d_unroll Sq=256 Skv=512 Tm=128 Tk=128 X=2 Y=2 -make TESTCASE=fa_2d_unroll Sq=512 Skv=512 Tm=128 Tk=128 X=2 Y=2 +make TESTCASE=fa_2d_unroll Sq=256 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=2 Y=2 +make TESTCASE=fa_2d_unroll Sq=512 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=2 Y=2 # X=2, Y=4 -make TESTCASE=fa_2d_unroll Sq=256 Skv=512 Tm=128 Tk=128 X=2 Y=4 -make TESTCASE=fa_2d_unroll Sq=512 Skv=512 Tm=128 Tk=128 X=2 Y=4 +make TESTCASE=fa_2d_unroll Sq=256 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=2 Y=4 +make TESTCASE=fa_2d_unroll Sq=512 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=2 Y=4 # ============================================ # fa_HIF4_HIF4: HIF4 quantized flash attention +# Requires block-vector launch migration before it is v0.57-active. # ============================================ -make TESTCASE=fa_HIF4_HIF4 MODE=BF16_NOGATHER Sq=256 Skv=512 Tm=128 Tk=128 X=1 Y=1 +# make TESTCASE=fa_HIF4_HIF4 MODE=BF16_NOGATHER Sq=256 Skv=512 qD=32 vD=32 Tm=32 Tk=32 X=1 Y=1 # make TESTCASE=fa_HIF4_HIF4 MODE=BF16x2 Sq=256 Skv=512 Tm=128 Tk=128 X=1 Y=1 # make TESTCASE=fa_HIF4_HIF4 MODE=BF16x2_NOGATHER Sq=256 Skv=512 Tm=128 Tk=128 X=1 Y=1 diff --git a/benchmark-ptoisa/test/kernel/fa/src/fa_2d_unroll.cpp b/benchmark-ptoisa/test/kernel/fa/src/fa_2d_unroll.cpp index d57e0e2..b7024b0 100644 --- a/benchmark-ptoisa/test/kernel/fa/src/fa_2d_unroll.cpp +++ b/benchmark-ptoisa/test/kernel/fa/src/fa_2d_unroll.cpp @@ -18,8 +18,17 @@ #define Skv Tskv #endif +#ifndef TqD #define qD 128 +#else +#define qD TqD +#endif + +#ifndef TvD #define vD 128 +#else +#define vD TvD +#endif #ifndef Tm #define kTm 128 diff --git a/benchmark-ptoisa/test/kernel/gather/compile.all b/benchmark-ptoisa/test/kernel/gather/compile.all index f891426..588626f 100755 --- a/benchmark-ptoisa/test/kernel/gather/compile.all +++ b/benchmark-ptoisa/test/kernel/gather/compile.all @@ -25,4 +25,4 @@ COMPILER_DIR=${COMPILER_DIR:-/Users/liyi/Documents/SuperNPU编译器构建/outpu # Power-of-2 dimensions make TESTCASE=gather COMPILER_DIR=$COMPILER_DIR DType=__fp32 OType=uint32_t \ gKs=131072 gMs=32 gNs=256 \ - tMs=32 tNs=128 + tMs=32 tNs=32 diff --git a/benchmark-ptoisa/test/kernel/matmul/compile.all b/benchmark-ptoisa/test/kernel/matmul/compile.all index f48dbfb..3f158c3 100755 --- a/benchmark-ptoisa/test/kernel/matmul/compile.all +++ b/benchmark-ptoisa/test/kernel/matmul/compile.all @@ -23,39 +23,39 @@ # ============================================ # HIF4_HIF4: MX_NOGATHER variant - 2 configs # ============================================ -make TESTCASE=matmul TYPE=HIF4_HIF4 VER=MX_NOGATHER M=256 N=2048 K=2048 tM=128 tN=128 tK=128 -make TESTCASE=matmul TYPE=HIF4_HIF4 VER=MX_NOGATHER M=512 N=1280 K=4096 tM=128 tN=128 tK=128 +make TESTCASE=matmul TYPE=HIF4_HIF4 VER=MX_NOGATHER M=256 N=2048 K=2048 tM=32 tN=32 tK=128 +make TESTCASE=matmul TYPE=HIF4_HIF4 VER=MX_NOGATHER M=512 N=1280 K=4096 tM=32 tN=32 tK=128 # ============================================ # HIF4_HIF4: MX_NOGATHER_REUSEA variant - 2 configs # ============================================ -make TESTCASE=matmul TYPE=HIF4_HIF4 VER=MX_NOGATHER_REUSEA M=256 N=2048 K=2048 tM=128 tN=128 tK=128 -make TESTCASE=matmul TYPE=HIF4_HIF4 VER=MX_NOGATHER_REUSEA M=512 N=1280 K=4096 tM=128 tN=128 +make TESTCASE=matmul TYPE=HIF4_HIF4 VER=MX_NOGATHER_REUSEA M=256 N=2048 K=2048 tM=32 tN=32 tK=128 +make TESTCASE=matmul TYPE=HIF4_HIF4 VER=MX_NOGATHER_REUSEA M=512 N=1280 K=4096 tM=32 tN=32 tK=128 # ============================================ # A16W4: BF16 x FP4 mixed precision - 3 configs # ============================================ -make TESTCASE=matmul TYPE=A16W4 M=256 N=2048 K=2048 tM=128 tN=128 tK=128 -make TESTCASE=matmul TYPE=A16W4 M=512 N=1280 K=2048 tM=128 tN=128 tK=128 -make TESTCASE=matmul TYPE=A16W4 M=512 N=512 K=4096 tM=128 tN=256 tK=128 +make TESTCASE=matmul TYPE=A16W4 M=256 N=2048 K=2048 tM=16 tN=16 tK=128 +make TESTCASE=matmul TYPE=A16W4 M=512 N=1280 K=2048 tM=16 tN=16 tK=128 +make TESTCASE=matmul TYPE=A16W4 M=512 N=512 K=4096 tM=16 tN=16 tK=128 # ============================================ # MASK: Generic matmul variants - 8 configs # ============================================ # FP32 variants -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP32 M=256 N=256 K=256 tM=64 tN=64 tK=64 -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP32_REUSEA M=256 N=256 K=256 tM=64 tN=64 tK=64 -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP32_REUSEB M=256 N=256 K=256 tM=64 tN=64 tK=64 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP32 M=256 N=256 K=256 tM=32 tN=32 tK=32 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP32_REUSEA M=256 N=256 K=256 tM=32 tN=32 tK=32 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP32_REUSEB M=256 N=256 K=256 tM=32 tN=32 tK=32 # 黄区能过,蓝区过不了 # make TESTCASE=matmul TYPE=MASK MODE=MASK_FP32_DYNAMIC M=256 N=256 K=256 tM=64 tN=64 tK=64 # FP16 variants -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP16 M=256 N=256 K=256 tM=64 tN=64 tK=64 -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP16_REUSEA M=256 N=256 K=256 tM=64 tN=64 tK=64 -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP16_REUSEB M=256 N=256 K=256 tM=64 tN=64 tK=64 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP16 M=256 N=256 K=256 tM=32 tN=32 tK=32 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP16_REUSEA M=256 N=256 K=256 tM=32 tN=32 tK=32 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP16_REUSEB M=256 N=256 K=256 tM=32 tN=32 tK=32 # FP8 variants -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP8 M=256 N=256 K=256 tM=64 tN=64 tK=64 -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP8_REUSEA M=256 N=256 K=256 tM=64 tN=64 tK=64 -make TESTCASE=matmul TYPE=MASK MODE=MASK_FP8_REUSEB M=256 N=256 K=256 tM=64 tN=64 tK=64 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP8 M=256 N=256 K=256 tM=32 tN=32 tK=32 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP8_REUSEA M=256 N=256 K=256 tM=32 tN=32 tK=32 +make TESTCASE=matmul TYPE=MASK MODE=MASK_FP8_REUSEB M=256 N=256 K=256 tM=32 tN=32 tK=32 # make TESTCASE=matmul TYPE=MASK MODE=MX_FP8 M=256 N=256 K=256 tM=64 tN=64 tK=64 diff --git a/benchmark-ptoisa/test/kernel/matmul/src/A16W4.cpp b/benchmark-ptoisa/test/kernel/matmul/src/A16W4.cpp index e6af690..b4a2906 100644 --- a/benchmark-ptoisa/test/kernel/matmul/src/A16W4.cpp +++ b/benchmark-ptoisa/test/kernel/matmul/src/A16W4.cpp @@ -41,7 +41,7 @@ int main() { // bf16*fp4 using fp4_t = __fp4_e2m1x2; - using bf16_t = __bf16; + using bf16_t = pto_bf16_t; static_assert(tilM % 2 == 0); // 暂时假定tile是偶数的,方便取地址,奇数tile实现需要末尾padding 0对齐地址 static_assert(tilN % 2 == 0); static_assert(tilK == (128)); @@ -68,4 +68,4 @@ int main() { #endif return 0; -} \ No newline at end of file +} diff --git a/benchmark-ptoisa/test/kernel/reduction/reducemax_col/compile.all b/benchmark-ptoisa/test/kernel/reduction/reducemax_col/compile.all index d54fa35..8f8a50c 100755 --- a/benchmark-ptoisa/test/kernel/reduction/reducemax_col/compile.all +++ b/benchmark-ptoisa/test/kernel/reduction/reducemax_col/compile.all @@ -4,9 +4,9 @@ COMPILER_DIR=${COMPILER_DIR:-/Users/liyi/Documents/SuperNPU编译器构建/output/linx_blockisa_llvm_musl/bin} # Large matrix, int32_t -make TESTCASE=reducemax_col COMPILER_DIR=$COMPILER_DIR DType=int32_t tM_s=256 tN_s=64 \ +make TESTCASE=reducemax_col COMPILER_DIR=$COMPILER_DIR DType=int32_t tM_s=16 tN_s=64 \ gM_s=2048 gN_s=64 # Large matrix, __half -#make TESTCASE=reducemax_col COMPILER_DIR=$COMPILER_DIR DType=__half tM_s=256 tN_s=64 \ +#make TESTCASE=reducemax_col COMPILER_DIR=$COMPILER_DIR DType=__half tM_s=16 tN_s=64 \ # gM_s=2048 gN_s=64 diff --git a/benchmark-ptoisa/test/kernel/reduction/reducemax_col/src/reducemax_col.cpp b/benchmark-ptoisa/test/kernel/reduction/reducemax_col/src/reducemax_col.cpp index a9cb6ea..dc767c9 100644 --- a/benchmark-ptoisa/test/kernel/reduction/reducemax_col/src/reducemax_col.cpp +++ b/benchmark-ptoisa/test/kernel/reduction/reducemax_col/src/reducemax_col.cpp @@ -11,28 +11,30 @@ #endif -#ifndef tM -#define tM 32 +#ifndef tMs +#define tMs 32 #endif -#ifndef tN -#define tN 128 +#ifndef tNs +#define tNs 128 #endif +#ifndef gIMs +#define gIMs 256 +#endif - - -#define gIM 256 -#define gIN 256 +#ifndef gINs +#define gINs 256 +#endif // ============================================================================ // main // ============================================================================ int main() { using dtype = DType; - dtype input_buf[gIM*gIN]; + dtype input_buf[gIMs*gINs]; // dtype zero_buf[1*gIN]; - dtype output_buf[1*gIN]; + dtype output_buf[1*gINs]; dtype* input=input_buf; // dtype* zero=zero_buf; @@ -42,8 +44,7 @@ int main() { // readBinaryFile("/remote/lms01/q50057645/jcore_project/JanusCoreBench/test/ascpp/reducemax_col/src/data_8192x1024.bin", (uint8_t*)input, gIM * gIN * sizeof(dtype)); // readBinaryFile("/remote/lms01/q50057645/jcore_project/JanusCoreBench/test/ascpp/reducemax_col/src/data1x256_zero.bin", (uint8_t*)zero, gIN * sizeof(dtype)); // reducesum_colsum_rand(input, output); - reducemax_col_rand(input, output); + reducemax_col_rand(input, output); // writeBinaryFile("/remote/lms01/q50057645/jcore_project/JanusCoreBench/test/ascpp/reducemax_col/src/result_max.bin", (uint8_t*)output, gIN * sizeof(dtype)); } - diff --git a/benchmark-ptoisa/test/kernel/reduction/reducemax_row/compile.all b/benchmark-ptoisa/test/kernel/reduction/reducemax_row/compile.all index 92bfeed..8a0e379 100755 --- a/benchmark-ptoisa/test/kernel/reduction/reducemax_row/compile.all +++ b/benchmark-ptoisa/test/kernel/reduction/reducemax_row/compile.all @@ -4,9 +4,9 @@ COMPILER_DIR=${COMPILER_DIR:-/Users/liyi/Documents/SuperNPU编译器构建/output/linx_blockisa_llvm_musl/bin} # Large matrix, int32_t -make TESTCASE=reducemax_row COMPILER_DIR=$COMPILER_DIR DType=int32_t tM_s=16 tN_s=512 \ +make TESTCASE=reducemax_row COMPILER_DIR=$COMPILER_DIR DType=int32_t tM_s=16 tN_s=64 \ gM_s=16 gN_s=8192 # Large matrix, __half (DISABLED: Tile alignment < 32 bytes for _Float16 with Cols=8) -# make TESTCASE=reducemax_row COMPILER_DIR=$COMPILER_DIR DType=__half tM_s=16 tN_s=512 \ +# make TESTCASE=reducemax_row COMPILER_DIR=$COMPILER_DIR DType=__half tM_s=16 tN_s=64 \ # gM_s=16 gN_s=8192 diff --git a/benchmark-ptoisa/test/kernel/reduction/reducesum_col/compile.all b/benchmark-ptoisa/test/kernel/reduction/reducesum_col/compile.all index c9daba1..d3b6098 100755 --- a/benchmark-ptoisa/test/kernel/reduction/reducesum_col/compile.all +++ b/benchmark-ptoisa/test/kernel/reduction/reducesum_col/compile.all @@ -4,9 +4,9 @@ COMPILER_DIR=${COMPILER_DIR:-/Users/liyi/Documents/SuperNPU编译器构建/output/linx_blockisa_llvm_musl/bin} # Large matrix, int32_t -make TESTCASE=reducesum_col COMPILER_DIR=$COMPILER_DIR DType=int32_t tM_s=256 tN_s=64 \ +make TESTCASE=reducesum_col COMPILER_DIR=$COMPILER_DIR DType=int32_t tM_s=16 tN_s=64 \ gM_s=2048 gN_s=64 # Large matrix, __half -make TESTCASE=reducesum_col COMPILER_DIR=$COMPILER_DIR DType=__half tM_s=256 tN_s=64 \ +make TESTCASE=reducesum_col COMPILER_DIR=$COMPILER_DIR DType=__half tM_s=16 tN_s=64 \ gM_s=2048 gN_s=64 diff --git a/benchmark-ptoisa/test/kernel/reduction/reducesum_row/compile.all b/benchmark-ptoisa/test/kernel/reduction/reducesum_row/compile.all index 4bfab51..8f360f3 100755 --- a/benchmark-ptoisa/test/kernel/reduction/reducesum_row/compile.all +++ b/benchmark-ptoisa/test/kernel/reduction/reducesum_row/compile.all @@ -4,9 +4,9 @@ COMPILER_DIR=${COMPILER_DIR:-/Users/liyi/Documents/SuperNPU编译器构建/output/linx_blockisa_llvm_musl/bin} # Large matrix, int32_t -make TESTCASE=reducesum_row COMPILER_DIR=$COMPILER_DIR DType=int32_t tM_s=16 tN_s=512 \ +make TESTCASE=reducesum_row COMPILER_DIR=$COMPILER_DIR DType=int32_t tM_s=16 tN_s=64 \ gM_s=16 gN_s=8192 # Large matrix, __half (DISABLED: Tile alignment < 32 bytes for _Float16 with Cols=8) -# make TESTCASE=reducesum_row COMPILER_DIR=$COMPILER_DIR DType=__half tM_s=16 tN_s=512 \ +# make TESTCASE=reducesum_row COMPILER_DIR=$COMPILER_DIR DType=__half tM_s=16 tN_s=64 \ # gM_s=16 gN_s=8192 diff --git a/benchmark-ptoisa/test/kernel/sort/topk/data_obj/build_data_obj.sh b/benchmark-ptoisa/test/kernel/sort/topk/data_obj/build_data_obj.sh index 8128f1e..4644e7b 100755 --- a/benchmark-ptoisa/test/kernel/sort/topk/data_obj/build_data_obj.sh +++ b/benchmark-ptoisa/test/kernel/sort/topk/data_obj/build_data_obj.sh @@ -1,5 +1,5 @@ #!/bin/bash -COMPILER_DIR="${COMPILER_DIR:-/remote/lms01/j00827727/jcore/compilers/linx_blockisa_llvm_musl0.56.16/bin}" +: "${COMPILER_DIR:?COMPILER_DIR must point to the Linx Clang bin directory}" DATA_OBJ_DIR="$1" OUTPUT_DIR="$2" @@ -25,10 +25,10 @@ _binary_${name}_data_end: .equ _binary_${name}_data_size, .-_binary_${name}_data_start EOF - $COMPILER_DIR/clang++ -target linx64v5 -c "$asm_file" -o "$obj_file" + "$COMPILER_DIR/clang++" -mlxbc -c "$asm_file" -o "$obj_file" } build_one "input_131072" build_one "top_2048_out" -echo "Done building data object files" \ No newline at end of file +echo "Done building data object files" diff --git a/benchmark-ptoisa/test/kernel/sort/topk/topk.cpp b/benchmark-ptoisa/test/kernel/sort/topk/topk.cpp index 1f579fd..0856747 100644 --- a/benchmark-ptoisa/test/kernel/sort/topk/topk.cpp +++ b/benchmark-ptoisa/test/kernel/sort/topk/topk.cpp @@ -42,26 +42,15 @@ int main() { fflush(stdout); #endif - // ------------------------------------------------------------------------- - // Phase 1: SIMT high8 histogram (1 block × 256 lanes, each lane = 1 bucket) - // ------------------------------------------------------------------------- - TileU32 high8HistTile; - TEXPANDSCALAR(high8HistTile, static_cast(0)); - ExtractHigh8Hist_Impl< TileU32 >(high8HistTile, g_input); - - // Copy histogram results out and reduce to global 256-bin histogram - using HistGT = GlobalTensor, Stride<1,1,1,16,1>>; - uint32_t histResult[256]; - HistGT histGlobal(histResult); - TCOPYOUT(histGlobal, high8HistTile); - uint32_t global_high8_hist[256] = {0}; - for (int b = 0; b < 256; b++) { - global_high8_hist[b] = histResult[b]; + for (int i = 0; i < kInputCount; i++) { + uint16_t val = g_input[i]; + uint8_t high8 = static_cast(val >> 8); + global_high8_hist[high8] += 1; } #ifndef FOR_GFSIM - printf("\nPhase 1: high8 histograms built (1 SIMT launch, 256 lanes).\n"); + printf("\nPhase 1: high8 histogram built.\n"); fflush(stdout); #endif @@ -82,21 +71,14 @@ int main() { fflush(stdout); #endif - // ------------------------------------------------------------------------- - // Phase 3: SIMT low8 histogram for kth_bin elements - // ------------------------------------------------------------------------- - TileU32 low8HistTile; - TEXPANDSCALAR(low8HistTile, static_cast(0)); - ExtractLow8HistForKthBin_Impl< TileU32 >(low8HistTile, g_input, - static_cast(kth_bin)); - - uint32_t low8HistResult[256]; - HistGT low8HistGlobal(low8HistResult); - TCOPYOUT(low8HistGlobal, low8HistTile); - uint32_t global_low8_hist_kth[256] = {0}; - for (int b = 0; b < 256; b++) { - global_low8_hist_kth[b] = low8HistResult[b]; + for (int i = 0; i < kInputCount; i++) { + uint16_t val = g_input[i]; + uint8_t high8 = static_cast(val >> 8); + if (high8 == static_cast(kth_bin)) { + uint8_t low8 = static_cast(val & 0xFF); + global_low8_hist_kth[low8] += 1; + } } // ------------------------------------------------------------------------- diff --git a/compile_all.sh b/compile_all.sh index 0855097..a4a508e 100755 --- a/compile_all.sh +++ b/compile_all.sh @@ -9,6 +9,7 @@ echo "ISA backend: $ISA" echo "==========================================" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +FAILURES=0 compile_linx() { echo "" @@ -17,6 +18,7 @@ compile_linx() { bash "$SCRIPT_DIR/benchmark-linxisa/compile_all.sh" else echo "Warning: benchmark-linxisa/compile_all.sh not found" + return 1 fi } @@ -27,19 +29,28 @@ compile_pto() { bash "$SCRIPT_DIR/benchmark-ptoisa/compile_all.sh" else echo "Warning: benchmark-ptoisa/compile_all.sh not found" + return 1 fi } case $ISA in linx|benchmark-linxisa) - compile_linx + if ! compile_linx; then + FAILURES=$((FAILURES + 1)) + fi ;; pto|benchmark-ptoisa) - compile_pto + if ! compile_pto; then + FAILURES=$((FAILURES + 1)) + fi ;; all) - compile_linx - compile_pto + if ! compile_linx; then + FAILURES=$((FAILURES + 1)) + fi + if ! compile_pto; then + FAILURES=$((FAILURES + 1)) + fi ;; *) echo "Usage: $0 [linx|pto|all]" @@ -54,3 +65,8 @@ echo "" echo "==========================================" echo "Build completed for: $ISA" echo "==========================================" + +if [ "$FAILURES" -ne 0 ]; then + echo "Backend compilation failures: $FAILURES" + exit 1 +fi