diff --git a/tests/ap/matmul/all_tuning_configs.h b/tests/ap/matmul/all_tuning_configs.h index dca41fa..dfa70d1 100644 --- a/tests/ap/matmul/all_tuning_configs.h +++ b/tests/ap/matmul/all_tuning_configs.h @@ -6,483 +6,334 @@ namespace ap { -constexpr int kNumConfigsHalf = 23; -constexpr int kNumConfigsFloat = 13; - -template struct SwizzleWrapper { - using Type = - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle; -}; - -// template -// struct SwizzleWrapper { -// using Type = -// cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle; -// }; - -#define AP_AUTOTUNE_half(func, stream, ...) \ - { \ - using FuncType = decltype(func<0>); \ - static int selected_config_id = -1; \ - static std::vector> matmul_functions = { \ - func<0>, func<1>, func<2>, func<3>, func<4>, func<5>, \ - func<6>, func<7>, func<8>, func<9>, func<10>, func<11>, \ - func<12>, func<13>, func<14>, func<15>, func<16>, func<17>, \ - func<18>, func<19>, func<20>, func<21>, func<22>}; \ - if (selected_config_id == -1) { \ - selected_config_id = \ - ap::ProfileBestConfig(matmul_functions, stream, ##__VA_ARGS__); \ - } \ - matmul_functions[selected_config_id](__VA_ARGS__); \ - } - -#define AP_AUTOTUNE_nv_bfloat16(func, stream, ...) \ - AP_AUTOTUNE_half(func, stream, __VA_ARGS__) - -#define AP_AUTOTUNE_float(func, stream, ...) \ - { \ - using FuncType = decltype(func<0>); \ - static int selected_config_id = -1; \ - static std::vector> matmul_functions = { \ - func<0>, func<1>, func<2>, func<3>, func<4>, func<5>, func<6>, \ - func<7>, func<8>, func<9>, func<10>, func<11>, func<12>}; \ - if (selected_config_id == -1) { \ - selected_config_id = \ - ap::ProfileBestConfig(matmul_functions, stream, ##__VA_ARGS__); \ - } \ - matmul_functions[selected_config_id](__VA_ARGS__); \ - } - -template -struct GemmTuningConfigs { +template struct ConfigsInfo { + static constexpr int kNumTotals = 23; +}; + +template <> struct ConfigsInfo { static constexpr int kNumTotals = 13; }; + +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<256, 128, 64>; using WShape = cutlass::gemm::GemmShape<64, 64, 64>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 2; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = Id; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<64, 128, 64>; using WShape = cutlass::gemm::GemmShape<32, 64, 64>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 1; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<64, 128, 64>; using WShape = cutlass::gemm::GemmShape<64, 64, 64>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 2; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<128, 64, 64>; using WShape = cutlass::gemm::GemmShape<64, 32, 64>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 3; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<128, 128, 32>; using WShape = cutlass::gemm::GemmShape<64, 64, 32>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 4; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<128, 128, 64>; using WShape = cutlass::gemm::GemmShape<64, 64, 64>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 5; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<256, 64, 32>; using WShape = cutlass::gemm::GemmShape<64, 64, 32>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 6; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<256, 64, 64>; using WShape = cutlass::gemm::GemmShape<64, 64, 64>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 7; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<256, 128, 32>; using WShape = cutlass::gemm::GemmShape<64, 64, 32>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 8; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<256, 128, 64>; using WShape = cutlass::gemm::GemmShape<64, 64, 64>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 9; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<128, 32, 64>; using WShape = cutlass::gemm::GemmShape<32, 32, 64>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 4; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 10; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<128, 128, 32>; using WShape = cutlass::gemm::GemmShape<64, 64, 32>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 4; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 11; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<128, 128, 64>; using WShape = cutlass::gemm::GemmShape<64, 64, 64>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 4; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 12; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<256, 64, 64>; using WShape = cutlass::gemm::GemmShape<64, 64, 64>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 4; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 13; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<256, 64, 32>; using WShape = cutlass::gemm::GemmShape<64, 64, 32>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 4; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 14; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<32, 64, 64>; using WShape = cutlass::gemm::GemmShape<16, 32, 64>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 5; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 15; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<64, 64, 64>; using WShape = cutlass::gemm::GemmShape<32, 32, 64>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 5; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 16; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<128, 128, 32>; using WShape = cutlass::gemm::GemmShape<64, 64, 32>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 5; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 17; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<128, 128, 64>; using WShape = cutlass::gemm::GemmShape<64, 64, 64>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 5; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 18; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<64, 128, 32>; using WShape = cutlass::gemm::GemmShape<32, 64, 32>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 6; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 19; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<128, 64, 32>; using WShape = cutlass::gemm::GemmShape<64, 32, 32>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 6; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 20; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<128, 32, 32>; using WShape = cutlass::gemm::GemmShape<32, 32, 32>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 7; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 21; }; -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<64, 64, 32>; using WShape = cutlass::gemm::GemmShape<32, 32, 32>; using IShape = cutlass::gemm::GemmShape<16, 8, 16>; static constexpr int kNumStages = 10; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 22; }; // Specialization for float -template -struct GemmTuningConfigs { +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<64, 64, 16>; using WShape = cutlass::gemm::GemmShape<32, 32, 16>; using IShape = cutlass::gemm::GemmShape<16, 8, 8>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = Id; }; -template -struct GemmTuningConfigs { +template <> struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<64, 64, 32>; using WShape = cutlass::gemm::GemmShape<32, 32, 32>; using IShape = cutlass::gemm::GemmShape<16, 8, 8>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 1; }; -template -struct GemmTuningConfigs { +template <> struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<64, 128, 32>; using WShape = cutlass::gemm::GemmShape<32, 64, 32>; using IShape = cutlass::gemm::GemmShape<16, 8, 8>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 2; }; -template -struct GemmTuningConfigs { +template <> struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<64, 256, 16>; using WShape = cutlass::gemm::GemmShape<32, 64, 16>; using IShape = cutlass::gemm::GemmShape<16, 8, 8>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 3; }; -template -struct GemmTuningConfigs { +template <> struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<64, 256, 32>; using WShape = cutlass::gemm::GemmShape<32, 64, 32>; using IShape = cutlass::gemm::GemmShape<16, 8, 8>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 4; }; -template -struct GemmTuningConfigs { +template <> struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<128, 64, 32>; using WShape = cutlass::gemm::GemmShape<64, 32, 32>; using IShape = cutlass::gemm::GemmShape<16, 8, 8>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 5; }; -template -struct GemmTuningConfigs { +template <> struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<128, 128, 16>; using WShape = cutlass::gemm::GemmShape<32, 64, 16>; using IShape = cutlass::gemm::GemmShape<16, 8, 8>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 6; }; -template -struct GemmTuningConfigs { +template <> struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<128, 128, 32>; using WShape = cutlass::gemm::GemmShape<32, 64, 32>; using IShape = cutlass::gemm::GemmShape<16, 8, 8>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 7; }; -template -struct GemmTuningConfigs { +template <> struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<256, 64, 16>; using WShape = cutlass::gemm::GemmShape<64, 32, 16>; using IShape = cutlass::gemm::GemmShape<16, 8, 8>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 8; }; -template -struct GemmTuningConfigs { +template <> struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<256, 64, 32>; using WShape = cutlass::gemm::GemmShape<64, 32, 32>; using IShape = cutlass::gemm::GemmShape<16, 8, 8>; static constexpr int kNumStages = 3; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 9; }; -template -struct GemmTuningConfigs { +template <> struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<64, 128, 16>; using WShape = cutlass::gemm::GemmShape<32, 64, 16>; using IShape = cutlass::gemm::GemmShape<16, 8, 8>; static constexpr int kNumStages = 4; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 10; }; -template -struct GemmTuningConfigs { +template <> struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<128, 64, 16>; using WShape = cutlass::gemm::GemmShape<64, 32, 16>; using IShape = cutlass::gemm::GemmShape<16, 8, 8>; static constexpr int kNumStages = 4; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 11; }; -template -struct GemmTuningConfigs { +template <> struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<128, 128, 16>; using WShape = cutlass::gemm::GemmShape<32, 64, 16>; using IShape = cutlass::gemm::GemmShape<16, 8, 8>; static constexpr int kNumStages = 4; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = 12; }; diff --git a/tests/ap/matmul/autotune.h b/tests/ap/matmul/autotune.h new file mode 100644 index 0000000..376116b --- /dev/null +++ b/tests/ap/matmul/autotune.h @@ -0,0 +1,70 @@ +#pragma once + +#include "all_tuning_configs.h" +#include "default_config_id.h" +#include "profile.h" + +namespace ap { + +template +auto GenerateFuncList(std::integer_sequence) { + using FuncPtr = decltype(&Runner::template Apply<0, ST>); + return std::vector{&Runner::template Apply...}; +} + +template +int RunWithAutotune(cudaStream_t stream, int config_id, Args &&...args) { +#if AP_ENABLE_AUTOTUNE + int selected_config_id = config_id; + + using FuncPtr = decltype(&Runner::template Apply<0, SwizzleType::kCommon>); + constexpr int N = ap::ConfigsInfo::kNumTotals; + + static std::vector matmul_functions; + static std::vector streamk_functions; + + if (matmul_functions.empty()) { + matmul_functions = GenerateFuncList( + std::make_integer_sequence{}); + } + + if constexpr (EnableStreamK) { + if (streamk_functions.empty()) { + streamk_functions = GenerateFuncList( + std::make_integer_sequence{}); + } + } + + if (selected_config_id == -1) { + selected_config_id = ap::ProfileBestConfig(matmul_functions, stream, + std::forward(args)...); + if constexpr (EnableStreamK) { + std::vector mixed_functions = { + matmul_functions[selected_config_id], + streamk_functions[selected_config_id]}; + int mixed_config_id = ap::ProfileBestConfig(mixed_functions, stream, + std::forward(args)...); + selected_config_id = (mixed_config_id == 0) ? selected_config_id + : (selected_config_id + N); + } + } else { + if constexpr (EnableStreamK) { + if (selected_config_id < N) { + matmul_functions[selected_config_id](std::forward(args)...); + } else { + streamk_functions[selected_config_id - N](std::forward(args)...); + } + } else { + matmul_functions[selected_config_id](std::forward(args)...); + } + } + + return selected_config_id; +#else + Runner::template Apply( + std::forward(args)...); + return -1; +#endif +} + +} // namespace ap diff --git a/tests/ap/matmul/cutlass_matmul.cuh b/tests/ap/matmul/cutlass_matmul.cuh index 1ba8556..faa57b6 100644 --- a/tests/ap/matmul/cutlass_matmul.cuh +++ b/tests/ap/matmul/cutlass_matmul.cuh @@ -79,8 +79,7 @@ template + SwizzleType ST = DefaultConfig::kSwizzleType> void CutlassMatmul(const GemmEpilogueParams& params) { using ElementAccumulator = typename CutlassDataType::Type; // <- data type of accumulator using ElementComputeEpilogue = ElementAccumulator; // <- data type of epilogue operations @@ -107,12 +106,12 @@ void CutlassMatmul(const GemmEpilogueParams& params) { ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, - typename GemmTuningConfigs::TShape, - typename GemmTuningConfigs::WShape, - typename GemmTuningConfigs::IShape, + typename GemmTuningConfigs::TShape, + typename GemmTuningConfigs::WShape, + typename GemmTuningConfigs::IShape, EpilogueOutputOp, - typename GemmTuningConfigs::SwizzleThreadBlock, - GemmTuningConfigs::kNumStages, + typename ThreadBlockSwizzle::Type, + GemmTuningConfigs::kNumStages, 128 / cutlass::sizeof_bits::value, // AlignA 128 / cutlass::sizeof_bits::value, // AlignB typename GemmOperation::Type // Operation performed by GEMM @@ -170,11 +169,10 @@ void CutlassMatmul(const GemmEpilogueParams& params) { template class UnaryFunctor, - bool TransposeA = false, - bool TransposeB = false, + int AlignA = 128 / cutlass::sizeof_bits::value, + int AlignB = 128 / cutlass::sizeof_bits::value, int ConfigId = DefaultConfig::kConfigId, - int SwizzleFactor = DefaultConfig::kSwizzleFactor, - bool Batched = DefaultConfig::kBatched> + SwizzleType ST = DefaultConfig::kSwizzleType> void CutlassMatmulAddUnary(const GemmEpilogueParams& params, const typename UnaryFunctor::Arguments& unary_args) { using ElementAccumulator = typename CutlassDataType::Type; // <- data type of accumulator using ElementComputeEpilogue = ElementAccumulator; // <- data type of epilogue operations @@ -197,23 +195,23 @@ void CutlassMatmulAddUnary(const GemmEpilogueParams& params, const typename Unar using GemmFunc = cutlass::gemm::device::GemmUniversal< ElementInputA, - typename MatrixLayout::Type, + cutlass::layout::RowMajor, ElementInputB, - typename MatrixLayout::Type, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, - typename GemmTuningConfigs::TShape, - typename GemmTuningConfigs::WShape, - typename GemmTuningConfigs::IShape, + typename GemmTuningConfigs::TShape, + typename GemmTuningConfigs::WShape, + typename GemmTuningConfigs::IShape, EpilogueOutputOp, - typename GemmTuningConfigs::SwizzleThreadBlock, - GemmTuningConfigs::kNumStages, - 128 / cutlass::sizeof_bits::value, // AlignA - 128 / cutlass::sizeof_bits::value, // AlignB - typename GemmOperation::Type // Operation performed by GEMM + typename ThreadBlockSwizzle::Type, + GemmTuningConfigs::kNumStages, + AlignA, + AlignB, + typename GemmOperation::Type >; CHECK_CUTLASS(SetMaxDynamicSharedMemorySize()); @@ -265,137 +263,13 @@ void CutlassMatmulAddUnary(const GemmEpilogueParams& params, const typename Unar #endif } -template -void CutlassMatmulAddBroadcast(const GemmBroadcastEpilogueParams& params) { - using ElementAccumulator = typename CutlassDataType::Type; // <- data type of accumulator - using ElementComputeEpilogue = ElementAccumulator; // <- data type of epilogue operations - using ElementInputA = typename CutlassDataType::Type; // <- data type of elements in input matrix A - using ElementInputB = typename CutlassDataType::Type; // <- data type of elements in input matrix B - using ElementOutputC = typename CutlassDataType::Type;// <- data type of elements in output matrix D - using ElementOutputZ = ElementOutputC; - using ElementOutputT = ElementOutputC; - - // Epilogue operation as LinearCombinationBiasElementwise: - // Y = GEMM(AB, C) - // T[i, j] = BinaryOp(Y[i, j], Broadcast[i]) - // Z[i, j] = Elementwise(T[i, j]) - using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationBiasElementwise< - ElementOutputC, - ElementAccumulator, - ElementComputeEpilogue, - ElementOutputZ, - ElementOutputT, - 128 / cutlass::sizeof_bits::value - >; - - // Epilogue operation as LinearCombinationResidualBlock: - // Y = GEMM(AB, C1) - // UnaryOp(BinaryOp2(BinaryOp1(ActivationOp(Y), residual1), residual2)) - // using EpilogueOp = cutlass::epilogue::thread::LinearCombinationResidualBlock< - // ElementOutput, // Element type for output matrix - // ElementAccumulator, // Element type from internal accumulation - // ElementCompute, // Element type from internal accumulation - // ElementC, // Element type for C1/C2/D matrix operands - // AlignmentC, // Memory access granularity of C and D matrix in units of elements - // cutlass::epilogue::thread::Identity, // Activation - // cutlass::plus, // Binary operation 1 - // cutlass::epilogue::thread::Identity, // Unary operation - // cutlass::plus // Binary operation 2 - // >; - - using GemmFunc = cutlass::gemm::device::GemmUniversalWithBroadcast< - ElementInputA, - cutlass::layout::RowMajor, - ElementInputB, - cutlass::layout::RowMajor, - ElementOutputC, - cutlass::layout::RowMajor, - ElementAccumulator, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm80, - typename GemmTuningConfigs::TShape, - typename GemmTuningConfigs::WShape, - typename GemmTuningConfigs::IShape, - EpilogueOutputOp, - typename GemmTuningConfigs::SwizzleThreadBlock, - GemmTuningConfigs::kNumStages, - 128 / cutlass::sizeof_bits::value, // AlignA - 128 / cutlass::sizeof_bits::value, // AlignB - typename GemmOperation::Type // Operation performed by GEMM - >; - - CHECK_CUTLASS(SetMaxDynamicSharedMemorySize()); - - /// Arguments - cutlass::gemm::GemmCoord problem_size{params.m, params.n, params.k}; - - const ElementInputA *input = reinterpret_cast(params.input); - const ElementInputB *weight = reinterpret_cast(params.weight); - const ElementOutputC *bias = reinterpret_cast(params.bias); - ElementOutputZ *output = reinterpret_cast(params.output); - ElementOutputC *broadcast = reinterpret_cast(params.broadcast); - ElementOutputT *broadcast_out = reinterpret_cast(params.broadcast_out); - - const int64_t batch_stride_Broadcast = params.need_broadcast ? problem_size.m() : problem_size.m() * problem_size.n(); - const int64_t ldr_broadcast = params.need_broadcast ? 0 : problem_size.n(); - - ElementComputeEpilogue alpha = static_cast(1); - ElementComputeEpilogue beta = static_cast(1); - - typename GemmFunc::Arguments arguments{ - GetGemmMode(params.batch_count), - problem_size, // <- problem size of matrix multiplication - params.batch_count, // <- batch_count or k-dimension split factor - {alpha, beta}, // <- epilogue params, alpha, beta - input, // <- input, ptr_A, A, shape={M, K} - weight, // <- input, ptr_B, B, shape={K, N} - bias, // <- input, ptr_C, shape={M, N} or {1, N} - output, // <- output, ptr_D, Z, shape={M, N} - broadcast, // <- input, ptr_Vector, Broadcast, shape={M, 1} - broadcast_out, // <- output, ptr_Tensor, T - params.shape_args.batch_stride_A, - params.shape_args.batch_stride_B, - params.shape_args.batch_stride_C, - params.shape_args.batch_stride_D, - batch_stride_Broadcast, // <- batch_stride_Vector, need broadcast - problem_size.m() * problem_size.n(), // <- batch_stride_Tensor - params.shape_args.lda, - params.shape_args.ldb, - params.shape_args.ldc_bias, - params.shape_args.ldd, - ldr_broadcast, // <- ldr, must be zero - problem_size.n() // <- ldt - }; - - size_t workspace_size = GemmFunc::get_workspace_size(arguments); - void* workspace = workspace_size > 0 ? GetWorkspace(workspace_size) : nullptr; - - GemmFunc device_gemm; - - CHECK_CUTLASS(device_gemm.can_implement(arguments)); - CHECK_CUTLASS(device_gemm.initialize(arguments, workspace, params.stream)); - - // - // Run the GEMM - // - CHECK_CUTLASS(device_gemm(params.stream)); -#if AP_ENABLE_DEBUG - CHECK_CUDA(cudaStreamSynchronize(params.stream)); -#endif -} - template class VariadicFunctor, int AlignA = 128 / cutlass::sizeof_bits::value, int AlignB = 128 / cutlass::sizeof_bits::value, int ConfigId = DefaultConfig::kConfigId, - int SwizzleFactor = DefaultConfig::kSwizzleFactor, - bool Batched = DefaultConfig::kBatched> + SwizzleType ST = DefaultConfig::kSwizzleType> void CutlassMatmulAddVariadic(const GemmEpilogueParams& params, const typename VariadicFunctor::Arguments& variadic_args) { using ElementAccumulator = typename CutlassDataType::Type; // <- data type of accumulator using ElementComputeEpilogue = ElementAccumulator; // <- data type of epilogue operations @@ -425,12 +299,12 @@ void CutlassMatmulAddVariadic(const GemmEpilogueParams& params, const typename V ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, - typename GemmTuningConfigs::TShape, - typename GemmTuningConfigs::WShape, - typename GemmTuningConfigs::IShape, + typename GemmTuningConfigs::TShape, + typename GemmTuningConfigs::WShape, + typename GemmTuningConfigs::IShape, EpilogueOutputOp, - typename GemmTuningConfigs::SwizzleThreadBlock, - GemmTuningConfigs::kNumStages, + typename ThreadBlockSwizzle::Type, + GemmTuningConfigs::kNumStages, AlignA, AlignB, typename GemmOperation::Type diff --git a/tests/ap/matmul/cutlass_patch/epilogue/threadblock/default_epilogue_with_variadic.h b/tests/ap/matmul/cutlass_patch/epilogue/threadblock/default_epilogue_with_variadic.h index d9b4894..9020dd2 100644 --- a/tests/ap/matmul/cutlass_patch/epilogue/threadblock/default_epilogue_with_variadic.h +++ b/tests/ap/matmul/cutlass_patch/epilogue/threadblock/default_epilogue_with_variadic.h @@ -1,16 +1,17 @@ /*! \file \brief Epilogue for threadblock scoped GEMMs using Tensor Ops. - The epilogue rearranges the result of a matrix product through shared memory to match canonical - tensor layouts in global memory. Epilogues support conversion and reduction operations. + The epilogue rearranges the result of a matrix product through shared memory + to match canonical tensor layouts in global memory. Epilogues support + conversion and reduction operations. */ #pragma once +#include "cutlass/array.h" #include "cutlass/cutlass.h" #include "cutlass/numeric_types.h" -#include "cutlass/array.h" #include "cutlass/gemm/gemm.h" @@ -22,267 +23,164 @@ #include "cutlass/layout/permute.h" - namespace cutlass { namespace epilogue { namespace threadblock { /// Defines sensible defaults for epilogues for SimtOps. -template < - typename Shape, - typename WarpMmaSimt, - typename ElementOutput, - typename OutputOp, - int ElementsPerAccess, - bool ScatterD = false, - typename PermuteDLayout = layout::NoPermute, - conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity, - int Rank = 4 -> +template struct DefaultEpilogueWithVariadicSimt { static conv::StrideSupport const kStrideSupport = StrideSupport; static int const kRank = Rank; - static bool const UseCUDAStore = platform::is_same::value; + static bool const UseCUDAStore = + platform::is_same::value; /// Use defaults related to the existing epilogue - using Base = DefaultEpilogueSimt< - Shape, - WarpMmaSimt, - OutputOp, - ElementsPerAccess - >; - - using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator< - typename Base::OutputTileThreadMap, - ElementOutput, - ScatterD, - PermuteDLayout, - UseCUDAStore - >; - - using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv< - typename Base::OutputTileThreadMap, - ElementOutput, - ScatterD, - PermuteDLayout, - UseCUDAStore, - kRank - >; + using Base = + DefaultEpilogueSimt; + + using PackedOutputTileIterator = + cutlass::epilogue::threadblock::PredicatedTileIterator< + typename Base::OutputTileThreadMap, ElementOutput, ScatterD, + PermuteDLayout, UseCUDAStore>; + + using StridedOutputTileIterator = + cutlass::epilogue::threadblock::PredicatedTileIteratorConv< + typename Base::OutputTileThreadMap, ElementOutput, ScatterD, + PermuteDLayout, UseCUDAStore, kRank>; // // Stores the result z = (y = GEMM(A, B, C), variadic) // - using OutputTileIterator = typename platform::conditional::type; + using OutputTileIterator = typename platform::conditional< + StrideSupport == cutlass::conv::StrideSupport::kUnity, + PackedOutputTileIterator, StridedOutputTileIterator>::type; // // Define the epilogue // using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVariadic< - Shape, - WarpMmaSimt, - Base::kPartitionsK, - OutputTileIterator, - typename Base::AccumulatorFragmentIterator, - typename Base::WarpTileIterator, - typename Base::SharedLoadIterator, - OutputOp, - typename Base::Padding - >; + Shape, WarpMmaSimt, Base::kPartitionsK, OutputTileIterator, + typename Base::AccumulatorFragmentIterator, + typename Base::WarpTileIterator, typename Base::SharedLoadIterator, + OutputOp, typename Base::Padding>; }; /// Defines sensible defaults for strided dgrad epilogues for SimtOps. -template < - typename Shape, - typename WarpMmaSimt, - typename ElementOutput, - typename OutputOp, - int ElementsPerAccess, - bool ScatterD = false, - typename PermuteDLayout = layout::NoPermute -> +template struct DefaultEpilogueWithVariadicSimtStridedDgrad { /// Use defaults related to the existing epilogue - using Base = DefaultEpilogueSimtStridedDgrad< - Shape, - WarpMmaSimt, - OutputOp, - ElementsPerAccess - >; + using Base = DefaultEpilogueSimtStridedDgrad; // // Stores the result z = (y = GEMM(A, B, C), variadic) // - using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad< - typename Base::OutputTileThreadMap, - ElementOutput - >; + using OutputTileIterator = + cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad< + typename Base::OutputTileThreadMap, ElementOutput>; // // Define the epilogue // using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVariadic< - Shape, - WarpMmaSimt, - Base::kPartitionsK, - OutputTileIterator, - typename Base::AccumulatorFragmentIterator, - typename Base::WarpTileIterator, - typename Base::SharedLoadIterator, - OutputOp, - typename Base::Padding - >; + Shape, WarpMmaSimt, Base::kPartitionsK, OutputTileIterator, + typename Base::AccumulatorFragmentIterator, + typename Base::WarpTileIterator, typename Base::SharedLoadIterator, + OutputOp, typename Base::Padding>; }; /// Defines sensible defaults for epilogues for TensorOps. -template < - typename Shape, - typename WarpMmaTensorOp, - int PartitionsK, - typename ElementOutput, - typename OutputOp, - int ElementsPerAccess, - bool ScatterD = false, - typename PermuteDLayout = layout::NoPermute -> +template struct DefaultEpilogueWithVariadicTensorOp { /// Use defaults related to the existing epilogue - using Base = DefaultEpilogueTensorOp< - Shape, - WarpMmaTensorOp, - PartitionsK, - OutputOp, - ElementsPerAccess - >; + using Base = DefaultEpilogueTensorOp; // // Stores the result z = (y = GEMM(A, B, C), variadic) // - using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator< - typename Base::OutputTileThreadMap, - ElementOutput, - ScatterD, - PermuteDLayout - >; + using OutputTileIterator = + cutlass::epilogue::threadblock::PredicatedTileIterator< + typename Base::OutputTileThreadMap, ElementOutput, ScatterD, + PermuteDLayout>; // // Define the epilogue // using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVariadic< - Shape, - WarpMmaTensorOp, - PartitionsK, - OutputTileIterator, - typename Base::AccumulatorFragmentIterator, - typename Base::WarpTileIterator, - typename Base::SharedLoadIterator, - OutputOp, - typename Base::Padding, - Base::kFragmentsPerIteration - >; + Shape, WarpMmaTensorOp, PartitionsK, OutputTileIterator, + typename Base::AccumulatorFragmentIterator, + typename Base::WarpTileIterator, typename Base::SharedLoadIterator, + OutputOp, typename Base::Padding, Base::kFragmentsPerIteration>; }; -#if 0 /// Defines sensible defaults for streamk epilogues for TensorOps. -template < - typename Shape, - typename WarpMmaTensorOp, - int PartitionsK, - typename ElementOutput, - typename OutputOp, - int ElementsPerAccess, - bool ScatterD = false, - typename PermuteDLayout = layout::NoPermute -> +template struct DefaultStreamkEpilogueWithVariadicTensorOp { /// Use defaults related to the existing epilogue - using Base = DefaultEpilogueTensorOp< - Shape, - WarpMmaTensorOp, - PartitionsK, - OutputOp, - ElementsPerAccess - >; + using Base = DefaultEpilogueTensorOp; // // Stores the result z = (y = GEMM(A, B, C), variadic) // - using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator< - typename Base::OutputTileThreadMap, - ElementOutput, - ScatterD, - PermuteDLayout - >; + using OutputTileIterator = + cutlass::epilogue::threadblock::PredicatedTileIterator< + typename Base::OutputTileThreadMap, ElementOutput, ScatterD, + PermuteDLayout>; // // Define the epilogue // using Epilogue = cutlass::epilogue::threadblock::EpilogueStreamkWithVariadic< - Shape, - WarpMmaTensorOp, - PartitionsK, - OutputTileIterator, - typename Base::AccumulatorFragmentIterator, - typename Base::WarpTileIterator, - typename Base::SharedLoadIterator, - OutputOp, - typename Base::Padding, - Base::kFragmentsPerIteration - >; + Shape, WarpMmaTensorOp, PartitionsK, OutputTileIterator, + typename Base::AccumulatorFragmentIterator, + typename Base::WarpTileIterator, typename Base::SharedLoadIterator, + OutputOp, typename Base::Padding, Base::kFragmentsPerIteration>; }; -#endif /// Defines sensible defaults for epilogues for VoltaTensorOps. -template < - typename Shape, - typename WarpMmaTensorOp, - int PartitionsK, - typename ElementOutput, - typename OutputOp, - int ElementsPerAccess -> +template struct DefaultEpilogueWithVariadicVoltaTensorOp { /// Use defaults related to the existing epilogue - using Base = DefaultEpilogueVoltaTensorOp< - Shape, - WarpMmaTensorOp, - PartitionsK, - OutputOp, - ElementsPerAccess - >; + using Base = DefaultEpilogueVoltaTensorOp; // // Stores the result z = (y = GEMM(A, B, C), variadic) // - using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator< - typename Base::OutputTileThreadMap, - ElementOutput - >; + using OutputTileIterator = + cutlass::epilogue::threadblock::PredicatedTileIterator< + typename Base::OutputTileThreadMap, ElementOutput>; // // Define the epilogue // using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVariadic< - Shape, - WarpMmaTensorOp, - PartitionsK, - OutputTileIterator, - typename Base::AccumulatorFragmentIterator, - typename Base::WarpTileIterator, - typename Base::SharedLoadIterator, - OutputOp, - typename Base::Padding - >; + Shape, WarpMmaTensorOp, PartitionsK, OutputTileIterator, + typename Base::AccumulatorFragmentIterator, + typename Base::WarpTileIterator, typename Base::SharedLoadIterator, + OutputOp, typename Base::Padding>; }; - } // namespace threadblock } // namespace epilogue } // namespace cutlass diff --git a/tests/ap/matmul/cutlass_patch/gemm/kernel/default_gemm_with_variadic.h b/tests/ap/matmul/cutlass_patch/gemm/kernel/default_gemm_with_variadic.h index b98db0e..8aa21a8 100644 --- a/tests/ap/matmul/cutlass_patch/gemm/kernel/default_gemm_with_variadic.h +++ b/tests/ap/matmul/cutlass_patch/gemm/kernel/default_gemm_with_variadic.h @@ -8,194 +8,163 @@ #include "cutlass/cutlass.h" -#include "cutlass/gemm/kernel/gemm_universal.h" #include "cutlass/gemm/kernel/default_gemm_universal.h" +#include "cutlass/gemm/kernel/gemm_universal.h" #include "cutlass_patch/epilogue/threadblock/default_epilogue_with_variadic.h" #include "cutlass_patch/epilogue/threadblock/epilogue_with_variadic.h" - namespace cutlass { namespace gemm { namespace kernel { - template < - /// Element type for A matrix operand - typename ElementA_, - /// Layout type for A matrix operand - typename LayoutA_, - /// Complex elementwise transformation on A operand - ComplexTransform TransformA, - /// Access granularity of A matrix in units of elements - int kAlignmentA, - /// Element type for B matrix operand - typename ElementB_, - /// Layout type for B matrix operand - typename LayoutB_, - /// Complex elementwise transformation on B operand - ComplexTransform TransformB, - /// Access granularity of B matrix in units of elements - int kAlignmentB, - /// Element type for C and D matrix operands - typename ElementC_, - /// Layout type for C and D matrix operands - typename LayoutC_, - /// Element type for internal accumulation - typename ElementAccumulator, - /// Operator class tag - typename OperatorClass, - /// Tag indicating architecture to tune for - typename ArchTag, - /// Threadblock-level tile size (concept: GemmShape) - typename ThreadblockShape, - /// Warp-level tile size (concept: GemmShape) - typename WarpShape, - /// Warp-level tile size (concept: GemmShape) - typename InstructionShape, - /// Epilogue output operator - must satisfy concept of 'EpilogueWithVariadicOp' - typename EpilogueOutputOp, - /// Threadblock-level swizzling operator - typename ThreadblockSwizzle, - /// Number of stages used in the pipelined mainloop - int Stages, - /// Operation performed by GEMM - typename Operator, - /// - typename Enable = void -> + /// Element type for A matrix operand + typename ElementA_, + /// Layout type for A matrix operand + typename LayoutA_, + /// Complex elementwise transformation on A operand + ComplexTransform TransformA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB_, + /// Layout type for B matrix operand + typename LayoutB_, + /// Complex elementwise transformation on B operand + ComplexTransform TransformB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC_, + /// Layout type for C and D matrix operands + typename LayoutC_, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Operator class tag + typename OperatorClass, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator - must satisfy concept of + /// 'EpilogueWithVariadicOp' + typename EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Operation performed by GEMM + typename Operator, + /// + typename Enable = void> struct DefaultGemmWithVariadic { using GemmBase = typename DefaultGemmUniversal< - ElementA_, LayoutA_, TransformA, kAlignmentA, - ElementB_, LayoutB_, TransformB, kAlignmentB, - ElementC_, LayoutC_, ElementAccumulator, - OperatorClass, - ArchTag, - ThreadblockShape, - WarpShape, - InstructionShape, - EpilogueOutputOp, - ThreadblockSwizzle, - Stages, - Operator - >::GemmKernel; + ElementA_, LayoutA_, TransformA, kAlignmentA, ElementB_, LayoutB_, + TransformB, kAlignmentB, ElementC_, LayoutC_, ElementAccumulator, + OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, + EpilogueOutputOp, ThreadblockSwizzle, Stages, Operator>::GemmKernel; // Define epilogue - using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithVariadicTensorOp< - typename GemmBase::Epilogue::Shape, - typename GemmBase::Epilogue::WarpMmaOperator, - GemmBase::Epilogue::kPartitionsK, - ElementC_, - EpilogueOutputOp, - GemmBase::Epilogue::kElementsPerAccess - >::Epilogue; - - // Compose the GEMM kernel - using GemmKernel = GemmUniversal< - typename GemmBase::Mma, - Epilogue, - ThreadblockSwizzle - >; + using Epilogue = typename cutlass::epilogue::threadblock:: + DefaultEpilogueWithVariadicTensorOp< + typename GemmBase::Epilogue::Shape, + typename GemmBase::Epilogue::WarpMmaOperator, + GemmBase::Epilogue::kPartitionsK, ElementC_, EpilogueOutputOp, + GemmBase::Epilogue::kElementsPerAccess>::Epilogue; + + /// Universal kernel without StreamkFeature member type + template + class SelectBase : public kernel::GemmUniversal {}; + + /// Universal kernel with StreamkFeature member type + template + class SelectBase + : public kernel::GemmUniversalStreamk {}; + + /// Select kernel by ThreadblockSwizzle's support for StreamkFeature + using GemmKernel = SelectBase; }; - /// Partial specialization: ArchTag = cutlass::arch::Sm70 /// /// template < - /// Element type for A matrix operand - typename ElementA_, - /// Layout type for A matrix operand - typename LayoutA_, - /// Complex elementwise transformation on A operand - ComplexTransform TransformA, - /// Access granularity of A matrix in units of elements - int kAlignmentA, - /// Element type for B matrix operand - typename ElementB_, - /// Layout type for B matrix operand - typename LayoutB_, - /// Complex elementwise transformation on B operand - ComplexTransform TransformB, - /// Access granularity of B matrix in units of elements - int kAlignmentB, - /// Element type for C and D matrix operands - typename ElementC_, - /// Layout type for C and D matrix operands - typename LayoutC_, - /// Element type for internal accumulation - typename ElementAccumulator, - /// Operator class tag - typename OperatorClass, - /// Threadblock-level tile size (concept: GemmShape) - typename ThreadblockShape, - /// Warp-level tile size (concept: GemmShape) - typename WarpShape, - /// Warp-level tile size (concept: GemmShape) - typename InstructionShape, - /// Epilogue output operator - must satisfy concept of 'EpilogueWithVariadicOp' - typename EpilogueOutputOp, - /// Threadblock-level swizzling operator - typename ThreadblockSwizzle, - /// Number of stages used in the pipelined mainloop - int Stages, - /// Operation performed by GEMM - typename Operator, - /// - typename Enable -> + /// Element type for A matrix operand + typename ElementA_, + /// Layout type for A matrix operand + typename LayoutA_, + /// Complex elementwise transformation on A operand + ComplexTransform TransformA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB_, + /// Layout type for B matrix operand + typename LayoutB_, + /// Complex elementwise transformation on B operand + ComplexTransform TransformB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC_, + /// Layout type for C and D matrix operands + typename LayoutC_, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Operator class tag + typename OperatorClass, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator - must satisfy concept of + /// 'EpilogueWithVariadicOp' + typename EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Operation performed by GEMM + typename Operator, + /// + typename Enable> struct DefaultGemmWithVariadic< - ElementA_, LayoutA_, TransformA, kAlignmentA, - ElementB_, LayoutB_, TransformB, kAlignmentB, - ElementC_, LayoutC_, - ElementAccumulator, - OperatorClass, - cutlass::arch::Sm70, - ThreadblockShape, - WarpShape, - InstructionShape, - EpilogueOutputOp, - ThreadblockSwizzle, - Stages, - Operator, - Enable - > { + ElementA_, LayoutA_, TransformA, kAlignmentA, ElementB_, LayoutB_, + TransformB, kAlignmentB, ElementC_, LayoutC_, ElementAccumulator, + OperatorClass, cutlass::arch::Sm70, ThreadblockShape, WarpShape, + InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, Operator, + Enable> { using GemmBase = typename DefaultGemmUniversal< - ElementA_, LayoutA_, TransformA, kAlignmentA, - ElementB_, LayoutB_, TransformB, kAlignmentB, - ElementC_, LayoutC_, ElementAccumulator, - OperatorClass, - cutlass::arch::Sm70, - ThreadblockShape, - WarpShape, - InstructionShape, - EpilogueOutputOp, - ThreadblockSwizzle, - Stages, - Operator - >::GemmKernel; + ElementA_, LayoutA_, TransformA, kAlignmentA, ElementB_, LayoutB_, + TransformB, kAlignmentB, ElementC_, LayoutC_, ElementAccumulator, + OperatorClass, cutlass::arch::Sm70, ThreadblockShape, WarpShape, + InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, + Operator>::GemmKernel; // Define epilogue - using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithVariadicVoltaTensorOp< - typename GemmBase::Epilogue::Shape, - typename GemmBase::Epilogue::WarpMmaOperator, - GemmBase::Epilogue::kPartitionsK, - ElementC_, - EpilogueOutputOp, - GemmBase::Epilogue::kElementsPerAccess - >::Epilogue; + using Epilogue = typename cutlass::epilogue::threadblock:: + DefaultEpilogueWithVariadicVoltaTensorOp< + typename GemmBase::Epilogue::Shape, + typename GemmBase::Epilogue::WarpMmaOperator, + GemmBase::Epilogue::kPartitionsK, ElementC_, EpilogueOutputOp, + GemmBase::Epilogue::kElementsPerAccess>::Epilogue; // Compose the GEMM kernel - using GemmKernel = GemmUniversal< - typename GemmBase::Mma, - Epilogue, - ThreadblockSwizzle - >; + using GemmKernel = + GemmUniversal; }; - -} // namespace kernel -} // namespace gemm -} // namespace cutlass +} // namespace kernel +} // namespace gemm +} // namespace cutlass diff --git a/tests/ap/matmul/default_config_id.h b/tests/ap/matmul/default_config_id.h index d2d6afa..55702d1 100644 --- a/tests/ap/matmul/default_config_id.h +++ b/tests/ap/matmul/default_config_id.h @@ -1,13 +1,32 @@ #pragma once #include "all_tuning_configs.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" namespace ap { +enum SwizzleType { kCommon = 0, kStreamK, kBatched }; + +template struct ThreadBlockSwizzle { + using Type = + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle; +}; + +template +struct ThreadBlockSwizzle { + using Type = cutlass::gemm::threadblock::ThreadblockSwizzleStreamK; +}; + +template +struct ThreadBlockSwizzle { + using Type = + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle; +}; + struct DefaultConfig { - static constexpr int kConfigId = 0; + static constexpr int kConfigId = 8; static constexpr int kSwizzleFactor = 1; - static constexpr bool kBatched = false; + static constexpr SwizzleType kSwizzleType = SwizzleType::kCommon; }; } // namespace ap diff --git a/tests/ap/matmul/generate_configs.py b/tests/ap/matmul/generate_configs.py index ff6c4aa..9391129 100644 --- a/tests/ap/matmul/generate_configs.py +++ b/tests/ap/matmul/generate_configs.py @@ -6,89 +6,62 @@ namespace ap { -constexpr int kNumConfigsHalf = ${num_configs_fp16}; -constexpr int kNumConfigsFloat = ${num_configs_fp32}; - -template struct SwizzleWrapper { - using Type = - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle; +template +struct ConfigsInfo { + static constexpr int kNumTotals = ${num_configs_fp16}; }; -// template -// struct SwizzleWrapper { -// using Type = -// cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle; -// }; -""" - -autotune_wrapper_template = """ -#define AP_AUTOTUNE_${datatype}(func, stream, ...) { \\ - using FuncType = decltype(func<0>); \\ - static int selected_config_id = -1; \\ - static std::vector> \\ - matmul_functions = { \\ - ${repeat_functions} \\ - }; \\ - if (selected_config_id == -1) { \\ - selected_config_id = ap::ProfileBestConfig(matmul_functions, stream, ##__VA_ARGS__); \\ - } \\ - matmul_functions[selected_config_id](__VA_ARGS__); \\ -} +template <> +struct ConfigsInfo { + static constexpr int kNumTotals = ${num_configs_fp32}; +}; """ fp16_config_template_0 = """ -template +template struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<${tshape}>; using WShape = cutlass::gemm::GemmShape<${wshape}>; using IShape = cutlass::gemm::GemmShape<${ishape}>; static constexpr int kNumStages = ${stages}; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = Id; }; """ fp16_config_template = """ -template -struct GemmTuningConfigs { +template +struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<${tshape}>; using WShape = cutlass::gemm::GemmShape<${wshape}>; using IShape = cutlass::gemm::GemmShape<${ishape}>; static constexpr int kNumStages = ${stages}; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = ${config_id}; }; """ fp32_config_template_0 = """ // Specialization for float -template -struct GemmTuningConfigs { +template +struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<${tshape}>; using WShape = cutlass::gemm::GemmShape<${wshape}>; using IShape = cutlass::gemm::GemmShape<${ishape}>; static constexpr int kNumStages = ${stages}; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = Id; }; """ fp32_config_template = """ -template -struct GemmTuningConfigs { +template <> +struct GemmTuningConfigs { using TShape = cutlass::gemm::GemmShape<${tshape}>; using WShape = cutlass::gemm::GemmShape<${wshape}>; using IShape = cutlass::gemm::GemmShape<${ishape}>; static constexpr int kNumStages = ${stages}; - using SwizzleThreadBlock = - typename SwizzleWrapper::Type; static constexpr int kId = ${config_id}; }; """ @@ -194,16 +167,6 @@ def all_configs_sm80_fp32(): return all_tuning_configs_fp32 -def generate_autotune_wrapper(datatype, num_configs): - repeat_func_strs = [] - for i in range(num_configs): - repeat_func_strs.append(f"func<{i}>") - code_str = autotune_wrapper_template.replace("${datatype}", datatype).replace( - "${repeat_functions}", ", \\\n ".join(repeat_func_strs) - ) - return code_str - - def get_configs(all_configs_list, level=3): consigs_list = [] for i in range(len(all_configs_list)): @@ -261,14 +224,8 @@ def main(): head_code_str = head_template.replace( "${num_configs_fp16}", str(num_fp16_configs) ).replace("${num_configs_fp32}", str(num_fp32_configs)) - fp16_autotune_wrapper_code_str = generate_autotune_wrapper("half", num_fp16_configs) - fp32_autotune_wrapper_code_str = generate_autotune_wrapper( - "float", num_fp32_configs - ) with open("all_tuning_configs.h", "w") as f: f.write(head_code_str) - f.write(fp16_autotune_wrapper_code_str) - f.write(fp32_autotune_wrapper_code_str) f.write(fp16_configs_code_str) f.write(fp32_configs_code_str) f.write(tail_code_str) diff --git a/tests/ap/matmul/profile.h b/tests/ap/matmul/profile.h index 72ac822..3baedd9 100644 --- a/tests/ap/matmul/profile.h +++ b/tests/ap/matmul/profile.h @@ -1,3 +1,5 @@ +#pragma once + #include "matmul.h" #include #include diff --git a/tests/ap/matmul/tests/Makefile b/tests/ap/matmul/tests/Makefile index 5942dca..ed865b4 100644 --- a/tests/ap/matmul/tests/Makefile +++ b/tests/ap/matmul/tests/Makefile @@ -6,13 +6,14 @@ BUILD_DIR := ${TESTS_DIR}/build DEBUG := 0 TUNING := 1 -PROFILE := 0 +PROFILE := 1 USE_HALF := 1 LIB_NAME := matmul_kernel LIB := ${BUILD_DIR}/lib${LIB_NAME}.so TARGET := ${BUILD_DIR}/test_main TARGET_OBJ:= ${BUILD_DIR}/test_main_matmul_binary.o +#TARGET_OBJ:= ${BUILD_DIR}/test_main_matmul_dlopen.o # COMPILER OPTIONS NVCC := nvcc @@ -21,16 +22,18 @@ NVCCFLAGS := -std=c++17 -O3 NVCCFLAGS += -Xcompiler=-fPIC -arch=sm_80 --expt-relaxed-constexpr NVCCFLAGS += -I ${CUTLASS_DIR}/include -I ${CUTLASS_DIR}/tools/util/include NVCCFLAGS += -I ${CUDA_ROOT}/include -I ${SOURCE_DIR} -I ${TESTS_DIR} -NVCCFLAGS += -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -DCUTLASS_DEBUG_TRACE_LEVEL=${DEBUG} -DCUTLASS_EPILOGUE_ENABLE_VECTORIZE=1 +NVCCFLAGS += -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -DCUTLASS_DEBUG_TRACE_LEVEL=${DEBUG} -DCUTLASS_EPILOGUE_ENABLE_VECTORIZE=0 NVCCFLAGS += -DAP_USE_FLOAT16=${USE_HALF} -DAP_ENABLE_DEBUG=${DEBUG} -DAP_ENABLE_PROFILE=${PROFILE} -DAP_ENABLE_AUTOTUNE=${TUNING} LDFLAGS := -L./build -l${LIB_NAME} -lcuda -lcudart -# SRCS = native_kernel.cu matmul_unary_kernel.cu matmul_binary_kernel.cu kernel.cu profile.cu +# SRCS = native_kernel.cu matmul_unary_kernel.cu matmul_binary_kernel.cu LIB_SRCS := $(wildcard ${TESTS_DIR}/*.cu) +#LIB_SRCS := ${TESTS_DIR}/matmul_unary_kernel.cu ${TESTS_DIR}/test_util.cu LIB_OBJS := $(patsubst ${TESTS_DIR}/%.cu, ${BUILD_DIR}/%.o, ${LIB_SRCS}) TEST_SRCS := $(wildcard ${TESTS_DIR}/test_*.cc) +#TEST_SRCS := ${TESTS_DIR}/test_main_matmul_unary.cc TEST_OBJS := $(patsubst ${TESTS_DIR}/%.cc, ${BUILD_DIR}/%.o, ${TEST_SRCS}) # Compile diff --git a/tests/ap/matmul/tests/epilogue_op.h b/tests/ap/matmul/tests/epilogue_op.h index 214fba6..bdfdd88 100644 --- a/tests/ap/matmul/tests/epilogue_op.h +++ b/tests/ap/matmul/tests/epilogue_op.h @@ -5,6 +5,14 @@ namespace ap { +struct KernelUtils { +#if AP_USE_FLOAT16 + using Type = half; +#else + using Type = float; +#endif +}; + // Unary Epilogue template struct IdentityFunctor { struct Arguments {}; @@ -39,37 +47,38 @@ template struct VariadicEpilogueFunctor { }; // __forceinline__ __host__ __device__ int64_t - // Load(const Arguments &args, const MatrixCoord &coord) const { + // CalcOffset(const Arguments &args, const MatrixCoord &coord) const { // int64_t offset = coord.batch * args.in0_shape[1] * args.in0_shape[2] + // coord.row * args.in0_shape[2] + coord.column; // return offset; // } // __forceinline__ __host__ __device__ int64_t - // Load(const Arguments &args, const MatrixCoord &coord) const { + // CalcOffset(const Arguments &args, const MatrixCoord &coord) const { // int64_t offset = coord.batch * args.in0_strides[0] + // coord.row * args.in0_strides[1] + coord.column; // return offset; // } // __forceinline__ __host__ __device__ int64_t - // Load(const Arguments &args, const MatrixCoord &coord) const { - // int64_t offset = coord.batch * 65536 * 32 + coord.row * 32 + coord.column; - // return offset; + // CalcOffset(const Arguments &args, const MatrixCoord &coord) const { + // int64_t offset = coord.batch * 65536 * 32 + coord.row * 32 + + // coord.column; return offset; // } __forceinline__ __host__ __device__ int64_t - Load(const Arguments &args, const MatrixCoord &coord) const { + CalcOffset(const Arguments &args, const MatrixCoord &coord) const { int64_t offset = coord.column; return offset; } __forceinline__ __host__ __device__ T operator()(T x, const Arguments &args, const MatrixCoord &coord) const { - int64_t offset = Load(args, coord); - T y = static_cast(args.in0_ptr[offset]); - T out = x + y; - return out; + //int64_t offset = CalcOffset(args, coord); + //T y = static_cast(args.in0_ptr[offset]); + //T out = x + y; + //return out; + return x; } template diff --git a/tests/ap/matmul/tests/kernel.cu b/tests/ap/matmul/tests/kernel.cu deleted file mode 100644 index 687cce3..0000000 --- a/tests/ap/matmul/tests/kernel.cu +++ /dev/null @@ -1,23 +0,0 @@ -#include "cutlass_matmul.cuh" -#include "epilogue_op.h" -#include "profile.h" -#include - -namespace ap { - -void MatmulAddBroadcastKernel(cudaStream_t *stream, const void *input, - const void *weight, const void *bias, - void *broadcast, void *broadcast_out, - void *output, - const std::vector &input_shape, - const std::vector &weight_shape, - const std::vector &bias_shape, - bool need_broadcast) { - GemmBroadcastEpilogueParams params(*stream, input, weight, bias, broadcast, - broadcast_out, output, input_shape, - weight_shape, bias_shape, need_broadcast); - - CutlassMatmulAddBroadcast(params); -} - -} // namespace ap diff --git a/tests/ap/matmul/tests/kernel.h b/tests/ap/matmul/tests/kernel.h index 4f10f87..de19d33 100644 --- a/tests/ap/matmul/tests/kernel.h +++ b/tests/ap/matmul/tests/kernel.h @@ -18,23 +18,16 @@ void MatmulAddUnaryKernel(cudaStream_t *stream, const void *input, const std::vector &bias_shape, bool transpose_b); -void MatmulAddBroadcastKernel(cudaStream_t *stream, const void *input, - const void *weight, const void *bias, - void *broadcast, void *broadcast_out, - void *output, - const std::vector &input_shape, - const std::vector &weight_shape, - const std::vector &bias_shape, - bool need_broadcast); - void MatmulAddBinaryKernel( cudaStream_t *stream, const void *input, const void *weight, const void *bias, void *output, const std::vector &epilogue_ins, + const std::vector &epilogue_outs, const std::vector &input_shape, const std::vector &weight_shape, const std::vector &bias_shape, - const std::vector> &epilogue_shapes); + const std::vector> &epilogue_in_shapes, + const std::vector> &epilogue_out_shapes); } // namespace ap diff --git a/tests/ap/matmul/tests/matmul_binary_kernel.cu b/tests/ap/matmul/tests/matmul_binary_kernel.cu index b090f33..5b3d067 100644 --- a/tests/ap/matmul/tests/matmul_binary_kernel.cu +++ b/tests/ap/matmul/tests/matmul_binary_kernel.cu @@ -1,47 +1,44 @@ +#include "autotune.h" #include "cutlass_matmul.cuh" #include "default_config_id.h" #include "epilogue_op.h" -#include "profile.h" #include namespace ap { -template -static void RunMatmulAddBinaryKernel(const GemmEpilogueParams ¶ms) { -#if AP_USE_FLOAT16 - using ElementT = half; - using ElementComputeT = float; -#else - using ElementT = float; - using ElementComputeT = float; -#endif +struct MatmulAddBinaryRunner { + template + static void Apply(const GemmEpilogueParams ¶ms) { + using ElementT = KernelUtils::Type; + using ElementComputeT = float; - typename VariadicEpilogueFunctor::Arguments variadic_args; - if (params.epilogue_in_ptrs.size() > 0U) { - std::vector epilogue_in0_shape = params.epilogue_in_shapes[0]; - size_t begin = 3 - epilogue_in0_shape.size(); - int64_t stride = 1; - for (int i = epilogue_in0_shape.size() - 1; i >= 0; --i) { - variadic_args.in0_shape[begin + i] = epilogue_in0_shape[i]; - variadic_args.in0_strides[begin + i] = stride; - // std::cout << "stride[" << begin + i << "]=" << stride << std::endl; - stride *= epilogue_in0_shape[i]; + typename VariadicEpilogueFunctor::Arguments variadic_args; + if (params.epilogue_in_ptrs.size() > 0U) { + std::vector epilogue_in0_shape = params.epilogue_in_shapes[0]; + size_t begin = 3 - epilogue_in0_shape.size(); + int64_t stride = 1; + for (int i = epilogue_in0_shape.size() - 1; i >= 0; --i) { + variadic_args.in0_shape[begin + i] = epilogue_in0_shape[i]; + variadic_args.in0_strides[begin + i] = stride; + // std::cout << "stride[" << begin + i << "]=" << stride << std::endl; + stride *= epilogue_in0_shape[i]; + } + for (size_t i = 0; i < begin; ++i) { + variadic_args.in0_shape[i] = 1; + variadic_args.in0_strides[i] = stride; + // std::cout << "stride[" << i << "]=" << stride << std::endl; + } + variadic_args.in0_ptr = + reinterpret_cast(params.epilogue_in_ptrs[0]); } - for (size_t i = 0; i < begin; ++i) { - variadic_args.in0_shape[i] = 1; - variadic_args.in0_strides[i] = stride; - // std::cout << "stride[" << i << "]=" << stride << std::endl; - } - variadic_args.in0_ptr = - reinterpret_cast(params.epilogue_in_ptrs[0]); - } - constexpr int AlignA = 128 / cutlass::sizeof_bits::value; - constexpr int AlignB = 128 / cutlass::sizeof_bits::value; - CutlassMatmulAddVariadic(params, - variadic_args); -} + constexpr int AlignA = 128 / cutlass::sizeof_bits::value; + constexpr int AlignB = 128 / cutlass::sizeof_bits::value; + CutlassMatmulAddVariadic(params, + variadic_args); + } +}; void MatmulAddBinaryKernel( cudaStream_t *stream, const void *input, const void *weight, @@ -55,17 +52,13 @@ void MatmulAddBinaryKernel( const std::vector> &epilogue_out_shapes) { GemmEpilogueParams params(*stream, input, weight, bias, output, input_shape, weight_shape, bias_shape); - params.SetEpilogueAndShapes(epilogue_ins, epilogue_in_shapes, epilogue_outs, epilogue_out_shapes); + params.SetEpilogueAndShapes(epilogue_ins, epilogue_in_shapes, epilogue_outs, + epilogue_out_shapes); -#if AP_ENABLE_AUTOTUNE -#if AP_USE_FLOAT16 - AP_AUTOTUNE_half(RunMatmulAddBinaryKernel, *stream, params); -#else - AP_AUTOTUNE_float(RunMatmulAddBinaryKernel, *stream, params); -#endif -#else - RunMatmulAddBinaryKernel(params); -#endif + static int selected_config_id = -1; + selected_config_id = + RunWithAutotune( + *stream, selected_config_id, params); } } // namespace ap diff --git a/tests/ap/matmul/tests/matmul_kernel.cu b/tests/ap/matmul/tests/matmul_kernel.cu index 4cefe04..23d4b63 100644 --- a/tests/ap/matmul/tests/matmul_kernel.cu +++ b/tests/ap/matmul/tests/matmul_kernel.cu @@ -1,29 +1,21 @@ +#include "autotune.h" #include "cutlass_matmul.cuh" #include "default_config_id.h" #include "epilogue_op.h" -#include "profile.h" #include namespace ap { -template -static void RunMatmulKernel(const GemmEpilogueParams ¶ms) { -#if AP_USE_FLOAT16 - using ElementT = half; - using ElementComputeT = float; -#else - using ElementT = float; - using ElementComputeT = float; -#endif +struct MatmulRunner { + template + static void Apply(const GemmEpilogueParams ¶ms) { + using ElementT = KernelUtils::Type; + using ElementComputeT = float; - if (params.transpose_b) { - CutlassMatmul( - params); - } else { CutlassMatmul( params); } -} +}; void MatmulKernel(cudaStream_t *stream, const void *input, const void *weight, void *output, const std::vector &input_shape, @@ -32,15 +24,9 @@ void MatmulKernel(cudaStream_t *stream, const void *input, const void *weight, input_shape, weight_shape, std::vector{}, false, transpose_b); -#if AP_ENABLE_AUTOTUNE -#if AP_USE_FLOAT16 - AP_AUTOTUNE_half(RunMatmulKernel, *stream, params); -#else - AP_AUTOTUNE_float(RunMatmulKernel, *stream, params); -#endif -#else - RunMatmulKernel(params); -#endif + static int selected_config_id = -1; + selected_config_id = RunWithAutotune( + *stream, selected_config_id, params); } } // namespace ap diff --git a/tests/ap/matmul/tests/matmul_unary_kernel.cu b/tests/ap/matmul/tests/matmul_unary_kernel.cu index 4cce75c..d7d4e02 100644 --- a/tests/ap/matmul/tests/matmul_unary_kernel.cu +++ b/tests/ap/matmul/tests/matmul_unary_kernel.cu @@ -1,7 +1,7 @@ +#include "autotune.h" #include "cutlass_matmul.cuh" #include "default_config_id.h" #include "epilogue_op.h" -#include "profile.h" #include namespace ap { @@ -10,27 +10,23 @@ template // using UnaryEpilogueFunctor = ScaleFunctor; using UnaryEpilogueFunctor = IdentityFunctor; -template -static void RunMatmulAddUnaryKernel(const GemmEpilogueParams ¶ms) { -#if AP_USE_FLOAT16 - using ElementT = half; - using ElementComputeT = float; -#else - using ElementT = float; - using ElementComputeT = float; -#endif +struct MatmulAddUnaryRunner { + template + static void Apply(const GemmEpilogueParams ¶ms) { + using ElementT = KernelUtils::Type; + using ElementComputeT = float; - // typename UnaryEpilogueFunctor::Arguments unary_args{0.1}; - typename UnaryEpilogueFunctor::Arguments unary_args; + // typename UnaryEpilogueFunctor::Arguments + // unary_args{0.1}; + typename UnaryEpilogueFunctor::Arguments unary_args; - if (params.transpose_b) { + constexpr int AlignA = 128 / cutlass::sizeof_bits::value; + constexpr int AlignB = 128 / cutlass::sizeof_bits::value; CutlassMatmulAddUnary(params, unary_args); - } else { - CutlassMatmulAddUnary(params, unary_args); + AlignA, AlignB, TuningConfigId, ST>(params, + unary_args); } -} +}; void MatmulAddUnaryKernel(cudaStream_t *stream, const void *input, const void *weight, const void *bias, void *output, @@ -41,15 +37,10 @@ void MatmulAddUnaryKernel(cudaStream_t *stream, const void *input, GemmEpilogueParams params(*stream, input, weight, bias, output, input_shape, weight_shape, bias_shape, false, transpose_b); -#if AP_ENABLE_AUTOTUNE -#if AP_USE_FLOAT16 - AP_AUTOTUNE_half(RunMatmulAddUnaryKernel, *stream, params); -#else - AP_AUTOTUNE_float(RunMatmulAddUnaryKernel, *stream, params); -#endif -#else - RunMatmulAddUnaryKernel(params); -#endif + static int selected_config_id = -1; + selected_config_id = + RunWithAutotune( + *stream, selected_config_id, params); } } // namespace ap diff --git a/tests/ap/matmul/tests/run.sh b/tests/ap/matmul/tests/run.sh index 58e4cb8..400dbff 100755 --- a/tests/ap/matmul/tests/run.sh +++ b/tests/ap/matmul/tests/run.sh @@ -1,15 +1,19 @@ #!/bin/bash -export CUDA_VISIBLE_DEVICES="7" +export CUDA_VISIBLE_DEVICES="6" #export LD_LIBRARY_PATH=/usr/local/cuda/compat:/usr/lib64:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/work/abstract_pass/Athena/tests/ap/matmul/tests/build:$LD_LIBRARY_PATH -export PATH=/opt/nvidia/nsight-systems/2023.4.1/bin:$PATH +#export PATH=/opt/nvidia/nsight-systems/2023.4.1/bin:$PATH +export PATH=/opt/nvidia/nsight-systems/2025.1.1/bin:$PATH -nsys_args="nsys profile --stats true -w true -t cuda,nvtx,osrt,cudnn,cublas --capture-range=cudaProfilerApi -x true --force-overwrite true -o cutlass_matmul" +AP_WORKSPACE_PATH=/work/abstract_pass/Athena/tests/ap/ap_workspace/5855040424717010002 +export LD_LIBRARY_PATH=${AP_WORKSPACE_PATH}/main:${AP_WORKSPACE_PATH}/api_wrapper:$LD_LIBRARY_PATH +#nsys_args="nsys profile --stats true -w true -t cuda,nvtx,osrt,cudnn,cublas --capture-range=cudaProfilerApi -x true --force-overwrite true -o cutlass_matmul" +nsys_args="nsys profile --stats true -w true -t cuda,nvtx --gpu-metrics-devices=cuda-visible --gpuctxsw=true --capture-range=cudaProfilerApi -x true --force-overwrite true -o cutlass_matmul" #AP_LIB_DIR=/work/abstract_pass/Athena/tests/ap/ap_workspace/15859713568798564682/main #export LD_LIBRARY_PATH=${AP_LIB_DIR}:$LD_LIBRARY_PATH ldd ./build/test_main -${nsys_args} ./build/test_main +${nsys_args} ./build/test_main 1 4096 7168 16384 diff --git a/tests/ap/matmul/tests/test_main_matmul_binary.cc b/tests/ap/matmul/tests/test_main_matmul_binary.cc index 674871d..1c8c9ed 100644 --- a/tests/ap/matmul/tests/test_main_matmul_binary.cc +++ b/tests/ap/matmul/tests/test_main_matmul_binary.cc @@ -4,17 +4,17 @@ #include "test_util.h" template -void TestMatmulAddBinary(cudaStream_t stream, int batch_count, int m, int n, - int k, bool add_bias, bool has_epilogue) { - bool transpose_b = false; +void TestMain(cudaStream_t stream, int batch_count, int m, int n, int k, + bool add_bias, bool has_epilogue) { + bool random = true; std::vector input_shape{batch_count, m, k}; std::vector weight_shape{k, n}; std::vector bias_shape; std::vector output_shape{batch_count, m, n}; - T *input = AllocateAndInit(stream, input_shape, false, 1.); - T *weight = AllocateAndInit(stream, weight_shape, false, 1.); + T *input = AllocateAndInit(stream, input_shape, random, 1.); + T *weight = AllocateAndInit(stream, weight_shape, random, 1.); T *bias = nullptr; if (add_bias) { @@ -25,11 +25,13 @@ void TestMatmulAddBinary(cudaStream_t stream, int batch_count, int m, int n, for (size_t i = 0; i < bias_ref.size(); ++i) { bias_ref[i] = static_cast(1000 * (i % 11)); } - bias = AllocateAndInit(stream, bias_shape, false, 0., bias_ref); + bias = AllocateAndInit(stream, bias_shape, random, 0., bias_ref); } std::vector epilogue_ins; - std::vector> epilogue_shapes; + std::vector epilogue_outs; + std::vector> epilogue_in_shapes; + std::vector> epilogue_out_shapes; T *another = nullptr; if (has_epilogue) { @@ -43,7 +45,7 @@ void TestMatmulAddBinary(cudaStream_t stream, int batch_count, int m, int n, another = AllocateAndInit(stream, another_shape, false, 0., another_ref); epilogue_ins = {another}; - epilogue_shapes = {another_shape}; + epilogue_in_shapes = {another_shape}; } T *output = AllocateAndInit(stream, output_shape, false, 0.); @@ -53,8 +55,9 @@ void TestMatmulAddBinary(cudaStream_t stream, int batch_count, int m, int n, cudaMemsetAsync(output, 0, sizeof(T) * Product(output_shape), stream)); KERNEL_PROFILE(ap::MatmulAddBinaryKernel( - &stream, input, weight, bias, output, epilogue_ins, input_shape, - weight_shape, bias_shape, epilogue_shapes)); + &stream, input, weight, bias, output, epilogue_ins, epilogue_outs, + input_shape, weight_shape, bias_shape, epilogue_in_shapes, + epilogue_out_shapes)); Print(stream, reinterpret_cast(output), batch_count, m, n); @@ -67,6 +70,7 @@ void TestMatmulAddBinary(cudaStream_t stream, int batch_count, int m, int n, } int main(int argc, const char *argv[]) { + std::cout << "TestName: test_main_matmul_binary" << std::endl; ProblemSizeArgs args = ParseArgs(argc, argv); cudaStream_t stream; @@ -76,11 +80,11 @@ int main(int argc, const char *argv[]) { bool has_epilogue = true; #if AP_USE_FLOAT16 - TestMatmulAddBinary(stream, args.batch_count, args.m, args.n, args.k, - add_bias, has_epilogue); + TestMain(stream, args.batch_count, args.m, args.n, args.k, add_bias, + has_epilogue); #else - TestMatmulAddBinary(stream, args.batch_count, args.m, args.n, args.k, - add_bias, has_epilogue); + TestMain(stream, args.batch_count, args.m, args.n, args.k, add_bias, + has_epilogue); #endif CHECK_CUDA(cudaStreamDestroy(stream)); diff --git a/tests/ap/matmul/tests/test_main_matmul_broadcast.cc b/tests/ap/matmul/tests/test_main_matmul_broadcast.cc deleted file mode 100644 index bf51615..0000000 --- a/tests/ap/matmul/tests/test_main_matmul_broadcast.cc +++ /dev/null @@ -1,78 +0,0 @@ -#include - -#include "kernel.h" -#include "test_util.h" - -template void TestMatmulAddBroadcast(cudaStream_t stream) { - int batch_count = 1; - int m = 256; - int n = 512; - int k = 256; - bool need_broadcast = false; - - std::cout << "we are running for problem: [" << m << ", " << n << ", " << k - << "]" << std::endl; - - std::vector input_shape{batch_count, m, k}; - std::vector weight_shape{k, n}; - std::vector bias_shape{n}; - std::vector output_shape{batch_count, m, n}; - - T *input = AllocateAndInit(stream, input_shape, false, 1.); - T *weight = AllocateAndInit(stream, weight_shape, false, 1.); - - std::vector bias_ref; - bias_ref.resize(Product(bias_shape)); - for (size_t i = 0; i < bias_ref.size(); ++i) { - bias_ref[i] = static_cast(1000 * (i % 10)); - } - T *bias = AllocateAndInit(stream, bias_shape, false, 0., bias_ref); - - std::vector broadcast_shape = need_broadcast ? {m} : {m, n}; - std::vector broadcast_ref; - broadcast_ref.resize(Product(broadcast_shape)); - if (need_broadcast) { - for (size_t i = 0; i < broadcast_ref.size(); ++i) { - broadcast_ref[i] = static_cast(10000 * (i % 5)); - } - } else { - for (size_t i = 0; i < m; ++i) { - for (size_t j = 0; j < n; ++j) { - broadcast_ref[i * n + j] = static_cast(10000 * (i % 5)); - } - } - } - T *broadcast = - AllocateAndInit(stream, broadcast_shape, false, 0., broadcast_ref); - - T *output = AllocateAndInit(stream, output_shape, false, 0.); - T *broadcast_out = AllocateAndInit(stream, broadcast_shape, false, 0.); - CHECK_CUDA(cudaStreamSynchronize(stream)); - - CHECK_CUDA( - cudaMemsetAsync(output, 0, sizeof(T) * Product(output_shape), stream)); - CHECK_CUDA(cudaMemsetAsync(broadcast_out, 0, - sizeof(T) * Product(broadcast_shape), stream)); - ap::MatmulAddBroadcastKernel(&stream, input, weight, bias, broadcast, - broadcast_out, output, input_shape, weight_shape, - bias_shape, need_broadcast); - - Print(stream, reinterpret_cast(output), batch_count, m, n); - - cudaFree(input); - cudaFree(weight); - cudaFree(bias); - cudaFree(output); - cudaFree(broadcast); - cudaFree(broadcast_out); -} - -int main(int argc, const char *arg[]) { - cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); - - TestMatmulAddBroadcast(stream); - - CHECK_CUDA(cudaStreamDestroy(stream)); - return 0; -} diff --git a/tests/ap/matmul/tests/test_main_matmul_dlopen.cc b/tests/ap/matmul/tests/test_main_matmul_dlopen.cc new file mode 100644 index 0000000..efbded5 --- /dev/null +++ b/tests/ap/matmul/tests/test_main_matmul_dlopen.cc @@ -0,0 +1,170 @@ +#include +#include +#include + +#include "test_util.h" + +typedef void (*MatmulFunc)(void *, void *, void *, void *, int64_t, int64_t, + int64_t, int64_t, void *); +typedef void (*ApiWrapperFunc)(void *, void *, void **); + +struct DlHandle { + explicit DlHandle(const std::string &main_so_path, + const std::string &api_wrapper_so_path) { + std::cout << "main_so_path: " << main_so_path << std::endl; + std::cout << "api_wrapper_so_path: " << api_wrapper_so_path << std::endl; + main_handle = dlopen(main_so_path.c_str(), RTLD_LAZY); + if (!main_handle) { + std::cerr << "Cannot open library: " << dlerror() << std::endl; + } + api_wrapper_handle = dlopen(api_wrapper_so_path.c_str(), RTLD_LAZY); + if (!api_wrapper_handle) { + std::cerr << "Cannot open library: " << dlerror() << std::endl; + } + } + + template void Call(Args &&...args) { + MatmulFunc func = (MatmulFunc)dlsym(main_handle, "MatmulVariadicKernel"); + const char *func_error = dlerror(); + if (func_error) { + std::cerr << "Cannot load symbol 'MatmulVariadicKernel': " << func_error + << std::endl; + } else { + func(std::forward(args)...); + } + } + + void CallWithApiWrapper(void **args) { + void *main_func = dlsym(main_handle, "MatmulVariadicKernel"); + const char *main_error = dlerror(); + if (main_error) { + std::cerr << "Cannot load symbol 'MatmulVariadicKernel': " << main_error + << std::endl; + } + + ApiWrapperFunc api_wrapper_func = + (ApiWrapperFunc)dlsym(api_wrapper_handle, "MatmulVariadicKernel"); + const char *api_wrapper_error = dlerror(); + if (api_wrapper_error) { + std::cerr << "Cannot load symbol 'MatmulVariadicKernel': " + << api_wrapper_error << std::endl; + } + + void *ret; + api_wrapper_func(ret, main_func, args); + } + + ~DlHandle() { + dlclose(main_handle); + dlclose(api_wrapper_handle); + } + + void *main_handle; + void *api_wrapper_handle; +}; + +template +void TestMain(const std::string &main_so_path, + const std::string &api_wrapper_so_path, cudaStream_t stream, + int batch_count, int m, int n, int k, bool add_bias) { + bool random = true; + + std::vector input_shape{batch_count, m, k}; + std::vector weight_shape{k, n}; + std::vector bias_shape; + std::vector output_shape{batch_count, m, n}; + + T *input = AllocateAndInit(stream, input_shape, random, 10.); + T *weight = AllocateAndInit(stream, weight_shape, random, 10.); + + T *bias = nullptr; + if (add_bias) { + // bias_shape = {n}; + bias_shape = {batch_count, m, n}; + std::vector bias_ref; + bias_ref.resize(Product(bias_shape)); + for (size_t i = 0; i < bias_ref.size(); ++i) { + bias_ref[i] = static_cast(1000 * (i % 11)); + } + bias = AllocateAndInit(stream, bias_shape, random, 0., bias_ref); + } + + T *output0 = AllocateAndInit(stream, output_shape, false, 0.); + T *output1 = AllocateAndInit(stream, output_shape, false, 0.); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + CHECK_CUDA( + cudaMemsetAsync(output0, 0, sizeof(T) * Product(output_shape), stream)); + CHECK_CUDA( + cudaMemsetAsync(output1, 0, sizeof(T) * Product(output_shape), stream)); + + void *stream_ptr = &stream; + int64_t input0_dim0 = batch_count; + int64_t input0_dim1 = m; + int64_t input0_dim2 = k; + int64_t input1_dim1 = n; + T *in_ptr_0 = bias; + + DlHandle handle(main_so_path, api_wrapper_so_path); + // KERNEL_PROFILE(handle.Call(stream_ptr, input, weight, output, input0_dim0, + // input0_dim1, input0_dim2, input1_dim1, + // in_ptr_0)); + + ap::GpuTimer gpu_timer(true); + for (int i = 0; i < 1010; ++i) { + if (i == 10) { + CHECK_CUDA(cudaStreamSynchronize(stream)); + gpu_timer.Start(stream); + } + std::vector args; + args.push_back(&stream_ptr); + args.push_back(&input); + args.push_back(&weight); + if (i % 2 == 0) { + args.push_back(&output0); + } else { + args.push_back(&output1); + } + args.push_back(&input0_dim0); + args.push_back(&input0_dim1); + args.push_back(&input0_dim2); + args.push_back(&input1_dim1); + args.push_back(&in_ptr_0); + // KERNEL_PROFILE(handle.CallWithApiWrapper(args.data())); + handle.CallWithApiWrapper(args.data()); + } + gpu_timer.Stop(stream); + + Print(stream, reinterpret_cast(output0), batch_count, m, n); + + cudaFree(input); + cudaFree(weight); + if (add_bias) { + cudaFree(bias); + } + cudaFree(output0); + cudaFree(output1); +} + +int main(int argc, const char *argv[]) { + std::cout << "TestName: test_main_matmul_dlopen" << std::endl; + ProblemSizeArgs args = ParseArgs(argc, argv); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + std::string main_so_path = "libmatmul_variadic_kernel.so"; + std::string api_wrapper_so_path = "api_wrapper.so"; + bool add_bias = false; + +#if AP_USE_FLOAT16 + TestMain(main_so_path, api_wrapper_so_path, stream, args.batch_count, + args.m, args.n, args.k, add_bias); +#else + TestMain(main_so_path, api_wrapper_so_path, stream, args.batch_count, + args.m, args.n, args.k, add_bias); +#endif + + CHECK_CUDA(cudaStreamDestroy(stream)); + return 0; +} diff --git a/tests/ap/matmul/tests/test_main_matmul_unary.cc b/tests/ap/matmul/tests/test_main_matmul_unary.cc index c9a5c74..5999cfe 100644 --- a/tests/ap/matmul/tests/test_main_matmul_unary.cc +++ b/tests/ap/matmul/tests/test_main_matmul_unary.cc @@ -4,8 +4,9 @@ #include "test_util.h" template -void TestMatmulAddUnary(cudaStream_t stream, int batch_count, int m, int n, - int k, bool add_bias) { +void TestMain(cudaStream_t stream, int batch_count, int m, int n, int k, + bool add_bias) { + bool random = true; bool transpose_b = false; std::vector input_shape{batch_count, m, k}; @@ -13,8 +14,8 @@ void TestMatmulAddUnary(cudaStream_t stream, int batch_count, int m, int n, std::vector bias_shape; std::vector output_shape{batch_count, m, n}; - T *input = AllocateAndInit(stream, input_shape, false, 1.); - T *weight = AllocateAndInit(stream, weight_shape, false, 1.); + T *input = AllocateAndInit(stream, input_shape, random, 1.); + T *weight = AllocateAndInit(stream, weight_shape, random, 1.); T *bias = nullptr; if (add_bias) { @@ -24,7 +25,7 @@ void TestMatmulAddUnary(cudaStream_t stream, int batch_count, int m, int n, for (size_t i = 0; i < bias_ref.size(); ++i) { bias_ref[i] = static_cast(1000 * (i % 11)); } - bias = AllocateAndInit(stream, bias_shape, false, 0., bias_ref); + bias = AllocateAndInit(stream, bias_shape, random, 0., bias_ref); } T *output = AllocateAndInit(stream, output_shape, false, 0.); @@ -48,19 +49,18 @@ void TestMatmulAddUnary(cudaStream_t stream, int batch_count, int m, int n, } int main(int argc, const char *argv[]) { + std::cout << "TestName: test_main_matmul_unary" << std::endl; ProblemSizeArgs args = ParseArgs(argc, argv); cudaStream_t stream; CHECK_CUDA(cudaStreamCreate(&stream)); - bool add_bias = true; + bool add_bias = false; #if AP_USE_FLOAT16 - TestMatmulAddUnary(stream, args.batch_count, args.m, args.n, args.k, - add_bias); + TestMain(stream, args.batch_count, args.m, args.n, args.k, add_bias); #else - TestMatmulAddUnary(stream, args.batch_count, args.m, args.n, args.k, - add_bias); + TestMain(stream, args.batch_count, args.m, args.n, args.k, add_bias); #endif CHECK_CUDA(cudaStreamDestroy(stream)); diff --git a/tests/ap/matmul_variadic_tpl.py b/tests/ap/matmul_variadic_tpl.py index 74819db..f2eb9e3 100644 --- a/tests/ap/matmul_variadic_tpl.py +++ b/tests/ap/matmul_variadic_tpl.py @@ -198,9 +198,9 @@ def make_project( #include #include +#include "autotune.h" #include "cutlass_matmul.cuh" #include "math_function.h" -#include "profile.h" namespace ap { @@ -212,28 +212,29 @@ def make_project( // Note: need to support vectorized operation __forceinline__ __host__ __device__ - T operator()(T x, const Arguments& args, const MatrixCoord& coord) const { - T out; - ${AP_EPILOGUE_COMPUTATION_STATEMENTS} - return out; + T operator()(T x, const Arguments& args) const { + return x; } }; -template -static void RunMatmulWithVariadicKernel(const GemmEpilogueParams ¶ms, ${AP_KERNEL_ARGS_DECLARE}) { - using ElementT = ${output_dtype}; - using ElementComputeT = float; +struct MatmulWithVariadicRunner { + template + static void Apply(const GemmEpilogueParams ¶ms, ${AP_KERNEL_ARGS_DECLARE}) { + using ElementT = ${output_dtype}; + using ElementComputeT = float; - typename VariadicEpilogueFunctor::Arguments epilogue_args; + typename VariadicEpilogueFunctor::Arguments epilogue_args; - ${AP_EPILOGUE_ARGUMENTS_INIT} + ${AP_EPILOGUE_ARGUMENTS_INIT} - constexpr int AlignA = Alignment::kValue; - constexpr int AlignB = Alignment::kValue; + constexpr int AlignA = Alignment::kValue; + constexpr int AlignB = Alignment::kValue; + //std::cout << "AlignA: " << AlignA << ", AlignB: " << AlignB << std::endl; - CutlassMatmulAddVariadic(params, epilogue_args); -} + CutlassMatmulAddUnary(params, epilogue_args); + } +}; } // namespace ap @@ -250,11 +251,8 @@ def make_project( ap::GemmEpilogueParams params( *cuda_stream_ptr, ${input0}, ${input1}, nullptr, ${output}, ${input0}_shape, ${input1}_shape, std::vector{}); -#if AP_ENABLE_AUTOTUNE - AP_AUTOTUNE_${output_dtype}(ap::RunMatmulWithVariadicKernel, *cuda_stream_ptr, params, ${AP_KERNEL_ARGS_CALL}); -#else - ap::RunMatmulWithVariadicKernel(params, ${AP_KERNEL_ARGS_CALL}); -#endif + static int selected_config_id = -1; + selected_config_id = ap::RunWithAutotune<${output_dtype}, ap::MatmulWithVariadicRunner>(*cuda_stream_ptr, selected_config_id, params, ${AP_KERNEL_ARGS_CALL}); } } """ @@ -290,7 +288,7 @@ def make_project( ) .replace( "${AP_EPILOGUE_ARGUMENTS_INIT}", - self.get_epilogue_arguments_init_str("epilogue_args", indent=" "), + self.get_epilogue_arguments_init_str("epilogue_args", indent=" "), ) .replace("${kernel_name}", self.kernel_name) .replace("${input0}", self.get_kernel_arg_id_var_name(input0_karg)) @@ -313,7 +311,7 @@ def make_project( compile_cmd + " -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -DCUTLASS_DEBUG_TRACE_LEVEL=0" ) - compile_cmd = compile_cmd + " -DAP_ENABLE_AUTOTUNE=1 -DAP_ENABLE_DEBUG=0" + compile_cmd = compile_cmd + " -DAP_ENABLE_AUTOTUNE=0 -DAP_ENABLE_DEBUG=1" compile_cmd = ( compile_cmd + f" --shared {self.library_name}.cu -o lib{self.library_name}.so" diff --git a/tests/ap/paddle-tests/test_pcc.py b/tests/ap/paddle-tests/test_pcc.py index 29ec020..5a47cd3 100644 --- a/tests/ap/paddle-tests/test_pcc.py +++ b/tests/ap/paddle-tests/test_pcc.py @@ -32,8 +32,8 @@ def __init__(self, fn): super().__init__() self.fn = fn - def forward(self, x, y, b): - out = self.fn(x, y, b) + def forward(self, x, y, b, r): + out = self.fn(x, y, b, r) return out @@ -43,34 +43,41 @@ def setUp(self): self.prepare_data() def prepare_data(self): - self.dtype = "bfloat16" + self.dtype = "float16" - self.x_shape = [4, 65536, 128] + self.x_shape = [1, 4096, 16384] + #self.x_shape = [1, 4096, 2048] self.x = paddle.randn(self.x_shape, dtype=self.dtype) self.x.stop_gradient = False - self.y_shape = [128, 32] + self.y_shape = [16384, 7168] + #self.y_shape = [2048, 7168] self.y = paddle.randn(self.y_shape, dtype=self.dtype) self.y.stop_gradient = False - self.b_shape = [32] + self.b_shape = [1, 4096, 7168] self.b = paddle.randn(self.b_shape, dtype=self.dtype) self.b.stop_gradient = False + self.r = paddle.randn(self.b_shape, dtype=self.dtype) + self.r.stop_gradient = False + def run_with_dy2st(self, profile): - def matmul_add_relu(x, y, b): + def matmul_add_relu(x, y, b, r): out = paddle.matmul(x, y) - return paddle.nn.functional.relu(out + b) + #return paddle.nn.functional.relu(out + b) + return out + b #+ r net = CINNSubGraphNet(matmul_add_relu) input_spec = [ InputSpec(shape=self.x_shape, dtype=self.dtype), InputSpec(shape=self.y_shape, dtype=self.dtype), InputSpec(shape=self.b_shape, dtype=self.dtype), + InputSpec(shape=self.b_shape, dtype=self.dtype), ] net = utils.apply_to_static(net, False, input_spec) net.eval() - out = utils.run_with_profile(profile, net, self.x, self.y, self.b) + out = utils.run_with_profile(profile, net, self.x, self.y, self.b, self.r) return out def run_with_pcc(self, profile): @@ -83,25 +90,27 @@ def run_with_pcc(self, profile): def matmul_add_relu( x: pct.Tensor([B, M, K], DType), y: pct.Tensor([K, N], DType), - b: pct.Tensor([N], DType), + b: pct.Tensor([B, M, N], DType), + #r: pct.Tensor([B, M, N], DType), ): def epilogue(out): - return paddle.nn.functional.relu(out + b) + #return paddle.nn.functional.relu(out + b) + return out + b #+ r out = paddle.matmul(x, y) return epilogue(out) parent_dir = os.path.dirname(os.path.abspath(__file__)) fused_matmul = pcc.compile(matmul_add_relu, ap_path=parent_dir) - out = fused_matmul(self.x, self.y, self.b) + out = utils.run_with_profile(profile, fused_matmul, self.x, self.y, self.b)#, self.r) return out def test_matmul_add_relu(self): - profile = False + profile = True ap_out = self.run_with_pcc(profile=profile) - dy2st_out = self.run_with_dy2st(profile=profile) - if not profile: - utils.check_result(self.dtype, ap_out, dy2st_out) + #dy2st_out = self.run_with_dy2st(profile=profile) + #if not profile: + # utils.check_result(self.dtype, ap_out, dy2st_out) if __name__ == "__main__":