diff --git a/CMakeLists.txt b/CMakeLists.txt index 582ada3..eca64e5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,30 +1,51 @@ -cmake_minimum_required(VERSION 3.7 FATAL_ERROR) -project (HPTT C CXX) +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +project (HPTT VERSION 1.0.0 LANGUAGES C CXX) -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -set(ENABLE_IBM OFF) +set(CMAKE_NO_SYSTEM_FROM_IMPORTED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(CMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY ON) + +option(ENABLE_IBM OFF) + +if(NOT CMAKE_BUILD_TYPE) + set (CMAKE_BUILD_TYPE Release) +endif() + +include(CheckCXXCompilerFlag) if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le") set(ENABLE_IBM ON) endif() -if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -qopenmp -xhost) -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - if(ENABLE_IBM) - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -fopenmp) - else() - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -fopenmp -march=native -mtune=native) + +if(DEFINED MARCH_FLAGS) + set(HPTT_ARCH_FLAGS ${MARCH_FLAGS}) +else() + if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") + set(HPTT_ARCH_FLAGS -xhost) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(NOT ENABLE_IBM) + set(HPTT_ARCH_FLAGS -march=native) # -mtune=native + endif() + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(HPTT_ARCH_FLAGS -march=native) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "PGI") + set(HPTT_ARCH_FLAGS -silent -w -Mnovect) + # elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL") + # set(HPTT_ARCH_FLAGS -qsmp=omp) + #elseif(CMAKE_CXX_COMPILER_ID STREQUAL "ARMClang") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + set(HPTT_ARCH_FLAGS -mcpu=native) endif() -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -fopenmp -march=native) -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "PGI") - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -silent -w -Mnovect) -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL") - set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -qsmp=omp) +endif() + +check_cxx_compiler_flag("${HPTT_ARCH_FLAGS}" __COMPILER_SUPPORTS_MARCH) +if(__COMPILER_SUPPORTS_MARCH) + set(HPTT_CXX_FLAGS "${HPTT_ARCH_FLAGS}") endif() if(ENABLE_AVX) @@ -35,26 +56,80 @@ elseif(ENABLE_IBM) set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -mtune=native -DHPTT_ARCH_IBM -maltivec -mabi=altivec) endif() +if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "8.2") + set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -O2) +endif() + set(HPTT_SRCS src/hptt.cpp src/plan.cpp src/transpose.cpp src/utils.cpp) -add_library(hptt STATIC ${HPTT_SRCS}) -target_compile_features(hptt PUBLIC cxx_std_11) -target_include_directories(hptt PUBLIC ${PROJECT_SOURCE_DIR}/include) -#target_compile_definitions(hptt PRIVATE ${HPTT_CXX_COMPILE_DEFS}) -target_compile_options(hptt PUBLIC ${HPTT_CXX_FLAGS}) +add_library(hptt ${HPTT_SRCS}) + +add_library(hptt::hptt ALIAS hptt) + +target_include_directories(hptt + PUBLIC + $ + $ +) + +target_compile_options(hptt PRIVATE ${HPTT_CXX_FLAGS}) + +if(ENABLE_OPENMP) + find_package(OpenMP REQUIRED) + target_link_libraries(hptt PUBLIC OpenMP::OpenMP_CXX) +endif() + +set_target_properties(hptt PROPERTIES EXPORT_NAME hptt) + +# Install + +include(GNUInstallDirs) +set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/hptt) install(TARGETS hptt - LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib - ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) - -set(HPTT_INCLUDES - include/compute_node.h - include/hptt_types.h - include/hptt.h - include/macros.h - include/plan.h - include/utils.h - include/transpose.h) - -install(FILES ${HPTT_INCLUDES} - DESTINATION ${CMAKE_INSTALL_PREFIX}/include) + EXPORT hptt-targets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} +) + + +install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hptt) + +#Export the targets to a script +install(EXPORT hptt-targets + FILE + hptt-targets.cmake + NAMESPACE + hptt:: + DESTINATION + ${INSTALL_CONFIGDIR} +) + +#Create a ConfigVersion.cmake file +include(CMakePackageConfigHelpers) +write_basic_package_version_file( + ${CMAKE_CURRENT_BINARY_DIR}/hptt-config-version.cmake + VERSION ${PROJECT_VERSION} + COMPATIBILITY AnyNewerVersion +) + +configure_package_config_file(${CMAKE_CURRENT_LIST_DIR}/cmake/hptt-config.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/hptt-config.cmake + INSTALL_DESTINATION ${INSTALL_CONFIGDIR} + PATH_VARS CMAKE_INSTALL_INCLUDEDIR +) + +#Install the config, configversion and custom find modules +install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/hptt-config.cmake + ${CMAKE_CURRENT_BINARY_DIR}/hptt-config-version.cmake + DESTINATION ${INSTALL_CONFIGDIR} +) + + +export(EXPORT hptt-targets + FILE ${CMAKE_CURRENT_BINARY_DIR}/hptt-targets.cmake + NAMESPACE hptt::) + +#Register package in user's package registry +export(PACKAGE hptt) diff --git a/cmake/hptt-config.cmake.in b/cmake/hptt-config.cmake.in new file mode 100644 index 0000000..673e714 --- /dev/null +++ b/cmake/hptt-config.cmake.in @@ -0,0 +1,18 @@ + +@PACKAGE_INIT@ + +set(ENABLE_OPENMP @ENABLE_OPENMP@) + +if(ENABLE_OPENMP) + # include( CMakeFindDependencyMacro ) + find_package(OpenMP REQUIRED) +endif() + +if(NOT TARGET hptt::hptt) + include("${CMAKE_CURRENT_LIST_DIR}/hptt-targets.cmake") +endif() + +set(HPTT_FOUND TRUE) +set(HPTT_LIBRARIES hptt::hptt) +set(HPTT_INCLUDE_DIRS "@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@") + diff --git a/include/compute_node.h b/include/compute_node.h index b777857..a778f7c 100644 --- a/include/compute_node.h +++ b/include/compute_node.h @@ -15,11 +15,11 @@ class ComputeNode delete next; } - size_t start; //!< start index for at the current loop - size_t end; //!< end index for at the current loop - size_t inc; //!< increment for at the current loop - size_t lda; //!< stride of A w.r.t. the loop index - size_t ldb; //!< stride of B w.r.t. the loop index + int start; //!< start index for at the current loop + int end; //!< end index for at the current loop + int inc; //!< increment for at the current loop + int lda; //!< stride of A w.r.t. the loop index + int ldb; //!< stride of B w.r.t. the loop index ComputeNode *next; //!< next ComputeNode, this might be another loop or 'nullptr' (i.e., indicating that the macro-kernel should be called) }; diff --git a/include/hptt_types.h b/include/hptt_types.h index 170288e..ebc5796 100644 --- a/include/hptt_types.h +++ b/include/hptt_types.h @@ -1,7 +1,6 @@ #pragma once #include -#include #define REGISTER_BITS 256 // AVX #ifdef HPTT_ARCH_ARM diff --git a/include/plan.h b/include/plan.h index 2ff260d..27e6b3c 100644 --- a/include/plan.h +++ b/include/plan.h @@ -17,7 +17,7 @@ class ComputeNode; class Plan { public: - Plan() : rootNodes_(nullptr), numTasks_(0) { } + Plan() : numTasks_(0), rootNodes_(nullptr) { } Plan(std::vectorloopOrder, std::vectornumThreadsAtLoop); diff --git a/include/transpose.h b/include/transpose.h index 82f0239..90dabae 100644 --- a/include/transpose.h +++ b/include/transpose.h @@ -252,12 +252,12 @@ class Transpose floatType alpha_; //!< scaling factor for A floatType beta_; //!< scaling factor for B int dim_; //!< dimension of the tensor - std::vector sizeA_; //!< size of A + std::vector sizeA_; //!< size of A std::vector perm_; //!< permutation - std::vector outerSizeA_; //!< outer sizes of A - std::vector outerSizeB_; //!< outer sizes of B - std::vector lda_; //!< strides for all dimensions of A (first dimension has a stride of 1) - std::vector ldb_; //!< strides for all dimensions of B (first dimension has a stride of 1) + std::vector outerSizeA_; //!< outer sizes of A + std::vector outerSizeB_; //!< outer sizes of B + std::vector lda_; //!< strides for all dimensions of A (first dimension has a stride of 1) + std::vector ldb_; //!< strides for all dimensions of B (first dimension has a stride of 1) std::vector threadIds_; //!< OpenMP threadIds of the threads involed in the transposition int numThreads_; int selectedParallelStrategyId_; diff --git a/include/utils.h b/include/utils.h index a85b27c..3937f7b 100644 --- a/include/utils.h +++ b/include/utils.h @@ -16,6 +16,7 @@ template static floatType conj(floatType x){ return std::conj(x); } + template<> float conj(float x){ return x; @@ -66,7 +67,7 @@ void getPrimeFactors( int n, std::list &primeFactors ); template int findPos(t value, const std::vector &array) { - for(int i=0;i < array.size() ; ++i) + for(size_t i = 0; i < array.size(); ++i) if( array[i] == value ) return i; return -1; diff --git a/src/hptt.cpp b/src/hptt.cpp index 82d4e73..c3cafe0 100644 --- a/src/hptt.cpp +++ b/src/hptt.cpp @@ -180,8 +180,10 @@ void cTensorTranspose( const int *perm, const int dim, const float _Complex beta, float _Complex *B, const int *outerSizeB, const int numThreads, const int useRowMajor) { + const hptt::FloatComplex* calpha = reinterpret_cast(&alpha); + const hptt::FloatComplex* cbeta = reinterpret_cast(&beta); auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, dim, - (const hptt::FloatComplex*) A, (hptt::FloatComplex) alpha, (hptt::FloatComplex*) B, (hptt::FloatComplex) beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); + (const hptt::FloatComplex*) A, *calpha, (hptt::FloatComplex*) B, *cbeta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); plan->setConjA(conjA); plan->execute(); } @@ -191,8 +193,10 @@ void zTensorTranspose( const int *perm, const int dim, const double _Complex beta, double _Complex *B, const int *outerSizeB, const int numThreads, const int useRowMajor) { + const hptt::DoubleComplex* calpha = reinterpret_cast(&alpha); + const hptt::DoubleComplex* cbeta = reinterpret_cast(&beta); auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, dim, - (const hptt::DoubleComplex*) A, (hptt::DoubleComplex) alpha, (hptt::DoubleComplex*) B, (hptt::DoubleComplex) beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); + (const hptt::DoubleComplex*) A, *calpha, (hptt::DoubleComplex*) B, *cbeta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); plan->setConjA(conjA); plan->execute(); } diff --git a/src/plan.cpp b/src/plan.cpp index 7c5b9bc..d1d5d13 100644 --- a/src/plan.cpp +++ b/src/plan.cpp @@ -6,7 +6,7 @@ namespace hptt { - Plan::Plan(std::vectorloopOrder, std::vectornumThreadsAtLoop) : rootNodes_(nullptr), loopOrder_(loopOrder), numThreadsAtLoop_(numThreadsAtLoop) { + Plan::Plan(std::vectorloopOrder, std::vectornumThreadsAtLoop) : loopOrder_(loopOrder), numThreadsAtLoop_(numThreadsAtLoop), rootNodes_(nullptr) { numTasks_ = 1; for(auto nt : numThreadsAtLoop) numTasks_ *= nt; diff --git a/src/transpose.cpp b/src/transpose.cpp index f77cd5b..03ec97e 100644 --- a/src/transpose.cpp +++ b/src/transpose.cpp @@ -704,11 +704,11 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r } else { if( useStreamingStores) if( conjA ) -#pragma vector nontemporal +// #pragma vector nontemporal for(int32_t i = plan->start; i < end; i+= inc) B[i] = alpha * conj(A[i]); else -#pragma vector nontemporal +// #pragma vector nontemporal for(int32_t i = plan->start; i < end; i+= inc) B[i] = alpha * A[i]; else @@ -742,12 +742,12 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r beta_(beta), dim_(-1), numThreads_(numThreads), - masterPlan_(nullptr), - selectionMethod_(selectionMethod), - maxAutotuningCandidates_(-1), selectedParallelStrategyId_(-1), selectedLoopOrderId_(-1), - conjA_(false) + conjA_(false), + masterPlan_(nullptr), + selectionMethod_(selectionMethod), + maxAutotuningCandidates_(-1) { #ifdef _OPENMP omp_init_lock(&writelock); @@ -793,12 +793,6 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r alpha_(other.alpha_), beta_(other.beta_), dim_(other.dim_), - numThreads_(other.numThreads_), - masterPlan_(other.masterPlan_), - selectionMethod_(other.selectionMethod_), - selectedParallelStrategyId_(other.selectedParallelStrategyId_), - selectedLoopOrderId_(other.selectedLoopOrderId_), - maxAutotuningCandidates_(other.maxAutotuningCandidates_), sizeA_(other.sizeA_), perm_(other.perm_), outerSizeA_(other.outerSizeA_), @@ -806,7 +800,14 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r lda_(other.lda_), ldb_(other.ldb_), threadIds_(other.threadIds_), - conjA_(other.conjA_) + numThreads_(other.numThreads_), + selectedParallelStrategyId_(other.selectedParallelStrategyId_), + selectedLoopOrderId_(other.selectedLoopOrderId_), + conjA_(other.conjA_), + masterPlan_(other.masterPlan_), + selectionMethod_(other.selectionMethod_), + maxAutotuningCandidates_(other.maxAutotuningCandidates_) + { #ifdef _OPENMP omp_init_lock(&writelock); @@ -834,19 +835,21 @@ void Transpose::executeEstimate(const Plan *plan) noexcept #ifdef _OPENMP #pragma omp parallel for num_threads(numThreads_) if(numThreads_ > 1) #endif + + const floatType* __restrict__ Bnext__ = B_; for( int taskId = 0; taskId < numTasks; taskId++) if ( perm_[0] != 0 ) { auto rootNode = plan->getRootNode_const( taskId ); if( std::abs(beta_) < getZeroThreshold() ) { if( conjA_ ) - transpose_int( A_,A_, B_, B_, 0.0, 1.0, rootNode); + transpose_int( A_,A_, B_, Bnext__, 0.0, 1.0, rootNode); else - transpose_int( A_,A_, B_, B_, 0.0, 1.0, rootNode); + transpose_int( A_,A_, B_, Bnext__, 0.0, 1.0, rootNode); } else { if( conjA_ ) - transpose_int( A_,A_, B_, B_, 0.0, 1.0, rootNode); + transpose_int( A_,A_, B_, Bnext__, 0.0, 1.0, rootNode); else - transpose_int( A_,A_, B_, B_, 0.0, 1.0, rootNode); + transpose_int( A_,A_, B_, Bnext__, 0.0, 1.0, rootNode); } } else { auto rootNode = plan->getRootNode_const( taskId ); @@ -880,7 +883,7 @@ static void axpy_1D( const floatType* __restrict__ A, floatType* __restrict__ B, ) } else { if( useStreamingStores) -#pragma vector nontemporal +// #pragma vector nontemporal HPTT_DUPLICATE(spawnThreads, for(int32_t i = myStart; i < myEnd; i++) if( conjA ) @@ -918,7 +921,7 @@ static void axpy_2D( const floatType* __restrict__ A, const int lda, if( useStreamingStores) HPTT_DUPLICATE(spawnThreads, for(int32_t j = myStart; j < myEnd; j++) -_Pragma("vector nontemporal") +// _Pragma("vector nontemporal") for(int32_t i = 0; i < n0; i++) if( conjA ) B[i + j * ldb] = alpha * conj(A[i + j * lda]); @@ -1007,17 +1010,18 @@ void Transpose::execute_expert() noexcept } const int numTasks = masterPlan_->getNumTasks(); - const int numThreads = numThreads_; + // const int numThreads = numThreads_; getStartEnd(numTasks, myStart, myEnd); + const floatType* __restrict__ Bnext__ = B_; HPTT_DUPLICATE(spawnThreads, for( int taskId = myStart; taskId < myEnd; taskId++) if ( perm_[0] != 0 ) { auto rootNode = masterPlan_->getRootNode_const( taskId ); if( conjA_ ) - transpose_int( A_, A_, B_, B_, alpha_, beta_, rootNode); + transpose_int( A_, A_, B_, Bnext__, alpha_, beta_, rootNode); else - transpose_int( A_, A_, B_, B_, alpha_, beta_, rootNode); + transpose_int( A_, A_, B_, Bnext__, alpha_, beta_, rootNode); } else { auto rootNode = masterPlan_->getRootNode_const( taskId ); if( conjA_ ) @@ -1259,7 +1263,7 @@ void Transpose::getBestParallelismStrategy ( std::vector &bestPa float lb2 = getLoadBalance(strat2); // printVector(strat2,"strat2"); // printf("strat2: %f\n",getLoadBalance(strat2)); - if( lb1 > 0.8 && lb2 < 0.85 || lb1 >lb2 && lb1 > 0.75 ) + if( (lb1 > 0.8 && lb2 < 0.85) || (lb1 > lb2 && lb1 > 0.75) ) { std::copy(strat1.begin(), strat1.end(), bestParallelismStrategy.begin()); return; @@ -1514,11 +1518,11 @@ void Transpose::skipIndices(const int *sizeA, const int* perm, const } // compact arrays (remove -1) for(int i=0;i < dim ; ++i) - if( sizeA_[i] == -1 ) + if( (int)sizeA_[i] == -1 ) { int j=i+1; for(;j < dim ; ++j) - if( sizeA_[j] != -1 ) + if( (int)sizeA_[j] != -1 ) break; if( j < dim ) std::swap(sizeA_[i], sizeA_[j]); @@ -1614,8 +1618,8 @@ void Transpose::fuseIndices() int toMerge = i; perm.push_back(perm_[i]); while(i+1 < dim_ && perm_[i] + 1 == perm_[i+1] - && (sizeA_[perm_[i]] == outerSizeA_[perm_[i]]) - && (sizeA_[perm_[i]] == outerSizeB_[i]) ){ + && ((int)sizeA_[perm_[i]] == outerSizeA_[perm_[i]]) + && ((int)sizeA_[perm_[i]] == outerSizeB_[i]) ){ #ifdef DEBUG fprintf(stderr,"[HPTT] MERGING indices %d and %d\n",perm_[i], perm_[i+1]); #endif @@ -1641,11 +1645,11 @@ void Transpose::fuseIndices() perm_ = perm; // remove gaps in the perm, if requried (e.g., perm=3,1,0 -> 2,1,0) int currentValue = 0; - for(int i=0;i < perm_.size(); ++i){ + for(size_t i = 0;i < perm_.size(); ++i){ //find smallest element in perm_ and rename it to currentValue int minValue = 1000000; int minPos = -1; - for(int pos=0; pos < perm_.size(); ++pos){ + for(int pos = 0; pos < (int)perm_.size(); ++pos){ if ( perm_[pos] >= currentValue && perm_[pos] < minValue) { minValue = perm_[pos]; minPos = pos; @@ -1917,10 +1921,10 @@ void Transpose::createPlans( std::vector > &pla // heuristics, search the space with a growing rectangle (from best to worst, // see line marked with ***) bool done = false; - for( int start= 0; start< std::max( parallelismStrategies.size(), loopOrders.size() ) && !done; start++ ) - for( int i = 0; i < parallelismStrategies.size() && !done; i++) + for( size_t start= 0; start< std::max( parallelismStrategies.size(), loopOrders.size() ) && !done; start++ ) + for( size_t i = 0; i < parallelismStrategies.size() && !done; i++) { - for( int j = 0; j < loopOrders.size() && !done; j++) + for( size_t j = 0; j < loopOrders.size() && !done; j++) { if( i > start || j > start || (i != start && j != start) ) continue; //these are already done *** @@ -1975,10 +1979,10 @@ void Transpose::createPlans( std::vector > &pla } } plans.push_back(plan); - if( selectionMethod_ == ESTIMATE || - selectionMethod_ == MEASURE && plans.size() > 200 || - selectionMethod_ == PATIENT && plans.size() > 400 || - selectionMethod_ == CRAZY && plans.size() > 800 ) + if( (selectionMethod_ == ESTIMATE) || + (selectionMethod_ == MEASURE && plans.size() > 200) || + (selectionMethod_ == PATIENT && plans.size() > 400) || + (selectionMethod_ == CRAZY && plans.size() > 800) ) done = true; } } @@ -2078,7 +2082,7 @@ std::shared_ptr Transpose::selectPlan( const std::vectorinfoLevel_ > 0 ) - printf("We evaluated %d/%d candidates and selected candidate %d.\n", plansEvaluated, plans.size(), bestPlan_id); + printf("We evaluated %d/%ld candidates and selected candidate %d.\n", plansEvaluated, plans.size(), bestPlan_id); } return plans[bestPlan_id]; }