diff --git a/Makefile b/Makefile index 9283bc6..a11369d 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,14 @@ -CXX_FLAGS=-O3 -std=c++11 -DNDEBUG +CXX_FLAGS=-O3 -std=c++11 INCLUDE_PATH=-I./include/ ifeq ($(CXX),icpc) CXX_FLAGS += -qopenmp -xhost else ifeq ($(CXX),g++) -CXX_FLAGS += -fopenmp -march=native +CXX_FLAGS += -fopenmp -march=native -Wno-vla else ifeq ($(CXX),clang++) -CXX_FLAGS += -fopenmp -march=native +CXX_FLAGS += -fopenmp -march=native -fsanitize=address -Wno-vla-extension endif endif endif diff --git a/benchmark/Makefile b/benchmark/Makefile index c25f9c1..d01f2df 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -28,8 +28,11 @@ LIBS=-lhptt all: ${OBJ} ${CXX} ${OBJ} ${LIB_PATH} ${LIBS} ${CXX_FLAGS} -o benchmark.exe +LIB_PATH+= ${OPENMP_LIB_PATH} +LIBS+= -lomp + %.o: %.cpp - ${CXX} ${CXX_FLAGS} ${INCLUDE_PATH} -c $< -o $@ + ${CXX} ${LIB_PATH} ${LIBS} ${CXX_FLAGS} ${INCLUDE_PATH} -c $< -o $@ clean: rm -rf *.o benchmark.exe diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp index c07a56c..4d4c520 100644 --- a/benchmark/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -192,7 +192,7 @@ int main(int argc, char *argv[]) restore(B, B_ref, total_size); trashCache(trash1, trash2, largerThanL3); auto begin_time = omp_get_wtime(); - transpose_ref( size, perm, dim, A, alpha, B_ref, beta, false); + transpose_ref( size, perm, dim, A, alpha, nullptr, nullptr, 1, B_ref, beta, nullptr, nullptr, 1, false); double elapsed_time = omp_get_wtime() - begin_time; minTime = (elapsed_time < minTime) ? elapsed_time : minTime; } diff --git a/benchmark/benchmark.sh b/benchmark/benchmark.sh old mode 100644 new mode 100755 diff --git a/benchmark/maxFromFiles.py b/benchmark/maxFromFiles.py index f3b39e6..ea60c1c 100644 --- a/benchmark/maxFromFiles.py +++ b/benchmark/maxFromFiles.py @@ -36,5 +36,5 @@ else: f2.write(f2Content[i]) except: - print "ERROR:", i + print("ERROR:", i) f2.close() diff --git a/benchmark/reference.cpp b/benchmark/reference.cpp index a74318a..3a18e28 100644 --- a/benchmark/reference.cpp +++ b/benchmark/reference.cpp @@ -16,68 +16,120 @@ template void transpose_ref( uint32_t *size, uint32_t *perm, int dim, - const floatType* __restrict__ A, floatType alpha, - floatType* __restrict__ B, floatType beta, const bool conjA) + const floatType* __restrict__ A, floatType alpha, int *outerSizeA, int *offsetA, int innerStrideA, + floatType* __restrict__ B, floatType beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA) { - // compute stride for all dimensions w.r.t. A + std::vector tempOuterSizeA, tempOuterSizeB, tempOffsetA, tempOffsetB, tempPointerB; + + // Stride One is location of 0 in perm. Perm[0] may not be B stride one unless perm[0] == 0 + // perm provided yields positions in A data from a B order index + // perm calculated below relates positions in B data to an A order index + tempPointerB.resize(dim); + for (int i = 0; i < dim; ++i) + for (int j = 0; j < dim; ++j) + if (i == perm[j]) + tempPointerB[i] = j; + + // Use default values if any of the pointers are nullptr + if (outerSizeA == nullptr) { + tempOuterSizeA.resize(dim); + for (int i = 0; i < dim; i++) tempOuterSizeA[i] = size[i]; + outerSizeA = tempOuterSizeA.data(); + } + + if (outerSizeB == nullptr) { + tempOuterSizeB.resize(dim); + for (int i = 0; i < dim; i++) tempOuterSizeB[i] = size[perm[i]]; + outerSizeB = tempOuterSizeB.data(); + } + + if (offsetA == nullptr) { + tempOffsetA.resize(dim); // Default to zeros + for (int i = 0; i < dim; i++) tempOffsetA[i] = 0; + offsetA = tempOffsetA.data(); + } + + if (offsetB == nullptr) { + tempOffsetB.resize(dim); // Default to zeros + for (int i = 0; i < dim; i++) tempOffsetB[i] = 0; + offsetB = tempOffsetB.data(); + } + + // compute stride for all dimensions w.r.t. A (like lda) uint32_t strideA[dim]; - strideA[0] = 1; + strideA[0] = innerStrideA; + for(int i=1; i < dim; ++i) + strideA[i] = strideA[i-1] * outerSizeA[i-1]; + + // compute stride for all dimensions w.r.t. B (like ldb) + uint32_t strideB[dim]; + strideB[0] = innerStrideB; for(int i=1; i < dim; ++i) - strideA[i] = strideA[i-1] * size[i-1]; + strideB[i] = strideB[i-1] * outerSizeB[i-1]; // combine all non-stride-one dimensions of B into a single dimension for // maximum parallelism uint32_t sizeOuter = 1; for(int i=0; i < dim; ++i) if( i != perm[0] ) - sizeOuter *= size[i]; + sizeOuter *= size[i]; uint32_t sizeInner = size[perm[0]]; + uint32_t strideAinner = strideA[perm[0]]; // This implementation traverses the output tensor in a linear fashion #pragma omp parallel for for(uint32_t j=0; j < sizeOuter; ++j) { - uint32_t offsetA = 0; - uint32_t offsetB = 0; - uint32_t j_tmp = j; + uint32_t indexOffsetA = 0; + uint32_t indexOffsetB = 0; + uint32_t j_tmp_A = j; + uint32_t j_tmp_B = j; for(int i=1; i < dim; ++i) { - int current_index = j_tmp % size[perm[i]]; - j_tmp /= size[perm[i]]; - offsetA += current_index * strideA[perm[i]]; + int current_index_A = j_tmp_A % size[perm[i]]; + j_tmp_A /= size[perm[i]]; + j_tmp_B /= size[perm[i]]; + indexOffsetA += (current_index_A + offsetA[perm[i]]) * strideA[perm[i]]; + indexOffsetB += (j_tmp_B + 1) * offsetB[i] * strideB[i]; + indexOffsetB += j_tmp_B * (outerSizeB[i] - offsetB[i] - size[perm[i]]) * strideB[i]; } - const floatType* __restrict__ A_ = A + offsetA; - floatType* __restrict__ B_ = B + j*sizeInner; - - uint32_t strideAinner = strideA[perm[0]]; + const floatType* __restrict__ A_ = A + indexOffsetA; + floatType* __restrict__ B_ = B + indexOffsetB + (offsetB[0] * innerStrideB) + (j * outerSizeB[0] * innerStrideB); if( beta == (floatType) 0 ) - for(int i=0; i < sizeInner; ++i) + for(int i=0; i < sizeInner; ++i) { +#ifdef DEBUG + //printf("A[%d] = %e -> B[%d] = %e\n", ((i + offsetA[perm[0]]) * strideAinner) + indexOffsetA, A_[(i + offsetA[perm[0]]) * strideAinner], (i * innerStrideB) + indexOffsetB + (offsetB[0] * innerStrideB) + (j * outerSizeB[0] * innerStrideB), B_[i * innerStrideB]); +#endif if( conjA ) - B_[i] = alpha * std::conj(A_[i * strideAinner]); + B_[i * innerStrideB] = alpha * std::conj(A_[(i + offsetA[perm[0]]) * strideAinner]).real(); else - B_[i] = alpha * A_[i * strideAinner]; + B_[i * innerStrideB] = alpha * A_[(i + offsetA[perm[0]]) * strideAinner];} else - for(int i=0; i < sizeInner; ++i) + for(int i=0; i < sizeInner; ++i) { +#ifdef DEBUG + //printf("A[%d] = %e -> B[%d] = %e\n", ((i + offsetA[perm[0]]) * strideAinner) + indexOffsetA, A_[(i + offsetA[perm[0]]) * strideAinner], (i * innerStrideB) + indexOffsetB + (offsetB[0] * innerStrideB) + (j * outerSizeB[0] * innerStrideB), B_[i * innerStrideB]); +#endif if( conjA ) - B_[i] = alpha * std::conj(A_[i * strideAinner]) + beta * B_[i]; + B_[i * innerStrideB] = alpha * std::conj(A_[(i + offsetA[perm[0]]) * strideAinner]).real() + beta * B_[i * innerStrideB]; else - B_[i] = alpha * A_[i * strideAinner] + beta * B_[i]; + B_[i * innerStrideB] = alpha * A_[(i + offsetA[perm[0]]) * strideAinner] + beta * B_[i * innerStrideB];} } } template void transpose_ref( uint32_t *size, uint32_t *perm, int dim, - const float* __restrict__ A, float alpha, - float* __restrict__ B, float beta, const bool conjA); + const float* __restrict__ A, float alpha, int *outerSizeA, int *offsetA, int innerStrideA, + float* __restrict__ B, float beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA); template void transpose_ref( uint32_t *size, uint32_t *perm, int dim, - const FloatComplex* __restrict__ A, FloatComplex alpha, - FloatComplex* __restrict__ B, FloatComplex beta, const bool conjA); + const FloatComplex* __restrict__ A, FloatComplex alpha, int *outerSizeA, int *offsetA, int innerStrideA, + FloatComplex* __restrict__ B, FloatComplex beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA); template void transpose_ref( uint32_t *size, uint32_t *perm, int dim, - const double* __restrict__ A, double alpha, - double* __restrict__ B, double beta, const bool conjA); + const double* __restrict__ A, double alpha, int *outerSizeA, int *offsetA, int innerStrideA, + double* __restrict__ B, double beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA); template void transpose_ref( uint32_t *size, uint32_t *perm, int dim, - const DoubleComplex* __restrict__ A, DoubleComplex alpha, - DoubleComplex* __restrict__ B, DoubleComplex beta, const bool conjA); + const DoubleComplex* __restrict__ A, DoubleComplex alpha, int *outerSizeA, int *offsetA, int innerStrideA, + DoubleComplex* __restrict__ B, DoubleComplex beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA); + diff --git a/benchmark/reference.h b/benchmark/reference.h index 4f4a4a8..6209f84 100644 --- a/benchmark/reference.h +++ b/benchmark/reference.h @@ -2,5 +2,5 @@ template void transpose_ref( uint32_t *size, uint32_t *perm, int dim, - const floatType* __restrict__ A, floatType alpha, - floatType* __restrict__ B, floatType beta, const bool conjA); + const floatType* __restrict__ A, floatType alpha, int *outerSizeA, int *offsetA, int innerStrideA, + floatType* __restrict__ B, floatType beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA); diff --git a/include/compute_node.h b/include/compute_node.h index b777857..8a31d95 100644 --- a/include/compute_node.h +++ b/include/compute_node.h @@ -5,10 +5,10 @@ namespace hptt { /** * \brief A ComputNode encodes a loop. */ -class ComputeNode +class alignas(16) ComputeNode { public: - ComputeNode() : start(-1), end(-1), inc(-1), lda(-1), ldb(-1), next(nullptr) {} + ComputeNode() : start(-1), end(-1), inc(-1), lda(-1), ldb(-1), indexA(false), indexB(false), offDiffAB(std::numeric_limits::min()), next(nullptr) {} ~ComputeNode() { if ( next != nullptr ) @@ -20,6 +20,9 @@ class ComputeNode size_t inc; //!< increment for at the current loop size_t lda; //!< stride of A w.r.t. the loop index size_t ldb; //!< stride of B w.r.t. the loop index + bool indexA; //!< true if index of A is innermost (0) + bool indexB; //!< true if index of B is innermost (0) + int offDiffAB; //!< difference in offset A and B (i.e., A - B) at the current loop ComputeNode *next; //!< next ComputeNode, this might be another loop or 'nullptr' (i.e., indicating that the macro-kernel should be called) }; diff --git a/include/hptt.h b/include/hptt.h index dda7f0a..31e56d8 100644 --- a/include/hptt.h +++ b/include/hptt.h @@ -168,12 +168,26 @@ namespace hptt { * * This parameter may be NULL, indicating that the outer-size is equal to sizeA. * * If outerSizeA is not NULL, outerSizeA[i] >= sizeA[i] for all 0 <= i < dim must hold. * * This option enables HPTT to operate on sub-tensors. + * \param[in] offsetA dim-dimensional array that stores the offsets in each dimension of A + * * This parameter may be NULL, indicating that the offset is zero. + * * If offsetA is not NULL, outerSizeA[i] >= offsetA[i] + sizeA[i] >= 0 for all 0 <= i < dim must hold. + * * This option enables HPTT to operate on intermediate sub-tensors. + * \param[in] innerStrideA integer storing a non-unitary stride for the innermost dimension of A. + * * This parameter may be NULL, indicating that the innerStrideA is equal to 1. + * * If innerStrideA is not NULL, the raw-data size must exceed \product_{i} (dim_{i}) * innerStrideA. * \param[in] beta scaling factor for B * \param[inout] B Pointer to the raw-data of the output tensor B * \param[in] outerSizeB dim-dimensional array that stores the outer-sizes of each dimension of B. * * This parameter may be NULL, indicating that the outer-size is equal to the perm(sizeA). * * If outerSizeA is not NULL, outerSizeB[i] >= perm(sizeA)[i] for all 0 <= i < dim must hold. * * This option enables HPTT to operate on sub-tensors. + * \param[in] offsetB dim-dimensional array that stores the offsets in each dimension of B + * * This parameter may be NULL, indicating that the offset is zero. + * * If offsetB is not NULL, outerSizeB[i] >= offsetB[i] + sizeB[i] >= 0 for all 0 <= i < dim must hold. + * * This option enables HPTT to operate on intermediate sub-tensors. + * \param[in] innerStrideB integer storing a non-unitary stride for the innermost dimension of B. + * * This parameter may be NULL, indicating that the innerStrideB is equal to 1. + * * If innerStrideB is not NULL, the raw-data size must exceed \product_{i} (dim_{i}) * innerStrideB. * \param[in] selectionMethod Determines if auto-tuning should be used. See hptt::SelectionMethod for details. * ATTENTION: If you enable auto-tuning (e.g., hptt::MEASURE) * then the output data will be used during the @@ -186,6 +200,8 @@ namespace hptt { * HPTT from within a parallel region (i.e., via execute_expert()). * \param[in] useRowMajor This flag indicates whether a row-major memory layout should be used (default: off = column-major). */ +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, no Offsets + * or innerStrides, and --ints-- (sizeA, outerSizeA, and outerSizeB). */ std::shared_ptr > create_plan( const int *perm, const int dim, const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const float beta, float *B, const int *outerSizeB, @@ -210,7 +226,8 @@ std::shared_ptr > create_plan( const int *perm, c const SelectionMethod selectionMethod, const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); - +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, no Offsets + * or innerStride, and --vector ints-- (sizeA, outerSizeA, and outerSizeB). */ std::shared_ptr > create_plan( const std::vector &perm, const int dim, const float alpha, const float *A, const std::vector &sizeA, const std::vector &outerSizeA, const float beta, float *B, const std::vector &outerSizeB, @@ -235,8 +252,8 @@ std::shared_ptr > create_plan( const std::vector< const SelectionMethod selectionMethod, const int numThreads, const std::vector &threadIds = {}, const bool useRowMajor = false); - - +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, + * no Offsets or innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */ std::shared_ptr > create_plan( const int *perm, const int dim, const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const float beta, float *B, const int *outerSizeB, @@ -260,6 +277,162 @@ std::shared_ptr > create_plan( const int *perm, c const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int maxAutotuningCandidates, const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); + +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, + * --int-- Offsets, no innerStrides, and --ints-- (sizeA, outerSizeA, and outerSizeB). */ +std::shared_ptr > create_plan( const int *perm, const int dim, + const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const float beta, float *B, const int *outerSizeB, const int *offsetB, + const SelectionMethod selectionMethod, + const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const int *perm, const int dim, + const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const double beta, double *B, const int *outerSizeB, const int *offsetB, + const SelectionMethod selectionMethod, + const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const int *perm, const int dim, + const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, + const SelectionMethod selectionMethod, + const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const int *perm, const int dim, + const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, + const SelectionMethod selectionMethod, + const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); + +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, + * --vector int-- Offsets, no innerStrides, and --vector ints-- (sizeA, outerSizeA, and outerSizeB). */ +std::shared_ptr > create_plan( const std::vector &perm, const int dim, + const float alpha, const float *A, const std::vector &sizeA, const std::vector &outerSizeA, const std::vector *offsetA, + const float beta, float *B, const std::vector &outerSizeB, const std::vector *offsetB, + const SelectionMethod selectionMethod, + const int numThreads, const std::vector &threadIds = {}, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const std::vector &perm, const int dim, + const double alpha, const double *A, const std::vector &sizeA, const std::vector &outerSizeA, const std::vector *offsetA, + const double beta, double *B, const std::vector &outerSizeB, const std::vector *offsetB, + const SelectionMethod selectionMethod, + const int numThreads, const std::vector &threadIds = {}, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const std::vector &perm, const int dim, + const FloatComplex alpha, const FloatComplex *A, const std::vector &sizeA, const std::vector &outerSizeA, const std::vector *offsetA, + const FloatComplex beta, FloatComplex *B, const std::vector &outerSizeB, const std::vector *offsetB, + const SelectionMethod selectionMethod, + const int numThreads, const std::vector &threadIds = {}, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const std::vector &perm, const int dim, + const DoubleComplex alpha, const DoubleComplex *A, const std::vector &sizeA, const std::vector &outerSizeA, const std::vector *offsetA, + const DoubleComplex beta, DoubleComplex *B, const std::vector &outerSizeB, const std::vector *offsetB, + const SelectionMethod selectionMethod, + const int numThreads, const std::vector &threadIds = {}, const bool useRowMajor = false); + +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, + * --int-- Offsets, no innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */ +std::shared_ptr > create_plan( const int *perm, const int dim, + const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const float beta, float *B, const int *outerSizeB, const int *offsetB, + const int maxAutotuningCandidates, + const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const int *perm, const int dim, + const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const double beta, double *B, const int *outerSizeB, const int *offsetB, + const int maxAutotuningCandidates, + const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const int *perm, const int dim, + const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, + const int maxAutotuningCandidates, + const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const int *perm, const int dim, + const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, + const int maxAutotuningCandidates, + const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); + +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, + * --int-- Offsets, --int-- innerStrides, and --ints-- (sizeA, outerSizeA, and outerSizeB). */ +std::shared_ptr > create_plan( const int *perm, const int dim, + const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const float beta, float *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const SelectionMethod selectionMethod, + const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const int *perm, const int dim, + const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const double beta, double *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const SelectionMethod selectionMethod, + const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const int *perm, const int dim, + const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const SelectionMethod selectionMethod, + const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const int *perm, const int dim, + const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const SelectionMethod selectionMethod, + const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); + +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, + * --vector int-- Offsets, --int-- innerStrides, and --vector ints-- (sizeA, outerSizeA, and outerSizeB). */ +std::shared_ptr > create_plan( const std::vector &perm, const int dim, + const float alpha, const float *A, const std::vector &sizeA, const std::vector &outerSizeA, const std::vector *offsetA, const int innerStrideA, + const float beta, float *B, const std::vector &outerSizeB, const std::vector *offsetB, const int innerStrideB, + const SelectionMethod selectionMethod, + const int numThreads, const std::vector &threadIds = {}, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const std::vector &perm, const int dim, + const double alpha, const double *A, const std::vector &sizeA, const std::vector &outerSizeA, const std::vector *offsetA, const int innerStrideA, + const double beta, double *B, const std::vector &outerSizeB, const std::vector *offsetB, const int innerStrideB, + const SelectionMethod selectionMethod, + const int numThreads, const std::vector &threadIds = {}, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const std::vector &perm, const int dim, + const FloatComplex alpha, const FloatComplex *A, const std::vector &sizeA, const std::vector &outerSizeA, const std::vector *offsetA, const int innerStrideA, + const FloatComplex beta, FloatComplex *B, const std::vector &outerSizeB, const std::vector *offsetB, const int innerStrideB, + const SelectionMethod selectionMethod, + const int numThreads, const std::vector &threadIds = {}, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const std::vector &perm, const int dim, + const DoubleComplex alpha, const DoubleComplex *A, const std::vector &sizeA, const std::vector &outerSizeA, const std::vector *offsetA, const int innerStrideA, + const DoubleComplex beta, DoubleComplex *B, const std::vector &outerSizeB, const std::vector *offsetB, const int innerStrideB, + const SelectionMethod selectionMethod, + const int numThreads, const std::vector &threadIds = {}, const bool useRowMajor = false); + +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, + * --int-- Offsets, --int-- innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */ +std::shared_ptr > create_plan( const int *perm, const int dim, + const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const float beta, float *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const int maxAutotuningCandidates, + const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const int *perm, const int dim, + const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const double beta, double *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const int maxAutotuningCandidates, + const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const int *perm, const int dim, + const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const int maxAutotuningCandidates, + const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); + +std::shared_ptr > create_plan( const int *perm, const int dim, + const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const int maxAutotuningCandidates, + const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false); } extern "C" @@ -282,12 +455,26 @@ extern "C" * * This parameter may be NULL, indicating that the outer-size is equal to sizeA. * * If outerSizeA is not NULL, outerSizeA[i] >= sizeA[i] for all 0 <= i < dim must hold. * * This option enables HPTT to operate on sub-tensors. + * \param[in] offsetA dim-dimensional array that stores the offsets in each dimension of A + * * This parameter may be NULL, indicating that the offset is zero. + * * If offsetA is not NULL, outerSizeA[i] >= offsetA[i] + sizeA[i] >= 0 for all 0 <= i < dim must hold. + * * This option enables HPTT to operate on intermediate sub-tensors. + * \param[in] innerStrideA integer storing a non-unitary stride for the innermost dimension of A. + * * This parameter may be NULL, indicating that the innerStrideA is equal to 1. + * * If innerStrideA is not NULL, the raw-data size must exceed \product_{i} (dim_{i}) * innerStrideA. * \param[in] beta scaling factor for B * \param[inout] B Pointer to the raw-data of the output tensor B * \param[in] outerSizeB dim-dimensional array that stores the outer-sizes of each dimension of B. * * This parameter may be NULL, indicating that the outer-size is equal to the perm(sizeA). * * If outerSizeA is not NULL, outerSizeB[i] >= perm(sizeA)[i] for all 0 <= i < dim must hold. * * This option enables HPTT to operate on sub-tensors. + * \param[in] offsetB dim-dimensional array that stores the offsets in each dimension of B + * * This parameter may be NULL, indicating that the offset is zero. + * * If offsetB is not NULL, outerSizeB[i] >= offsetB[i] + sizeB[i] >= 0 for all 0 <= i < dim must hold. + * * This option enables HPTT to operate on intermediate sub-tensors. + * \param[in] innerStrideB integer storing a non-unitary stride for the innermost dimension of B. + * * This parameter may be NULL, indicating that the innerStrideB is equal to 1. + * * If innerStrideB is not NULL, the raw-data size must exceed \product_{i} (dim_{i}) * innerStrideB. * \param[in] numThreads number of threads that participate in this tensor transposition. * \param[in] useRowMajor This flag indicates whether a row-major memory layout should be used (default: off = column-major). */ @@ -310,4 +497,46 @@ void zTensorTranspose( const int *perm, const int dim, const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, const double _Complex beta, double _Complex *B, const int *outerSizeB, const int numThreads, const int useRowMajor = 0); + +/* With Offsets */ +void sOffsetTensorTranspose( const int *perm, const int dim, + const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const float beta, float *B, const int *outerSizeB, const int *offsetB, + const int numThreads, const int useRowMajor = 0); + +void dOffsetTensorTranspose( const int *perm, const int dim, + const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const double beta, double *B, const int *outerSizeB, const int *offsetB, + const int numThreads, const int useRowMajor = 0); + +void cOffsetTensorTranspose( const int *perm, const int dim, + const float _Complex alpha, bool conjA, const float _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const float _Complex beta, float _Complex *B, const int *outerSizeB, const int *offsetB, + const int numThreads, const int useRowMajor = 0); + +void zOffsetTensorTranspose( const int *perm, const int dim, + const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const double _Complex beta, double _Complex *B, const int *outerSizeB, const int *offsetB, + const int numThreads, const int useRowMajor = 0); + +/* With Offsets and InnerStrides */ +void sInnerStrideTensorTranspose( const int *perm, const int dim, + const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const float beta, float *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const int numThreads, const int useRowMajor = 0); + +void dInnerStrideTensorTranspose( const int *perm, const int dim, + const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const double beta, double *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const int numThreads, const int useRowMajor = 0); + +void cInnerStrideTensorTranspose( const int *perm, const int dim, + const float _Complex alpha, bool conjA, const float _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const float _Complex beta, float _Complex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const int numThreads, const int useRowMajor = 0); + +void zInnerStrideTensorTranspose( const int *perm, const int dim, + const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const double _Complex beta, double _Complex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const int numThreads, const int useRowMajor = 0); } diff --git a/include/transpose.h b/include/transpose.h index 82f0239..8b42147 100644 --- a/include/transpose.h +++ b/include/transpose.h @@ -53,12 +53,22 @@ class Transpose * * This parameter may be NULL, indicating that the outer-size is equal to sizeA. * * If outerSizeA is not NULL, outerSizeA[i] >= sizeA[i] for all 0 <= i < dim must hold. * * This option enables HPTT to operate on sub-tensors. + * \param[in] offsetA dim-dimensional array that stores the offsets in each dimension of A + * * This parameter may be NULL, indicating that the offset is zero. + * * If offsetA is not NULL, outerSizeA[i] >= offsetA[i] + sizeA[i] >= 0 for all 0 <= i < dim must hold. + * * This option enables HPTT to operate on intermediate sub-tensors. + * \param[in] innerStrideA integer storing a non-unitary stride for the innermost dimension of A. * \param[in] beta scaling factor for B * \param[inout] B Pointer to the raw-data of the output tensor B * \param[in] outerSizeB dim-dimensional array that stores the outer-sizes of each dimension of B. * * This parameter may be NULL, indicating that the outer-size is equal to the perm(sizeA). * * If outerSizeA is not NULL, outerSizeB[i] >= perm(sizeA)[i] for all 0 <= i < dim must hold. * * This option enables HPTT to operate on sub-tensors. + * \param[in] offsetB dim-dimensional array that stores the offsets in each dimension of B + * * This parameter may be NULL, indicating that the offset is zero. + * * If offsetB is not NULL, outerSizeB[i] >= offsetB[i] + sizeB[i] >= 0 for all 0 <= i < dim must hold. + * * This option enables HPTT to operate on intermediate sub-tensors. + * \param[in] innerStrideB integer storing a non-unitary stride for the innermost dimension of B. * \param[in] selectionMethod Determines if auto-tuning should be used. See hptt::SelectionMethod for details. * ATTENTION: If you enable auto-tuning (e.g., hptt::MEASURE) * then the output data will be used during the @@ -77,6 +87,10 @@ class Transpose const int *perm, const int *outerSizeA, const int *outerSizeB, + const int *offsetA, + const int *offsetB, + const size_t innerStrideA, + const size_t innerStrideB, const int dim, const floatType *A, const floatType alpha, @@ -211,7 +225,7 @@ class Transpose void createPlans( std::vector > &plans ) const; std::shared_ptr selectPlan( const std::vector > &plans ); void fuseIndices(); - void skipIndices(const int *_sizeA, const int* _perm, const int *_outerSizeA, const int *_outerSizeB, const int dim); + void skipIndices(const int *_sizeA, const int* _perm, const int *_outerSizeA, const int *_outerSizeB, const int *_offsetA, const int *_offsetB, const int dim); void computeLeadingDimensions(); double loopCostHeuristic( const std::vector &loopOrder ) const; double parallelismCostHeuristic( const std::vector &loopOrder ) const; @@ -233,7 +247,8 @@ class Transpose const std::vector &loopsAllowed) const; float getLoadBalance( const std::vector ¶llelismStrategy ) const; float estimateExecutionTime( const std::shared_ptr plan); //execute just a few iterations and extrapolate the result - void verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int dim) const; + void verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int* offsetA, + const int* offsetB, const size_t innerStrideA, const size_t innerStrideB, const int dim) const; void getBestParallelismStrategy ( std::vector &bestParallelismStrategy ) const; void getBestLoopOrder( std::vector &loopOrder ) const; //innermost loop idx is stored at dim_-1 void getLoopOrders(std::vector > &loopOrders) const; @@ -256,6 +271,10 @@ class Transpose std::vector perm_; //!< permutation std::vector outerSizeA_; //!< outer sizes of A std::vector outerSizeB_; //!< outer sizes of B + std::vector offsetA_; //!< offsets of A + std::vector offsetB_; //!< offsets of B + size_t innerStrideA_; //!< innerStride of A + size_t innerStrideB_; //!< innerStride of B std::vector lda_; //!< strides for all dimensions of A (first dimension has a stride of 1) std::vector ldb_; //!< strides for all dimensions of B (first dimension has a stride of 1) std::vector threadIds_; //!< OpenMP threadIds of the threads involed in the transposition diff --git a/include/utils.h b/include/utils.h index a85b27c..b2b1147 100644 --- a/include/utils.h +++ b/include/utils.h @@ -76,8 +76,8 @@ int findPos(int value, const int *array, int n); int factorial(int n); -void accountForRowMajor(const int *sizeA, const int *outerSizeA, const int *outerSizeB, const int *perm, - int *tmpSizeA, int *tmpOuterSizeA, int *tmpouterSizeB, int *tmpPerm, const int dim, const bool useRowMajor); +void accountForRowMajor(const int *sizeA, const int *outerSizeA, const int *outerSizeB, const int *offsetA, const int *offsetB, const int *perm, + int *tmpSizeA, int *tmpOuterSizeA, int *tmpOuterSizeB, int *tmpOffsetA, int *tmpOffsetB, int *tmpPerm, const int dim, const bool useRowMajor); } diff --git a/src/hptt.cpp b/src/hptt.cpp index ea761c8..4a31be0 100644 --- a/src/hptt.cpp +++ b/src/hptt.cpp @@ -19,13 +19,15 @@ namespace hptt { +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, + * no Offsets or innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */ std::shared_ptr > create_plan( const int *perm, const int dim, const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const float beta, float *B, const int *outerSizeB, const SelectionMethod selectionMethod, const int numThreads, const int *threadIds, const bool useRowMajor) { - auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor)); + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor)); return plan; } @@ -35,7 +37,7 @@ std::shared_ptr > create_plan( const int *perm, const in const SelectionMethod selectionMethod, const int numThreads, const int *threadIds, const bool useRowMajor) { - auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor )); + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor )); return plan; } @@ -45,7 +47,7 @@ std::shared_ptr > create_plan( const int *perm, co const SelectionMethod selectionMethod, const int numThreads, const int *threadIds, const bool useRowMajor) { - auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor)); + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor)); return plan; } @@ -55,19 +57,19 @@ std::shared_ptr > create_plan( const int *perm, c const SelectionMethod selectionMethod, const int numThreads, const int *threadIds, const bool useRowMajor) { - auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor )); + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor )); return plan; } - - +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, + * no Offsets or innerStrides, and vector ints (sizeA, outerSizeA, and outerSizeB). */ std::shared_ptr > create_plan( const std::vector &perm, const int dim, const float alpha, const float *A, const std::vector &sizeA, const std::vector &outerSizeA, const float beta, float *B, const std::vector &outerSizeB, const SelectionMethod selectionMethod, const int numThreads, const std::vector &threadIds, const bool useRowMajor) { - auto plan(std::make_shared >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor )); + auto plan(std::make_shared >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor )); return plan; } @@ -77,7 +79,7 @@ std::shared_ptr > create_plan( const std::vector &p const SelectionMethod selectionMethod, const int numThreads, const std::vector &threadIds, const bool useRowMajor) { - auto plan(std::make_shared >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor )); + auto plan(std::make_shared >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor )); return plan; } @@ -87,7 +89,7 @@ std::shared_ptr > create_plan( const std::vector &threadIds, const bool useRowMajor) { - auto plan(std::make_shared >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor )); + auto plan(std::make_shared >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor )); return plan; } @@ -97,18 +99,19 @@ std::shared_ptr > create_plan( const std::vector< const SelectionMethod selectionMethod, const int numThreads, const std::vector &threadIds, const bool useRowMajor) { - auto plan(std::make_shared >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor )); + auto plan(std::make_shared >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor )); return plan; } - +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, + * no Offsets or innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */ std::shared_ptr > create_plan( const int *perm, const int dim, const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const float beta, float *B, const int *outerSizeB, const int maxAutotuningCandidates, const int numThreads, const int *threadIds, const bool useRowMajor) { - auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor)); + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor)); plan->setMaxAutotuningCandidates(maxAutotuningCandidates); plan->createPlan(); return plan; @@ -120,7 +123,7 @@ std::shared_ptr > create_plan( const int *perm, const in const int maxAutotuningCandidates, const int numThreads, const int *threadIds, const bool useRowMajor) { - auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor )); + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor )); plan->setMaxAutotuningCandidates(maxAutotuningCandidates); plan->createPlan(); return plan; @@ -132,7 +135,7 @@ std::shared_ptr > create_plan( const int *perm, co const int maxAutotuningCandidates, const int numThreads, const int *threadIds, const bool useRowMajor) { - auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor)); + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor)); plan->createPlan(); return plan; } @@ -143,7 +146,274 @@ std::shared_ptr > create_plan( const int *perm, c const int maxAutotuningCandidates, const int numThreads, const int *threadIds, const bool useRowMajor) { - auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor )); + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor )); + plan->setMaxAutotuningCandidates(maxAutotuningCandidates); + plan->createPlan(); + return plan; +} + +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, + * --int-- Offsets, no innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */ +std::shared_ptr > create_plan( const int *perm, const int dim, + const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const float beta, float *B, const int *outerSizeB, const int *offsetB, + const SelectionMethod selectionMethod, + const int numThreads, const int *threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor)); + return plan; +} + +std::shared_ptr > create_plan( const int *perm, const int dim, + const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const double beta, double *B, const int *outerSizeB, const int *offsetB, + const SelectionMethod selectionMethod, + const int numThreads, const int *threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor )); + return plan; +} + +std::shared_ptr > create_plan( const int *perm, const int dim, + const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, + const SelectionMethod selectionMethod, + const int numThreads, const int *threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor)); + return plan; +} + +std::shared_ptr > create_plan( const int *perm, const int dim, + const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, + const SelectionMethod selectionMethod, + const int numThreads, const int *threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor )); + return plan; +} + +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, + * --vector int-- Offsets, no innerStrides, and vector ints (sizeA, outerSizeA, and outerSizeB). */ +std::shared_ptr > create_plan( const std::vector &perm, const int dim, + const float alpha, const float *A, const std::vector &sizeA, const std::vector &outerSizeA, const std::vector &offsetA, + const float beta, float *B, const std::vector &outerSizeB, const std::vector &offsetB, + const SelectionMethod selectionMethod, + const int numThreads, const std::vector &threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor )); + return plan; +} + +std::shared_ptr > create_plan( const std::vector &perm, const int dim, + const double alpha, const double *A, const std::vector &sizeA, const std::vector &outerSizeA, const std::vector &offsetA, + const double beta, double *B, const std::vector &outerSizeB, const std::vector &offsetB, + const SelectionMethod selectionMethod, + const int numThreads, const std::vector &threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor )); + return plan; +} + +std::shared_ptr > create_plan( const std::vector &perm, const int dim, + const FloatComplex alpha, const FloatComplex *A, const std::vector &sizeA, const std::vector &outerSizeA, const std::vector &offsetA, + const FloatComplex beta, FloatComplex *B, const std::vector &outerSizeB, const std::vector &offsetB, + const SelectionMethod selectionMethod, + const int numThreads, const std::vector &threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor )); + return plan; +} + +std::shared_ptr > create_plan( const std::vector &perm, const int dim, + const DoubleComplex alpha, const DoubleComplex *A, const std::vector &sizeA, const std::vector &outerSizeA, const std::vector &offsetA, + const DoubleComplex beta, DoubleComplex *B, const std::vector &outerSizeB, const std::vector &offsetB, + const SelectionMethod selectionMethod, + const int numThreads, const std::vector &threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor )); + return plan; +} + +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, + * --int-- Offsets, no innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */ +std::shared_ptr > create_plan( const int *perm, const int dim, + const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const float beta, float *B, const int *outerSizeB, const int *offsetB, + const int maxAutotuningCandidates, + const int numThreads, const int *threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor)); + plan->setMaxAutotuningCandidates(maxAutotuningCandidates); + plan->createPlan(); + return plan; +} + +std::shared_ptr > create_plan( const int *perm, const int dim, + const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const double beta, double *B, const int *outerSizeB, const int *offsetB, + const int maxAutotuningCandidates, + const int numThreads, const int *threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor )); + plan->setMaxAutotuningCandidates(maxAutotuningCandidates); + plan->createPlan(); + return plan; +} + +std::shared_ptr > create_plan( const int *perm, const int dim, + const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, + const int maxAutotuningCandidates, + const int numThreads, const int *threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor)); + plan->createPlan(); + return plan; +} + +std::shared_ptr > create_plan( const int *perm, const int dim, + const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, + const int maxAutotuningCandidates, + const int numThreads, const int *threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor )); + plan->setMaxAutotuningCandidates(maxAutotuningCandidates); + plan->createPlan(); + return plan; +} + + +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, + * --int-- Offsets, --int-- innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */ +std::shared_ptr > create_plan( const int *perm, const int dim, + const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const float beta, float *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const SelectionMethod selectionMethod, + const int numThreads, const int *threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor)); + return plan; +} + +std::shared_ptr > create_plan( const int *perm, const int dim, + const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const double beta, double *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const SelectionMethod selectionMethod, + const int numThreads, const int *threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor )); + return plan; +} + +std::shared_ptr > create_plan( const int *perm, const int dim, + const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const SelectionMethod selectionMethod, + const int numThreads, const int *threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor)); + return plan; +} + +std::shared_ptr > create_plan( const int *perm, const int dim, + const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const SelectionMethod selectionMethod, + const int numThreads, const int *threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor )); + return plan; +} + +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, + * --vector int-- Offsets, --int-- innerStrides, and vector ints (sizeA, outerSizeA, and outerSizeB). */ +std::shared_ptr > create_plan( const std::vector &perm, const int dim, + const float alpha, const float *A, const std::vector &sizeA, const std::vector &outerSizeA, const std::vector &offsetA, const int innerStrideA, + const float beta, float *B, const std::vector &outerSizeB, const std::vector &offsetB, const int innerStrideB, + const SelectionMethod selectionMethod, + const int numThreads, const std::vector &threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor )); + return plan; +} + +std::shared_ptr > create_plan( const std::vector &perm, const int dim, + const double alpha, const double *A, const std::vector &sizeA, const std::vector &outerSizeA, const std::vector &offsetA, const int innerStrideA, + const double beta, double *B, const std::vector &outerSizeB, const std::vector &offsetB, const int innerStrideB, + const SelectionMethod selectionMethod, + const int numThreads, const std::vector &threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor )); + return plan; +} + +std::shared_ptr > create_plan( const std::vector &perm, const int dim, + const FloatComplex alpha, const FloatComplex *A, const std::vector &sizeA, const std::vector &outerSizeA, const std::vector &offsetA, const int innerStrideA, + const FloatComplex beta, FloatComplex *B, const std::vector &outerSizeB, const std::vector &offsetB, const int innerStrideB, + const SelectionMethod selectionMethod, + const int numThreads, const std::vector &threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor )); + return plan; +} + +std::shared_ptr > create_plan( const std::vector &perm, const int dim, + const DoubleComplex alpha, const DoubleComplex *A, const std::vector &sizeA, const std::vector &outerSizeA, const std::vector &offsetA, const int innerStrideA, + const DoubleComplex beta, DoubleComplex *B, const std::vector &outerSizeB, const std::vector &offsetB, const int innerStrideB, + const SelectionMethod selectionMethod, + const int numThreads, const std::vector &threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor )); + return plan; +} + +/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, + * --int-- Offsets, --int-- innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */ +std::shared_ptr > create_plan( const int *perm, const int dim, + const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const float beta, float *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const int maxAutotuningCandidates, + const int numThreads, const int *threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor)); + plan->setMaxAutotuningCandidates(maxAutotuningCandidates); + plan->createPlan(); + return plan; +} + +std::shared_ptr > create_plan( const int *perm, const int dim, + const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const double beta, double *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const int maxAutotuningCandidates, + const int numThreads, const int *threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor )); + plan->setMaxAutotuningCandidates(maxAutotuningCandidates); + plan->createPlan(); + return plan; +} + +std::shared_ptr > create_plan( const int *perm, const int dim, + const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const int maxAutotuningCandidates, + const int numThreads, const int *threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor)); + plan->createPlan(); + return plan; +} + +std::shared_ptr > create_plan( const int *perm, const int dim, + const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const int maxAutotuningCandidates, + const int numThreads, const int *threadIds, const bool useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor )); plan->setMaxAutotuningCandidates(maxAutotuningCandidates); plan->createPlan(); return plan; @@ -157,7 +427,7 @@ void sTensorTranspose( const int *perm, const int dim, const float beta, float *B, const int *outerSizeB, const int numThreads, const int useRowMajor) { - auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); plan->execute(); } @@ -166,45 +436,115 @@ void dTensorTranspose( const int *perm, const int dim, const double beta, double *B, const int *outerSizeB, const int numThreads, const int useRowMajor) { - auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); plan->execute(); } void cTensorTranspose( const int *perm, const int dim, const float _Complex alpha, bool conjA, const float _Complex *A, const int *sizeA, const int *outerSizeA, - const float _Complex beta, float _Complex *B, const int *outerSizeB, + const float _Complex beta, float _Complex *B, const int *outerSizeB, const int numThreads, const int useRowMajor) { - auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, dim, - (const hptt::FloatComplex*) A, (hptt::FloatComplex) alpha, (hptt::FloatComplex*) B, (hptt::FloatComplex) beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, + (const hptt::FloatComplex*) A, hptt::FloatComplex(__real__ alpha, __imag__ alpha), (hptt::FloatComplex*) B, hptt::FloatComplex(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); plan->setConjA(conjA); plan->execute(); } void zTensorTranspose( const int *perm, const int dim, const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, - const double _Complex beta, double _Complex *B, const int *outerSizeB, + const double _Complex beta, double _Complex *B, const int *outerSizeB, const int numThreads, const int useRowMajor) { - auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, dim, - (const hptt::DoubleComplex*) A, (hptt::DoubleComplex) alpha, (hptt::DoubleComplex*) B, (hptt::DoubleComplex) beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, + (const hptt::DoubleComplex*) A, (hptt::DoubleComplex)(__real__ alpha, __imag__ alpha), (hptt::DoubleComplex*) B, (hptt::DoubleComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); plan->setConjA(conjA); plan->execute(); } -} - - - - - - - - +/* With Offset */ +void sOffsetTensorTranspose( const int *perm, const int dim, + const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const float beta, float *B, const int *outerSizeB, const int *offsetB, + const int numThreads, const int useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, + dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); + plan->execute(); +} +void dOffsetTensorTranspose( const int *perm, const int dim, + const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const double beta, double *B, const int *outerSizeB, const int *offsetB, + const int numThreads, const int useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, + dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); + plan->execute(); +} +void cOffsetTensorTranspose( const int *perm, const int dim, + const float _Complex alpha, bool conjA, const float _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const float _Complex beta, float _Complex *B, const int *outerSizeB, const int *offsetB, + const int numThreads, const int useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, + (const hptt::FloatComplex*) A, (hptt::FloatComplex)(__real__ alpha, __imag__ alpha), (hptt::FloatComplex*) B, (hptt::FloatComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); + plan->setConjA(conjA); + plan->execute(); +} +void zOffsetTensorTranspose( const int *perm, const int dim, + const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, + const double _Complex beta, double _Complex *B, const int *outerSizeB, const int *offsetB, + const int numThreads, const int useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, + (const hptt::DoubleComplex*) A, (hptt::DoubleComplex)(__real__ alpha, __imag__ alpha), (hptt::DoubleComplex*) B, (hptt::DoubleComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); + plan->setConjA(conjA); + plan->execute(); +} +/* With Offset and innerStride */ +void sInnerStrideTensorTranspose( const int *perm, const int dim, + const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const float beta, float *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const int numThreads, const int useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, + A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); + plan->execute(); +} +void dInnerStrideTensorTranspose( const int *perm, const int dim, + const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const double beta, double *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const int numThreads, const int useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, + A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); + plan->execute(); +} +void cInnerStrideTensorTranspose( const int *perm, const int dim, + const float _Complex alpha, bool conjA, const float _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const float _Complex beta, float _Complex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const int numThreads, const int useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, + (const hptt::FloatComplex*) A, (hptt::FloatComplex)(__real__ alpha, __imag__ alpha), (hptt::FloatComplex*) B, (hptt::FloatComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); + plan->setConjA(conjA); + plan->execute(); +} +void zInnerStrideTensorTranspose( const int *perm, const int dim, + const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, + const double _Complex beta, double _Complex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, + const int numThreads, const int useRowMajor) +{ + auto plan(std::make_shared >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, + (const hptt::DoubleComplex*) A, (hptt::DoubleComplex)(__real__ alpha, __imag__ alpha), (hptt::DoubleComplex*) B, (hptt::DoubleComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor)); + plan->setConjA(conjA); + plan->execute(); +} +} diff --git a/src/transpose.cpp b/src/transpose.cpp index f77cd5b..a59de89 100644 --- a/src/transpose.cpp +++ b/src/transpose.cpp @@ -34,24 +34,30 @@ namespace hptt { template struct micro_kernel { - static void execute(const floatType* __restrict__ A, const size_t lda, floatType* __restrict__ B, const size_t ldb, const floatType alpha, const floatType beta) + static void execute(const floatType* __restrict__ A, const size_t lda, const size_t innerStrideA, floatType* __restrict__ B, const size_t ldb, const size_t innerStrideB, const floatType alpha, const floatType beta) { constexpr int n = (REGISTER_BITS/8) / sizeof(floatType); if( betaIsZero ) for(int j=0; j < n; ++j) - for(int i=0; i < n; ++i) + for(int i=0; i < n; ++i){ +#ifdef DEBUG + //printf("B[+%zu] = %e -> A[+%zu] = %e\n", (i * innerStrideB) + (j * ldb), B[(i * innerStrideB) + (j * ldb)], (j * innerStrideA) + (lda * i), A[(j * innerStrideA) + (lda * i)]); +#endif if( conjA ) - B[i + j * ldb] = alpha * conj(A[i * lda + j]); + B[(i * innerStrideB) + (j * ldb)] = alpha * conj(A[(j * innerStrideA) + (lda * i)]); else - B[i + j * ldb] = alpha * A[i * lda + j]; + B[(i * innerStrideB) + (j * ldb)] = alpha * A[(j * innerStrideA) + (lda * i)];} else for(int j=0; j < n; ++j) - for(int i=0; i < n; ++i) + for(int i=0; i < n; ++i) { +#ifdef DEBUG + //printf("B[+%zu] = %e -> A[+%zu] = %e\n", (i * innerStrideB) + (j * ldb), B[(i * innerStrideB) + (j * ldb)], (j * innerStrideA) + (lda * i), A[(j * innerStrideA) + (lda * i)]); +#endif if( conjA ) - B[i + j * ldb] = alpha * conj(A[i * lda + j]) + beta * B[i + j * ldb]; + B[(i * innerStrideB) + (j * ldb)] = alpha * conj(A[(j * innerStrideA) + (lda * i)]) + beta * B[(i * innerStrideB) + (j * ldb)]; else - B[i + j * ldb] = alpha * A[i * lda + j] + beta * B[i + j * ldb]; + B[(i * innerStrideB) + (j * ldb)] = alpha * A[(j * innerStrideA) + (lda * i)] + beta * B[(i * innerStrideB) + (j * ldb)];} } }; @@ -77,15 +83,28 @@ static INLINE void prefetch(const floatType* A, const int lda) template struct micro_kernel { - static void execute(const double* __restrict__ A, const size_t lda, double* __restrict__ B, const size_t ldb, const double alpha ,const double beta) + static void execute(const double* __restrict__ A, const size_t lda, const size_t innerStrideA, double* __restrict__ B, const size_t ldb, const size_t innerStrideB, const double alpha ,const double beta) { +#ifdef DEBUG + printf("In AVX micro_kernel::execute\n", betaIsZero, conjA); +#endif __m256d reg_alpha = _mm256_set1_pd(alpha); // do not alter the content of B __m256d reg_beta = _mm256_set1_pd(beta); // do not alter the content of B //Load A - __m256d rowA0 = _mm256_loadu_pd((A +0*lda)); - __m256d rowA1 = _mm256_loadu_pd((A +1*lda)); - __m256d rowA2 = _mm256_loadu_pd((A +2*lda)); - __m256d rowA3 = _mm256_loadu_pd((A +3*lda)); + __m256d rowA0, rowA1, rowA2, rowA3; + __m256i indicesA; + if (innerStrideA != 1) { + indicesA = _mm256_set_epi32(7 * innerStrideA, 6 * innerStrideA, 5 * innerStrideA, 4 * innerStrideA, 3 * innerStrideA, 2 * innerStrideA, 1 * innerStrideA, 0 * innerStrideA); + rowA0 = _mm256_i32gather_pd((A +0*lda), indicesA, sizeof(double)); + rowA1 = _mm256_i32gather_pd((A +1*lda), indicesA, sizeof(double)); + rowA2 = _mm256_i32gather_pd((A +2*lda), indicesA, sizeof(double)); + rowA3 = _mm256_i32gather_pd((A +3*lda), indicesA, sizeof(double)); + } else { + rowA0 = _mm256_loadu_pd((A +0*lda)); + rowA1 = _mm256_loadu_pd((A +1*lda)); + rowA2 = _mm256_loadu_pd((A +2*lda)); + rowA3 = _mm256_loadu_pd((A +3*lda)); + } //4x4 transpose micro kernel __m256d r4, r34, r3, r33; @@ -107,26 +126,51 @@ struct micro_kernel //Load B if( !betaIsZero ) { - __m256d rowB0 = _mm256_loadu_pd((B +0*ldb)); - __m256d rowB1 = _mm256_loadu_pd((B +1*ldb)); - __m256d rowB2 = _mm256_loadu_pd((B +2*ldb)); - __m256d rowB3 = _mm256_loadu_pd((B +3*ldb)); + __m256d rowB0, rowB1, rowB2, rowB3; + __m256i indicesB; + if (innerStrideB != 1) { + indicesB = _mm256_set_epi32(7 * innerStrideB, 6 * innerStrideB, 5 * innerStrideB, 4 * innerStrideB, 3 * innerStrideB, 2 * innerStrideB, 1 * innerStrideB, 0 * innerStrideB); + rowB0 = _mm256_i32gather_pd((B +0*ldb), indicesB, sizeof(double)); + rowB1 = _mm256_i32gather_pd((B +1*ldb), indicesB, sizeof(double)); + rowB2 = _mm256_i32gather_pd((B +2*ldb), indicesB, sizeof(double)); + rowB3 = _mm256_i32gather_pd((B +3*ldb), indicesB, sizeof(double)); + } else { + rowB0 = _mm256_loadu_pd((B +0*ldb)); + rowB1 = _mm256_loadu_pd((B +1*ldb)); + rowB2 = _mm256_loadu_pd((B +2*ldb)); + rowB3 = _mm256_loadu_pd((B +3*ldb)); + } rowB0 = _mm256_add_pd( _mm256_mul_pd(rowB0, reg_beta), rowA0); rowB1 = _mm256_add_pd( _mm256_mul_pd(rowB1, reg_beta), rowA1); rowB2 = _mm256_add_pd( _mm256_mul_pd(rowB2, reg_beta), rowA2); rowB3 = _mm256_add_pd( _mm256_mul_pd(rowB3, reg_beta), rowA3); //Store B - _mm256_storeu_pd((B + 0 * ldb), rowB0); - _mm256_storeu_pd((B + 1 * ldb), rowB1); - _mm256_storeu_pd((B + 2 * ldb), rowB2); - _mm256_storeu_pd((B + 3 * ldb), rowB3); + if (innerStrideB != 1) { + _mm256_i32scatter_epi32((B +0*ldb), indicesB, rowB0, sizeof(double)); + _mm256_i32scatter_epi32((B +1*ldb), indicesB, rowB1, sizeof(double)); + _mm256_i32scatter_epi32((B +2*ldb), indicesB, rowB2, sizeof(double)); + _mm256_i32scatter_epi32((B +3*ldb), indicesB, rowB3, sizeof(double)); + } else { + _mm256_storeu_pd((B + 0 * ldb), rowB0); + _mm256_storeu_pd((B + 1 * ldb), rowB1); + _mm256_storeu_pd((B + 2 * ldb), rowB2); + _mm256_storeu_pd((B + 3 * ldb), rowB3); + } } else { //Store B - _mm256_storeu_pd((B + 0 * ldb), rowA0); - _mm256_storeu_pd((B + 1 * ldb), rowA1); - _mm256_storeu_pd((B + 2 * ldb), rowA2); - _mm256_storeu_pd((B + 3 * ldb), rowA3); + if (innerStrideB != 1) { + __m256i indicesB = _mm256_set_epi32(7 * innerStrideB, 6 * innerStrideB, 5 * innerStrideB, 4 * innerStrideB, 3 * innerStrideB, 2 * innerStrideB, 1 * innerStrideB, 0 * innerStrideB); + _mm256_i32scatter_epi32((B +0*ldb), indicesB, rowA0, sizeof(double)); + _mm256_i32scatter_epi32((B +1*ldb), indicesB, rowA1, sizeof(double)); + _mm256_i32scatter_epi32((B +2*ldb), indicesB, rowA2, sizeof(double)); + _mm256_i32scatter_epi32((B +3*ldb), indicesB, rowA3, sizeof(double)); + } else { + _mm256_storeu_pd((B + 0 * ldb), rowA0); + _mm256_storeu_pd((B + 1 * ldb), rowA1); + _mm256_storeu_pd((B + 2 * ldb), rowA2); + _mm256_storeu_pd((B + 3 * ldb), rowA3); + } } } }; @@ -134,19 +178,36 @@ struct micro_kernel template struct micro_kernel { - static void execute(const float* __restrict__ A, const size_t lda, float* __restrict__ B, const size_t ldb, const float alpha ,const float beta) + static void execute(const float* __restrict__ A, const size_t lda, const size_t innerStrideA, float* __restrict__ B, const size_t ldb, const size_t innerStrideB, const float alpha ,const float beta) { +#ifdef DEBUG + printf("In AVX micro_kernel::execute\n", betaIsZero, conjA); +#endif __m256 reg_alpha = _mm256_set1_ps(alpha); // do not alter the content of B __m256 reg_beta = _mm256_set1_ps(beta); // do not alter the content of B //Load A - __m256 rowA0 = _mm256_loadu_ps((A +0*lda)); - __m256 rowA1 = _mm256_loadu_ps((A +1*lda)); - __m256 rowA2 = _mm256_loadu_ps((A +2*lda)); - __m256 rowA3 = _mm256_loadu_ps((A +3*lda)); - __m256 rowA4 = _mm256_loadu_ps((A +4*lda)); - __m256 rowA5 = _mm256_loadu_ps((A +5*lda)); - __m256 rowA6 = _mm256_loadu_ps((A +6*lda)); - __m256 rowA7 = _mm256_loadu_ps((A +7*lda)); + __m256 rowA0, rowA1, rowA2, rowA3, rowA4, rowA5, rowA6, rowA7; + __m256i indicesA; + if (innerStrideA == 1) { + rowA0 = _mm256_loadu_ps((A +0*lda)); + rowA1 = _mm256_loadu_ps((A +1*lda)); + rowA2 = _mm256_loadu_ps((A +2*lda)); + rowA3 = _mm256_loadu_ps((A +3*lda)); + rowA4 = _mm256_loadu_ps((A +4*lda)); + rowA5 = _mm256_loadu_ps((A +5*lda)); + rowA6 = _mm256_loadu_ps((A +6*lda)); + rowA7 = _mm256_loadu_ps((A +7*lda)); + } else { + indicesA = _mm256_set_epi32(7 * innerStrideA, 6 * innerStrideA, 5 * innerStrideA, 4 * innerStrideA, 3 * innerStrideA, 2 * innerStrideA, 1 * innerStrideA, 0 * innerStrideA); + rowA0 = _mm256_i32gather_ps((A +0*lda), indicesA, sizeof(float)); + rowA1 = _mm256_i32gather_ps((A +1*lda), indicesA, sizeof(float)); + rowA2 = _mm256_i32gather_ps((A +2*lda), indicesA, sizeof(float)); + rowA3 = _mm256_i32gather_ps((A +3*lda), indicesA, sizeof(float)); + rowA4 = _mm256_i32gather_ps((A +4*lda), indicesA, sizeof(float)); + rowA5 = _mm256_i32gather_ps((A +5*lda), indicesA, sizeof(float)); + rowA6 = _mm256_i32gather_ps((A +6*lda), indicesA, sizeof(float)); + rowA7 = _mm256_i32gather_ps((A +7*lda), indicesA, sizeof(float)); + } //8x8 transpose micro kernel __m256 r121, r139, r120, r138, r71, r89, r70, r88, r11, r1, r55, r29, r10, r0, r54, r28; @@ -188,14 +249,28 @@ struct micro_kernel //Load B if( !betaIsZero ) { - __m256 rowB0 = _mm256_loadu_ps((B +0*ldb)); - __m256 rowB1 = _mm256_loadu_ps((B +1*ldb)); - __m256 rowB2 = _mm256_loadu_ps((B +2*ldb)); - __m256 rowB3 = _mm256_loadu_ps((B +3*ldb)); - __m256 rowB4 = _mm256_loadu_ps((B +4*ldb)); - __m256 rowB5 = _mm256_loadu_ps((B +5*ldb)); - __m256 rowB6 = _mm256_loadu_ps((B +6*ldb)); - __m256 rowB7 = _mm256_loadu_ps((B +7*ldb)); + __m256 rowB0, rowB1, rowB2, rowB3, rowB4, rowB5, rowB6, rowB7; + __m256i indicesB; + if (innerStrideB != 1) { + indicesB = _mm256_set_epi32(7 * innerStrideB, 6 * innerStrideB, 5 * innerStrideB, 4 * innerStrideB, 3 * innerStrideB, 2 * innerStrideB, 1 * innerStrideB, 0 * innerStrideB); + rowB0 = _mm256_i32gather_ps((B +0*ldb), indicesB, sizeof(float)); + rowB1 = _mm256_i32gather_ps((B +1*ldb), indicesB, sizeof(float)); + rowB2 = _mm256_i32gather_ps((B +2*ldb), indicesB, sizeof(float)); + rowB3 = _mm256_i32gather_ps((B +3*ldb), indicesB, sizeof(float)); + rowB4 = _mm256_i32gather_ps((B +4*ldb), indicesB, sizeof(float)); + rowB5 = _mm256_i32gather_ps((B +5*ldb), indicesB, sizeof(float)); + rowB6 = _mm256_i32gather_ps((B +6*ldb), indicesB, sizeof(float)); + rowB7 = _mm256_i32gather_ps((B +7*ldb), indicesB, sizeof(float)); + } else { + rowB0 = _mm256_loadu_ps((B +0*ldb)); + rowB1 = _mm256_loadu_ps((B +1*ldb)); + rowB2 = _mm256_loadu_ps((B +2*ldb)); + rowB3 = _mm256_loadu_ps((B +3*ldb)); + rowB4 = _mm256_loadu_ps((B +4*ldb)); + rowB5 = _mm256_loadu_ps((B +5*ldb)); + rowB6 = _mm256_loadu_ps((B +6*ldb)); + rowB7 = _mm256_loadu_ps((B +7*ldb)); + } rowB0 = _mm256_add_ps( _mm256_mul_ps(rowB0, reg_beta), rowA0); rowB1 = _mm256_add_ps( _mm256_mul_ps(rowB1, reg_beta), rowA1); @@ -206,23 +281,46 @@ struct micro_kernel rowB6 = _mm256_add_ps( _mm256_mul_ps(rowB6, reg_beta), rowA6); rowB7 = _mm256_add_ps( _mm256_mul_ps(rowB7, reg_beta), rowA7); //Store B - _mm256_storeu_ps((B + 0 * ldb), rowB0); - _mm256_storeu_ps((B + 1 * ldb), rowB1); - _mm256_storeu_ps((B + 2 * ldb), rowB2); - _mm256_storeu_ps((B + 3 * ldb), rowB3); - _mm256_storeu_ps((B + 4 * ldb), rowB4); - _mm256_storeu_ps((B + 5 * ldb), rowB5); - _mm256_storeu_ps((B + 6 * ldb), rowB6); - _mm256_storeu_ps((B + 7 * ldb), rowB7); + if (innerStrideB != 1) { + _mm256_i32scatter_epi32((B +0*ldb), indicesB, rowB0, sizeof(float)); + _mm256_i32scatter_epi32((B +1*ldb), indicesB, rowB1, sizeof(float)); + _mm256_i32scatter_epi32((B +2*ldb), indicesB, rowB2, sizeof(float)); + _mm256_i32scatter_epi32((B +3*ldb), indicesB, rowB3, sizeof(float)); + _mm256_i32scatter_epi32((B +4*ldb), indicesB, rowB4, sizeof(float)); + _mm256_i32scatter_epi32((B +5*ldb), indicesB, rowB5, sizeof(float)); + _mm256_i32scatter_epi32((B +6*ldb), indicesB, rowB6, sizeof(float)); + _mm256_i32scatter_epi32((B +7*ldb), indicesB, rowB7, sizeof(float)); + } else { + _mm256_storeu_ps((B + 0 * ldb), rowB0); + _mm256_storeu_ps((B + 1 * ldb), rowB1); + _mm256_storeu_ps((B + 2 * ldb), rowB2); + _mm256_storeu_ps((B + 3 * ldb), rowB3); + _mm256_storeu_ps((B + 4 * ldb), rowB4); + _mm256_storeu_ps((B + 5 * ldb), rowB5); + _mm256_storeu_ps((B + 6 * ldb), rowB6); + _mm256_storeu_ps((B + 7 * ldb), rowB7); + } } else { - _mm256_storeu_ps((B + 0 * ldb), rowA0); - _mm256_storeu_ps((B + 1 * ldb), rowA1); - _mm256_storeu_ps((B + 2 * ldb), rowA2); - _mm256_storeu_ps((B + 3 * ldb), rowA3); - _mm256_storeu_ps((B + 4 * ldb), rowA4); - _mm256_storeu_ps((B + 5 * ldb), rowA5); - _mm256_storeu_ps((B + 6 * ldb), rowA6); - _mm256_storeu_ps((B + 7 * ldb), rowA7); + if (innerStrideB != 1) { + __m256i indicesB = _mm256_set_epi32(7 * innerStrideB, 6 * innerStrideB, 5 * innerStrideB, 4 * innerStrideB, 3 * innerStrideB, 2 * innerStrideB, 1 * innerStrideB, 0 * innerStrideB); + _mm256_i32scatter_epi32((B +0*ldb), indicesB, rowA0, sizeof(float)); + _mm256_i32scatter_epi32((B +1*ldb), indicesB, rowA1, sizeof(float)); + _mm256_i32scatter_epi32((B +2*ldb), indicesB, rowA2, sizeof(float)); + _mm256_i32scatter_epi32((B +3*ldb), indicesB, rowA3, sizeof(float)); + _mm256_i32scatter_epi32((B +4*ldb), indicesB, rowA4, sizeof(float)); + _mm256_i32scatter_epi32((B +5*ldb), indicesB, rowA5, sizeof(float)); + _mm256_i32scatter_epi32((B +6*ldb), indicesB, rowA6, sizeof(float)); + _mm256_i32scatter_epi32((B +7*ldb), indicesB, rowA7, sizeof(float)); + } else { + _mm256_storeu_ps((B + 0 * ldb), rowA0); + _mm256_storeu_ps((B + 1 * ldb), rowA1); + _mm256_storeu_ps((B + 2 * ldb), rowA2); + _mm256_storeu_ps((B + 3 * ldb), rowA3); + _mm256_storeu_ps((B + 4 * ldb), rowA4); + _mm256_storeu_ps((B + 5 * ldb), rowA5); + _mm256_storeu_ps((B + 6 * ldb), rowA6); + _mm256_storeu_ps((B + 7 * ldb), rowA7); + } } } }; @@ -246,16 +344,62 @@ static INLINE void prefetch(const floatType* A, const int lda) { } template struct micro_kernel { - static void execute(const float* __restrict__ A, const size_t lda, float* __restrict__ B, const size_t ldb, const float alpha ,const float beta) + static void execute(const float* __restrict__ A, const size_t lda, const size_t innerStrideA, float* __restrict__ B, const size_t ldb, const size_t innerStrideA, const float alpha ,const float beta) { +#ifdef DEBUG + printf("In ARM micro_kernel::execute\n", betaIsZero, conjA); +#endif float32x4_t reg_alpha = vdupq_n_f32(alpha); float32x4_t reg_beta = vdupq_n_f32(beta); //Load A - float32x4_t rowA0 = vld1q_f32((A +0*lda)); - float32x4_t rowA1 = vld1q_f32((A +1*lda)); - float32x4_t rowA2 = vld1q_f32((A +2*lda)); - float32x4_t rowA3 = vld1q_f32((A +3*lda)); + float32x4_t rowA0, rowA1, rowA2, rowA3; + if (innerStrideA == 1) { + float32x4_t rowA0 = vld1q_f32((A +0*lda)); + float32x4_t rowA1 = vld1q_f32((A +1*lda)); + float32x4_t rowA2 = vld1q_f32((A +2*lda)); + float32x4_t rowA3 = vld1q_f32((A +3*lda)); + } else if (innerStrideA == 2) { + float32x4_t rowA0 = vld2q_f32((A +0*lda)).val[0]; + float32x4_t rowA1 = vld2q_f32((A +1*lda)).val[0]; + float32x4_t rowA2 = vld2q_f32((A +2*lda)).val[0]; + float32x4_t rowA3 = vld2q_f32((A +3*lda)).val[0]; + } else if (innerStrideA == 3) { + float32x4_t rowA0 = vld3q_f32((A +0*lda)).val[0]; + float32x4_t rowA1 = vld3q_f32((A +1*lda)).val[0]; + float32x4_t rowA2 = vld3q_f32((A +2*lda)).val[0]; + float32x4_t rowA3 = vld3q_f32((A +3*lda)).val[0]; + } else if (innerStrideA == 4) { + float32x4_t rowA0 = vld4q_f32((A +0*lda)).val[0]; + float32x4_t rowA1 = vld4q_f32((A +1*lda)).val[0]; + float32x4_t rowA2 = vld4q_f32((A +2*lda)).val[0]; + float32x4_t rowA3 = vld4q_f32((A +3*lda)).val[0]; + } else { + float32x4_t rowA0 = vdupq_n_f32(0); + float32x4_t rowA1 = vdupq_n_f32(0); + float32x4_t rowA2 = vdupq_n_f32(0); + float32x4_t rowA3 = vdupq_n_f32(0); + + rowA0 = vld1q_lane_f32(A + 0 * lda + 0 * strideA, rowA0, 0); + rowA0 = vld1q_lane_f32(A + 0 * lda + 1 * strideA, rowA0, 1); + rowA0 = vld1q_lane_f32(A + 0 * lda + 2 * strideA, rowA0, 2); + rowA0 = vld1q_lane_f32(A + 0 * lda + 3 * strideA, rowA0, 3); + + rowA1 = vld1q_lane_f32(A + 1 * lda + 0 * strideA, rowA1, 0); + rowA1 = vld1q_lane_f32(A + 1 * lda + 1 * strideA, rowA1, 1); + rowA1 = vld1q_lane_f32(A + 1 * lda + 2 * strideA, rowA1, 2); + rowA1 = vld1q_lane_f32(A + 1 * lda + 3 * strideA, rowA1, 3); + + rowA2 = vld1q_lane_f32(A + 2 * lda + 0 * strideA, rowA2, 0); + rowA2 = vld1q_lane_f32(A + 2 * lda + 1 * strideA, rowA2, 1); + rowA2 = vld1q_lane_f32(A + 2 * lda + 2 * strideA, rowA2, 2); + rowA2 = vld1q_lane_f32(A + 2 * lda + 3 * strideA, rowA2, 3); + + rowA3 = vld1q_lane_f32(A + 3 * lda + 0 * strideA, rowA3, 0); + rowA3 = vld1q_lane_f32(A + 3 * lda + 1 * strideA, rowA3, 1); + rowA3 = vld1q_lane_f32(A + 3 * lda + 2 * strideA, rowA3, 2); + rowA3 = vld1q_lane_f32(A + 3 * lda + 3 * strideA, rowA3, 3); + } //4x4 transpose micro kernel float32x4x2_t t0,t1,t2,t3; @@ -272,27 +416,110 @@ struct micro_kernel //Load B if( !betaIsZero ) - { - float32x4_t rowB0 = vld1q_f32((B +0*ldb)); - float32x4_t rowB1 = vld1q_f32((B +1*ldb)); - float32x4_t rowB2 = vld1q_f32((B +2*ldb)); - float32x4_t rowB3 = vld1q_f32((B +3*ldb)); + { + float32x4_t rowB0, rowB1, rowB2, rowB3; + if (innerStrideB == 1) { + float32x4_t rowB0 = vld1q_f32((B +0*ldb)); + float32x4_t rowB1 = vld1q_f32((B +1*ldb)); + float32x4_t rowB2 = vld1q_f32((B +2*ldb)); + float32x4_t rowB3 = vld1q_f32((B +3*ldb)); + } else if (innerStrideB == 2) { + float32x4_t rowB0 = vld2q_f32((B +0*ldb)).val[0]; + float32x4_t rowB1 = vld2q_f32((B +1*ldb)).val[0]; + float32x4_t rowB2 = vld2q_f32((B +2*ldb)).val[0]; + float32x4_t rowB3 = vld2q_f32((B +3*ldb)).val[0]; + } else if (innerStrideB == 3) { + float32x4_t rowB0 = vld3q_f32((B +0*ldb)).val[0]; + float32x4_t rowB1 = vld3q_f32((B +1*ldb)).val[0]; + float32x4_t rowB2 = vld3q_f32((B +2*ldb)).val[0]; + float32x4_t rowB3 = vld3q_f32((B +3*ldb)).val[0]; + } else if (innerStrideB == 4) { + float32x4_t rowB0 = vld4q_f32((B +0*ldb)).val[0]; + float32x4_t rowB1 = vld4q_f32((B +1*ldb)).val[0]; + float32x4_t rowB2 = vld4q_f32((B +2*ldb)).val[0]; + float32x4_t rowB3 = vld4q_f32((B +3*ldb)).val[0]; + } else { + float32x4_t rowB0 = vdupq_n_f32(0); + float32x4_t rowB1 = vdupq_n_f32(0); + float32x4_t rowB2 = vdupq_n_f32(0); + float32x4_t rowB3 = vdupq_n_f32(0); + + rowB0 = vld1q_lane_f32(B + 0 * innerStrideB, rowB0, 0); + rowB0 = vld1q_lane_f32(B + 1 * innerStrideB, rowB0, 1); + rowB0 = vld1q_lane_f32(B + 2 * innerStrideB, rowB0, 2); + rowB0 = vld1q_lane_f32(B + 3 * innerStrideB, rowB0, 3); + + rowB1 = vld1q_lane_f32(B + 1 * ldb + 0 * innerStrideB, rowB1, 0); + rowB1 = vld1q_lane_f32(B + 1 * ldb + 1 * innerStrideB, rowB1, 1); + rowB1 = vld1q_lane_f32(B + 1 * ldb + 2 * innerStrideB, rowB1, 2); + rowB1 = vld1q_lane_f32(B + 1 * ldb + 3 * innerStrideB, rowB1, 3); + + rowB2 = vld1q_lane_f32(B + 2 * ldb + 0 * innerStrideB, rowB2, 0); + rowB2 = vld1q_lane_f32(B + 2 * ldb + 1 * innerStrideB, rowB2, 1); + rowB2 = vld1q_lane_f32(B + 2 * ldb + 2 * innerStrideB, rowB2, 2); + rowB2 = vld1q_lane_f32(B + 2 * ldb + 3 * innerStrideB, rowB2, 3); + + rowB3 = vld1q_lane_f32(B + 3 * ldb + 0 * innerStrideB, rowB3, 0); + rowB3 = vld1q_lane_f32(B + 3 * ldb + 1 * innerStrideB, rowB3, 1); + rowB3 = vld1q_lane_f32(B + 3 * ldb + 2 * innerStrideB, rowB3, 2); + rowB3 = vld1q_lane_f32(B + 3 * ldb + 3 * innerStrideB, rowB3, 3); + } rowB0 = vaddq_f32( vmulq_f32(rowB0, reg_beta), rowA0); rowB1 = vaddq_f32( vmulq_f32(rowB1, reg_beta), rowA1); rowB2 = vaddq_f32( vmulq_f32(rowB2, reg_beta), rowA2); rowB3 = vaddq_f32( vmulq_f32(rowB3, reg_beta), rowA3); //Store B - vst1q_f32((B + 0 * ldb), rowB0); - vst1q_f32((B + 1 * ldb), rowB1); - vst1q_f32((B + 2 * ldb), rowB2); - vst1q_f32((B + 3 * ldb), rowB3); + if (innerStrideB == 1) { + vst1q_f32((B + 0 * ldb), rowB0); + vst1q_f32((B + 1 * ldb), rowB1); + vst1q_f32((B + 2 * ldb), rowB2); + vst1q_f32((B + 3 * ldb), rowB3); + } else { + float tmp[4]; + vst1q_f32(tmp, rowB0); + for (int i = 0; i < 4; ++i) { + B[i * innerStrideB] = tmp[i]; + } + vst1q_f32(tmp, rowB1); + for (int i = 0; i < 4; ++i) { + B[i * innerStrideB + 1 * ldb] = tmp[i]; + } + vst1q_f32(tmp, rowB2); + for (int i = 0; i < 4; ++i) { + B[i * innerStrideB + 2 * ldb] = tmp[i]; + } + vst1q_f32(tmp, rowB3); + for (int i = 0; i < 4; ++i) { + B[i * innerStrideB + 3 * ldb] = tmp[i]; + } + } } else { //Store B - vst1q_f32((B + 0 * ldb), rowA0); - vst1q_f32((B + 1 * ldb), rowA1); - vst1q_f32((B + 2 * ldb), rowA2); - vst1q_f32((B + 3 * ldb), rowA3); + if (innerStrideB == 1) { + vst1q_f32((B + 0 * ldb), rowA0); + vst1q_f32((B + 1 * ldb), rowA1); + vst1q_f32((B + 2 * ldb), rowA2); + vst1q_f32((B + 3 * ldb), rowA3); + } else { + float tmp[4]; + vst1q_f32(tmp, rowB0); + for (int i = 0; i < 4; ++i) { + B[i * innerStrideB] = tmp[i]; + } + vst1q_f32(tmp, rowB1); + for (int i = 0; i < 4; ++i) { + B[i * innerStrideB + 1 * ldb] = tmp[i]; + } + vst1q_f32(tmp, rowB2); + for (int i = 0; i < 4; ++i) { + B[i * innerStrideB + 2 * ldb] = tmp[i]; + } + vst1q_f32(tmp, rowB3); + for (int i = 0; i < 4; ++i) { + B[i * innerStrideB + 3 * ldb] = tmp[i]; + } + } } } }; @@ -304,7 +531,7 @@ struct micro_kernel //template //struct micro_kernel //{ -// static void execute(const float* __restrict__ A, const size_t lda, float* __restrict__ B, const size_t ldb, const float alpha ,const float beta) +// static void execute(const float* __restrict__ A, const size_t lda, const size_t innerStrideA, float* __restrict__ B, const size_t ldb, const size_t innerStrideA, const float alpha ,const float beta) // { // vector float reg_alpha = vec_splats(alpha); // @@ -368,314 +595,537 @@ struct micro_kernel template -static INLINE void macro_kernel_scalar(const floatType* __restrict__ A, const size_t lda, int blockingA, - floatType* __restrict__ B, const size_t ldb, int blockingB, +static INLINE void macro_kernel_scalar(const floatType* __restrict__ A, const size_t lda, int blockingA, size_t innerStrideA, + floatType* __restrict__ B, const size_t ldb, int blockingB, size_t innerStrideB, const floatType alpha ,const floatType beta) { #ifdef DEBUG - assert( blockingA > 0 && blockingB > 0); + printf("Macro kernel: blockingA = %d with %zu, blockingB = %d with %zu\n", blockingA, lda, blockingB, ldb); + assert( blockingA >= 0 && blockingB >= 0); #endif if( betaIsZero ) for(int j=0; j < blockingA; ++j) - for(int i=0; i < blockingB; ++i) + for(int i=0; i < blockingB; ++i){ +#ifdef DEBUG + printf(" A[+%zu] = %e -> B[+%zu] = %e\n", (i * lda) + (j * innerStrideA), A[(i * lda) + (j * innerStrideA)], (i * innerStrideB) + (j * ldb), B[(i * innerStrideB) + (j * ldb)]); +#endif if( conjA ) - B[i + j * ldb] = alpha * conj(A[i * lda + j]); + B[(i * innerStrideB) + (j * ldb)] = alpha * conj(A[(i * lda) + (j * innerStrideA)]); else - B[i + j * ldb] = alpha * A[i * lda + j]; + B[(i * innerStrideB) + (j * ldb)] = alpha * A[(i * lda) + (j * innerStrideA)];} else for(int j=0; j < blockingA; ++j) - for(int i=0; i < blockingB; ++i) + for(int i=0; i < blockingB; ++i){ +#ifdef DEBUG + printf(" A[+%zu] = %e -> B[+%zu] = %e\n", (i * lda) + (j * innerStrideA), A[(i * lda) + (j * innerStrideA)], i + (j * ldb), B[(i * innerStrideB) + (j * ldb)]); +#endif if( conjA ) - B[i + j * ldb] = alpha * conj(A[i * lda + j]) + beta * B[i + j * ldb]; + B[(i * innerStrideB) + (j * ldb)] = alpha * conj(A[(i * lda) + (j * innerStrideA)]) + beta * B[(i * innerStrideB) + (j * ldb)]; else - B[i + j * ldb] = alpha * A[i * lda + j] + beta * B[i + j * ldb]; + B[(i * innerStrideB) + (j * ldb)] = alpha * A[(i * lda) + (j * innerStrideA)] + beta * B[(i * innerStrideB) + (j * ldb)];} } template -static INLINE void macro_kernel(const floatType* __restrict__ A, const floatType* __restrict__ Anext, const size_t lda, - floatType* __restrict__ B, const floatType* __restrict__ Bnext, const size_t ldb, +static INLINE void macro_kernel(const floatType* __restrict__ A, const floatType* __restrict__ Anext, const size_t lda, size_t innerStrideA, + floatType* __restrict__ B, const floatType* __restrict__ Bnext, const size_t ldb, size_t innerStrideB, const floatType alpha ,const floatType beta) { constexpr int blocking_micro_ = REGISTER_BITS/8 / sizeof(floatType); constexpr int blocking_ = blocking_micro_ * 4; +#ifdef DEBUG + printf("Macro kernel (%d): blockingA = %d with %zu, blockingB = %d with %zu\n", blocking_, blockingA, lda, blockingB, ldb); +#endif const bool useStreamingStores = useStreamingStores_ && betaIsZero && (blockingB*sizeof(floatType))%64 == 0 && ((uint64_t)B)%32 == 0 && (ldb*sizeof(floatType))%32 == 0; +#ifdef DEBUG + printf("Macro kernel using streaming stores? %d\n", useStreamingStores); +#endif + floatType *Btmp = B; size_t ldb_tmp = ldb; floatType buffer[blockingA * blockingB];// __attribute__((aligned(64))); - if( (useStreamingStores_ && useStreamingStores) ){ + if( (useStreamingStores_ && useStreamingStores && innerStrideB == 1) ){ Btmp = buffer; ldb_tmp = blockingB; } if( blockingA == blocking_ && blockingB == blocking_ ) { - if( !(useStreamingStores_ && useStreamingStores) ) + if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 ) prefetch(Bnext + (0 * ldb_tmp + 0), ldb_tmp); - prefetch(Anext + (0 * lda + 0), lda); - micro_kernel::execute(A + (0 * lda + 0), lda, Btmp + (0 * ldb_tmp + 0), ldb_tmp , alpha , beta); - prefetch(Anext + (blocking_micro_ * lda + 0), lda); - micro_kernel::execute(A + (blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + blocking_micro_), ldb_tmp , alpha , beta); - if( !(useStreamingStores_ && useStreamingStores) ) + if (innerStrideA == 1) prefetch(Anext + (0 * lda + 0), lda); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA * 0)), lda, (0 * ldb_tmp + (innerStrideB *0)), ldb_tmp); +#endif + micro_kernel::execute(A + (0 * lda + (innerStrideA * 0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB *0)), ldb_tmp, innerStrideB, alpha , beta); + if (innerStrideA == 1) prefetch(Anext + (blocking_micro_ * lda + 0), lda); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA * 0)), lda, (0 * ldb_tmp + (innerStrideB * blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (blocking_micro_ * lda + (innerStrideA * 0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB * blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); + if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 ) prefetch(Bnext + (0 * ldb_tmp + 2*blocking_micro_), ldb_tmp); - prefetch(Anext + (2*blocking_micro_ * lda + 0), lda); - micro_kernel::execute(A + (2*blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + 2*blocking_micro_), ldb_tmp , alpha , beta); - prefetch(Anext + (3*blocking_micro_ * lda + 0), lda); - micro_kernel::execute(A + (3*blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + 3*blocking_micro_), ldb_tmp , alpha , beta); - if( !(useStreamingStores_ && useStreamingStores) ) + if (innerStrideA == 1) prefetch(Anext + (2*blocking_micro_ * lda + 0), lda); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA * 0)), lda, (0 * ldb_tmp + (innerStrideB * 2*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (2*blocking_micro_ * lda + (innerStrideA * 0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB * 2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); + if (innerStrideA == 1) prefetch(Anext + (3*blocking_micro_ * lda + 0), lda); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA * 0)), lda, (0 * ldb_tmp + (innerStrideB *3*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (3*blocking_micro_ * lda + (innerStrideA * 0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB *3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); + if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 ) prefetch(Bnext + (blocking_micro_ * ldb_tmp + 0), ldb_tmp); - micro_kernel::execute(A + (0 * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 0), ldb_tmp , alpha , beta); - micro_kernel::execute(A + (blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp , alpha , beta); - if( !(useStreamingStores_ && useStreamingStores) ) +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA *blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB *0)), ldb_tmp); +#endif + micro_kernel::execute(A + (0 * lda + (innerStrideA *blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB *0)), ldb_tmp, innerStrideB, alpha , beta); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB * blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB * blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); + if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 ) prefetch(Bnext + (blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp); - micro_kernel::execute(A + (2*blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp , alpha , beta); - micro_kernel::execute(A + (3*blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 3*blocking_micro_), ldb_tmp , alpha , beta); - if( !(useStreamingStores_ && useStreamingStores) ) +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2* blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB *2*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (2*blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB *2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3* blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (3*blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); + if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 ) prefetch(Bnext + (2*blocking_micro_ * ldb_tmp + 0), ldb_tmp); - prefetch(Anext + (0 * lda + 2*blocking_micro_), lda); - micro_kernel::execute(A + (0 * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + 0), ldb_tmp , alpha , beta); - prefetch(Anext + (blocking_micro_ * lda + 2*blocking_micro_), lda); - micro_kernel::execute(A + (blocking_micro_ * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp , alpha , beta); - if( !(useStreamingStores_ && useStreamingStores) ) + if (innerStrideA == 1) prefetch(Anext + (0 * lda + 2*blocking_micro_), lda); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp); +#endif + micro_kernel::execute(A + (0 * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta); + if (innerStrideA == 1) prefetch(Anext + (blocking_micro_ * lda + 2*blocking_micro_), lda); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); + if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 ) prefetch(Bnext + (2*blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp); - prefetch(Anext + (2*blocking_micro_ * lda + 2*blocking_micro_), lda); - micro_kernel::execute(A + (2*blocking_micro_ * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp , alpha , beta); - prefetch(Anext + (3*blocking_micro_ * lda + 2*blocking_micro_), lda); - micro_kernel::execute(A + (3*blocking_micro_ * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + 3*blocking_micro_), ldb_tmp , alpha , beta); - if( !(useStreamingStores_ && useStreamingStores) ) + if (innerStrideA == 1) prefetch(Anext + (2*blocking_micro_ * lda + 2*blocking_micro_), lda); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (2*blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); + if (innerStrideA == 1) prefetch(Anext + (3*blocking_micro_ * lda + 2*blocking_micro_), lda); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3* blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (3*blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); + if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 ) prefetch(Bnext + (3*blocking_micro_ * ldb_tmp + 0), ldb_tmp); - micro_kernel::execute(A + (0 * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + 0), ldb_tmp , alpha , beta); - micro_kernel::execute(A + (blocking_micro_ * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp , alpha , beta); - if( !(useStreamingStores_ && useStreamingStores) ) +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp); +#endif + micro_kernel::execute(A + (0 * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3* blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); + if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 ) prefetch(Bnext + (3*blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp); - micro_kernel::execute(A + (2*blocking_micro_ * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp , alpha , beta); - micro_kernel::execute(A + (3*blocking_micro_ * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + 3*blocking_micro_), ldb_tmp , alpha , beta); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2* blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (2*blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3* blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (3*blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); }else if( blockingA == 2*blocking_micro_ && blockingB == blocking_ ) { - if( !(useStreamingStores_ && useStreamingStores) ) + if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 ) prefetch(Bnext + (0 * ldb_tmp + 0), ldb_tmp); - prefetch(Anext + (0 * lda + 0), lda); - micro_kernel::execute(A + (0 * lda + 0), lda, Btmp + (0 * ldb_tmp + 0), ldb_tmp , alpha , beta); - prefetch(Anext + (blocking_micro_ * lda + 0), lda); - micro_kernel::execute(A + (blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + blocking_micro_), ldb_tmp , alpha , beta); - if( !(useStreamingStores_ && useStreamingStores) ) + if (innerStrideA == 1) prefetch(Anext + (0 * lda + 0), lda); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA *0)), lda, (0 * ldb_tmp + (innerStrideB*0)), ldb_tmp); +#endif + micro_kernel::execute(A + (0 * lda + (innerStrideA *0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta); + if (innerStrideA == 1) prefetch(Anext + (blocking_micro_ * lda + 0), lda); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (blocking_micro_ * lda + (innerStrideA*0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); + if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 ) prefetch(Bnext + (0 * ldb_tmp + 2*blocking_micro_), ldb_tmp); - prefetch(Anext + (2*blocking_micro_ * lda + 0), lda); - micro_kernel::execute(A + (2*blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + 2*blocking_micro_), ldb_tmp , alpha , beta); - prefetch(Anext + (3*blocking_micro_ * lda + 0), lda); - micro_kernel::execute(A + (3*blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + 3*blocking_micro_), ldb_tmp , alpha , beta); - if( !(useStreamingStores_ && useStreamingStores) ) + if (innerStrideA == 1) prefetch(Anext + (2*blocking_micro_ * lda + 0), lda); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (2*blocking_micro_ * lda + (innerStrideA*0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); + if (innerStrideA == 1) prefetch(Anext + (3*blocking_micro_ * lda + 0), lda); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (3*blocking_micro_ * lda + (innerStrideA*0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); + if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 ) prefetch(Bnext + (blocking_micro_ * ldb_tmp + 0), ldb_tmp); - micro_kernel::execute(A + (0 * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 0), ldb_tmp , alpha , beta); - micro_kernel::execute(A + (blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp , alpha , beta); - if( !(useStreamingStores_ && useStreamingStores) ) +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp); +#endif + micro_kernel::execute(A + (0 * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); + if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 ) prefetch(Bnext + (blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp); - micro_kernel::execute(A + (2*blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp , alpha , beta); - micro_kernel::execute(A + (3*blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 3*blocking_micro_), ldb_tmp , alpha , beta); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (2*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (3*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); }else if( blockingA == blocking_ && blockingB == 2*blocking_micro_) { - if( !(useStreamingStores_ && useStreamingStores) ) + if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 ) prefetch(Bnext + (0 * ldb_tmp + 0), ldb_tmp); - prefetch(Anext + (0 * lda + 0), lda); - micro_kernel::execute(A + (0 * lda + 0), lda, Btmp + (0 * ldb_tmp + 0), ldb_tmp , alpha , beta); - prefetch(Anext + (blocking_micro_ * lda + 0), lda); - micro_kernel::execute(A + (blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + blocking_micro_), ldb_tmp , alpha , beta); - if( !(useStreamingStores_ && useStreamingStores) ) + if (innerStrideA == 1) prefetch(Anext + (0 * lda + 0), lda); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*0)), ldb_tmp); +#endif + micro_kernel::execute(A + (0 * lda + (innerStrideA*0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta); + if (innerStrideA == 1) prefetch(Anext + (blocking_micro_ * lda + 0), lda); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (blocking_micro_ * lda + (innerStrideA*0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); + if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 ) prefetch(Bnext + (blocking_micro_ * ldb_tmp + 0), ldb_tmp); - micro_kernel::execute(A + (0 * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 0), ldb_tmp , alpha , beta); - micro_kernel::execute(A + (blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp , alpha , beta); - if( !(useStreamingStores_ && useStreamingStores) ) +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp); +#endif + micro_kernel::execute(A + (0 * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); + if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 ) prefetch(Bnext + (2*blocking_micro_ * ldb_tmp + 0), ldb_tmp); - prefetch(Anext + (0 * lda + 2*blocking_micro_), lda); - micro_kernel::execute(A + (0 * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + 0), ldb_tmp , alpha , beta); - prefetch(Anext + (blocking_micro_ * lda + 2*blocking_micro_), lda); - micro_kernel::execute(A + (blocking_micro_ * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp , alpha , beta); - if( !(useStreamingStores_ && useStreamingStores) ) + if (innerStrideA == 1) prefetch(Anext + (0 * lda + 2*blocking_micro_), lda); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp); +#endif + micro_kernel::execute(A + (0 * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta); + if (innerStrideA == 1) prefetch(Anext + (blocking_micro_ * lda + 2*blocking_micro_), lda); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); + if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 ) prefetch(Bnext + (3*blocking_micro_ * ldb_tmp + 0), ldb_tmp); - micro_kernel::execute(A + (0 * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + 0), ldb_tmp , alpha , beta); - micro_kernel::execute(A + (blocking_micro_ * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp , alpha , beta); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp); +#endif + micro_kernel::execute(A + (0 * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta); +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta); } else { //invoke micro-transpose - if(blockingA > 0 && blockingB > 0 ) - micro_kernel::execute(A, lda, Btmp, ldb_tmp , alpha , beta); + if(blockingA > 0 && blockingB > 0 ) { +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*0)), ldb_tmp); +#endif + micro_kernel::execute(A, lda, innerStrideA, Btmp, ldb_tmp, innerStrideB, alpha , beta);} //invoke micro-transpose - if(blockingA > 0 && blockingB > blocking_micro_ ) - micro_kernel::execute(A + blocking_micro_ * lda, lda, Btmp + blocking_micro_, ldb_tmp , alpha , beta); + if(blockingA > 0 && blockingB > blocking_micro_ ) { +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*blocking_micro_), ldb_tmp, innerStrideB, alpha , beta);} //invoke micro-transpose - if(blockingA > 0 && blockingB > 2*blocking_micro_ ) - micro_kernel::execute(A + 2*blocking_micro_ * lda, lda, Btmp + 2*blocking_micro_, ldb_tmp , alpha , beta); + if(blockingA > 0 && blockingB > 2*blocking_micro_ ){ +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + 2*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*2*blocking_micro_), ldb_tmp, innerStrideB, alpha , beta);} //invoke micro-transpose - if(blockingA > 0 && blockingB > 3*blocking_micro_ ) - micro_kernel::execute(A + 3*blocking_micro_ * lda, lda, Btmp + 3*blocking_micro_, ldb_tmp , alpha , beta); + if(blockingA > 0 && blockingB > 3*blocking_micro_ ){ +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + 3*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*3*blocking_micro_), ldb_tmp, innerStrideB, alpha , beta);} //invoke micro-transpose - if(blockingA > blocking_micro_ && blockingB > 0 ) - micro_kernel::execute(A + blocking_micro_, lda, Btmp + blocking_micro_ * ldb_tmp, ldb_tmp , alpha , beta); + if(blockingA > blocking_micro_ && blockingB > 0 ){ +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp); +#endif + micro_kernel::execute(A + (innerStrideA*blocking_micro_), lda, innerStrideA, Btmp + blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);} //invoke micro-transpose - if(blockingA > blocking_micro_ && blockingB > blocking_micro_ ) - micro_kernel::execute(A + blocking_micro_ + blocking_micro_ * lda, lda, Btmp + blocking_micro_ + blocking_micro_ * ldb_tmp, ldb_tmp , alpha , beta); + if(blockingA > blocking_micro_ && blockingB > blocking_micro_ ){ +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (innerStrideA*blocking_micro_) + blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*blocking_micro_) + blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);} //invoke micro-transpose - if(blockingA > blocking_micro_ && blockingB > 2*blocking_micro_ ) - micro_kernel::execute(A + blocking_micro_ + 2*blocking_micro_ * lda, lda, Btmp + 2*blocking_micro_ + blocking_micro_ * ldb_tmp, ldb_tmp , alpha , beta); + if(blockingA > blocking_micro_ && blockingB > 2*blocking_micro_ ){ +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (innerStrideA*blocking_micro_) + 2*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*2*blocking_micro_) + blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);} //invoke micro-transpose - if(blockingA > blocking_micro_ && blockingB > 3*blocking_micro_ ) - micro_kernel::execute(A + blocking_micro_ + 3*blocking_micro_ * lda, lda, Btmp + 3*blocking_micro_ + blocking_micro_ * ldb_tmp, ldb_tmp , alpha , beta); + if(blockingA > blocking_micro_ && blockingB > 3*blocking_micro_ ){ +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (innerStrideA*blocking_micro_) + 3*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*3*blocking_micro_) + blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);} //invoke micro-transpose - if(blockingA > 2*blocking_micro_ && blockingB > 0 ) - micro_kernel::execute(A + 2*blocking_micro_, lda, Btmp + 2*blocking_micro_ * ldb_tmp, ldb_tmp , alpha , beta); + if(blockingA > 2*blocking_micro_ && blockingB > 0 ){ +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp); +#endif + micro_kernel::execute(A + (innerStrideA*2*blocking_micro_), lda, innerStrideA, Btmp + 2*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);} //invoke micro-transpose - if(blockingA > 2*blocking_micro_ && blockingB > blocking_micro_ ) - micro_kernel::execute(A + 2*blocking_micro_ + blocking_micro_ * lda, lda, Btmp + blocking_micro_ + 2*blocking_micro_ * ldb_tmp, ldb_tmp , alpha , beta); + if(blockingA > 2*blocking_micro_ && blockingB > blocking_micro_ ){ +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (innerStrideA*2*blocking_micro_) + blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*blocking_micro_) + 2*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);} //invoke micro-transpose - if(blockingA > 2*blocking_micro_ && blockingB > 2*blocking_micro_ ) - micro_kernel::execute(A + 2*blocking_micro_ + 2*blocking_micro_ * lda, lda, Btmp + 2*blocking_micro_ + 2*blocking_micro_ * ldb_tmp, ldb_tmp , alpha , beta); + if(blockingA > 2*blocking_micro_ && blockingB > 2*blocking_micro_ ){ +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (innerStrideA*2*blocking_micro_) + 2*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*2*blocking_micro_) + 2*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);} //invoke micro-transpose - if(blockingA > 2*blocking_micro_ && blockingB > 3*blocking_micro_ ) - micro_kernel::execute(A + 2*blocking_micro_ + 3*blocking_micro_ * lda, lda, Btmp + 3*blocking_micro_ + 2*blocking_micro_ * ldb_tmp, ldb_tmp , alpha , beta); + if(blockingA > 2*blocking_micro_ && blockingB > 3*blocking_micro_ ){ +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (innerStrideA*2*blocking_micro_) + 3*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*3*blocking_micro_) + 2*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);} //invoke micro-transpose - if(blockingA > 3*blocking_micro_ && blockingB > 0 ) - micro_kernel::execute(A + 3*blocking_micro_, lda, Btmp + 3*blocking_micro_ * ldb_tmp, ldb_tmp , alpha , beta); + if(blockingA > 3*blocking_micro_ && blockingB > 0 ){ +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp); +#endif + micro_kernel::execute(A + (innerStrideA*3*blocking_micro_), lda, innerStrideA, Btmp + 3*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);} //invoke micro-transpose - if(blockingA > 3*blocking_micro_ && blockingB > blocking_micro_ ) - micro_kernel::execute(A + 3*blocking_micro_ + blocking_micro_ * lda, lda, Btmp + blocking_micro_ + 3*blocking_micro_ * ldb_tmp, ldb_tmp , alpha , beta); + if(blockingA > 3*blocking_micro_ && blockingB > blocking_micro_ ){ +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (innerStrideA*3*blocking_micro_) + blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*blocking_micro_) + 3*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);} //invoke micro-transpose - if(blockingA > 3*blocking_micro_ && blockingB > 2*blocking_micro_ ) - micro_kernel::execute(A + 3*blocking_micro_ + 2*blocking_micro_ * lda, lda, Btmp + 2*blocking_micro_ + 3*blocking_micro_ * ldb_tmp, ldb_tmp , alpha , beta); + if(blockingA > 3*blocking_micro_ && blockingB > 2*blocking_micro_ ){ +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (innerStrideA*3*blocking_micro_) + 2*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*2*blocking_micro_) + 3*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);} //invoke micro-transpose - if(blockingA > 3*blocking_micro_ && blockingB > 3*blocking_micro_ ) - micro_kernel::execute(A + 3*blocking_micro_ + 3*blocking_micro_ * lda, lda, Btmp + 3*blocking_micro_ + 3*blocking_micro_ * ldb_tmp, ldb_tmp , alpha , beta); + if(blockingA > 3*blocking_micro_ && blockingB > 3*blocking_micro_ ){ +#ifdef DEBUG + printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp); +#endif + micro_kernel::execute(A + (innerStrideA*3*blocking_micro_) + 3*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*3*blocking_micro_) + 3*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);} } // write buffer to main-memory via non-temporal stores - if( (useStreamingStores_ && useStreamingStores) ) + if( (useStreamingStores_ && useStreamingStores && innerStrideB == 1) ) for( int i = 0; i < blockingA; i++){ for( int j = 0; j < blockingB; j+=blocking_micro_) - streamingStore(B + i * ldb + j, buffer + i * ldb_tmp + j); + streamingStore(B + (i * ldb) + (j * innerStrideB), buffer + i * ldb_tmp + j); } } template -void transpose_int_scalar( const floatType* __restrict__ A, int sizeStride1A, - floatType* __restrict__ B, int sizeStride1B, const floatType alpha, const floatType beta, const ComputeNode* plan) +void transpose_int_scalar( const floatType* __restrict__ A, int sizeStride1A, size_t innerStrideA, + floatType* __restrict__ B, int sizeStride1B, size_t innerStrideB, const floatType alpha, const floatType beta, const ComputeNode* plan) { const int32_t end = plan->end; const size_t lda = plan->lda; const size_t ldb = plan->ldb; + const int offDiffAB = plan->offDiffAB; if( plan->next->next != nullptr ){ // recurse int i = plan->start; - if( lda == 1) - transpose_int_scalar( &A[i*lda], end - plan->start, &B[i*ldb], sizeStride1B, alpha, beta, plan->next); - else if( ldb == 1) - transpose_int_scalar( &A[i*lda], sizeStride1A, &B[i*ldb], end - plan->start, alpha, beta, plan->next); +#ifdef DEBUG + printf("----[SCALAR]Pointing to deeper level, Shifting A[+%zu], B[+%zu], repeats (%zu, %zu) = %zu\n", (i+offDiffAB)*lda, i*ldb, lda, ldb, end - plan->start); +#endif + if( plan->indexA ) + transpose_int_scalar( &A[(i+offDiffAB)*lda], end - plan->start, innerStrideA, &B[i*ldb], sizeStride1B, innerStrideB, alpha, beta, plan->next); + else if( plan->indexB ) + transpose_int_scalar( &A[(i+offDiffAB)*lda], sizeStride1A, innerStrideA, &B[i*ldb], end - plan->start, innerStrideB, alpha, beta, plan->next); else - for(; i < end; i++) - transpose_int_scalar( &A[i*lda], sizeStride1A, &B[i*ldb], sizeStride1B, alpha, beta, plan->next); + for(; i < end; i++){ +#ifdef DEBUG + printf("--------[SCALAR]Repeat %zu of %zu, A[+%zu], B[+%zu], repeats (%zu, %zu) = %zu\n", i - plan->start, end - plan->start, (i+offDiffAB)*lda, i*ldb, lda, ldb, end - plan->start); +#endif + transpose_int_scalar( &A[(i+offDiffAB)*lda], sizeStride1A, innerStrideA, &B[i*ldb], sizeStride1B, innerStrideB, alpha, beta, plan->next);} }else{ // macro-kernel const size_t lda_macro = plan->next->lda; const size_t ldb_macro = plan->next->ldb; int i = plan->start; const size_t scalarRemainder = plan->end - plan->start; +#ifdef DEBUG + printf("----[SCALAR]Sending to macro-kernel (Remainder: %zu), Shifting A[+%zu] lda - %zu, B[+%zu] ldb - %zu\n", scalarRemainder, (i+offDiffAB)*lda, lda, i*ldb, ldb); +#endif if( scalarRemainder > 0 ){ - if( lda == 1) - macro_kernel_scalar(&A[i*lda], lda_macro, scalarRemainder, &B[i*ldb], ldb_macro, sizeStride1B, alpha, beta); - else if( ldb == 1) - macro_kernel_scalar(&A[i*lda], lda_macro, sizeStride1A, &B[i*ldb], ldb_macro, scalarRemainder, alpha, beta); + if( plan->indexA ) + macro_kernel_scalar(&A[(i+offDiffAB)*lda], lda_macro, scalarRemainder, innerStrideA, &B[i*ldb], ldb_macro, sizeStride1B, innerStrideB, alpha, beta); + else if( plan->indexB ) + macro_kernel_scalar(&A[(i+offDiffAB)*lda], lda_macro, sizeStride1A, innerStrideA, &B[i*ldb], ldb_macro, scalarRemainder, innerStrideB, alpha, beta); + else + for(; i < end; i++){ +#ifdef DEBUG + printf("--------[SCALAR]Repeat %zu of %zu, A[+%zu], B[+%zu], repeats (%zu, %zu) = %zu\n", i - plan->start, end - plan->start, (i+offDiffAB)*lda, i*ldb, lda, ldb, end - plan->start); +#endif + macro_kernel_scalar(&A[(i+offDiffAB)*lda], lda_macro, sizeStride1A, innerStrideA, &B[i*ldb], ldb_macro, sizeStride1B, innerStrideB, alpha, beta);} } } } template -void transpose_int( const floatType* __restrict__ A, const floatType* __restrict__ Anext, - floatType* __restrict__ B, const floatType* __restrict__ Bnext, const floatType alpha, const floatType beta, +void transpose_int( const floatType* __restrict__ A, const floatType* __restrict__ Anext, size_t innerStrideA, + floatType* __restrict__ B, const floatType* __restrict__ Bnext, size_t innerStrideB, const floatType alpha, const floatType beta, const ComputeNode* plan) { const int32_t end = plan->end - (plan->inc - 1); const int32_t inc = plan->inc; const size_t lda = plan->lda; const size_t ldb = plan->ldb; + const int offDiffAB = plan->offDiffAB; constexpr int blocking_micro_ = REGISTER_BITS/8 / sizeof(floatType); constexpr int blocking_ = blocking_micro_ * 4; if( plan->next->next != nullptr ){ +#ifdef DEBUG + printf("Pointing to deeper level.\n"); +#endif // recurse int32_t i; for(i = plan->start; i < end; i+= inc) { - if( i + inc < end ) - transpose_int( &A[i*lda], &A[(i+1)*lda], &B[i*ldb], &B[(i+1)*ldb], alpha, beta, plan->next); - else - transpose_int( &A[i*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next); +#ifdef DEBUG + printf("|___Shifting A[+%zu], B[+%zu]; repeat %zu of %zu\n", (i+offDiffAB)*lda, i*ldb, i - plan->start, end - plan->start); +#endif + if( i + inc < end ) { + transpose_int( &A[(i+offDiffAB)*lda], &A[(i+1+offDiffAB)*lda], innerStrideA, &B[i*ldb], &B[(i+1)*ldb], innerStrideB, alpha, beta, plan->next); + } else if( i == plan->start || i + inc >= end ) { + transpose_int( &A[(i+offDiffAB)*lda], &A[(i+offDiffAB)*lda], innerStrideA, &B[i*ldb], &B[i*ldb], innerStrideB, alpha, beta, plan->next); + } else { + transpose_int( &A[(i+offDiffAB)*lda], Anext, innerStrideA, &B[i*ldb], Bnext, innerStrideB, alpha, beta, plan->next); + } } // remainder if( blocking_/2 >= blocking_micro_ && (i + blocking_/2) <= plan->end ){ - if( lda == 1) - transpose_int( &A[i*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next); - else if( ldb == 1) - transpose_int( &A[i*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next); +#ifdef DEBUG + printf("---- Half Blocking_ where leading dimension is 1 - A -> %zu, B -> %zu\n", lda, ldb); + printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb); +#endif + if( plan->indexA ) + transpose_int( &A[(i+offDiffAB)*lda], Anext, innerStrideA, &B[i*ldb], Bnext, innerStrideB, alpha, beta, plan->next); + else if( plan->indexB ) + transpose_int( &A[(i+offDiffAB)*lda], Anext, innerStrideA, &B[i*ldb], Bnext, innerStrideB, alpha, beta, plan->next); i+=blocking_/2; } if( blocking_/4 >= blocking_micro_ && (i + blocking_/4) <= plan->end ){ - if( lda == 1) - transpose_int( &A[i*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next); - else if( ldb == 1) - transpose_int( &A[i*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next); +#ifdef DEBUG + printf("---- Quarter Blocking_ where leading dimension is 1 - A -> %zu, B -> %zu\n", lda, ldb); + printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb); +#endif + if( plan->indexA ) + transpose_int( &A[(i+offDiffAB)*lda], Anext, innerStrideA, &B[i*ldb], Bnext, innerStrideB, alpha, beta, plan->next); + else if( plan->indexB ) + transpose_int( &A[(i+offDiffAB)*lda], Anext, innerStrideA, &B[i*ldb], Bnext, innerStrideB, alpha, beta, plan->next); i+=blocking_/4; } const size_t scalarRemainder = plan->end - i; if( scalarRemainder > 0 ){ - if( lda == 1) - transpose_int_scalar( &A[i*lda], scalarRemainder, &B[i*ldb], -1, alpha, beta, plan->next); +#ifdef DEBUG + printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb); + printf("---- Passing to scalar. Blocking_ to %zu (blA = %d, blB = %d) where leading dimension is 1 - A -> %zu, B -> %zu\n", plan->end - i, blockingA, blockingB, lda, ldb); +#endif + if( plan->indexA ) + transpose_int_scalar( &A[(i+offDiffAB)*lda], scalarRemainder, innerStrideA, &B[i*ldb], blockingB, innerStrideB, alpha, beta, plan->next); + else if ( plan->indexB ) + transpose_int_scalar( &A[(i+offDiffAB)*lda], blockingA, innerStrideA, &B[i*ldb], scalarRemainder, innerStrideB, alpha, beta, plan->next); else - transpose_int_scalar( &A[i*lda], -1, &B[i*ldb], scalarRemainder, alpha, beta, plan->next); + transpose_int_scalar( &A[(i+offDiffAB)*lda], blockingA, innerStrideA, &B[i*ldb], blockingB, innerStrideB, alpha, beta, plan->next); } } else { const size_t lda_macro = plan->next->lda; const size_t ldb_macro = plan->next->ldb; // invoke macro-kernel - +#ifdef DEBUG + printf("Sending to macro-kernel.\n"); +#endif int32_t i; for(i = plan->start; i < end; i+= inc) - if( i + inc < end ) - macro_kernel(&A[i*lda], &A[(i+1)*lda], lda_macro, &B[i*ldb], &B[(i+1)*ldb], ldb_macro, alpha, beta); - else - macro_kernel(&A[i*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta); + { +#ifdef DEBUG + printf("|___Shifting A[+%zu], B[+%zu]; repeat %zu of %zu\n", (i+offDiffAB)*lda, i*ldb, i - plan->start, end - plan->start); + printf("--------- Case A: %d < %d, Case B: %d == %zu or %d > %d, else Case C\n", i + inc, end, i, plan->start, i + inc, end); +#endif + if( i + inc < end ) { + macro_kernel(&A[(i+offDiffAB)*lda], &A[(i+offDiffAB+1)*lda], lda_macro, innerStrideA, &B[i*ldb], &B[(i+1)*ldb], ldb_macro, innerStrideB, alpha, beta); + } else if( i == plan->start || i + inc > end ) { + break; // Things land here when the last two levels contain indexA and indexB separately. + } else { + macro_kernel(&A[(i+offDiffAB)*lda], Anext, lda_macro, innerStrideA, &B[i*ldb], Bnext, ldb_macro, innerStrideB, alpha, beta); + } + } // remainder if( blocking_/2 >= blocking_micro_ && (i + blocking_/2) <= plan->end ){ - if( lda == 1) - macro_kernel(&A[i*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta); - else if( ldb == 1) - macro_kernel(&A[i*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta); +#ifdef DEBUG + printf("---- Half Blocking_ where leading dimension is 1 - A -> %zu, B -> %zu\n", lda, ldb); + printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb); +#endif + if( plan->indexA ) + macro_kernel(&A[(i+offDiffAB)*lda], Anext, lda_macro, innerStrideA, &B[i*ldb], Bnext, ldb_macro, innerStrideB, alpha, beta); + else if( plan->indexB ) + macro_kernel(&A[(i+offDiffAB)*lda], Anext, lda_macro, innerStrideA, &B[i*ldb], Bnext, ldb_macro, innerStrideB, alpha, beta); i+=blocking_/2; } if( blocking_/4 >= blocking_micro_ && (i + blocking_/4) <= plan->end ){ - if( lda == 1) - macro_kernel(&A[i*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta); - else if( ldb == 1) - macro_kernel(&A[i*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta); +#ifdef DEBUG + printf("---- Quarter Blocking_ where leading dimension is 1 - A -> %zu, B -> %zu\n", lda, ldb); + printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb); +#endif + if( plan->indexA ) + macro_kernel(&A[(i+offDiffAB)*lda], Anext, lda_macro, innerStrideA, &B[i*ldb], Bnext, ldb_macro, innerStrideB, alpha, beta); + else if( plan->indexB ) + macro_kernel(&A[(i+offDiffAB)*lda], Anext, lda_macro, innerStrideA, &B[i*ldb], Bnext, ldb_macro, innerStrideB, alpha, beta); i+=blocking_/4; } const size_t scalarRemainder = plan->end - i; if( scalarRemainder > 0 ){ - if( lda == 1) - macro_kernel_scalar(&A[i*lda], lda_macro, scalarRemainder, &B[i*ldb], ldb_macro, blockingB, alpha, beta); - else if( ldb == 1) - macro_kernel_scalar(&A[i*lda], lda_macro, blockingA, &B[i*ldb], ldb_macro, scalarRemainder, alpha, beta); +#ifdef DEBUG + printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb); + printf("---- Passing to scalar. Blocking_ to %zu where leading dimension is 1 - A -> %zu, B -> %zu\n", plan->end - i, lda, ldb); +#endif + if( plan->indexA ) + macro_kernel_scalar(&A[(i+offDiffAB)*lda], lda_macro, scalarRemainder, innerStrideA, &B[i*ldb], ldb_macro, blockingB, innerStrideB, alpha, beta); + else if( plan->indexB ) + macro_kernel_scalar(&A[(i+offDiffAB)*lda], lda_macro, blockingA, innerStrideA, &B[i*ldb], ldb_macro, scalarRemainder, innerStrideB, alpha, beta); + else + macro_kernel_scalar(&A[(i+offDiffAB)*lda], lda_macro, blockingA, innerStrideA, &B[i*ldb], ldb_macro, blockingB, innerStrideB, alpha, beta); } } } @@ -688,36 +1138,48 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r constexpr int32_t inc = 1; // TODO const size_t lda = plan->lda; const size_t ldb = plan->ldb; + const int offDiffAB = plan->offDiffAB; if( plan->next != nullptr ) - for(int i = plan->start; i < end; i+= inc) + for(int i = plan->start; i < end; i+= inc) { // recurse - transpose_int_constStride1( &A[i*lda], &B[i*ldb], alpha, beta, plan->next); - else +#ifdef DEBUG + printf("[CONST](Loop %zu of %zu)Shifting A[+%zu], B[+%zu]\n", i - plan->start, end - plan->start, (i+offDiffAB)*lda, i*ldb); +#endif + transpose_int_constStride1( &A[(i+offDiffAB)*lda], &B[i*ldb], alpha, beta, plan->next); + } + else if( !betaIsZero ) { - for(int32_t i = plan->start; i < end; i+= inc) +#ifdef DEBUG + printf("[CONST]Setting A[+%zu], B[+%zu] (from %zu to %d)\n", (plan->start+offDiffAB), plan->start, plan->start, end); +#endif + for(int32_t i = plan->start; i < end; i+= inc){ if( conjA ) - B[i] = alpha * conj(A[i]) + beta * B[i]; + B[i*ldb] = alpha * conj(A[(i+offDiffAB)*lda]) + beta * B[i*ldb]; else - B[i] = alpha * A[i] + beta * B[i]; + B[i*ldb] = alpha * A[(i+offDiffAB)*lda] + beta * B[i*ldb]; + } } else { +#ifdef DEBUG + printf("[CONST]Setting A[+%zu], B[+%zu] (from %zu to %d)\n", (plan->start+offDiffAB), plan->start, plan->start, end); +#endif if( useStreamingStores) if( conjA ) #pragma vector nontemporal for(int32_t i = plan->start; i < end; i+= inc) - B[i] = alpha * conj(A[i]); + B[i*ldb] = alpha * conj(A[(i+offDiffAB)*lda]); else #pragma vector nontemporal for(int32_t i = plan->start; i < end; i+= inc) - B[i] = alpha * A[i]; + B[i*ldb] = alpha * A[(i+offDiffAB)*lda]; else if( conjA ) for(int32_t i = plan->start; i < end; i+= inc) - B[i] = alpha * conj(A[i]); + B[i*ldb] = alpha * conj(A[(i+offDiffAB)*lda]); else for(int32_t i = plan->start; i < end; i+= inc) - B[i] = alpha * A[i]; + B[i*ldb] = alpha * A[(i+offDiffAB)*lda]; } } @@ -727,6 +1189,10 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r const int *perm, const int *outerSizeA, const int *outerSizeB, + const int *offsetA, + const int *offsetB, + const size_t innerStrideA, + const size_t innerStrideB, const int dim, const floatType *A, const floatType alpha, @@ -741,6 +1207,8 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r alpha_(alpha), beta_(beta), dim_(-1), + innerStrideA_(-1), + innerStrideB_(-1), numThreads_(numThreads), masterPlan_(nullptr), selectionMethod_(selectionMethod), @@ -756,13 +1224,17 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r int tmpSizeA[dim]; int tmpOuterSizeA[dim]; int tmpOuterSizeB[dim]; - accountForRowMajor(sizeA, outerSizeA, outerSizeB, perm, - tmpSizeA, tmpOuterSizeA, tmpOuterSizeB, tmpPerm, dim, useRowMajor); + int tmpOffsetA[dim]; + int tmpOffsetB[dim]; + accountForRowMajor(sizeA, outerSizeA, outerSizeB, offsetA, offsetB, perm, + tmpSizeA, tmpOuterSizeA, tmpOuterSizeB, tmpOffsetA, tmpOffsetB, tmpPerm, dim, useRowMajor); sizeA_.resize(dim); perm_.resize(dim); outerSizeA_.resize(dim); outerSizeB_.resize(dim); + offsetA_.resize(dim); + offsetB_.resize(dim); lda_.resize(dim); ldb_.resize(dim); if(threadIds){ @@ -775,10 +1247,13 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r threadIds_.push_back(i); } - verifyParameter(tmpSizeA, tmpPerm, tmpOuterSizeA, tmpOuterSizeB, dim); + verifyParameter(tmpSizeA, tmpPerm, tmpOuterSizeA, tmpOuterSizeB, tmpOffsetA, tmpOffsetB, innerStrideA, innerStrideB, dim); + + innerStrideA_ = innerStrideA; + innerStrideB_ = innerStrideB; // initializes dim_, outerSizeA, outerSizeB, sizeA and perm - skipIndices(tmpSizeA, tmpPerm, tmpOuterSizeA, tmpOuterSizeB, dim); + skipIndices(tmpSizeA, tmpPerm, tmpOuterSizeA, tmpOuterSizeB, tmpOffsetA, tmpOffsetB, dim); fuseIndices(); // initializes lda_ and ldb_ @@ -803,6 +1278,10 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r perm_(other.perm_), outerSizeA_(other.outerSizeA_), outerSizeB_(other.outerSizeB_), + offsetA_(other.offsetA_), + offsetB_(other.offsetB_), + innerStrideA_(other.innerStrideA_), + innerStrideB_(other.innerStrideB_), lda_(other.lda_), ldb_(other.ldb_), threadIds_(other.threadIds_), @@ -839,14 +1318,14 @@ void Transpose::executeEstimate(const Plan *plan) noexcept auto rootNode = plan->getRootNode_const( taskId ); if( std::abs(beta_) < getZeroThreshold() ) { if( conjA_ ) - transpose_int( A_,A_, B_, B_, 0.0, 1.0, rootNode); + transpose_int( A_,A_,innerStrideA_, B_, B_,innerStrideB_, 0.0, 1.0, rootNode); else - transpose_int( A_,A_, B_, B_, 0.0, 1.0, rootNode); + transpose_int( A_,A_,innerStrideA_, B_, B_,innerStrideB_, 0.0, 1.0, rootNode); } else { if( conjA_ ) - transpose_int( A_,A_, B_, B_, 0.0, 1.0, rootNode); + transpose_int( A_,A_,innerStrideA_, B_, B_,innerStrideB_, 0.0, 1.0, rootNode); else - transpose_int( A_,A_, B_, B_, 0.0, 1.0, rootNode); + transpose_int( A_,A_,innerStrideA_, B_, B_,innerStrideB_, 0.0, 1.0, rootNode); } } else { auto rootNode = plan->getRootNode_const( taskId ); @@ -867,16 +1346,16 @@ void Transpose::executeEstimate(const Plan *plan) noexcept template -static void axpy_1D( const floatType* __restrict__ A, floatType* __restrict__ B, const int myStart, const int myEnd, const floatType alpha, const floatType beta, int numThreads) +static void axpy_1D( const floatType* __restrict__ A, floatType* __restrict__ B, const int myStart, const int myEnd, const int &offDiffAB_, const size_t lda, const size_t ldb, const floatType alpha, const floatType beta, int numThreads) { if( !betaIsZero ) { HPTT_DUPLICATE(spawnThreads, for(int32_t i = myStart; i < myEnd; i++) if( conjA ) - B[i] = alpha * conj(A[i]) + beta * B[i]; + B[i * ldb] = alpha * conj(A[(i + offDiffAB_) * lda]) + beta * B[i * ldb]; else - B[i] = alpha * A[i] + beta * B[i]; + B[i * ldb] = alpha * A[(i + offDiffAB_) * lda] + beta * B[i * ldb]; ) } else { if( useStreamingStores) @@ -884,55 +1363,55 @@ static void axpy_1D( const floatType* __restrict__ A, floatType* __restrict__ B, HPTT_DUPLICATE(spawnThreads, for(int32_t i = myStart; i < myEnd; i++) if( conjA ) - B[i] = alpha * conj(A[i]); + B[i * ldb] = alpha * conj(A[(i + offDiffAB_) * lda]); else - B[i] = alpha * A[i]; + B[i * ldb] = alpha * A[(i + offDiffAB_) * lda]; ) else HPTT_DUPLICATE(spawnThreads, for(int32_t i = myStart; i < myEnd; i++) if( conjA ) - B[i] = alpha * conj(A[i]); + B[i * ldb] = alpha * conj(A[(i + offDiffAB_) * lda]); else - B[i] = alpha * A[i]; + B[i * ldb] = alpha * A[(i + offDiffAB_) * lda]; ) } } template -static void axpy_2D( const floatType* __restrict__ A, const int lda, - floatType* __restrict__ B, const int ldb, - const int n0, const int myStart, const int myEnd, const floatType alpha, const floatType beta, int numThreads) +static void axpy_2D( const floatType* __restrict__ A, const size_t (&lda)[2], + floatType* __restrict__ B, const size_t (&ldb)[2], + const int n0, const int myStart, const int myEnd, const int (&offDiffAB_)[2], const int offsetB_, const floatType alpha, const floatType beta, int numThreads) { if( !betaIsZero ) { HPTT_DUPLICATE(spawnThreads, for(int32_t j = myStart; j < myEnd; j++) - for(int32_t i = 0; i < n0; i++) + for(int32_t i = offsetB_; i < n0 + offsetB_; i++) if( conjA ) - B[i + j * ldb] = alpha * conj(A[i + j * lda]) + beta * B[i + j * ldb]; + B[(i * ldb[0]) + j * ldb[1]] = alpha * conj(A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]]) + beta * B[(i * ldb[0]) + j * ldb[1]]; else - B[i + j * ldb] = alpha * A[i + j * lda] + beta * B[i + j * ldb]; + B[(i * ldb[0]) + j * ldb[1]] = alpha * A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]] + beta * B[(i * ldb[0]) + j * ldb[1]]; ) } else { if( useStreamingStores) HPTT_DUPLICATE(spawnThreads, for(int32_t j = myStart; j < myEnd; j++) _Pragma("vector nontemporal") - for(int32_t i = 0; i < n0; i++) + for(int32_t i = offsetB_; i < n0 + offsetB_; i++) if( conjA ) - B[i + j * ldb] = alpha * conj(A[i + j * lda]); + B[(i * ldb[0])+ j * ldb[1]] = alpha * conj(A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]]); else - B[i + j * ldb] = alpha * A[i + j * lda]; + B[(i * ldb[0])+ j * ldb[1]] = alpha * A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]]; ) else HPTT_DUPLICATE(spawnThreads, for(int32_t j = myStart; j < myEnd; j++) - for(int32_t i = 0; i < n0; i++) + for(int32_t i = offsetB_; i < n0 + offsetB_; i++) if( conjA ) - B[i + j * ldb] = alpha * conj(A[i + j * lda]); + B[(i * ldb[0]) + j * ldb[1]] = alpha * conj(A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]]); else - B[i + j * ldb] = alpha * A[i + j * lda]; + B[(i * ldb[0]) + j * ldb[1]] = alpha * A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]]; ) } } @@ -987,22 +1466,27 @@ void Transpose::execute_expert() noexcept int myStart = 0; int myEnd = 0; +#ifdef DEBUG + printf("Executing expert with dim = %d\n", dim_); +#endif if( dim_ == 1) { getStartEnd(sizeA_[0], myStart, myEnd); + const int offDiffAB_ = (int)offsetA_[0] - (int)offsetB_[0]; if( conjA_ ) - axpy_1D( A_, B_, myStart, myEnd, alpha_, beta_, numThreads_ ); + axpy_1D( A_, B_, myStart + offsetB_[0], myEnd + offsetB_[0], offDiffAB_, lda_[0], ldb_[0], alpha_, beta_, numThreads_ ); else - axpy_1D( A_, B_, myStart, myEnd, alpha_, beta_, numThreads_ ); + axpy_1D( A_, B_, myStart + offsetB_[0], myEnd + offsetB_[0], offDiffAB_, lda_[0], ldb_[0], alpha_, beta_, numThreads_ ); return; } else if( dim_ == 2 && perm_[0] == 0) { getStartEnd(sizeA_[1], myStart, myEnd); + const int offDiffAB_[2] = {((int)offsetA_[0] - (int)offsetB_[0]), ((int)offsetA_[1] - (int)offsetB_[1])}; if( conjA_ ) - axpy_2D( A_, lda_[1], B_, ldb_[1], sizeA_[0], myStart, myEnd, alpha_, beta_, numThreads_ ); + axpy_2D( A_, {lda_[0], lda_[1]}, B_, {ldb_[0], ldb_[1]}, sizeA_[0], myStart + offsetB_[1], myEnd + offsetB_[1], offDiffAB_, offsetB_[0], alpha_, beta_, numThreads_ ); else - axpy_2D( A_, lda_[1], B_, ldb_[1], sizeA_[0], myStart, myEnd, alpha_, beta_, numThreads_ ); + axpy_2D( A_, {lda_[0], lda_[1]}, B_, {ldb_[0], ldb_[1]}, sizeA_[0], myStart + offsetB_[1], myEnd + offsetB_[1], offDiffAB_, offsetB_[0], alpha_, beta_, numThreads_ ); return; } @@ -1015,9 +1499,9 @@ void Transpose::execute_expert() noexcept if ( perm_[0] != 0 ) { auto rootNode = masterPlan_->getRootNode_const( taskId ); if( conjA_ ) - transpose_int( A_, A_, B_, B_, alpha_, beta_, rootNode); + transpose_int( A_, A_,innerStrideA_, B_, B_,innerStrideB_, alpha_, beta_, rootNode); else - transpose_int( A_, A_, B_, B_, alpha_, beta_, rootNode); + transpose_int( A_, A_,innerStrideA_, B_, B_,innerStrideB_, alpha_, beta_, rootNode); } else { auto rootNode = masterPlan_->getRootNode_const( taskId ); if( conjA_ ) @@ -1420,7 +1904,7 @@ void Transpose::getParallelismStrategies(std::vector } template -void Transpose::verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int dim) const +void Transpose::verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int* offsetA, const int* offsetB, const size_t innerStrideA, const size_t innerStrideB, const int dim) const { if ( dim < 1 ) { fprintf(stderr,"[HPTT] ERROR: dimensionality too low.\n"); @@ -1457,12 +1941,36 @@ void Transpose::verifyParameter(const int *size, const int* perm, con fprintf(stderr,"[HPTT] ERROR: outerSizeB invalid\n"); exit(-1); } + + if ( offsetA != NULL ) + for(int i=0;i < dim ; ++i) + if ( offsetA[i] + size[i] > outerSizeA[i] ) { + fprintf(stderr,"[HPTT] ERROR: offsetA invalid\n"); + exit(-1); + } + + if ( offsetB != NULL ) + for(int i=0;i < dim ; ++i) + if ( offsetB[i] + size[perm[i]] > outerSizeB[i] ) { + fprintf(stderr,"[HPTT] ERROR: offsetB invalid\n"); + exit(-1); + } + + if ( innerStrideA < 0 ) { + fprintf(stderr,"[HPTT] ERROR: innerStrideA invalid\n"); + exit(-1); + } + + if ( innerStrideB < 0 ) { + fprintf(stderr,"[HPTT] ERROR: innerStrideB invalid\n"); + exit(-1); + } } template void Transpose::computeLeadingDimensions() { - lda_[0] = 1; + lda_[0] = innerStrideA_; if( outerSizeA_[0] == -1 ) for(int i=1;i < dim_ ; ++i) lda_[i] = lda_[i-1] * sizeA_[i-1]; @@ -1470,7 +1978,7 @@ void Transpose::computeLeadingDimensions() for(int i=1;i < dim_ ; ++i) lda_[i] = outerSizeA_[i-1] * lda_[i-1]; - ldb_[0] = 1; + ldb_[0] = innerStrideB_; if( outerSizeB_[0] == -1 ) for(int i=1;i < dim_ ; ++i) ldb_[i] = ldb_[i-1] * sizeA_[perm_[i-1]]; @@ -1480,7 +1988,7 @@ void Transpose::computeLeadingDimensions() } template -void Transpose::skipIndices(const int *sizeA, const int* perm, const int *outerSizeA, const int *outerSizeB, const int dim) +void Transpose::skipIndices(const int *sizeA, const int* perm, const int *outerSizeA, const int *outerSizeB, const int *offsetA, const int *offsetB, const int dim) { for(int i=0;i < dim ; ++i) { @@ -1494,6 +2002,14 @@ void Transpose::skipIndices(const int *sizeA, const int* perm, const outerSizeB_[i] = outerSizeB[i]; else outerSizeB_[i] = sizeA[perm[i]]; + if(offsetA) + offsetA_[i] = offsetA[i]; + else + offsetA_[i] = 0; + if(offsetB) + offsetB_[i] = offsetB[i]; + else + offsetB_[i] = 0; } int skipped = 0; @@ -1508,6 +2024,8 @@ void Transpose::skipIndices(const int *sizeA, const int* perm, const sizeA_[i] = -1; outerSizeA_[i] = -1; outerSizeB_[idxB] = -1; + offsetA_[i] = -1; + offsetB_[idxB] = -1; perm_[idxB] = -1; skipped++; } @@ -1530,8 +2048,10 @@ void Transpose::skipIndices(const int *sizeA, const int* perm, const for(;j < dim ; ++j) if( outerSizeA_[j] != -1 ) break; - if( j < dim ) + if( j < dim ) { std::swap(outerSizeA_[i], outerSizeA_[j]); + std::swap(offsetA_[i], offsetA_[j]); + } } for(int i=0;i < dim ; ++i) if( outerSizeB_[i] == -1 ) @@ -1540,8 +2060,10 @@ void Transpose::skipIndices(const int *sizeA, const int* perm, const for(;j < dim ; ++j) if( outerSizeB_[j] != -1 ) break; - if( j < dim ) + if( j < dim ) { std::swap(outerSizeB_[i], outerSizeB_[j]); + std::swap(offsetB_[i], offsetB_[j]); + } } for(int i=0;i < dim ; ++i) if( perm_[i] == -1 ) @@ -1565,11 +2087,15 @@ void Transpose::skipIndices(const int *sizeA, const int* perm, const sizeA_[0] = 1; outerSizeA_[0] = 1; outerSizeB_[0] = 1; + offsetA_[0] = 0; + offsetB_[0] = 0; } else { perm_.resize(dim_); sizeA_.resize(dim_); outerSizeA_.resize(dim_); outerSizeB_.resize(dim_); + offsetA_.resize(dim_); + offsetB_.resize(dim_); // remove gaps in the perm, if requried (e.g., perm=3,1,0 -> 2,1,0) int currentValue = 0; @@ -1593,6 +2119,8 @@ void Transpose::skipIndices(const int *sizeA, const int* perm, const printVector(sizeA_,"sizeA"); printVector(outerSizeA_,"outerSizeA"); printVector(outerSizeB_,"outerSizeB"); + printf("innerStrideA: %lu\n",innerStrideA_); + printf("innerStrideB: %lu\n",innerStrideB_); #endif } @@ -1630,11 +2158,13 @@ void Transpose::fuseIndices() sizeA_[std::get<0>(tup)] *= sizeA_[std::get<1>(tup)]; outerSizeA_[std::get<0>(tup)] *= outerSizeA_[std::get<1>(tup)]; outerSizeA_[std::get<1>(tup)] = -1; + offsetA_[std::get<1>(tup)] = -1; auto pos1 = std::find(perm_.begin(), perm_.end(), std::get<0>(tup)) - perm_.begin(); auto pos2 = std::find(perm_.begin(), perm_.end(), std::get<1>(tup)) - perm_.begin(); outerSizeB_[pos1] *= outerSizeB_[pos2]; outerSizeB_[pos2] = -1; + offsetB_[pos2] = -1; } if ( fusedIndices.size() > 0 ) { @@ -1668,8 +2198,10 @@ void Transpose::fuseIndices() for(;j < dim_ ; ++j) if( outerSizeA_[j] != -1 ) break; - if( j < dim_ ) + if( j < dim_ ) { std::swap(outerSizeA_[i], outerSizeA_[j]); + std::swap(offsetA_[i], offsetA_[j]); + } } for(int i=0;i < dim_ ; ++i) if( outerSizeB_[i] == -1 ) @@ -1678,12 +2210,16 @@ void Transpose::fuseIndices() for(;j < dim_ ; ++j) if( outerSizeB_[j] != -1 ) break; - if( j < dim_ ) + if( j < dim_ ) { std::swap(outerSizeB_[i], outerSizeB_[j]); + std::swap(offsetB_[i], offsetB_[j]); + } } dim_ -= fusedIndices.size(); outerSizeA_.resize(dim_); outerSizeB_.resize(dim_); + offsetA_.resize(dim_); + offsetB_.resize(dim_); sizeA_.resize(dim_); perm_.resize(dim_); @@ -1693,13 +2229,19 @@ void Transpose::fuseIndices() printf("%d ",perm_[i]); printf("\nsizes_new: "); for(int i=0;i < dim_ ; ++i) - printf("%d ",sizeA_[i]); + printf("%lu ",sizeA_[i]); printf("\nouterSizeA_new: "); for(int i=0;i < dim_ ; ++i) - printf("%d ",outerSizeA_[i]); + printf("%lu ",outerSizeA_[i]); printf("\nouterSizeB_new: "); for(int i=0;i < dim_ ; ++i) - printf("%d ",outerSizeB_[i]); + printf("%lu ",outerSizeB_[i]); + printf("\noffsetA_new: "); + for(int i=0;i < dim_ ; ++i) + printf("%lu ",offsetA_[i]); + printf("\noffsetB_new: "); + for(int i=0;i < dim_ ; ++i) + printf("%lu ",offsetB_[i]); printf("\n"); #endif } @@ -1951,11 +2493,17 @@ void Transpose::createPlans( std::vector > &pla const int commId = (taskIdComm/numThreadsPerComm); // = 0,0,1,1,2,2 taskIdComm = taskIdComm % numThreadsPerComm; // local taskId in next communicator // 0,1,0,1,0,1 - currentNode->start = std::min( sizeA_[index], commId * workPerThread * currentNode->inc ); - currentNode->end = std::min( sizeA_[index], (commId+1) * workPerThread * currentNode->inc ); + if (index == 0) currentNode->indexA = true; + if (findPos(index, perm_) == 0) currentNode->indexB = true; + currentNode->start = std::min( sizeA_[index] + offsetB_[findPos(index, perm_)], (commId * workPerThread * currentNode->inc) + offsetB_[findPos(index, perm_)] ); + currentNode->end = std::min( sizeA_[index] + offsetB_[findPos(index, perm_)], ((commId+1) * workPerThread * currentNode->inc) + offsetB_[findPos(index, perm_)] ); currentNode->lda = lda_[index]; currentNode->ldb = ldb_[findPos(index, perm_)]; + currentNode->offDiffAB = (int)offsetA_[index] - (int)offsetB_[findPos(index, perm_)]; +#ifdef DEBUG + printf("(Task %d, Node %p) Level %d is IndexA %d, IndexB %d. Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d\n",taskId,currentNode,l,currentNode->indexA,currentNode->indexB,currentNode->start,currentNode->end,currentNode->lda,currentNode->ldb,currentNode->offDiffAB); +#endif if( perm_[0] != 0 || l != dim_-1 ){ currentNode->next = new ComputeNode; @@ -1966,12 +2514,17 @@ void Transpose::createPlans( std::vector > &pla //macro-kernel if( perm_[0] != 0 ) { + if (posStride1A_inB == 0) currentNode->indexB = true; currentNode->start = -1; currentNode->end = -1; currentNode->inc = -1; currentNode->lda = lda_[ posStride1B_inA ]; currentNode->ldb = ldb_[ posStride1A_inB ]; + currentNode->offDiffAB = (int)offsetA_[ posStride1B_inA ] - (int)offsetB_[ posStride1A_inB ]; currentNode->next = nullptr; +#ifdef DEBUG + printf(" Adjust Node (%p) IndexA: %d, IndexB: %d, Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d\n",currentNode,currentNode->indexA,currentNode->indexB,currentNode->start,currentNode->end,currentNode->lda,currentNode->ldb,currentNode->offDiffAB); +#endif } } plans.push_back(plan); @@ -2078,7 +2631,7 @@ std::shared_ptr Transpose::selectPlan( const std::vectorinfoLevel_ > 0 ) - printf("We evaluated %d/%d candidates and selected candidate %d.\n", plansEvaluated, plans.size(), bestPlan_id); + printf("We evaluated %d/%zu candidates and selected candidate %d.\n", plansEvaluated, plans.size(), bestPlan_id); } return plans[bestPlan_id]; } diff --git a/src/utils.cpp b/src/utils.cpp index 7742bce..abb29f8 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -46,8 +46,8 @@ int factorial(int n){ return n * factorial(n-1); } -void accountForRowMajor(const int *sizeA, const int *outerSizeA, const int *outerSizeB, const int *perm, - int *tmpSizeA, int *tmpOuterSizeA, int *tmpOuterSizeB, int *tmpPerm, const int dim, const bool useRowMajor) +void accountForRowMajor(const int *sizeA, const int *outerSizeA, const int *outerSizeB, const int *offsetA, const int *offsetB, const int *perm, + int *tmpSizeA, int *tmpOuterSizeA, int *tmpOuterSizeB, int *tmpOffsetA, int *tmpOffsetB, int *tmpPerm, const int dim, const bool useRowMajor) { for(int i=0; i < dim; ++i){ int idx = i; @@ -66,6 +66,14 @@ void accountForRowMajor(const int *sizeA, const int *outerSizeA, const int *oute tmpOuterSizeB[i] = sizeA[perm[idx]]; else tmpOuterSizeB[i] = outerSizeB[idx]; + if( offsetA == nullptr ) + tmpOffsetA[i] = 0; + else + tmpOffsetA[i] = offsetA[idx]; + if( offsetB == nullptr ) + tmpOffsetB[i] = 0; + else + tmpOffsetB[i] = offsetB[idx]; } } diff --git a/testframework/Makefile b/testframework/Makefile index 3e87780..38eca08 100644 --- a/testframework/Makefile +++ b/testframework/Makefile @@ -28,8 +28,11 @@ LIBS=-lhptt all: ${OBJ} ${CXX} ${OBJ} ${LIB_PATH} ${LIBS} ${CXX_FLAGS} -o testframework.exe +LIB_PATH+= $(OPENMP_LIB_PATH) +LIBS+= -lomp + %.o: %.cpp - ${CXX} ${CXX_FLAGS} ${INCLUDE_PATH} -c $< -o $@ + ${CXX} ${LIB_PATH} ${LIBS} ${CXX_FLAGS} ${INCLUDE_PATH} -c $< -o $@ clean: rm -rf *.o benchmark.exe diff --git a/testframework/testframework.cpp b/testframework/testframework.cpp index bfc11d2..2b116ed 100644 --- a/testframework/testframework.cpp +++ b/testframework/testframework.cpp @@ -56,6 +56,9 @@ int equal_(const floatType *A, const floatType*B, int total_size){ } } } +#ifdef DEBUG + printf("\nNumber of Errors: %d of %d\n", error, total_size); +#endif return (error == 0) ? 1 : 0; } @@ -67,22 +70,41 @@ void restore(const floatType* A, floatType* B, size_t n) } template -static void getRandomTest(int &dim, uint32_t *perm, uint32_t *size, floatType &beta, +static void getRandomTest(int &dim, uint32_t *perm, uint32_t *size, + uint32_t *outerSizeA, uint32_t *outerSizeB, + uint32_t *offsetA, uint32_t *offsetB, + int &innerStrideA, int &innerStrideB, + floatType &beta, int &numThreads, - std::string &perm_str, - std::string &size_str, - const int total_size) + std::string &perm_str, std::string &size_str, + std::string &outerSizeA_str, std::string &offsetA_str, + std::string &outerSizeB_str, std::string &offsetB_str, + const int total_size, bool subTensors) { dim = (rand() % MAX_DIM) + 1; uint32_t maxSizeDim = std::max(1.0, std::pow(total_size, 1.0/dim)); std::vector perm_(dim); for(int i=0;i < dim ; ++i){ - size[i] = std::max((((double)rand())/RAND_MAX) * maxSizeDim, 1.); + outerSizeA[i] = std::max((((double)rand())/RAND_MAX) * maxSizeDim, 1.); + size[i] = outerSizeA[i]; + if (subTensors) + size[i] = std::max((((double)rand())/RAND_MAX) * outerSizeA[i], 1.); perm_[i] = i; } std::random_shuffle(perm_.begin(), perm_.end()); - for(int i=0;i < dim ; ++i) + for(int i=0;i < dim ; ++i) + { perm[i] = perm_[i]; + outerSizeB[i] = outerSizeA[perm[i]]; + offsetA[i] = 0; + offsetB[i] = 0; + if (subTensors) + { + outerSizeB[i] = std::max((((double)rand())/RAND_MAX) * maxSizeDim, (double)size[perm[i]]); + offsetA[i] = std::max((((double)rand())/RAND_MAX) * (outerSizeA[i] - size[i]), 0.); + offsetB[i] = std::max((((double)rand())/RAND_MAX) * (outerSizeB[i] - size[perm[i]]), 0.); + } + } numThreads = std::max(std::round((((double)rand())/RAND_MAX) * 24), 1.); if( rand() > RAND_MAX/2 ) @@ -90,29 +112,52 @@ static void getRandomTest(int &dim, uint32_t *perm, uint32_t *size, floatType &b else beta = 4.0; + // Provide a larger inner stride if the tensor is less than a integer factor of the total size + int ordinalSizeA = std::accumulate(outerSizeA, outerSizeA+dim, 1, std::multiplies()); + int ordinalSizeB = std::accumulate(outerSizeB, outerSizeB+dim, 1, std::multiplies()); + innerStrideA = (ordinalSizeA < (total_size / (dim*4))) ? std::max((((double)rand())/RAND_MAX) * dim, 1.) : 1; + innerStrideB = (ordinalSizeB < (total_size / (dim*4))) ? std::max((((double)rand())/RAND_MAX) * dim, 1.) : 1; + for(int i=0;i < dim ; ++i){ perm_str += std::to_string(perm[i]) + " "; size_str += std::to_string(size[i]) + " "; + outerSizeA_str += std::to_string(outerSizeA[i]) + " "; + offsetA_str += std::to_string(offsetA[i]) + " "; + outerSizeB_str += std::to_string(outerSizeB[i]) + " "; + offsetB_str += std::to_string(offsetB[i]) + " "; } printf("dim: %d\n", dim); printf("beta: %f\n", std::real(beta)); printf("perm: %s\n", perm_str.c_str()); printf("size: %s\n", size_str.c_str()); + printf("outerSizeA: %s\n", outerSizeA_str.c_str()); + printf("outerSizeB: %s\n", outerSizeB_str.c_str()); + printf("offsetA: %s\n", offsetA_str.c_str()); + printf("offsetB: %s\n", offsetB_str.c_str()); + printf("ordinalSizeA: %d\n", ordinalSizeA); + printf("ordinalSizeB: %d\n", ordinalSizeB); + printf("innerStrideA: %d\n", innerStrideA); + printf("innerStrideB: %d\n", innerStrideB); printf("numThreads: %d\n",numThreads); } template -void runTests() +void runTests(bool subTensors = false) { int numThreads = 1; floatType alpha = 2.; floatType beta = 4.; - //beta = 0; srand(time(NULL)); int dim; uint32_t perm[MAX_DIM]; uint32_t size[MAX_DIM]; + uint32_t outerSizeA[MAX_DIM]; + uint32_t outerSizeB[MAX_DIM]; + uint32_t offsetA[MAX_DIM]; + uint32_t offsetB[MAX_DIM]; + int innerStrideA = 2; + int innerStrideB = 2; size_t total_size = 128*1024*1024; // Allocating memory for tensors @@ -136,37 +181,68 @@ void runTests() B_ref[i] = B[i]; B_hptt[i] = B[i]; } + printf("Total size: %lu\n", total_size); + printf("Last element of A: %f\n", A[total_size-1]); + printf("Last element of B: %f\n", B[total_size-1]); for(int j=0; j < NUM_TESTS; ++j) { std::string perm_str = ""; std::string size_str = ""; - getRandomTest(dim, perm, size, beta, numThreads, perm_str, size_str, total_size); + std::string outerSizeA_str = ""; + std::string outerSizeB_str = ""; + std::string offsetA_str = ""; + std::string offsetB_str = ""; + std::cout<<"Test "<( size, perm, dim, A, alpha, B_ref, beta, false); - + transpose_ref(size, perm, dim, A, alpha, outerSizeA_, offsetA_, innerStrideA, B_ref, beta, outerSizeB_, offsetB_, innerStrideB, false); restore(B, B_hptt, total_size); plan->execute(); if( !equal_(B_ref, B_hptt, total_size) ) { fprintf(stderr, "Error in HPTT.\n"); - fprintf(stderr,"%d OMP_NUM_THREADS=%d ./benchmark.exe %d %s %s\n",sizeof(floatType), numThreads, dim, perm_str.c_str(), size_str.c_str()); + fprintf(stderr,"%lu OMP_NUM_THREADS=%d ./benchmark.exe %d %s %s\n",sizeof(floatType), numThreads, dim, perm_str.c_str(), size_str.c_str()); exit(-1); } + std::cout << "Test " << j << " passed." << std::endl; } + std::cout << "All tests passed." << std::endl; free(A); free(B); free(B_ref); @@ -177,15 +253,19 @@ int main(int argc, char *argv[]) { printf("float tests: \n"); runTests(); + runTests(true); printf("double tests: \n"); runTests(); + runTests(true); printf("float complex tests: \n"); runTests(); + runTests(true); printf("double complex tests: \n"); runTests(); + runTests(true); printf("[SUCCESS] All tests have passed.\n"); return 0;