diff --git a/Makefile b/Makefile
index 9283bc6..a11369d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,14 +1,14 @@
-CXX_FLAGS=-O3 -std=c++11 -DNDEBUG
+CXX_FLAGS=-O3 -std=c++11
 INCLUDE_PATH=-I./include/
 
 ifeq ($(CXX),icpc)
 CXX_FLAGS += -qopenmp -xhost 
 else
 ifeq ($(CXX),g++)
-CXX_FLAGS += -fopenmp -march=native 
+CXX_FLAGS += -fopenmp -march=native -Wno-vla
 else
 ifeq ($(CXX),clang++)
-CXX_FLAGS += -fopenmp -march=native
+CXX_FLAGS += -fopenmp -march=native -fsanitize=address -Wno-vla-extension
 endif
 endif
 endif
diff --git a/benchmark/Makefile b/benchmark/Makefile
index c25f9c1..d01f2df 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -28,8 +28,11 @@ LIBS=-lhptt
 all: ${OBJ}
 	${CXX} ${OBJ} ${LIB_PATH} ${LIBS} ${CXX_FLAGS} -o benchmark.exe
 
+LIB_PATH+= ${OPENMP_LIB_PATH}
+LIBS+= -lomp
+
 %.o: %.cpp
-	${CXX} ${CXX_FLAGS} ${INCLUDE_PATH} -c $< -o $@
+	${CXX} ${LIB_PATH} ${LIBS} ${CXX_FLAGS} ${INCLUDE_PATH} -c $< -o $@
 
 clean:
 	rm -rf *.o benchmark.exe
diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
index c07a56c..4d4c520 100644
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@@ -192,7 +192,7 @@ int main(int argc, char *argv[])
         restore(B, B_ref, total_size);
         trashCache(trash1, trash2, largerThanL3);
         auto begin_time = omp_get_wtime();
-        transpose_ref( size, perm, dim, A, alpha, B_ref, beta, false);
+        transpose_ref( size, perm, dim, A, alpha, nullptr, nullptr, 1, B_ref, beta, nullptr, nullptr, 1, false);
         double elapsed_time = omp_get_wtime() - begin_time;
         minTime = (elapsed_time < minTime) ? elapsed_time : minTime;
      }
diff --git a/benchmark/benchmark.sh b/benchmark/benchmark.sh
old mode 100644
new mode 100755
diff --git a/benchmark/maxFromFiles.py b/benchmark/maxFromFiles.py
index f3b39e6..ea60c1c 100644
--- a/benchmark/maxFromFiles.py
+++ b/benchmark/maxFromFiles.py
@@ -36,5 +36,5 @@
       else:
          f2.write(f2Content[i])
    except:
-      print "ERROR:", i
+      print("ERROR:", i)
 f2.close()
diff --git a/benchmark/reference.cpp b/benchmark/reference.cpp
index a74318a..3a18e28 100644
--- a/benchmark/reference.cpp
+++ b/benchmark/reference.cpp
@@ -16,68 +16,120 @@
 
 template<typename floatType>
 void transpose_ref( uint32_t *size, uint32_t *perm, int dim, 
-      const floatType* __restrict__ A, floatType alpha, 
-      floatType* __restrict__ B, floatType beta, const bool conjA)
+      const floatType* __restrict__ A, floatType alpha, int *outerSizeA, int *offsetA, int innerStrideA,  
+      floatType* __restrict__ B, floatType beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA)
 {
-   // compute stride for all dimensions w.r.t. A
+   std::vector<int> tempOuterSizeA, tempOuterSizeB, tempOffsetA, tempOffsetB, tempPointerB;
+
+   // Stride One is location of 0 in perm. Perm[0] may not be B stride one unless perm[0] == 0
+   // perm provided yields positions in A data from a B order index
+   // perm calculated below relates positions in B data to an A order index
+   tempPointerB.resize(dim);
+   for (int i = 0; i < dim; ++i)
+      for (int j = 0; j < dim; ++j)
+         if (i == perm[j])
+            tempPointerB[i] = j;
+
+   // Use default values if any of the pointers are nullptr
+   if (outerSizeA == nullptr) {
+      tempOuterSizeA.resize(dim);
+      for (int i = 0; i < dim; i++) tempOuterSizeA[i] = size[i];
+      outerSizeA = tempOuterSizeA.data();
+   }
+    
+   if (outerSizeB == nullptr) {
+      tempOuterSizeB.resize(dim);
+      for (int i = 0; i < dim; i++) tempOuterSizeB[i] = size[perm[i]];
+      outerSizeB = tempOuterSizeB.data();
+   }
+
+   if (offsetA == nullptr) {
+      tempOffsetA.resize(dim);  // Default to zeros
+      for (int i = 0; i < dim; i++) tempOffsetA[i] = 0;
+      offsetA = tempOffsetA.data();
+   }
+
+   if (offsetB == nullptr) {
+      tempOffsetB.resize(dim);  // Default to zeros
+      for (int i = 0; i < dim; i++) tempOffsetB[i] = 0;
+      offsetB = tempOffsetB.data();
+   }
+
+   // compute stride for all dimensions w.r.t. A (like lda)
    uint32_t strideA[dim];
-   strideA[0] = 1;
+   strideA[0] = innerStrideA;
+   for(int i=1; i < dim; ++i) 
+      strideA[i] = strideA[i-1] * outerSizeA[i-1];
+
+   // compute stride for all dimensions w.r.t. B (like ldb)
+   uint32_t strideB[dim];
+   strideB[0] = innerStrideB;
    for(int i=1; i < dim; ++i)
-      strideA[i] = strideA[i-1] * size[i-1];
+      strideB[i] = strideB[i-1] * outerSizeB[i-1];
 
    // combine all non-stride-one dimensions of B into a single dimension for
    // maximum parallelism
    uint32_t sizeOuter = 1;
    for(int i=0; i < dim; ++i)
       if( i != perm[0] )
-         sizeOuter *= size[i]; 
+         sizeOuter *= size[i];
 
    uint32_t sizeInner = size[perm[0]];
+   uint32_t strideAinner = strideA[perm[0]];
 
    // This implementation traverses the output tensor in a linear fashion
    
 #pragma omp parallel for
    for(uint32_t j=0; j < sizeOuter; ++j)
    {
-      uint32_t offsetA = 0;
-      uint32_t offsetB = 0;
-      uint32_t j_tmp = j;
+      uint32_t indexOffsetA = 0;
+      uint32_t indexOffsetB = 0;
+      uint32_t j_tmp_A = j;
+      uint32_t j_tmp_B = j;
       for(int i=1; i < dim; ++i)
       {
-         int current_index = j_tmp % size[perm[i]];
-         j_tmp /= size[perm[i]];
-         offsetA += current_index * strideA[perm[i]];
+         int current_index_A = j_tmp_A % size[perm[i]];
+         j_tmp_A /= size[perm[i]];
+         j_tmp_B /= size[perm[i]];
+         indexOffsetA += (current_index_A + offsetA[perm[i]]) * strideA[perm[i]];
+         indexOffsetB += (j_tmp_B + 1) * offsetB[i] * strideB[i];
+         indexOffsetB += j_tmp_B * (outerSizeB[i] - offsetB[i] - size[perm[i]]) * strideB[i];
       }
 
-      const floatType* __restrict__ A_ = A + offsetA;
-      floatType* __restrict__ B_ = B + j*sizeInner;
-
-      uint32_t strideAinner = strideA[perm[0]];
+      const floatType* __restrict__ A_ = A + indexOffsetA;
+      floatType* __restrict__ B_ = B + indexOffsetB + (offsetB[0] * innerStrideB) + (j * outerSizeB[0] * innerStrideB);
 
       if( beta == (floatType) 0 )
-         for(int i=0; i < sizeInner; ++i)
+         for(int i=0; i < sizeInner; ++i) {
+#ifdef DEBUG
+            //printf("A[%d] = %e -> B[%d] = %e\n", ((i + offsetA[perm[0]]) * strideAinner) + indexOffsetA, A_[(i + offsetA[perm[0]]) * strideAinner], (i * innerStrideB) + indexOffsetB + (offsetB[0] * innerStrideB) + (j * outerSizeB[0] * innerStrideB), B_[i * innerStrideB]);
+#endif
             if( conjA )
-               B_[i] = alpha * std::conj(A_[i * strideAinner]);
+               B_[i * innerStrideB] = alpha * std::conj(A_[(i + offsetA[perm[0]]) * strideAinner]).real();
             else
-               B_[i] = alpha * A_[i * strideAinner];
+               B_[i * innerStrideB] = alpha * A_[(i + offsetA[perm[0]]) * strideAinner];}
       else
-         for(int i=0; i < sizeInner; ++i)
+         for(int i=0; i < sizeInner; ++i) {
+#ifdef DEBUG
+            //printf("A[%d] = %e -> B[%d] = %e\n", ((i + offsetA[perm[0]]) * strideAinner) + indexOffsetA, A_[(i + offsetA[perm[0]]) * strideAinner], (i * innerStrideB) + indexOffsetB + (offsetB[0] * innerStrideB) + (j * outerSizeB[0] * innerStrideB), B_[i * innerStrideB]);
+#endif
             if( conjA )
-               B_[i] = alpha * std::conj(A_[i * strideAinner]) + beta * B_[i];
+               B_[i * innerStrideB] = alpha * std::conj(A_[(i + offsetA[perm[0]]) * strideAinner]).real() + beta * B_[i * innerStrideB];
             else
-               B_[i] = alpha * A_[i * strideAinner] + beta * B_[i];
+               B_[i * innerStrideB] = alpha * A_[(i + offsetA[perm[0]]) * strideAinner] + beta * B_[i * innerStrideB];}
    }
 }
 
 template void transpose_ref<float>( uint32_t *size, uint32_t *perm, int dim, 
-      const float* __restrict__ A, float alpha, 
-      float* __restrict__ B, float beta, const bool conjA);
+      const float* __restrict__ A, float alpha, int *outerSizeA, int *offsetA, int innerStrideA,
+      float* __restrict__ B, float beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA);
 template void transpose_ref<FloatComplex>( uint32_t *size, uint32_t *perm, int dim, 
-      const FloatComplex* __restrict__ A, FloatComplex alpha, 
-      FloatComplex* __restrict__ B, FloatComplex beta, const bool conjA);
+      const FloatComplex* __restrict__ A, FloatComplex alpha, int *outerSizeA, int *offsetA, int innerStrideA,
+      FloatComplex* __restrict__ B, FloatComplex beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA);
 template void transpose_ref<double>( uint32_t *size, uint32_t *perm, int dim, 
-      const double* __restrict__ A, double alpha, 
-      double* __restrict__ B, double beta, const bool conjA);
+      const double* __restrict__ A, double alpha, int *outerSizeA, int *offsetA, int innerStrideA,
+      double* __restrict__ B, double beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA);
 template void transpose_ref<DoubleComplex>( uint32_t *size, uint32_t *perm, int dim, 
-      const DoubleComplex* __restrict__ A, DoubleComplex alpha, 
-      DoubleComplex* __restrict__ B, DoubleComplex beta, const bool conjA);
+      const DoubleComplex* __restrict__ A, DoubleComplex alpha, int *outerSizeA, int *offsetA, int innerStrideA,
+      DoubleComplex* __restrict__ B, DoubleComplex beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA);
+
diff --git a/benchmark/reference.h b/benchmark/reference.h
index 4f4a4a8..6209f84 100644
--- a/benchmark/reference.h
+++ b/benchmark/reference.h
@@ -2,5 +2,5 @@
 
 template<typename floatType>
 void transpose_ref( uint32_t *size, uint32_t *perm, int dim, 
-      const floatType* __restrict__ A, floatType alpha, 
-      floatType* __restrict__ B, floatType beta, const bool conjA);
+      const floatType* __restrict__ A, floatType alpha, int *outerSizeA, int *offsetA, int innerStrideA, 
+      floatType* __restrict__ B, floatType beta, int *outerSizeB, int *offsetB, int innerStrideB, const bool conjA);
diff --git a/include/compute_node.h b/include/compute_node.h
index b777857..8a31d95 100644
--- a/include/compute_node.h
+++ b/include/compute_node.h
@@ -5,10 +5,10 @@ namespace hptt {
    /**
     * \brief A ComputNode encodes a loop.
     */
-class ComputeNode
+class alignas(16) ComputeNode
 {
    public:
-      ComputeNode() : start(-1), end(-1), inc(-1), lda(-1), ldb(-1), next(nullptr) {}
+      ComputeNode() : start(-1), end(-1), inc(-1), lda(-1), ldb(-1), indexA(false), indexB(false), offDiffAB(std::numeric_limits<int>::min()), next(nullptr) {}
 
       ~ComputeNode() {
          if ( next != nullptr )
@@ -20,6 +20,9 @@ class ComputeNode
    size_t inc; //!< increment for at the current loop
    size_t lda; //!< stride of A w.r.t. the loop index
    size_t ldb; //!< stride of B w.r.t. the loop index
+   bool   indexA; //!< true if index of A is innermost (0)
+   bool   indexB; //!< true if index of B is innermost (0)
+   int    offDiffAB; //!< difference in offset A and B (i.e., A - B) at the current loop
    ComputeNode *next; //!< next ComputeNode, this might be another loop or 'nullptr' (i.e., indicating that the macro-kernel should be called)
 };
 
diff --git a/include/hptt.h b/include/hptt.h
index dda7f0a..31e56d8 100644
--- a/include/hptt.h
+++ b/include/hptt.h
@@ -168,12 +168,26 @@ namespace hptt {
  *                       * This parameter may be NULL, indicating that the outer-size is equal to sizeA. 
  *                       * If outerSizeA is not NULL, outerSizeA[i] >= sizeA[i] for all 0 <= i < dim must hold.
  *                       * This option enables HPTT to operate on sub-tensors.
+ * \param[in] offsetA dim-dimensional array that stores the offsets in each dimension of A 
+ *                       * This parameter may be NULL, indicating that the offset is zero.
+ *                       * If offsetA is not NULL, outerSizeA[i] >= offsetA[i] + sizeA[i] >= 0 for all 0 <= i < dim must hold.
+ *                       * This option enables HPTT to operate on intermediate sub-tensors.
+ * \param[in] innerStrideA integer storing a non-unitary stride for the innermost dimension of A.
+ *                       * This parameter may be NULL, indicating that the innerStrideA is equal to 1.
+ *                       * If innerStrideA is not NULL, the raw-data size must exceed \product_{i} (dim_{i}) * innerStrideA.
  * \param[in] beta scaling factor for B
  * \param[inout] B Pointer to the raw-data of the output tensor B
  * \param[in] outerSizeB dim-dimensional array that stores the outer-sizes of each dimension of B.
  *                       * This parameter may be NULL, indicating that the outer-size is equal to the perm(sizeA). 
  *                       * If outerSizeA is not NULL, outerSizeB[i] >= perm(sizeA)[i] for all 0 <= i < dim must hold.
  *                       * This option enables HPTT to operate on sub-tensors.
+ * \param[in] offsetB dim-dimensional array that stores the offsets in each dimension of B
+ *                       * This parameter may be NULL, indicating that the offset is zero.
+ *                       * If offsetB is not NULL, outerSizeB[i] >= offsetB[i] + sizeB[i] >= 0 for all 0 <= i < dim must hold.
+ *                       * This option enables HPTT to operate on intermediate sub-tensors.
+ * \param[in] innerStrideB integer storing a non-unitary stride for the innermost dimension of B.
+ *                       * This parameter may be NULL, indicating that the innerStrideB is equal to 1.
+ *                       * If innerStrideB is not NULL, the raw-data size must exceed \product_{i} (dim_{i}) * innerStrideB.
  * \param[in] selectionMethod Determines if auto-tuning should be used. See hptt::SelectionMethod for details.
  *                            ATTENTION: If you enable auto-tuning (e.g., hptt::MEASURE)
  *                            then the output data will be used during the
@@ -186,6 +200,8 @@ namespace hptt {
  *            HPTT from within a parallel region (i.e., via execute_expert()).
  * \param[in] useRowMajor This flag indicates whether a row-major memory layout should be used (default: off = column-major).
  */
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, no Offsets
+ * or innerStrides, and --ints-- (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
                  const float alpha, const float *A, const int *sizeA, const int *outerSizeA, 
                  const float beta, float *B, const int *outerSizeB, 
@@ -210,7 +226,8 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, c
                  const SelectionMethod selectionMethod,
                  const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
 
-
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, no Offsets
+ * or innerStride, and --vector ints-- (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const std::vector<int> &perm, const int dim,
                  const float alpha, const float *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, 
                  const float beta, float *B, const std::vector<int> &outerSizeB, 
@@ -235,8 +252,8 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const std::vector<
                  const SelectionMethod selectionMethod,
                  const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
 
-
-
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, 
+ * no Offsets or innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
                  const float alpha, const float *A, const int *sizeA, const int *outerSizeA, 
                  const float beta, float *B, const int *outerSizeB, 
@@ -260,6 +277,162 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, c
                  const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, 
                  const int maxAutotuningCandidates,
                  const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class,
+ * --int-- Offsets, no innerStrides, and --ints-- (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
+                 const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const float beta, float *B, const int *outerSizeB, const int *offsetB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                 const double beta, double *B, const int *outerSizeB, const int *offsetB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, const int dim,
+                 const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, 
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, const int dim,
+                 const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class,
+ * --vector int-- Offsets, no innerStrides, and --vector ints-- (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const std::vector<int> &perm, const int dim,
+                 const float alpha, const float *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA,
+                 const float beta, float *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const std::vector<int> &perm, const int dim,
+                 const double alpha, const double *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA,
+                 const double beta, double *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const std::vector<int> &perm, const int dim,
+                 const FloatComplex alpha, const FloatComplex *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA,
+                 const FloatComplex beta, FloatComplex *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const std::vector<int> &perm, const int dim,
+                 const DoubleComplex alpha, const DoubleComplex *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA,
+                 const DoubleComplex beta, DoubleComplex *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, 
+ * --int-- Offsets, no innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
+                 const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const float beta, float *B, const int *outerSizeB, const int *offsetB, 
+                 const int maxAutotuningCandidates,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const double beta, double *B, const int *outerSizeB, const int *offsetB, 
+                 const int maxAutotuningCandidates,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, const int dim,
+                 const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, 
+                 const int maxAutotuningCandidates,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, const int dim,
+                 const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, 
+                 const int maxAutotuningCandidates,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class,
+ * --int-- Offsets, --int-- innerStrides, and --ints-- (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
+                 const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                 const float beta, float *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                 const double beta, double *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, const int dim,
+                 const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, const int dim,
+                 const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class,
+ * --vector int-- Offsets, --int-- innerStrides, and --vector ints-- (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const std::vector<int> &perm, const int dim,
+                 const float alpha, const float *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA, const int innerStrideA,
+                 const float beta, float *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB, const int innerStrideB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const std::vector<int> &perm, const int dim,
+                 const double alpha, const double *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA, const int innerStrideA,
+                 const double beta, double *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB, const int innerStrideB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const std::vector<int> &perm, const int dim,
+                 const FloatComplex alpha, const FloatComplex *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA, const int innerStrideA,
+                 const FloatComplex beta, FloatComplex *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB, const int innerStrideB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const std::vector<int> &perm, const int dim,
+                 const DoubleComplex alpha, const DoubleComplex *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> *offsetA, const int innerStrideA,
+                 const DoubleComplex beta, DoubleComplex *B, const std::vector<int> &outerSizeB, const std::vector<int> *offsetB, const int innerStrideB,
+                 const SelectionMethod selectionMethod,
+                 const int numThreads, const std::vector<int> &threadIds = {}, const bool useRowMajor = false);
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, 
+ * --int-- Offsets, --int-- innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
+                 const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const float beta, float *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const int maxAutotuningCandidates,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const double beta, double *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const int maxAutotuningCandidates,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, const int dim,
+                 const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const int maxAutotuningCandidates,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, const int dim,
+                 const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const int maxAutotuningCandidates,
+                 const int numThreads, const int *threadIds = nullptr, const bool useRowMajor = false);
 }
 
 extern "C"
@@ -282,12 +455,26 @@ extern "C"
  *                       * This parameter may be NULL, indicating that the outer-size is equal to sizeA. 
  *                       * If outerSizeA is not NULL, outerSizeA[i] >= sizeA[i] for all 0 <= i < dim must hold.
  *                       * This option enables HPTT to operate on sub-tensors.
+ * \param[in] offsetA dim-dimensional array that stores the offsets in each dimension of A 
+ *                       * This parameter may be NULL, indicating that the offset is zero.
+ *                       * If offsetA is not NULL, outerSizeA[i] >= offsetA[i] + sizeA[i] >= 0 for all 0 <= i < dim must hold.
+ *                       * This option enables HPTT to operate on intermediate sub-tensors.
+ * \param[in] innerStrideA integer storing a non-unitary stride for the innermost dimension of A.
+ *                       * This parameter may be NULL, indicating that the innerStrideA is equal to 1.
+ *                       * If innerStrideA is not NULL, the raw-data size must exceed \product_{i} (dim_{i}) * innerStrideA.
  * \param[in] beta scaling factor for B
  * \param[inout] B Pointer to the raw-data of the output tensor B
  * \param[in] outerSizeB dim-dimensional array that stores the outer-sizes of each dimension of B.
  *                       * This parameter may be NULL, indicating that the outer-size is equal to the perm(sizeA). 
  *                       * If outerSizeA is not NULL, outerSizeB[i] >= perm(sizeA)[i] for all 0 <= i < dim must hold.
  *                       * This option enables HPTT to operate on sub-tensors.
+ * \param[in] offsetB dim-dimensional array that stores the offsets in each dimension of B
+ *                       * This parameter may be NULL, indicating that the offset is zero.
+ *                       * If offsetB is not NULL, outerSizeB[i] >= offsetB[i] + sizeB[i] >= 0 for all 0 <= i < dim must hold.
+ *                       * This option enables HPTT to operate on intermediate sub-tensors.
+ * \param[in] innerStrideB integer storing a non-unitary stride for the innermost dimension of B.
+ *                       * This parameter may be NULL, indicating that the innerStrideB is equal to 1.
+ *                       * If innerStrideB is not NULL, the raw-data size must exceed \product_{i} (dim_{i}) * innerStrideB.
  * \param[in] numThreads number of threads that participate in this tensor transposition.
  * \param[in] useRowMajor This flag indicates whether a row-major memory layout should be used (default: off = column-major).
  */
@@ -310,4 +497,46 @@ void zTensorTranspose( const int *perm, const int dim,
                  const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, 
                  const double _Complex beta,                    double _Complex *B,                   const int *outerSizeB, 
                  const int numThreads, const int useRowMajor = 0);
+
+/* With Offsets */
+void sOffsetTensorTranspose( const int *perm, const int dim,
+                 const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                 const float beta,        float *B,                   const int *outerSizeB, const int *offsetB,
+                 const int numThreads, const int useRowMajor = 0);
+
+void dOffsetTensorTranspose( const int *perm, const int dim,
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                 const double beta,        double *B,                   const int *outerSizeB, const int *offsetB,
+                 const int numThreads, const int useRowMajor = 0);
+
+void cOffsetTensorTranspose( const int *perm, const int dim,
+                 const float _Complex alpha, bool conjA, const float _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                 const float _Complex beta,                    float _Complex *B,                   const int *outerSizeB, const int *offsetB,
+                 const int numThreads, const int useRowMajor = 0);
+
+void zOffsetTensorTranspose( const int *perm, const int dim,
+                 const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                 const double _Complex beta,                    double _Complex *B,                   const int *outerSizeB, const int *offsetB,
+                 const int numThreads, const int useRowMajor = 0);
+
+/* With Offsets and InnerStrides */
+void sInnerStrideTensorTranspose( const int *perm, const int dim,
+                 const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                 const float beta,        float *B,                   const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const int numThreads, const int useRowMajor = 0);
+
+void dInnerStrideTensorTranspose( const int *perm, const int dim,
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                 const double beta,        double *B,                   const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const int numThreads, const int useRowMajor = 0);
+
+void cInnerStrideTensorTranspose( const int *perm, const int dim,
+                 const float _Complex alpha, bool conjA, const float _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                 const float _Complex beta,                    float _Complex *B,                   const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const int numThreads, const int useRowMajor = 0);
+
+void zInnerStrideTensorTranspose( const int *perm, const int dim,
+                 const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                 const double _Complex beta,                    double _Complex *B,                   const int *outerSizeB, const int *offsetB, const int innerStrideB, 
+                 const int numThreads, const int useRowMajor = 0);
 }
diff --git a/include/transpose.h b/include/transpose.h
index 82f0239..8b42147 100644
--- a/include/transpose.h
+++ b/include/transpose.h
@@ -53,12 +53,22 @@ class Transpose
        *                       * This parameter may be NULL, indicating that the outer-size is equal to sizeA. 
        *                       * If outerSizeA is not NULL, outerSizeA[i] >= sizeA[i] for all 0 <= i < dim must hold.
        *                       * This option enables HPTT to operate on sub-tensors.
+       * \param[in] offsetA dim-dimensional array that stores the offsets in each dimension of A
+       *                       * This parameter may be NULL, indicating that the offset is zero.
+       *                       * If offsetA is not NULL, outerSizeA[i] >= offsetA[i] + sizeA[i] >= 0 for all 0 <= i < dim must hold.
+       *                       * This option enables HPTT to operate on intermediate sub-tensors.
+       * \param[in] innerStrideA integer storing a non-unitary stride for the innermost dimension of A.
        * \param[in] beta scaling factor for B
        * \param[inout] B Pointer to the raw-data of the output tensor B
        * \param[in] outerSizeB dim-dimensional array that stores the outer-sizes of each dimension of B.
        *                       * This parameter may be NULL, indicating that the outer-size is equal to the perm(sizeA). 
        *                       * If outerSizeA is not NULL, outerSizeB[i] >= perm(sizeA)[i] for all 0 <= i < dim must hold.
        *                       * This option enables HPTT to operate on sub-tensors.
+       * \param[in] offsetB dim-dimensional array that stores the offsets in each dimension of B
+       *                       * This parameter may be NULL, indicating that the offset is zero.
+       *                       * If offsetB is not NULL, outerSizeB[i] >= offsetB[i] + sizeB[i] >= 0 for all 0 <= i < dim must hold.
+       *                       * This option enables HPTT to operate on intermediate sub-tensors.
+       * \param[in] innerStrideB integer storing a non-unitary stride for the innermost dimension of B.
        * \param[in] selectionMethod Determines if auto-tuning should be used. See hptt::SelectionMethod for details.
        *                            ATTENTION: If you enable auto-tuning (e.g., hptt::MEASURE)
        *                            then the output data will be used during the
@@ -77,6 +87,10 @@ class Transpose
                  const int *perm, 
                  const int *outerSizeA, 
                  const int *outerSizeB, 
+                 const int *offsetA, 
+                 const int *offsetB, 
+                 const size_t innerStrideA, 
+                 const size_t innerStrideB, 
                  const int dim,
                  const floatType *A,
                  const floatType alpha,
@@ -211,7 +225,7 @@ class Transpose
       void createPlans( std::vector<std::shared_ptr<Plan> > &plans ) const;
       std::shared_ptr<Plan> selectPlan( const std::vector<std::shared_ptr<Plan> > &plans );
       void fuseIndices();
-      void skipIndices(const int *_sizeA, const int* _perm, const int *_outerSizeA, const int *_outerSizeB, const int dim);
+      void skipIndices(const int *_sizeA, const int* _perm, const int *_outerSizeA, const int *_outerSizeB, const int *_offsetA, const int *_offsetB, const int dim);
       void computeLeadingDimensions();
       double loopCostHeuristic( const std::vector<int> &loopOrder ) const;
       double parallelismCostHeuristic( const std::vector<int> &loopOrder ) const;
@@ -233,7 +247,8 @@ class Transpose
                                         const std::vector<int> &loopsAllowed) const;
       float getLoadBalance( const std::vector<int> &parallelismStrategy ) const;
       float estimateExecutionTime( const std::shared_ptr<Plan> plan); //execute just a few iterations and extrapolate the result
-      void verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int dim) const;
+      void verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int* offsetA, 
+                           const int* offsetB, const size_t innerStrideA, const size_t innerStrideB, const int dim) const;
       void getBestParallelismStrategy ( std::vector<int> &bestParallelismStrategy ) const;
       void getBestLoopOrder( std::vector<int> &loopOrder ) const; //innermost loop idx is stored at dim_-1
       void getLoopOrders(std::vector<std::vector<int> > &loopOrders) const;
@@ -256,6 +271,10 @@ class Transpose
       std::vector<int> perm_; //!< permutation 
       std::vector<size_t> outerSizeA_; //!< outer sizes of A
       std::vector<size_t> outerSizeB_;  //!< outer sizes of B
+      std::vector<size_t> offsetA_; //!< offsets of A
+      std::vector<size_t> offsetB_;  //!< offsets of B
+      size_t innerStrideA_; //!< innerStride of A
+      size_t innerStrideB_;  //!< innerStride of B
       std::vector<size_t> lda_;  //!< strides for all dimensions of A (first dimension has a stride of 1)
       std::vector<size_t> ldb_;  //!< strides for all dimensions of B (first dimension has a stride of 1)
       std::vector<int> threadIds_; //!< OpenMP threadIds of the threads involed in the transposition
diff --git a/include/utils.h b/include/utils.h
index a85b27c..b2b1147 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -76,8 +76,8 @@ int findPos(int value, const int *array, int n);
 
 int factorial(int n);
 
-void accountForRowMajor(const int *sizeA, const int *outerSizeA, const int *outerSizeB, const int *perm, 
-               int *tmpSizeA, int *tmpOuterSizeA, int *tmpouterSizeB, int *tmpPerm, const int dim, const bool useRowMajor);
+void accountForRowMajor(const int *sizeA, const int *outerSizeA, const int *outerSizeB, const int *offsetA, const int *offsetB, const int *perm, 
+               int *tmpSizeA, int *tmpOuterSizeA, int *tmpOuterSizeB, int *tmpOffsetA, int *tmpOffsetB, int *tmpPerm, const int dim, const bool useRowMajor);
 }
 
 
diff --git a/src/hptt.cpp b/src/hptt.cpp
index ea761c8..4a31be0 100644
--- a/src/hptt.cpp
+++ b/src/hptt.cpp
@@ -19,13 +19,15 @@
 
 namespace hptt {
 
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, 
+ * no Offsets or innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
                   const float alpha, const float *A, const int *sizeA, const int *outerSizeA, 
                   const float beta, float *B, const int *outerSizeB, 
                   const SelectionMethod selectionMethod,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
    return plan;
 }
 
@@ -35,7 +37,7 @@ std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const in
                   const SelectionMethod selectionMethod,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
    return plan;
 }
 
@@ -45,7 +47,7 @@ std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, co
                   const SelectionMethod selectionMethod,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
    return plan;
 }
 
@@ -55,19 +57,19 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, c
                   const SelectionMethod selectionMethod,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
    return plan;
 }
 
-
-
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, 
+ * no Offsets or innerStrides, and vector ints (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const std::vector<int> &perm, const int dim,
                   const float alpha, const float *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, 
                   const float beta, float *B, const std::vector<int> &outerSizeB, 
                   const SelectionMethod selectionMethod,
                   const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<float> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<float> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
    return plan;
 }
 
@@ -77,7 +79,7 @@ std::shared_ptr<hptt::Transpose<double> > create_plan( const std::vector<int> &p
                   const SelectionMethod selectionMethod,
                   const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<double> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<double> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
    return plan;
 }
 
@@ -87,7 +89,7 @@ std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const std::vector<i
                   const SelectionMethod selectionMethod,
                   const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
    return plan;
 }
 
@@ -97,18 +99,19 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const std::vector<
                   const SelectionMethod selectionMethod,
                   const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
    return plan;
 }
 
-
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, 
+ * no Offsets or innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
 std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
                   const float alpha, const float *A, const int *sizeA, const int *outerSizeA, 
                   const float beta, float *B, const int *outerSizeB, 
                   const int maxAutotuningCandidates,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
    plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
    plan->createPlan();
    return plan;
@@ -120,7 +123,7 @@ std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const in
                   const int maxAutotuningCandidates,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
    plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
    plan->createPlan();
    return plan;
@@ -132,7 +135,7 @@ std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, co
                   const int maxAutotuningCandidates,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
    plan->createPlan();
    return plan;
 }
@@ -143,7 +146,274 @@ std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, c
                   const int maxAutotuningCandidates,
                   const int numThreads, const int *threadIds, const bool useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
+   plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
+   plan->createPlan();
+   return plan;
+}
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, 
+ * --int-- Offsets, no innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
+                  const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                  const float beta, float *B, const int *outerSizeB, const int *offsetB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
+                  const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                  const double beta, double *B, const int *outerSizeB, const int *offsetB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, const int dim,
+                  const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                  const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, const int dim,
+                  const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                  const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
+   return plan;
+}
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, 
+ * --vector int-- Offsets, no innerStrides, and vector ints (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const std::vector<int> &perm, const int dim,
+                  const float alpha, const float *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA,
+                  const float beta, float *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<float> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const std::vector<int> &perm, const int dim,
+                  const double alpha, const double *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA,
+                  const double beta, double *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<double> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const std::vector<int> &perm, const int dim,
+                  const FloatComplex alpha, const FloatComplex *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA,
+                  const FloatComplex beta, FloatComplex *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const std::vector<int> &perm, const int dim,
+                  const DoubleComplex alpha, const DoubleComplex *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA,
+                  const DoubleComplex beta, DoubleComplex *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], 1, 1, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   return plan;
+}
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, 
+ * --int-- Offsets, no innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
+                  const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                  const float beta, float *B, const int *outerSizeB, const int *offsetB,
+                  const int maxAutotuningCandidates,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
+   plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
+   plan->createPlan();
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
+                  const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                  const double beta, double *B, const int *outerSizeB, const int *offsetB,
+                  const int maxAutotuningCandidates,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
+   plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
+   plan->createPlan();
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, const int dim,
+                  const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                  const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB,
+                  const int maxAutotuningCandidates,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
+   plan->createPlan();
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, const int dim,
+                  const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                  const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB,
+                  const int maxAutotuningCandidates,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
+   plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
+   plan->createPlan();
+   return plan;
+}
+
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, 
+ * --int-- Offsets, --int-- innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
+                  const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                  const float beta, float *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
+                  const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                  const double beta, double *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, const int dim,
+                  const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                  const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, const int dim,
+                  const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                  const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, threadIds, useRowMajor ));
+   return plan;
+}
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), SelectionMethod Class, 
+ * --vector int-- Offsets, --int-- innerStrides, and vector ints (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const std::vector<int> &perm, const int dim,
+                  const float alpha, const float *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA, const int innerStrideA,
+                  const float beta, float *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB, const int innerStrideB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<float> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const std::vector<int> &perm, const int dim,
+                  const double alpha, const double *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA, const int innerStrideA,
+                  const double beta, double *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB, const int innerStrideB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<double> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const std::vector<int> &perm, const int dim,
+                  const FloatComplex alpha, const FloatComplex *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA, const int innerStrideA,
+                  const FloatComplex beta, FloatComplex *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB, const int innerStrideB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const std::vector<int> &perm, const int dim,
+                  const DoubleComplex alpha, const DoubleComplex *A, const std::vector<int> &sizeA, const std::vector<int> &outerSizeA, const std::vector<int> &offsetA, const int innerStrideA,
+                  const DoubleComplex beta, DoubleComplex *B, const std::vector<int> &outerSizeB, const std::vector<int> &offsetB, const int innerStrideB,
+                  const SelectionMethod selectionMethod,
+                  const int numThreads, const std::vector<int> &threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(&sizeA[0], &perm[0], &outerSizeA[0], &outerSizeB[0], &offsetA[0], &offsetB[0], innerStrideA, innerStrideB, dim, A, alpha, B, beta, selectionMethod, numThreads, (threadIds.size() > 0 ) ? &threadIds[0] : nullptr, useRowMajor ));
+   return plan;
+}
+
+/* Methods with (floats, doubles, FloatComplexes, and DoubleComplexes) (alpha, A, beta, and B), --int-- maxAutotuningCandidates, 
+ * --int-- Offsets, --int-- innerStrides, and ints (sizeA, outerSizeA, and outerSizeB). */
+std::shared_ptr<hptt::Transpose<float> > create_plan( const int *perm, const int dim,
+                  const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                  const float beta, float *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                  const int maxAutotuningCandidates,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
+   plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
+   plan->createPlan();
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<double> > create_plan( const int *perm, const int dim,
+                  const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                  const double beta, double *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                  const int maxAutotuningCandidates,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
+   plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
+   plan->createPlan();
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<FloatComplex> > create_plan( const int *perm, const int dim,
+                  const FloatComplex alpha, const FloatComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                  const FloatComplex beta, FloatComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                  const int maxAutotuningCandidates,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor));
+   plan->createPlan();
+   return plan;
+}
+
+std::shared_ptr<hptt::Transpose<DoubleComplex> > create_plan( const int *perm, const int dim,
+                  const DoubleComplex alpha, const DoubleComplex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                  const DoubleComplex beta, DoubleComplex *B, const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                  const int maxAutotuningCandidates,
+                  const int numThreads, const int *threadIds, const bool useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, A, alpha, B, beta, MEASURE, numThreads, threadIds, useRowMajor ));
    plan->setMaxAutotuningCandidates(maxAutotuningCandidates);
    plan->createPlan();
    return plan;
@@ -157,7 +427,7 @@ void sTensorTranspose( const int *perm, const int dim,
                  const float beta,        float *B,                   const int *outerSizeB, 
                  const int numThreads, const int useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->execute();
 }
 
@@ -166,45 +436,115 @@ void dTensorTranspose( const int *perm, const int dim,
                  const double beta,        double *B,                   const int *outerSizeB, 
                  const int numThreads, const int useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->execute();
 }
 
 void cTensorTranspose( const int *perm, const int dim,
                  const float _Complex alpha, bool conjA, const float _Complex *A, const int *sizeA, const int *outerSizeA, 
-                 const float _Complex beta,        float _Complex *B,                   const int *outerSizeB, 
+                 const float _Complex beta,                    float _Complex *B,                   const int *outerSizeB, 
                  const int numThreads, const int useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<hptt::FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim, 
-                         (const hptt::FloatComplex*) A, (hptt::FloatComplex) alpha, (hptt::FloatComplex*) B, (hptt::FloatComplex) beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<hptt::FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, 
+                         (const hptt::FloatComplex*) A, hptt::FloatComplex(__real__ alpha, __imag__ alpha), (hptt::FloatComplex*) B, hptt::FloatComplex(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->setConjA(conjA);
    plan->execute();
 }
 
 void zTensorTranspose( const int *perm, const int dim,
                  const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, 
-                 const double _Complex beta,        double _Complex *B,                   const int *outerSizeB, 
+                 const double _Complex beta,                    double _Complex *B,                   const int *outerSizeB, 
                  const int numThreads, const int useRowMajor)
 {
-   auto plan(std::make_shared<hptt::Transpose<hptt::DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim, 
-                         (const hptt::DoubleComplex*) A, (hptt::DoubleComplex) alpha, (hptt::DoubleComplex*) B, (hptt::DoubleComplex) beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   auto plan(std::make_shared<hptt::Transpose<hptt::DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, nullptr, nullptr, 1, 1, dim, 
+                         (const hptt::DoubleComplex*) A, (hptt::DoubleComplex)(__real__ alpha, __imag__ alpha), (hptt::DoubleComplex*) B, (hptt::DoubleComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->setConjA(conjA);
    plan->execute();
 }
-}
-
-
-
-
-
-
-
-
 
+/* With Offset */
+void sOffsetTensorTranspose( const int *perm, const int dim,
+                 const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA,
+                 const float beta,        float *B,                   const int *outerSizeB, const int *offsetB,
+                 const int numThreads, const int useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, 
+                           dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   plan->execute();
+}
 
+void dOffsetTensorTranspose( const int *perm, const int dim,
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const double beta,        double *B,                   const int *outerSizeB, const int *offsetB,
+                 const int numThreads, const int useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, 
+                           dim, A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   plan->execute();
+}
 
+void cOffsetTensorTranspose( const int *perm, const int dim,
+                 const float _Complex alpha, bool conjA, const float _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const float _Complex beta,                    float _Complex *B,                   const int *outerSizeB, const int *offsetB,
+                 const int numThreads, const int useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<hptt::FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, 
+                         (const hptt::FloatComplex*) A, (hptt::FloatComplex)(__real__ alpha, __imag__ alpha), (hptt::FloatComplex*) B, (hptt::FloatComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   plan->setConjA(conjA);
+   plan->execute();
+}
 
+void zOffsetTensorTranspose( const int *perm, const int dim,
+                 const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, 
+                 const double _Complex beta,                    double _Complex *B,                   const int *outerSizeB, const int *offsetB,
+                 const int numThreads, const int useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<hptt::DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, 1, 1, dim, 
+                         (const hptt::DoubleComplex*) A, (hptt::DoubleComplex)(__real__ alpha, __imag__ alpha), (hptt::DoubleComplex*) B, (hptt::DoubleComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   plan->setConjA(conjA);
+   plan->execute();
+}
 
+/* With Offset and innerStride */
+void sInnerStrideTensorTranspose( const int *perm, const int dim,
+                 const float alpha, const float *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA,
+                 const float beta,        float *B,                   const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                 const int numThreads, const int useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<float> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim,
+               A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   plan->execute();
+}
 
+void dInnerStrideTensorTranspose( const int *perm, const int dim,
+                 const double alpha, const double *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const double beta,        double *B,                   const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                 const int numThreads, const int useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<double> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, 
+               A, alpha, B, beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   plan->execute();
+}
 
+void cInnerStrideTensorTranspose( const int *perm, const int dim,
+                 const float _Complex alpha, bool conjA, const float _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const float _Complex beta,                    float _Complex *B,                   const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                 const int numThreads, const int useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<hptt::FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, 
+                         (const hptt::FloatComplex*) A, (hptt::FloatComplex)(__real__ alpha, __imag__ alpha), (hptt::FloatComplex*) B, (hptt::FloatComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   plan->setConjA(conjA);
+   plan->execute();
+}
 
+void zInnerStrideTensorTranspose( const int *perm, const int dim,
+                 const double _Complex alpha, bool conjA, const double _Complex *A, const int *sizeA, const int *outerSizeA, const int *offsetA, const int innerStrideA, 
+                 const double _Complex beta,                    double _Complex *B,                   const int *outerSizeB, const int *offsetB, const int innerStrideB,
+                 const int numThreads, const int useRowMajor)
+{
+   auto plan(std::make_shared<hptt::Transpose<hptt::DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, dim, 
+                         (const hptt::DoubleComplex*) A, (hptt::DoubleComplex)(__real__ alpha, __imag__ alpha), (hptt::DoubleComplex*) B, (hptt::DoubleComplex)(__real__ beta, __imag__ beta), hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+   plan->setConjA(conjA);
+   plan->execute();
+}
+}
diff --git a/src/transpose.cpp b/src/transpose.cpp
index f77cd5b..a59de89 100644
--- a/src/transpose.cpp
+++ b/src/transpose.cpp
@@ -34,24 +34,30 @@ namespace hptt {
 template <typename floatType, int betaIsZero, bool conjA>
 struct micro_kernel
 {
-    static void execute(const floatType* __restrict__ A, const size_t lda, floatType* __restrict__ B, const size_t ldb, const floatType alpha, const floatType beta)
+    static void execute(const floatType* __restrict__ A, const size_t lda, const size_t innerStrideA, floatType* __restrict__ B, const size_t ldb, const size_t innerStrideB, const floatType alpha, const floatType beta)
     {
        constexpr int n = (REGISTER_BITS/8) / sizeof(floatType);
 
        if( betaIsZero )
           for(int j=0; j < n; ++j)
-             for(int i=0; i < n; ++i)
+             for(int i=0; i < n; ++i){
+#ifdef DEBUG
+               //printf("B[+%zu] = %e -> A[+%zu] = %e\n", (i * innerStrideB) + (j * ldb), B[(i * innerStrideB) + (j * ldb)], (j * innerStrideA) + (lda * i), A[(j * innerStrideA) + (lda * i)]);
+#endif
                 if( conjA )
-                   B[i + j * ldb] = alpha * conj(A[i * lda + j]);
+                   B[(i * innerStrideB) + (j * ldb)] = alpha * conj(A[(j * innerStrideA) + (lda * i)]);
                 else
-                   B[i + j * ldb] = alpha * A[i * lda + j];
+                   B[(i * innerStrideB) + (j * ldb)] = alpha * A[(j * innerStrideA) + (lda * i)];}
        else
           for(int j=0; j < n; ++j)
-             for(int i=0; i < n; ++i)
+             for(int i=0; i < n; ++i) {
+#ifdef DEBUG
+               //printf("B[+%zu] = %e -> A[+%zu] = %e\n", (i * innerStrideB) + (j * ldb), B[(i * innerStrideB) + (j * ldb)], (j * innerStrideA) + (lda * i), A[(j * innerStrideA) + (lda * i)]);
+#endif
                 if( conjA )
-                   B[i + j * ldb] = alpha * conj(A[i * lda + j]) + beta * B[i + j * ldb];
+                   B[(i * innerStrideB) + (j * ldb)] = alpha * conj(A[(j * innerStrideA) + (lda * i)]) + beta * B[(i * innerStrideB) + (j * ldb)];
                 else
-                   B[i + j * ldb] = alpha * A[i * lda + j] + beta * B[i + j * ldb];
+                   B[(i * innerStrideB) + (j * ldb)] = alpha * A[(j * innerStrideA) + (lda * i)] + beta * B[(i * innerStrideB) + (j * ldb)];}
     }
 };
 
@@ -77,15 +83,28 @@ static INLINE void prefetch(const floatType* A, const int lda)
 template <int betaIsZero, bool conjA>
 struct micro_kernel<double, betaIsZero, conjA>
 {
-    static void execute(const double* __restrict__ A, const size_t lda, double* __restrict__ B, const size_t ldb, const double alpha ,const double beta)
+    static void execute(const double* __restrict__ A, const size_t lda, const size_t innerStrideA, double* __restrict__ B, const size_t ldb, const size_t innerStrideB, const double alpha ,const double beta)
     {
+#ifdef DEBUG
+       printf("In AVX micro_kernel<double, %d, %d>::execute\n", betaIsZero, conjA);
+#endif
        __m256d reg_alpha = _mm256_set1_pd(alpha); // do not alter the content of B
        __m256d reg_beta = _mm256_set1_pd(beta); // do not alter the content of B
        //Load A
-       __m256d rowA0 = _mm256_loadu_pd((A +0*lda));
-       __m256d rowA1 = _mm256_loadu_pd((A +1*lda));
-       __m256d rowA2 = _mm256_loadu_pd((A +2*lda));
-       __m256d rowA3 = _mm256_loadu_pd((A +3*lda));
+       __m256d rowA0, rowA1, rowA2, rowA3;
+       __m256i indicesA;
+       if (innerStrideA != 1) {
+         indicesA = _mm256_set_epi32(7 * innerStrideA, 6 * innerStrideA, 5 * innerStrideA, 4 * innerStrideA, 3 * innerStrideA, 2 * innerStrideA, 1 * innerStrideA, 0 * innerStrideA);
+         rowA0 = _mm256_i32gather_pd((A +0*lda), indicesA, sizeof(double));
+         rowA1 = _mm256_i32gather_pd((A +1*lda), indicesA, sizeof(double));
+         rowA2 = _mm256_i32gather_pd((A +2*lda), indicesA, sizeof(double));
+         rowA3 = _mm256_i32gather_pd((A +3*lda), indicesA, sizeof(double));
+       } else {
+         rowA0 = _mm256_loadu_pd((A +0*lda));
+         rowA1 = _mm256_loadu_pd((A +1*lda));
+         rowA2 = _mm256_loadu_pd((A +2*lda));
+         rowA3 = _mm256_loadu_pd((A +3*lda));
+       }
 
        //4x4 transpose micro kernel
        __m256d r4, r34, r3, r33;
@@ -107,26 +126,51 @@ struct micro_kernel<double, betaIsZero, conjA>
        //Load B
        if( !betaIsZero )
        {
-          __m256d rowB0 = _mm256_loadu_pd((B +0*ldb));
-          __m256d rowB1 = _mm256_loadu_pd((B +1*ldb));
-          __m256d rowB2 = _mm256_loadu_pd((B +2*ldb));
-          __m256d rowB3 = _mm256_loadu_pd((B +3*ldb));
+          __m256d rowB0, rowB1, rowB2, rowB3;
+          __m256i indicesB;
+         if (innerStrideB != 1) {
+           indicesB = _mm256_set_epi32(7 * innerStrideB, 6 * innerStrideB, 5 * innerStrideB, 4 * innerStrideB, 3 * innerStrideB, 2 * innerStrideB, 1 * innerStrideB, 0 * innerStrideB);
+           rowB0 = _mm256_i32gather_pd((B +0*ldb), indicesB, sizeof(double));
+           rowB1 = _mm256_i32gather_pd((B +1*ldb), indicesB, sizeof(double));
+           rowB2 = _mm256_i32gather_pd((B +2*ldb), indicesB, sizeof(double));
+           rowB3 = _mm256_i32gather_pd((B +3*ldb), indicesB, sizeof(double));
+         } else {
+           rowB0 = _mm256_loadu_pd((B +0*ldb));
+           rowB1 = _mm256_loadu_pd((B +1*ldb));
+           rowB2 = _mm256_loadu_pd((B +2*ldb));
+           rowB3 = _mm256_loadu_pd((B +3*ldb));
+         }
 
           rowB0 = _mm256_add_pd( _mm256_mul_pd(rowB0, reg_beta), rowA0);
           rowB1 = _mm256_add_pd( _mm256_mul_pd(rowB1, reg_beta), rowA1);
           rowB2 = _mm256_add_pd( _mm256_mul_pd(rowB2, reg_beta), rowA2);
           rowB3 = _mm256_add_pd( _mm256_mul_pd(rowB3, reg_beta), rowA3);
           //Store B
-          _mm256_storeu_pd((B + 0 * ldb), rowB0);
-          _mm256_storeu_pd((B + 1 * ldb), rowB1);
-          _mm256_storeu_pd((B + 2 * ldb), rowB2);
-          _mm256_storeu_pd((B + 3 * ldb), rowB3);
+          if (innerStrideB != 1) {
+            _mm256_i32scatter_epi32((B +0*ldb), indicesB, rowB0, sizeof(double));
+            _mm256_i32scatter_epi32((B +1*ldb), indicesB, rowB1, sizeof(double));
+            _mm256_i32scatter_epi32((B +2*ldb), indicesB, rowB2, sizeof(double));
+            _mm256_i32scatter_epi32((B +3*ldb), indicesB, rowB3, sizeof(double));
+          } else {
+            _mm256_storeu_pd((B + 0 * ldb), rowB0);
+            _mm256_storeu_pd((B + 1 * ldb), rowB1);
+            _mm256_storeu_pd((B + 2 * ldb), rowB2);
+            _mm256_storeu_pd((B + 3 * ldb), rowB3);
+          }
        } else {
           //Store B
-          _mm256_storeu_pd((B + 0 * ldb), rowA0);
-          _mm256_storeu_pd((B + 1 * ldb), rowA1);
-          _mm256_storeu_pd((B + 2 * ldb), rowA2);
-          _mm256_storeu_pd((B + 3 * ldb), rowA3);
+          if (innerStrideB != 1) {
+            __m256i indicesB = _mm256_set_epi32(7 * innerStrideB, 6 * innerStrideB, 5 * innerStrideB, 4 * innerStrideB, 3 * innerStrideB, 2 * innerStrideB, 1 * innerStrideB, 0 * innerStrideB);
+            _mm256_i32scatter_epi32((B +0*ldb), indicesB, rowA0, sizeof(double));
+            _mm256_i32scatter_epi32((B +1*ldb), indicesB, rowA1, sizeof(double));
+            _mm256_i32scatter_epi32((B +2*ldb), indicesB, rowA2, sizeof(double));
+            _mm256_i32scatter_epi32((B +3*ldb), indicesB, rowA3, sizeof(double));
+          } else {
+            _mm256_storeu_pd((B + 0 * ldb), rowA0);
+            _mm256_storeu_pd((B + 1 * ldb), rowA1);
+            _mm256_storeu_pd((B + 2 * ldb), rowA2);
+            _mm256_storeu_pd((B + 3 * ldb), rowA3);
+          }
        }
     }
 };
@@ -134,19 +178,36 @@ struct micro_kernel<double, betaIsZero, conjA>
 template <int betaIsZero, bool conjA>
 struct micro_kernel<float, betaIsZero, conjA>
 {
-    static void execute(const float* __restrict__ A, const size_t lda, float* __restrict__ B, const size_t ldb, const float alpha ,const float beta)
+    static void execute(const float* __restrict__ A, const size_t lda, const size_t innerStrideA, float* __restrict__ B, const size_t ldb, const size_t innerStrideB, const float alpha ,const float beta)
     {
+#ifdef DEBUG
+       printf("In AVX micro_kernel<float, %d, %d>::execute\n", betaIsZero, conjA);
+#endif
        __m256 reg_alpha = _mm256_set1_ps(alpha); // do not alter the content of B
        __m256 reg_beta = _mm256_set1_ps(beta); // do not alter the content of B
        //Load A
-       __m256 rowA0 = _mm256_loadu_ps((A +0*lda));
-       __m256 rowA1 = _mm256_loadu_ps((A +1*lda));
-       __m256 rowA2 = _mm256_loadu_ps((A +2*lda));
-       __m256 rowA3 = _mm256_loadu_ps((A +3*lda));
-       __m256 rowA4 = _mm256_loadu_ps((A +4*lda));
-       __m256 rowA5 = _mm256_loadu_ps((A +5*lda));
-       __m256 rowA6 = _mm256_loadu_ps((A +6*lda));
-       __m256 rowA7 = _mm256_loadu_ps((A +7*lda));
+       __m256 rowA0, rowA1, rowA2, rowA3, rowA4, rowA5, rowA6, rowA7;
+       __m256i indicesA;
+       if (innerStrideA == 1) {
+         rowA0 = _mm256_loadu_ps((A +0*lda));
+         rowA1 = _mm256_loadu_ps((A +1*lda));
+         rowA2 = _mm256_loadu_ps((A +2*lda));
+         rowA3 = _mm256_loadu_ps((A +3*lda));
+         rowA4 = _mm256_loadu_ps((A +4*lda));
+         rowA5 = _mm256_loadu_ps((A +5*lda));
+         rowA6 = _mm256_loadu_ps((A +6*lda));
+         rowA7 = _mm256_loadu_ps((A +7*lda));
+       } else {
+         indicesA = _mm256_set_epi32(7 * innerStrideA, 6 * innerStrideA, 5 * innerStrideA, 4 * innerStrideA, 3 * innerStrideA, 2 * innerStrideA, 1 * innerStrideA, 0 * innerStrideA);
+         rowA0 = _mm256_i32gather_ps((A +0*lda), indicesA, sizeof(float));
+         rowA1 = _mm256_i32gather_ps((A +1*lda), indicesA, sizeof(float));
+         rowA2 = _mm256_i32gather_ps((A +2*lda), indicesA, sizeof(float));
+         rowA3 = _mm256_i32gather_ps((A +3*lda), indicesA, sizeof(float));
+         rowA4 = _mm256_i32gather_ps((A +4*lda), indicesA, sizeof(float));
+         rowA5 = _mm256_i32gather_ps((A +5*lda), indicesA, sizeof(float));
+         rowA6 = _mm256_i32gather_ps((A +6*lda), indicesA, sizeof(float));
+         rowA7 = _mm256_i32gather_ps((A +7*lda), indicesA, sizeof(float));
+       }
 
        //8x8 transpose micro kernel
        __m256 r121, r139, r120, r138, r71, r89, r70, r88, r11, r1, r55, r29, r10, r0, r54, r28;
@@ -188,14 +249,28 @@ struct micro_kernel<float, betaIsZero, conjA>
        //Load B
        if( !betaIsZero )
        {
-          __m256 rowB0 = _mm256_loadu_ps((B +0*ldb));
-          __m256 rowB1 = _mm256_loadu_ps((B +1*ldb));
-          __m256 rowB2 = _mm256_loadu_ps((B +2*ldb));
-          __m256 rowB3 = _mm256_loadu_ps((B +3*ldb));
-          __m256 rowB4 = _mm256_loadu_ps((B +4*ldb));
-          __m256 rowB5 = _mm256_loadu_ps((B +5*ldb));
-          __m256 rowB6 = _mm256_loadu_ps((B +6*ldb));
-          __m256 rowB7 = _mm256_loadu_ps((B +7*ldb));
+          __m256 rowB0, rowB1, rowB2, rowB3, rowB4, rowB5, rowB6, rowB7;
+          __m256i indicesB;
+          if (innerStrideB != 1) {
+            indicesB = _mm256_set_epi32(7 * innerStrideB, 6 * innerStrideB, 5 * innerStrideB, 4 * innerStrideB, 3 * innerStrideB, 2 * innerStrideB, 1 * innerStrideB, 0 * innerStrideB);
+            rowB0 = _mm256_i32gather_ps((B +0*ldb), indicesB, sizeof(float));
+            rowB1 = _mm256_i32gather_ps((B +1*ldb), indicesB, sizeof(float));
+            rowB2 = _mm256_i32gather_ps((B +2*ldb), indicesB, sizeof(float));
+            rowB3 = _mm256_i32gather_ps((B +3*ldb), indicesB, sizeof(float));
+            rowB4 = _mm256_i32gather_ps((B +4*ldb), indicesB, sizeof(float));
+            rowB5 = _mm256_i32gather_ps((B +5*ldb), indicesB, sizeof(float));
+            rowB6 = _mm256_i32gather_ps((B +6*ldb), indicesB, sizeof(float));
+            rowB7 = _mm256_i32gather_ps((B +7*ldb), indicesB, sizeof(float));
+          } else {
+            rowB0 = _mm256_loadu_ps((B +0*ldb));
+            rowB1 = _mm256_loadu_ps((B +1*ldb));
+            rowB2 = _mm256_loadu_ps((B +2*ldb));
+            rowB3 = _mm256_loadu_ps((B +3*ldb));
+            rowB4 = _mm256_loadu_ps((B +4*ldb));
+            rowB5 = _mm256_loadu_ps((B +5*ldb));
+            rowB6 = _mm256_loadu_ps((B +6*ldb));
+            rowB7 = _mm256_loadu_ps((B +7*ldb));
+          }
 
           rowB0 = _mm256_add_ps( _mm256_mul_ps(rowB0, reg_beta), rowA0);
           rowB1 = _mm256_add_ps( _mm256_mul_ps(rowB1, reg_beta), rowA1);
@@ -206,23 +281,46 @@ struct micro_kernel<float, betaIsZero, conjA>
           rowB6 = _mm256_add_ps( _mm256_mul_ps(rowB6, reg_beta), rowA6);
           rowB7 = _mm256_add_ps( _mm256_mul_ps(rowB7, reg_beta), rowA7);
           //Store B
-          _mm256_storeu_ps((B + 0 * ldb), rowB0);
-          _mm256_storeu_ps((B + 1 * ldb), rowB1);
-          _mm256_storeu_ps((B + 2 * ldb), rowB2);
-          _mm256_storeu_ps((B + 3 * ldb), rowB3);
-          _mm256_storeu_ps((B + 4 * ldb), rowB4);
-          _mm256_storeu_ps((B + 5 * ldb), rowB5);
-          _mm256_storeu_ps((B + 6 * ldb), rowB6);
-          _mm256_storeu_ps((B + 7 * ldb), rowB7);
+          if (innerStrideB != 1) {
+            _mm256_i32scatter_epi32((B +0*ldb), indicesB, rowB0, sizeof(float));
+            _mm256_i32scatter_epi32((B +1*ldb), indicesB, rowB1, sizeof(float));
+            _mm256_i32scatter_epi32((B +2*ldb), indicesB, rowB2, sizeof(float));
+            _mm256_i32scatter_epi32((B +3*ldb), indicesB, rowB3, sizeof(float));
+            _mm256_i32scatter_epi32((B +4*ldb), indicesB, rowB4, sizeof(float));
+            _mm256_i32scatter_epi32((B +5*ldb), indicesB, rowB5, sizeof(float));
+            _mm256_i32scatter_epi32((B +6*ldb), indicesB, rowB6, sizeof(float));
+            _mm256_i32scatter_epi32((B +7*ldb), indicesB, rowB7, sizeof(float));
+          } else {
+            _mm256_storeu_ps((B + 0 * ldb), rowB0);
+            _mm256_storeu_ps((B + 1 * ldb), rowB1);
+            _mm256_storeu_ps((B + 2 * ldb), rowB2);
+            _mm256_storeu_ps((B + 3 * ldb), rowB3);
+            _mm256_storeu_ps((B + 4 * ldb), rowB4);
+            _mm256_storeu_ps((B + 5 * ldb), rowB5);
+            _mm256_storeu_ps((B + 6 * ldb), rowB6);
+            _mm256_storeu_ps((B + 7 * ldb), rowB7);
+          }
        } else {
-          _mm256_storeu_ps((B + 0 * ldb), rowA0);
-          _mm256_storeu_ps((B + 1 * ldb), rowA1);
-          _mm256_storeu_ps((B + 2 * ldb), rowA2);
-          _mm256_storeu_ps((B + 3 * ldb), rowA3);
-          _mm256_storeu_ps((B + 4 * ldb), rowA4);
-          _mm256_storeu_ps((B + 5 * ldb), rowA5);
-          _mm256_storeu_ps((B + 6 * ldb), rowA6);
-          _mm256_storeu_ps((B + 7 * ldb), rowA7);
+          if (innerStrideB != 1) {
+            __m256i indicesB = _mm256_set_epi32(7 * innerStrideB, 6 * innerStrideB, 5 * innerStrideB, 4 * innerStrideB, 3 * innerStrideB, 2 * innerStrideB, 1 * innerStrideB, 0 * innerStrideB);
+            _mm256_i32scatter_epi32((B +0*ldb), indicesB, rowA0, sizeof(float));
+            _mm256_i32scatter_epi32((B +1*ldb), indicesB, rowA1, sizeof(float));
+            _mm256_i32scatter_epi32((B +2*ldb), indicesB, rowA2, sizeof(float));
+            _mm256_i32scatter_epi32((B +3*ldb), indicesB, rowA3, sizeof(float));
+            _mm256_i32scatter_epi32((B +4*ldb), indicesB, rowA4, sizeof(float));
+            _mm256_i32scatter_epi32((B +5*ldb), indicesB, rowA5, sizeof(float));
+            _mm256_i32scatter_epi32((B +6*ldb), indicesB, rowA6, sizeof(float));
+            _mm256_i32scatter_epi32((B +7*ldb), indicesB, rowA7, sizeof(float));
+          } else {
+            _mm256_storeu_ps((B + 0 * ldb), rowA0);
+            _mm256_storeu_ps((B + 1 * ldb), rowA1);
+            _mm256_storeu_ps((B + 2 * ldb), rowA2);
+            _mm256_storeu_ps((B + 3 * ldb), rowA3);
+            _mm256_storeu_ps((B + 4 * ldb), rowA4);
+            _mm256_storeu_ps((B + 5 * ldb), rowA5);
+            _mm256_storeu_ps((B + 6 * ldb), rowA6);
+            _mm256_storeu_ps((B + 7 * ldb), rowA7);
+          }
        }
     }
 };
@@ -246,16 +344,62 @@ static INLINE void prefetch(const floatType* A, const int lda) { }
 template <int betaIsZero, bool conjA>
 struct micro_kernel<float, betaIsZero>
 {
-    static void execute(const float* __restrict__ A, const size_t lda, float* __restrict__ B, const size_t ldb, const float alpha ,const float beta)
+    static void execute(const float* __restrict__ A, const size_t lda, const size_t innerStrideA, float* __restrict__ B, const size_t ldb, const size_t innerStrideA, const float alpha ,const float beta)
     {
+#ifdef DEBUG
+       printf("In ARM micro_kernel<float, %d, %d>::execute\n", betaIsZero, conjA);
+#endif
        float32x4_t reg_alpha = vdupq_n_f32(alpha);
        float32x4_t reg_beta = vdupq_n_f32(beta);
 
        //Load A
-       float32x4_t rowA0 = vld1q_f32((A +0*lda));
-       float32x4_t rowA1 = vld1q_f32((A +1*lda));
-       float32x4_t rowA2 = vld1q_f32((A +2*lda));
-       float32x4_t rowA3 = vld1q_f32((A +3*lda));
+       float32x4_t rowA0, rowA1, rowA2, rowA3;
+       if (innerStrideA == 1) {
+         float32x4_t rowA0 = vld1q_f32((A +0*lda));
+         float32x4_t rowA1 = vld1q_f32((A +1*lda));
+         float32x4_t rowA2 = vld1q_f32((A +2*lda));
+         float32x4_t rowA3 = vld1q_f32((A +3*lda));
+       } else if (innerStrideA == 2) {
+         float32x4_t rowA0 = vld2q_f32((A +0*lda)).val[0];
+         float32x4_t rowA1 = vld2q_f32((A +1*lda)).val[0];
+         float32x4_t rowA2 = vld2q_f32((A +2*lda)).val[0];
+         float32x4_t rowA3 = vld2q_f32((A +3*lda)).val[0];
+       } else if (innerStrideA == 3) {
+         float32x4_t rowA0 = vld3q_f32((A +0*lda)).val[0];
+         float32x4_t rowA1 = vld3q_f32((A +1*lda)).val[0];
+         float32x4_t rowA2 = vld3q_f32((A +2*lda)).val[0];
+         float32x4_t rowA3 = vld3q_f32((A +3*lda)).val[0];
+       } else if (innerStrideA == 4) {
+         float32x4_t rowA0 = vld4q_f32((A +0*lda)).val[0];
+         float32x4_t rowA1 = vld4q_f32((A +1*lda)).val[0];
+         float32x4_t rowA2 = vld4q_f32((A +2*lda)).val[0];
+         float32x4_t rowA3 = vld4q_f32((A +3*lda)).val[0];
+       } else {
+         float32x4_t rowA0 = vdupq_n_f32(0);
+         float32x4_t rowA1 = vdupq_n_f32(0);
+         float32x4_t rowA2 = vdupq_n_f32(0);
+         float32x4_t rowA3 = vdupq_n_f32(0);
+
+         rowA0 = vld1q_lane_f32(A + 0 * lda + 0 * strideA, rowA0, 0);
+         rowA0 = vld1q_lane_f32(A + 0 * lda + 1 * strideA, rowA0, 1);
+         rowA0 = vld1q_lane_f32(A + 0 * lda + 2 * strideA, rowA0, 2);
+         rowA0 = vld1q_lane_f32(A + 0 * lda + 3 * strideA, rowA0, 3);
+
+         rowA1 = vld1q_lane_f32(A + 1 * lda + 0 * strideA, rowA1, 0);
+         rowA1 = vld1q_lane_f32(A + 1 * lda + 1 * strideA, rowA1, 1);
+         rowA1 = vld1q_lane_f32(A + 1 * lda + 2 * strideA, rowA1, 2);
+         rowA1 = vld1q_lane_f32(A + 1 * lda + 3 * strideA, rowA1, 3);
+
+         rowA2 = vld1q_lane_f32(A + 2 * lda + 0 * strideA, rowA2, 0);
+         rowA2 = vld1q_lane_f32(A + 2 * lda + 1 * strideA, rowA2, 1);
+         rowA2 = vld1q_lane_f32(A + 2 * lda + 2 * strideA, rowA2, 2);
+         rowA2 = vld1q_lane_f32(A + 2 * lda + 3 * strideA, rowA2, 3);
+
+         rowA3 = vld1q_lane_f32(A + 3 * lda + 0 * strideA, rowA3, 0);
+         rowA3 = vld1q_lane_f32(A + 3 * lda + 1 * strideA, rowA3, 1);
+         rowA3 = vld1q_lane_f32(A + 3 * lda + 2 * strideA, rowA3, 2);
+         rowA3 = vld1q_lane_f32(A + 3 * lda + 3 * strideA, rowA3, 3);
+       }
 
        //4x4 transpose micro kernel
        float32x4x2_t t0,t1,t2,t3;
@@ -272,27 +416,110 @@ struct micro_kernel<float, betaIsZero>
 
        //Load B
        if( !betaIsZero )
-       {
-          float32x4_t rowB0 = vld1q_f32((B +0*ldb));
-          float32x4_t rowB1 = vld1q_f32((B +1*ldb));
-          float32x4_t rowB2 = vld1q_f32((B +2*ldb));
-          float32x4_t rowB3 = vld1q_f32((B +3*ldb));
+       {       
+          float32x4_t rowB0, rowB1, rowB2, rowB3;
+          if (innerStrideB == 1) {
+            float32x4_t rowB0 = vld1q_f32((B +0*ldb));
+            float32x4_t rowB1 = vld1q_f32((B +1*ldb));
+            float32x4_t rowB2 = vld1q_f32((B +2*ldb));
+            float32x4_t rowB3 = vld1q_f32((B +3*ldb));
+          } else if (innerStrideB == 2) {
+            float32x4_t rowB0 = vld2q_f32((B +0*ldb)).val[0];
+            float32x4_t rowB1 = vld2q_f32((B +1*ldb)).val[0];
+            float32x4_t rowB2 = vld2q_f32((B +2*ldb)).val[0];
+            float32x4_t rowB3 = vld2q_f32((B +3*ldb)).val[0];
+          } else if (innerStrideB == 3) {
+            float32x4_t rowB0 = vld3q_f32((B +0*ldb)).val[0];
+            float32x4_t rowB1 = vld3q_f32((B +1*ldb)).val[0];
+            float32x4_t rowB2 = vld3q_f32((B +2*ldb)).val[0];
+            float32x4_t rowB3 = vld3q_f32((B +3*ldb)).val[0];
+          } else if (innerStrideB == 4) {
+            float32x4_t rowB0 = vld4q_f32((B +0*ldb)).val[0];
+            float32x4_t rowB1 = vld4q_f32((B +1*ldb)).val[0];
+            float32x4_t rowB2 = vld4q_f32((B +2*ldb)).val[0];
+            float32x4_t rowB3 = vld4q_f32((B +3*ldb)).val[0];
+          } else {
+            float32x4_t rowB0 = vdupq_n_f32(0);
+            float32x4_t rowB1 = vdupq_n_f32(0);
+            float32x4_t rowB2 = vdupq_n_f32(0);
+            float32x4_t rowB3 = vdupq_n_f32(0);
+
+            rowB0 = vld1q_lane_f32(B + 0 * innerStrideB, rowB0, 0);
+            rowB0 = vld1q_lane_f32(B + 1 * innerStrideB, rowB0, 1);
+            rowB0 = vld1q_lane_f32(B + 2 * innerStrideB, rowB0, 2);
+            rowB0 = vld1q_lane_f32(B + 3 * innerStrideB, rowB0, 3);
+
+            rowB1 = vld1q_lane_f32(B + 1 * ldb + 0 * innerStrideB, rowB1, 0);
+            rowB1 = vld1q_lane_f32(B + 1 * ldb + 1 * innerStrideB, rowB1, 1);
+            rowB1 = vld1q_lane_f32(B + 1 * ldb + 2 * innerStrideB, rowB1, 2);
+            rowB1 = vld1q_lane_f32(B + 1 * ldb + 3 * innerStrideB, rowB1, 3);
+
+            rowB2 = vld1q_lane_f32(B + 2 * ldb + 0 * innerStrideB, rowB2, 0);
+            rowB2 = vld1q_lane_f32(B + 2 * ldb + 1 * innerStrideB, rowB2, 1);
+            rowB2 = vld1q_lane_f32(B + 2 * ldb + 2 * innerStrideB, rowB2, 2);
+            rowB2 = vld1q_lane_f32(B + 2 * ldb + 3 * innerStrideB, rowB2, 3);
+
+            rowB3 = vld1q_lane_f32(B + 3 * ldb + 0 * innerStrideB, rowB3, 0);
+            rowB3 = vld1q_lane_f32(B + 3 * ldb + 1 * innerStrideB, rowB3, 1);
+            rowB3 = vld1q_lane_f32(B + 3 * ldb + 2 * innerStrideB, rowB3, 2);
+            rowB3 = vld1q_lane_f32(B + 3 * ldb + 3 * innerStrideB, rowB3, 3);
+          }
 
           rowB0 = vaddq_f32( vmulq_f32(rowB0, reg_beta), rowA0);
           rowB1 = vaddq_f32( vmulq_f32(rowB1, reg_beta), rowA1);
           rowB2 = vaddq_f32( vmulq_f32(rowB2, reg_beta), rowA2);
           rowB3 = vaddq_f32( vmulq_f32(rowB3, reg_beta), rowA3);
           //Store B
-          vst1q_f32((B + 0 * ldb), rowB0);
-          vst1q_f32((B + 1 * ldb), rowB1);
-          vst1q_f32((B + 2 * ldb), rowB2);
-          vst1q_f32((B + 3 * ldb), rowB3);
+          if (innerStrideB == 1) {
+            vst1q_f32((B + 0 * ldb), rowB0);
+            vst1q_f32((B + 1 * ldb), rowB1);
+            vst1q_f32((B + 2 * ldb), rowB2);
+            vst1q_f32((B + 3 * ldb), rowB3);
+          } else {
+            float tmp[4];
+            vst1q_f32(tmp, rowB0);
+            for (int i = 0; i < 4; ++i) {
+                B[i * innerStrideB] = tmp[i];
+            }
+            vst1q_f32(tmp, rowB1);
+            for (int i = 0; i < 4; ++i) {
+                B[i * innerStrideB + 1 * ldb] = tmp[i];
+            }
+            vst1q_f32(tmp, rowB2);
+            for (int i = 0; i < 4; ++i) {
+                B[i * innerStrideB + 2 * ldb] = tmp[i];
+            }
+            vst1q_f32(tmp, rowB3);
+            for (int i = 0; i < 4; ++i) {
+                B[i * innerStrideB + 3 * ldb] = tmp[i];
+            }
+          }
        } else {
           //Store B
-          vst1q_f32((B + 0 * ldb), rowA0);
-          vst1q_f32((B + 1 * ldb), rowA1);
-          vst1q_f32((B + 2 * ldb), rowA2);
-          vst1q_f32((B + 3 * ldb), rowA3);
+          if (innerStrideB == 1) {
+            vst1q_f32((B + 0 * ldb), rowA0);
+            vst1q_f32((B + 1 * ldb), rowA1);
+            vst1q_f32((B + 2 * ldb), rowA2);
+            vst1q_f32((B + 3 * ldb), rowA3);
+          } else {
+            float tmp[4];
+            vst1q_f32(tmp, rowB0);
+            for (int i = 0; i < 4; ++i) {
+                B[i * innerStrideB] = tmp[i];
+            }
+            vst1q_f32(tmp, rowB1);
+            for (int i = 0; i < 4; ++i) {
+                B[i * innerStrideB + 1 * ldb] = tmp[i];
+            }
+            vst1q_f32(tmp, rowB2);
+            for (int i = 0; i < 4; ++i) {
+                B[i * innerStrideB + 2 * ldb] = tmp[i];
+            }
+            vst1q_f32(tmp, rowB3);
+            for (int i = 0; i < 4; ++i) {
+                B[i * innerStrideB + 3 * ldb] = tmp[i];
+            }
+          }
        }
     }
 };
@@ -304,7 +531,7 @@ struct micro_kernel<float, betaIsZero>
 //template <int betaIsZero>
 //struct micro_kernel<float, betaIsZero>
 //{
-//    static void execute(const float* __restrict__ A, const size_t lda, float* __restrict__ B, const size_t ldb, const float alpha ,const float beta)
+//    static void execute(const float* __restrict__ A, const size_t lda, const size_t innerStrideA, float* __restrict__ B, const size_t ldb, const size_t innerStrideA, const float alpha ,const float beta)
 //    {
 //       vector float reg_alpha = vec_splats(alpha);
 //
@@ -368,314 +595,537 @@ struct micro_kernel<float, betaIsZero>
 
 
 template<int betaIsZero, typename floatType, bool conjA>
-static INLINE void macro_kernel_scalar(const floatType* __restrict__ A, const size_t lda, int blockingA,  
-                                             floatType* __restrict__ B, const size_t ldb, int blockingB,
+static INLINE void macro_kernel_scalar(const floatType* __restrict__ A, const size_t lda, int blockingA, size_t innerStrideA,  
+                                             floatType* __restrict__ B, const size_t ldb, int blockingB, size_t innerStrideB,
                                              const floatType alpha ,const floatType beta)
 {
 #ifdef DEBUG
-   assert( blockingA > 0 && blockingB > 0);
+   printf("Macro kernel: blockingA = %d with %zu, blockingB = %d with %zu\n", blockingA, lda, blockingB, ldb);
+   assert( blockingA >= 0 && blockingB >= 0);
 #endif
 
    if( betaIsZero )
       for(int j=0; j < blockingA; ++j)
-         for(int i=0; i < blockingB; ++i)
+         for(int i=0; i < blockingB; ++i){
+#ifdef DEBUG
+            printf("    A[+%zu] = %e -> B[+%zu] = %e\n", (i * lda) + (j * innerStrideA), A[(i * lda) + (j * innerStrideA)], (i * innerStrideB) + (j * ldb), B[(i * innerStrideB) + (j * ldb)]);
+#endif
             if( conjA )
-               B[i + j * ldb] = alpha * conj(A[i * lda + j]);
+               B[(i * innerStrideB) + (j * ldb)] = alpha * conj(A[(i * lda) + (j * innerStrideA)]);
             else
-               B[i + j * ldb] = alpha * A[i * lda + j];
+               B[(i * innerStrideB) + (j * ldb)] = alpha * A[(i * lda) + (j * innerStrideA)];}
    else
       for(int j=0; j < blockingA; ++j)
-         for(int i=0; i < blockingB; ++i)
+         for(int i=0; i < blockingB; ++i){
+#ifdef DEBUG
+            printf("    A[+%zu] = %e -> B[+%zu] = %e\n", (i * lda) + (j * innerStrideA), A[(i * lda) + (j * innerStrideA)], i + (j * ldb), B[(i * innerStrideB) + (j * ldb)]);
+#endif
             if( conjA )
-               B[i + j * ldb] = alpha * conj(A[i * lda + j]) + beta * B[i + j * ldb];
+               B[(i * innerStrideB) + (j * ldb)] = alpha * conj(A[(i * lda) + (j * innerStrideA)]) + beta * B[(i * innerStrideB) + (j * ldb)];
             else
-               B[i + j * ldb] = alpha * A[i * lda + j] + beta * B[i + j * ldb];
+               B[(i * innerStrideB) + (j * ldb)] = alpha * A[(i * lda) + (j * innerStrideA)] + beta * B[(i * innerStrideB) + (j * ldb)];}
 }
 
 template<int blockingA, int blockingB, int betaIsZero, typename floatType, bool useStreamingStores_, bool conjA>
-static INLINE void macro_kernel(const floatType* __restrict__ A, const floatType* __restrict__ Anext, const size_t lda, 
-                                   floatType* __restrict__ B, const floatType* __restrict__ Bnext, const size_t ldb,
+static INLINE void macro_kernel(const floatType* __restrict__ A, const floatType* __restrict__ Anext, const size_t lda, size_t innerStrideA, 
+                                   floatType* __restrict__ B, const floatType* __restrict__ Bnext, const size_t ldb, size_t innerStrideB,
                                    const floatType alpha ,const floatType beta)
 {
    constexpr int blocking_micro_ = REGISTER_BITS/8 / sizeof(floatType);
    constexpr int blocking_ = blocking_micro_ * 4;
+#ifdef DEBUG
+   printf("Macro kernel (%d): blockingA = %d with %zu, blockingB = %d with %zu\n", blocking_, blockingA, lda, blockingB, ldb);
+#endif
 
    const bool useStreamingStores = useStreamingStores_ && betaIsZero && (blockingB*sizeof(floatType))%64 == 0 && ((uint64_t)B)%32 == 0 && (ldb*sizeof(floatType))%32 == 0;
 
+#ifdef DEBUG
+   printf("Macro kernel using streaming stores? %d\n", useStreamingStores);
+#endif
+
    floatType *Btmp = B;
    size_t ldb_tmp = ldb;
    floatType buffer[blockingA * blockingB];// __attribute__((aligned(64)));
-   if( (useStreamingStores_ && useStreamingStores) ){
+   if( (useStreamingStores_ && useStreamingStores && innerStrideB == 1) ){
       Btmp = buffer;
       ldb_tmp = blockingB;
    }
 
    if( blockingA == blocking_ && blockingB == blocking_ )
    {
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (0 * ldb_tmp + 0), ldb_tmp);
-      prefetch<floatType>(Anext + (0 * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + 0), lda, Btmp + (0 * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      prefetch<floatType>(Anext + (blocking_micro_ * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (0 * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA * 0)), lda, (0 * ldb_tmp + (innerStrideB *0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA * 0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB *0)), ldb_tmp, innerStrideB, alpha , beta);
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (blocking_micro_ * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA * 0)), lda, (0 * ldb_tmp + (innerStrideB * blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA * 0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB * blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (0 * ldb_tmp + 2*blocking_micro_), ldb_tmp);
-      prefetch<floatType>(Anext + (2*blocking_micro_ * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + 2*blocking_micro_), ldb_tmp  , alpha , beta);
-      prefetch<floatType>(Anext + (3*blocking_micro_ * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + 3*blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (2*blocking_micro_ * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA * 0)), lda, (0 * ldb_tmp + (innerStrideB * 2*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + (innerStrideA * 0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB * 2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (3*blocking_micro_ * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA * 0)), lda, (0 * ldb_tmp + (innerStrideB *3*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + (innerStrideA * 0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB *3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (blocking_micro_ * ldb_tmp + 0), ldb_tmp);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA *blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB *0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA *blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB *0)), ldb_tmp, innerStrideB, alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB * blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB * blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp  , alpha , beta);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 3*blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2* blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB *2*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB *2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3* blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + (innerStrideA * blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (2*blocking_micro_ * ldb_tmp + 0), ldb_tmp);
-      prefetch<floatType>(Anext + (0 * lda + 2*blocking_micro_), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      prefetch<floatType>(Anext + (blocking_micro_ * lda + 2*blocking_micro_), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (0 * lda + 2*blocking_micro_), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta);
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (blocking_micro_ * lda + 2*blocking_micro_), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (2*blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp);
-      prefetch<floatType>(Anext + (2*blocking_micro_ * lda + 2*blocking_micro_), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp  , alpha , beta);
-      prefetch<floatType>(Anext + (3*blocking_micro_ * lda + 2*blocking_micro_), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + 3*blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (2*blocking_micro_ * lda + 2*blocking_micro_), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (3*blocking_micro_ * lda + 2*blocking_micro_), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3* blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (3*blocking_micro_ * ldb_tmp + 0), ldb_tmp);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3* blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (3*blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp  , alpha , beta);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + 3*blocking_micro_), ldb_tmp  , alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2* blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3* blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
    }else if( blockingA == 2*blocking_micro_ && blockingB == blocking_ ) {
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (0 * ldb_tmp + 0), ldb_tmp);
-      prefetch<floatType>(Anext + (0 * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + 0), lda, Btmp + (0 * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      prefetch<floatType>(Anext + (blocking_micro_ * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (0 * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA *0)), lda, (0 * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA *0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta);
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (blocking_micro_ * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA*0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (0 * ldb_tmp + 2*blocking_micro_), ldb_tmp);
-      prefetch<floatType>(Anext + (2*blocking_micro_ * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + 2*blocking_micro_), ldb_tmp  , alpha , beta);
-      prefetch<floatType>(Anext + (3*blocking_micro_ * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + 3*blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (2*blocking_micro_ * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + (innerStrideA*0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (3*blocking_micro_ * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + (innerStrideA*0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (blocking_micro_ * ldb_tmp + 0), ldb_tmp);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 2*blocking_micro_), ldb_tmp  , alpha , beta);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 3*blocking_micro_), ldb_tmp  , alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (2*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (3*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
    }else if( blockingA == blocking_ && blockingB == 2*blocking_micro_) {
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (0 * ldb_tmp + 0), ldb_tmp);
-      prefetch<floatType>(Anext + (0 * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + 0), lda, Btmp + (0 * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      prefetch<floatType>(Anext + (blocking_micro_ * lda + 0), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + 0), lda, Btmp + (0 * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (0 * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA*0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta);
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (blocking_micro_ * lda + 0), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA*0)), lda, innerStrideA, Btmp + (0 * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (blocking_micro_ * ldb_tmp + 0), ldb_tmp);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + blocking_micro_), lda, Btmp + (blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, innerStrideA, Btmp + (blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (2*blocking_micro_ * ldb_tmp + 0), ldb_tmp);
-      prefetch<floatType>(Anext + (0 * lda + 2*blocking_micro_), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      prefetch<floatType>(Anext + (blocking_micro_ * lda + 2*blocking_micro_), lda);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + 2*blocking_micro_), lda, Btmp + (2*blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
-      if( !(useStreamingStores_ && useStreamingStores) )
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (0 * lda + 2*blocking_micro_), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta);
+      if (innerStrideA == 1) prefetch<floatType>(Anext + (blocking_micro_ * lda + 2*blocking_micro_), lda);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, innerStrideA, Btmp + (2*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
+      if( !(useStreamingStores_ && useStreamingStores) && innerStrideB == 1 )
          prefetch<floatType>(Bnext + (3*blocking_micro_ * ldb_tmp + 0), ldb_tmp);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + 0), ldb_tmp  , alpha , beta);
-      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + 3*blocking_micro_), lda, Btmp + (3*blocking_micro_ * ldb_tmp + blocking_micro_), ldb_tmp  , alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (0 * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp, innerStrideB, alpha , beta);
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+      micro_kernel<floatType,betaIsZero, conjA>::execute(A + (blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, innerStrideA, Btmp + (3*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp, innerStrideB, alpha , beta);
    } else {
       //invoke micro-transpose
-      if(blockingA > 0 && blockingB > 0 )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A, lda, Btmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 0 && blockingB > 0 ) {
+#ifdef DEBUG
+         printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A, lda, innerStrideA, Btmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 0 && blockingB > blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + blocking_micro_ * lda, lda, Btmp + blocking_micro_, ldb_tmp  , alpha , beta);
+      if(blockingA > 0 && blockingB > blocking_micro_ ) {
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*blocking_micro_), ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 0 && blockingB > 2*blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 2*blocking_micro_ * lda, lda, Btmp + 2*blocking_micro_, ldb_tmp  , alpha , beta);
+      if(blockingA > 0 && blockingB > 2*blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 2*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*2*blocking_micro_), ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 0 && blockingB > 3*blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 3*blocking_micro_ * lda, lda, Btmp + 3*blocking_micro_, ldb_tmp  , alpha , beta);
+      if(blockingA > 0 && blockingB > 3*blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*0)), lda, (0 * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 3*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*3*blocking_micro_), ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > blocking_micro_ && blockingB > 0 )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + blocking_micro_, lda, Btmp + blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > blocking_micro_ && blockingB > 0 ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*blocking_micro_), lda, innerStrideA, Btmp + blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > blocking_micro_ && blockingB > blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + blocking_micro_ + blocking_micro_ * lda, lda, Btmp + blocking_micro_ + blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > blocking_micro_ && blockingB > blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*blocking_micro_) + blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*blocking_micro_) + blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > blocking_micro_ && blockingB > 2*blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + blocking_micro_ + 2*blocking_micro_ * lda, lda, Btmp + 2*blocking_micro_ + blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > blocking_micro_ && blockingB > 2*blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*blocking_micro_) + 2*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*2*blocking_micro_) + blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > blocking_micro_ && blockingB > 3*blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + blocking_micro_ + 3*blocking_micro_ * lda, lda, Btmp + 3*blocking_micro_ + blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > blocking_micro_ && blockingB > 3*blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*blocking_micro_)), lda, (blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*blocking_micro_) + 3*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*3*blocking_micro_) + blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 2*blocking_micro_ && blockingB > 0 )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 2*blocking_micro_, lda, Btmp + 2*blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 2*blocking_micro_ && blockingB > 0 ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*2*blocking_micro_), lda, innerStrideA, Btmp + 2*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 2*blocking_micro_ && blockingB > blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 2*blocking_micro_ + blocking_micro_ * lda, lda, Btmp + blocking_micro_ + 2*blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 2*blocking_micro_ && blockingB > blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*2*blocking_micro_) + blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*blocking_micro_) + 2*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 2*blocking_micro_ && blockingB > 2*blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 2*blocking_micro_ + 2*blocking_micro_ * lda, lda, Btmp + 2*blocking_micro_ + 2*blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 2*blocking_micro_ && blockingB > 2*blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*2*blocking_micro_) + 2*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*2*blocking_micro_) + 2*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 2*blocking_micro_ && blockingB > 3*blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 2*blocking_micro_ + 3*blocking_micro_ * lda, lda, Btmp + 3*blocking_micro_ + 2*blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 2*blocking_micro_ && blockingB > 3*blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*2*blocking_micro_)), lda, (2*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*2*blocking_micro_) + 3*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*3*blocking_micro_) + 2*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 3*blocking_micro_ && blockingB > 0 )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 3*blocking_micro_, lda, Btmp + 3*blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 3*blocking_micro_ && blockingB > 0 ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (0 * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*0)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*3*blocking_micro_), lda, innerStrideA, Btmp + 3*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 3*blocking_micro_ && blockingB > blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 3*blocking_micro_ + blocking_micro_ * lda, lda, Btmp + blocking_micro_ + 3*blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 3*blocking_micro_ && blockingB > blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*3*blocking_micro_) + blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*blocking_micro_) + 3*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 3*blocking_micro_ && blockingB > 2*blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 3*blocking_micro_ + 2*blocking_micro_ * lda, lda, Btmp + 2*blocking_micro_ + 3*blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 3*blocking_micro_ && blockingB > 2*blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (2*blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*2*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*3*blocking_micro_) + 2*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*2*blocking_micro_) + 3*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
 
       //invoke micro-transpose
-      if(blockingA > 3*blocking_micro_ && blockingB > 3*blocking_micro_ )
-         micro_kernel<floatType,betaIsZero, conjA>::execute(A + 3*blocking_micro_ + 3*blocking_micro_ * lda, lda, Btmp + 3*blocking_micro_ + 3*blocking_micro_ * ldb_tmp, ldb_tmp  , alpha , beta);
+      if(blockingA > 3*blocking_micro_ && blockingB > 3*blocking_micro_ ){
+#ifdef DEBUG
+      printf("Micro kernel: A[+%zu] lda = %zu, B[+%zu] ldb = %zu\n", (3*blocking_micro_ * lda + (innerStrideA*3*blocking_micro_)), lda, (3*blocking_micro_ * ldb_tmp + (innerStrideB*3*blocking_micro_)), ldb_tmp);
+#endif
+         micro_kernel<floatType,betaIsZero, conjA>::execute(A + (innerStrideA*3*blocking_micro_) + 3*blocking_micro_ * lda, lda, innerStrideA, Btmp + (innerStrideB*3*blocking_micro_) + 3*blocking_micro_ * ldb_tmp, ldb_tmp, innerStrideB, alpha , beta);}
    }
 
    // write buffer to main-memory via non-temporal stores
-   if( (useStreamingStores_ && useStreamingStores) )
+   if( (useStreamingStores_ && useStreamingStores && innerStrideB == 1) )
       for( int i = 0; i < blockingA; i++){
          for( int j = 0; j < blockingB; j+=blocking_micro_)
-            streamingStore<floatType>(B + i * ldb + j, buffer + i * ldb_tmp + j);
+            streamingStore<floatType>(B + (i * ldb) + (j * innerStrideB), buffer + i * ldb_tmp + j);
       }
 }
 
 template<int betaIsZero, typename floatType, bool conjA>
-void transpose_int_scalar( const floatType* __restrict__ A, int sizeStride1A, 
-                                  floatType* __restrict__ B, int sizeStride1B, const floatType alpha, const floatType beta, const ComputeNode* plan)
+void transpose_int_scalar( const floatType* __restrict__ A, int sizeStride1A, size_t innerStrideA,
+                                  floatType* __restrict__ B, int sizeStride1B, size_t innerStrideB, const floatType alpha, const floatType beta, const ComputeNode* plan)
 {
    const int32_t end = plan->end;
    const size_t lda = plan->lda;
    const size_t ldb = plan->ldb;
+   const int offDiffAB = plan->offDiffAB;
    if( plan->next->next != nullptr ){
       // recurse
       int i = plan->start;
-      if( lda == 1)
-         transpose_int_scalar<betaIsZero, floatType, conjA>( &A[i*lda], end - plan->start, &B[i*ldb], sizeStride1B, alpha, beta, plan->next);
-      else if( ldb == 1)
-         transpose_int_scalar<betaIsZero, floatType, conjA>( &A[i*lda], sizeStride1A, &B[i*ldb], end - plan->start, alpha, beta, plan->next);
+#ifdef DEBUG
+      printf("----[SCALAR]Pointing to deeper level, Shifting A[+%zu], B[+%zu], repeats (%zu, %zu) = %zu\n", (i+offDiffAB)*lda, i*ldb, lda, ldb, end - plan->start);
+#endif
+      if( plan->indexA )
+         transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], end - plan->start, innerStrideA, &B[i*ldb], sizeStride1B, innerStrideB, alpha, beta, plan->next);
+      else if( plan->indexB )
+         transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], sizeStride1A, innerStrideA, &B[i*ldb], end - plan->start, innerStrideB, alpha, beta, plan->next);
       else
-         for(; i < end; i++)
-            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[i*lda], sizeStride1A, &B[i*ldb], sizeStride1B, alpha, beta, plan->next);
+         for(; i < end; i++){
+#ifdef DEBUG
+            printf("--------[SCALAR]Repeat %zu of %zu, A[+%zu], B[+%zu], repeats (%zu, %zu) = %zu\n", i - plan->start, end - plan->start, (i+offDiffAB)*lda, i*ldb, lda, ldb, end - plan->start);
+#endif
+            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], sizeStride1A, innerStrideA, &B[i*ldb], sizeStride1B, innerStrideB, alpha, beta, plan->next);}
    }else{
       // macro-kernel
       const size_t lda_macro = plan->next->lda;
       const size_t ldb_macro = plan->next->ldb;
       int i = plan->start;
       const size_t scalarRemainder = plan->end - plan->start;
+#ifdef DEBUG
+      printf("----[SCALAR]Sending to macro-kernel (Remainder: %zu), Shifting A[+%zu] lda - %zu, B[+%zu] ldb - %zu\n", scalarRemainder, (i+offDiffAB)*lda, lda, i*ldb, ldb);
+#endif
       if( scalarRemainder > 0 ){
-         if( lda == 1)
-            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[i*lda], lda_macro, scalarRemainder, &B[i*ldb], ldb_macro, sizeStride1B, alpha, beta);
-         else if( ldb == 1)
-            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[i*lda], lda_macro, sizeStride1A, &B[i*ldb], ldb_macro, scalarRemainder, alpha, beta);
+         if( plan->indexA )
+            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, scalarRemainder, innerStrideA, &B[i*ldb], ldb_macro, sizeStride1B, innerStrideB, alpha, beta);
+         else if( plan->indexB )
+            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, sizeStride1A, innerStrideA, &B[i*ldb], ldb_macro, scalarRemainder, innerStrideB, alpha, beta);
+         else
+            for(; i < end; i++){
+#ifdef DEBUG
+               printf("--------[SCALAR]Repeat %zu of %zu, A[+%zu], B[+%zu], repeats (%zu, %zu) = %zu\n", i - plan->start, end - plan->start, (i+offDiffAB)*lda, i*ldb, lda, ldb, end - plan->start);
+#endif
+               macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, sizeStride1A, innerStrideA, &B[i*ldb], ldb_macro, sizeStride1B, innerStrideB, alpha, beta);}
       }
    }
 }
 template<int blockingA, int blockingB, int betaIsZero, typename floatType, bool useStreamingStores, bool conjA>
-void transpose_int( const floatType* __restrict__ A, const floatType* __restrict__ Anext, 
-                     floatType* __restrict__ B, const floatType* __restrict__ Bnext, const floatType alpha, const floatType beta, 
+void transpose_int( const floatType* __restrict__ A, const floatType* __restrict__ Anext, size_t innerStrideA,
+                     floatType* __restrict__ B, const floatType* __restrict__ Bnext, size_t innerStrideB, const floatType alpha, const floatType beta, 
                      const ComputeNode* plan)
 {
    const int32_t end = plan->end - (plan->inc - 1);
    const int32_t inc = plan->inc;
    const size_t lda = plan->lda;
    const size_t ldb = plan->ldb;
+   const int offDiffAB = plan->offDiffAB;
 
    constexpr int blocking_micro_ = REGISTER_BITS/8 / sizeof(floatType);
    constexpr int blocking_ = blocking_micro_ * 4;
 
    if( plan->next->next != nullptr ){
+#ifdef DEBUG
+      printf("Pointing to deeper level.\n");
+#endif
       // recurse
       int32_t i;
       for(i = plan->start; i < end; i+= inc)
       {
-         if( i + inc < end )
-            transpose_int<blockingA, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[i*lda], &A[(i+1)*lda], &B[i*ldb], &B[(i+1)*ldb], alpha, beta, plan->next);
-         else
-            transpose_int<blockingA, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[i*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
+#ifdef DEBUG
+         printf("|___Shifting A[+%zu], B[+%zu]; repeat %zu of %zu\n", (i+offDiffAB)*lda, i*ldb, i - plan->start, end - plan->start);
+#endif
+         if( i + inc < end ) {
+            transpose_int<blockingA, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], &A[(i+1+offDiffAB)*lda], innerStrideA, &B[i*ldb], &B[(i+1)*ldb], innerStrideB, alpha, beta, plan->next);
+         } else if( i == plan->start || i + inc >= end ) {
+            transpose_int<blockingA, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], &A[(i+offDiffAB)*lda], innerStrideA, &B[i*ldb], &B[i*ldb], innerStrideB, alpha, beta, plan->next);
+         } else {
+            transpose_int<blockingA, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, innerStrideA, &B[i*ldb], Bnext, innerStrideB, alpha, beta, plan->next);
+         }
       }
       // remainder
       if( blocking_/2 >= blocking_micro_ && (i + blocking_/2) <= plan->end ){
-         if( lda == 1)
-            transpose_int<blocking_/2, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[i*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
-         else if( ldb == 1)
-            transpose_int<blockingA, blocking_/2, betaIsZero, floatType, useStreamingStores, conjA>( &A[i*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
+#ifdef DEBUG
+         printf("---- Half Blocking_ where leading dimension is 1 - A -> %zu, B -> %zu\n", lda, ldb);
+         printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
+#endif
+         if( plan->indexA )
+            transpose_int<blocking_/2, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, innerStrideA, &B[i*ldb], Bnext, innerStrideB, alpha, beta, plan->next);
+         else if( plan->indexB )
+            transpose_int<blockingA, blocking_/2, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, innerStrideA, &B[i*ldb], Bnext, innerStrideB, alpha, beta, plan->next);
          i+=blocking_/2;
       }
       if( blocking_/4 >= blocking_micro_ && (i + blocking_/4) <= plan->end ){
-         if( lda == 1)
-            transpose_int<blocking_/4, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[i*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
-         else if( ldb == 1)
-            transpose_int<blockingA, blocking_/4, betaIsZero, floatType, useStreamingStores, conjA>( &A[i*lda], Anext, &B[i*ldb], Bnext, alpha, beta, plan->next);
+#ifdef DEBUG
+         printf("---- Quarter Blocking_ where leading dimension is 1 - A -> %zu, B -> %zu\n", lda, ldb);
+         printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
+#endif
+         if( plan->indexA )
+            transpose_int<blocking_/4, blockingB, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, innerStrideA, &B[i*ldb], Bnext, innerStrideB, alpha, beta, plan->next);
+         else if( plan->indexB )
+            transpose_int<blockingA, blocking_/4, betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], Anext, innerStrideA, &B[i*ldb], Bnext, innerStrideB, alpha, beta, plan->next);
          i+=blocking_/4;
       }
       const size_t scalarRemainder = plan->end - i;
       if( scalarRemainder > 0 ){
-         if( lda == 1)
-            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[i*lda], scalarRemainder, &B[i*ldb], -1, alpha, beta, plan->next);
+#ifdef DEBUG
+         printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
+         printf("---- Passing to scalar. Blocking_ to %zu (blA = %d, blB = %d) where leading dimension is 1 - A -> %zu, B -> %zu\n", plan->end - i, blockingA, blockingB, lda, ldb);
+#endif
+         if( plan->indexA )
+            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], scalarRemainder, innerStrideA, &B[i*ldb], blockingB, innerStrideB, alpha, beta, plan->next);
+         else if ( plan->indexB )
+            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], blockingA, innerStrideA, &B[i*ldb], scalarRemainder, innerStrideB, alpha, beta, plan->next);
          else
-            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[i*lda], -1, &B[i*ldb], scalarRemainder, alpha, beta, plan->next);
+            transpose_int_scalar<betaIsZero, floatType, conjA>( &A[(i+offDiffAB)*lda], blockingA, innerStrideA, &B[i*ldb], blockingB, innerStrideB, alpha, beta, plan->next);
       }
    } else {
       const size_t lda_macro = plan->next->lda;
       const size_t ldb_macro = plan->next->ldb;
       // invoke macro-kernel
-      
+#ifdef DEBUG
+      printf("Sending to macro-kernel.\n");
+#endif
       int32_t i;
       for(i = plan->start; i < end; i+= inc)
-         if( i + inc < end )
-            macro_kernel<blockingA, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[i*lda], &A[(i+1)*lda], lda_macro, &B[i*ldb], &B[(i+1)*ldb], ldb_macro, alpha, beta);
-         else
-            macro_kernel<blockingA, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[i*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
+      {
+#ifdef DEBUG
+         printf("|___Shifting A[+%zu], B[+%zu]; repeat %zu of %zu\n", (i+offDiffAB)*lda, i*ldb, i - plan->start, end - plan->start);
+         printf("--------- Case A: %d < %d, Case B: %d == %zu or %d > %d, else Case C\n", i + inc, end, i, plan->start, i + inc, end);
+#endif
+         if( i + inc < end ) {
+            macro_kernel<blockingA, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], &A[(i+offDiffAB+1)*lda], lda_macro, innerStrideA, &B[i*ldb], &B[(i+1)*ldb], ldb_macro, innerStrideB, alpha, beta);
+         } else if( i == plan->start || i + inc > end ) {
+            break; // Things land here when the last two levels contain indexA and indexB separately. 
+         } else {
+            macro_kernel<blockingA, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, innerStrideA, &B[i*ldb], Bnext, ldb_macro, innerStrideB, alpha, beta);
+         }
+      }
       // remainder
       if( blocking_/2 >= blocking_micro_ && (i + blocking_/2) <= plan->end ){
-         if( lda == 1)
-            macro_kernel<blocking_/2, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[i*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
-         else if( ldb == 1)
-            macro_kernel<blockingA, blocking_/2, betaIsZero,floatType, useStreamingStores, conjA>(&A[i*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
+#ifdef DEBUG
+         printf("---- Half Blocking_ where leading dimension is 1 - A -> %zu, B -> %zu\n", lda, ldb);
+         printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
+#endif
+         if( plan->indexA )
+            macro_kernel<blocking_/2, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, innerStrideA, &B[i*ldb], Bnext, ldb_macro, innerStrideB, alpha, beta);
+         else if( plan->indexB )
+            macro_kernel<blockingA, blocking_/2, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, innerStrideA, &B[i*ldb], Bnext, ldb_macro, innerStrideB, alpha, beta);
          i+=blocking_/2;
       }
       if( blocking_/4 >= blocking_micro_ && (i + blocking_/4) <= plan->end ){
-         if( lda == 1)
-            macro_kernel<blocking_/4, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[i*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
-         else if( ldb == 1)
-            macro_kernel<blockingA, blocking_/4, betaIsZero,floatType, useStreamingStores, conjA>(&A[i*lda], Anext, lda_macro, &B[i*ldb], Bnext, ldb_macro, alpha, beta);
+#ifdef DEBUG
+         printf("---- Quarter Blocking_ where leading dimension is 1 - A -> %zu, B -> %zu\n", lda, ldb);
+         printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
+#endif
+         if( plan->indexA )
+            macro_kernel<blocking_/4, blockingB, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, innerStrideA, &B[i*ldb], Bnext, ldb_macro, innerStrideB, alpha, beta);
+         else if( plan->indexB )
+            macro_kernel<blockingA, blocking_/4, betaIsZero,floatType, useStreamingStores, conjA>(&A[(i+offDiffAB)*lda], Anext, lda_macro, innerStrideA, &B[i*ldb], Bnext, ldb_macro, innerStrideB, alpha, beta);
          i+=blocking_/4;
       }
       const size_t scalarRemainder = plan->end - i;
       if( scalarRemainder > 0 ){
-         if( lda == 1)
-            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[i*lda], lda_macro, scalarRemainder, &B[i*ldb], ldb_macro, blockingB, alpha, beta);
-         else if( ldb == 1)
-            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[i*lda], lda_macro, blockingA, &B[i*ldb], ldb_macro, scalarRemainder, alpha, beta);
+#ifdef DEBUG
+         printf("|___Shifting A[+%zu], B[+%zu]\n", (i+offDiffAB)*lda, i*ldb);
+         printf("---- Passing to scalar. Blocking_ to %zu where leading dimension is 1 - A -> %zu, B -> %zu\n", plan->end - i, lda, ldb);
+#endif
+         if( plan->indexA )
+            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, scalarRemainder, innerStrideA, &B[i*ldb], ldb_macro, blockingB, innerStrideB, alpha, beta);
+         else if( plan->indexB )
+            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, blockingA, innerStrideA, &B[i*ldb], ldb_macro, scalarRemainder, innerStrideB, alpha, beta);
+         else
+            macro_kernel_scalar<betaIsZero,floatType, conjA>(&A[(i+offDiffAB)*lda], lda_macro, blockingA, innerStrideA, &B[i*ldb], ldb_macro, blockingB, innerStrideB, alpha, beta);
       }
    }
 }
@@ -688,36 +1138,48 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
    constexpr int32_t inc = 1; // TODO
    const size_t lda = plan->lda;
    const size_t ldb = plan->ldb;
+   const int offDiffAB = plan->offDiffAB;
 
    if( plan->next != nullptr )
-      for(int i = plan->start; i < end; i+= inc)
+      for(int i = plan->start; i < end; i+= inc) {
          // recurse
-         transpose_int_constStride1<betaIsZero, floatType, useStreamingStores, conjA>( &A[i*lda], &B[i*ldb], alpha, beta, plan->next);
-   else 
+#ifdef DEBUG
+         printf("[CONST](Loop %zu of %zu)Shifting A[+%zu], B[+%zu]\n", i - plan->start, end - plan->start, (i+offDiffAB)*lda, i*ldb);
+#endif
+         transpose_int_constStride1<betaIsZero, floatType, useStreamingStores, conjA>( &A[(i+offDiffAB)*lda], &B[i*ldb], alpha, beta, plan->next);
+      }
+   else
       if( !betaIsZero )
       {
-         for(int32_t i = plan->start; i < end; i+= inc)
+#ifdef DEBUG
+         printf("[CONST]Setting A[+%zu], B[+%zu] (from %zu to %d)\n", (plan->start+offDiffAB), plan->start, plan->start, end);
+#endif
+         for(int32_t i = plan->start; i < end; i+= inc){
             if( conjA )
-               B[i] = alpha * conj(A[i]) + beta * B[i];
+               B[i*ldb] = alpha * conj(A[(i+offDiffAB)*lda]) + beta * B[i*ldb];
             else
-               B[i] = alpha * A[i] + beta * B[i];
+               B[i*ldb] = alpha * A[(i+offDiffAB)*lda] + beta * B[i*ldb];
+         }
       } else {
+#ifdef DEBUG
+         printf("[CONST]Setting A[+%zu], B[+%zu] (from %zu to %d)\n", (plan->start+offDiffAB), plan->start, plan->start, end);
+#endif
          if( useStreamingStores)
             if( conjA )
 #pragma vector nontemporal
                for(int32_t i = plan->start; i < end; i+= inc)
-                  B[i] = alpha * conj(A[i]);
+                  B[i*ldb] = alpha * conj(A[(i+offDiffAB)*lda]);
             else
 #pragma vector nontemporal
                for(int32_t i = plan->start; i < end; i+= inc)
-                  B[i] = alpha * A[i];
+                  B[i*ldb] = alpha * A[(i+offDiffAB)*lda];
          else
             if( conjA )
                for(int32_t i = plan->start; i < end; i+= inc)
-                  B[i] = alpha * conj(A[i]);
+                  B[i*ldb] = alpha * conj(A[(i+offDiffAB)*lda]);
             else
                for(int32_t i = plan->start; i < end; i+= inc)
-                  B[i] = alpha * A[i];
+                  B[i*ldb] = alpha * A[(i+offDiffAB)*lda];
       }
 }
 
@@ -727,6 +1189,10 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
                  const int *perm, 
                  const int *outerSizeA, 
                  const int *outerSizeB, 
+                 const int *offsetA, 
+                 const int *offsetB, 
+                 const size_t innerStrideA,
+                 const size_t innerStrideB,
                  const int dim,
                  const floatType *A,
                  const floatType alpha,
@@ -741,6 +1207,8 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
          alpha_(alpha),
          beta_(beta),
          dim_(-1),
+         innerStrideA_(-1),
+         innerStrideB_(-1),
          numThreads_(numThreads), 
          masterPlan_(nullptr),
          selectionMethod_(selectionMethod),
@@ -756,13 +1224,17 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
          int tmpSizeA[dim];
          int tmpOuterSizeA[dim];
          int tmpOuterSizeB[dim];
-         accountForRowMajor(sizeA, outerSizeA, outerSizeB, perm, 
-               tmpSizeA, tmpOuterSizeA, tmpOuterSizeB, tmpPerm, dim, useRowMajor); 
+         int tmpOffsetA[dim];
+         int tmpOffsetB[dim];
+         accountForRowMajor(sizeA, outerSizeA, outerSizeB, offsetA, offsetB, perm, 
+               tmpSizeA, tmpOuterSizeA, tmpOuterSizeB, tmpOffsetA, tmpOffsetB, tmpPerm, dim, useRowMajor); 
 
          sizeA_.resize(dim);
          perm_.resize(dim);
          outerSizeA_.resize(dim);
          outerSizeB_.resize(dim);
+         offsetA_.resize(dim);
+         offsetB_.resize(dim);
          lda_.resize(dim);
          ldb_.resize(dim);
          if(threadIds){
@@ -775,10 +1247,13 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
                threadIds_.push_back(i);
          }
 
-         verifyParameter(tmpSizeA, tmpPerm, tmpOuterSizeA, tmpOuterSizeB, dim);
+         verifyParameter(tmpSizeA, tmpPerm, tmpOuterSizeA, tmpOuterSizeB, tmpOffsetA, tmpOffsetB, innerStrideA, innerStrideB, dim);
+
+         innerStrideA_ = innerStrideA;
+         innerStrideB_ = innerStrideB;
 
          // initializes dim_, outerSizeA, outerSizeB, sizeA and perm 
-         skipIndices(tmpSizeA, tmpPerm, tmpOuterSizeA, tmpOuterSizeB, dim);
+         skipIndices(tmpSizeA, tmpPerm, tmpOuterSizeA, tmpOuterSizeB, tmpOffsetA, tmpOffsetB, dim);
          fuseIndices();
 
          // initializes lda_ and ldb_
@@ -803,6 +1278,10 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
                                           perm_(other.perm_),
                                           outerSizeA_(other.outerSizeA_),
                                           outerSizeB_(other.outerSizeB_),
+                                          offsetA_(other.offsetA_),
+                                          offsetB_(other.offsetB_),
+                                          innerStrideA_(other.innerStrideA_),
+                                          innerStrideB_(other.innerStrideB_),
                                           lda_(other.lda_),
                                           ldb_(other.ldb_),
                                           threadIds_(other.threadIds_),
@@ -839,14 +1318,14 @@ void Transpose<floatType>::executeEstimate(const Plan *plan) noexcept
          auto rootNode = plan->getRootNode_const( taskId );
          if( std::abs(beta_) < getZeroThreshold<floatType>() ) {
             if( conjA_ )
-               transpose_int<blocking_,blocking_,1,floatType, useStreamingStores, true>( A_,A_, B_, B_, 0.0, 1.0, rootNode);
+               transpose_int<blocking_,blocking_,1,floatType, useStreamingStores, true>( A_,A_,innerStrideA_, B_, B_,innerStrideB_, 0.0, 1.0, rootNode);
             else
-               transpose_int<blocking_,blocking_,1,floatType, useStreamingStores, false>( A_,A_, B_, B_, 0.0, 1.0, rootNode);
+               transpose_int<blocking_,blocking_,1,floatType, useStreamingStores, false>( A_,A_,innerStrideA_, B_, B_,innerStrideB_, 0.0, 1.0, rootNode);
          } else {
             if( conjA_ )
-               transpose_int<blocking_,blocking_,0,floatType, useStreamingStores, true>( A_,A_, B_, B_, 0.0, 1.0, rootNode);
+               transpose_int<blocking_,blocking_,0,floatType, useStreamingStores, true>( A_,A_,innerStrideA_, B_, B_,innerStrideB_, 0.0, 1.0, rootNode);
             else
-               transpose_int<blocking_,blocking_,0,floatType, useStreamingStores, false>( A_,A_, B_, B_, 0.0, 1.0, rootNode);
+               transpose_int<blocking_,blocking_,0,floatType, useStreamingStores, false>( A_,A_,innerStrideA_, B_, B_,innerStrideB_, 0.0, 1.0, rootNode);
          }
       } else {
          auto rootNode = plan->getRootNode_const( taskId );
@@ -867,16 +1346,16 @@ void Transpose<floatType>::executeEstimate(const Plan *plan) noexcept
 
 
 template<int betaIsZero, typename floatType, bool useStreamingStores, bool spawnThreads, bool conjA>
-static void axpy_1D( const floatType* __restrict__ A, floatType* __restrict__ B, const int myStart, const int myEnd, const floatType alpha, const floatType beta, int numThreads)
+static void axpy_1D( const floatType* __restrict__ A, floatType* __restrict__ B, const int myStart, const int myEnd, const int &offDiffAB_, const size_t lda, const size_t ldb, const floatType alpha, const floatType beta, int numThreads)
 {
    if( !betaIsZero )
    {
       HPTT_DUPLICATE(spawnThreads,
          for(int32_t i = myStart; i < myEnd; i++)
             if( conjA )
-               B[i] = alpha * conj(A[i]) + beta * B[i];
+               B[i * ldb] = alpha * conj(A[(i + offDiffAB_) * lda]) + beta * B[i * ldb];
             else
-               B[i] = alpha * A[i] + beta * B[i];
+               B[i * ldb] = alpha * A[(i + offDiffAB_) * lda] + beta * B[i * ldb];
       )
    } else {
       if( useStreamingStores)
@@ -884,55 +1363,55 @@ static void axpy_1D( const floatType* __restrict__ A, floatType* __restrict__ B,
          HPTT_DUPLICATE(spawnThreads,
             for(int32_t i = myStart; i < myEnd; i++)
                if( conjA )
-                  B[i] = alpha * conj(A[i]);
+                  B[i * ldb] = alpha * conj(A[(i + offDiffAB_) * lda]);
                else
-                  B[i] = alpha * A[i];
+                  B[i * ldb] = alpha * A[(i + offDiffAB_) * lda];
          )
       else
          HPTT_DUPLICATE(spawnThreads,
             for(int32_t i = myStart; i < myEnd; i++)
                if( conjA )
-                  B[i] = alpha * conj(A[i]);
+                  B[i * ldb] = alpha * conj(A[(i + offDiffAB_) * lda]);
                else
-                  B[i] = alpha * A[i];
+                  B[i * ldb] = alpha * A[(i + offDiffAB_) * lda];
          )
    }
 }
 
 template<int betaIsZero, typename floatType, bool useStreamingStores, bool spawnThreads, bool conjA>
-static void axpy_2D( const floatType* __restrict__ A, const int lda, 
-                        floatType* __restrict__ B, const int ldb, 
-                        const int n0, const int myStart, const int myEnd, const floatType alpha, const floatType beta, int numThreads)
+static void axpy_2D( const floatType* __restrict__ A, const size_t (&lda)[2], 
+                        floatType* __restrict__ B, const size_t (&ldb)[2], 
+                        const int n0, const int myStart, const int myEnd, const int (&offDiffAB_)[2], const int offsetB_, const floatType alpha, const floatType beta, int numThreads)
 {
    if( !betaIsZero )
    {
       HPTT_DUPLICATE(spawnThreads,
          for(int32_t j = myStart; j < myEnd; j++)
-            for(int32_t i = 0; i < n0; i++)
+            for(int32_t i = offsetB_; i < n0 + offsetB_; i++)
                if( conjA )
-                  B[i + j * ldb] = alpha * conj(A[i + j * lda]) + beta * B[i + j * ldb];
+                  B[(i * ldb[0]) + j * ldb[1]] = alpha * conj(A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]]) + beta * B[(i * ldb[0]) + j * ldb[1]];
                else
-                  B[i + j * ldb] = alpha * A[i + j * lda] + beta * B[i + j * ldb];
+                  B[(i * ldb[0]) + j * ldb[1]] = alpha * A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]] + beta * B[(i * ldb[0]) + j * ldb[1]];
       )
    } else {
       if( useStreamingStores)
          HPTT_DUPLICATE(spawnThreads,
             for(int32_t j = myStart; j < myEnd; j++)
 _Pragma("vector nontemporal")
-            for(int32_t i = 0; i < n0; i++)
+            for(int32_t i = offsetB_; i < n0 + offsetB_; i++)
                if( conjA )
-                  B[i + j * ldb] = alpha * conj(A[i + j * lda]);
+                  B[(i * ldb[0])+ j * ldb[1]] = alpha * conj(A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]]);
                else
-                  B[i + j * ldb] = alpha * A[i + j * lda];
+                  B[(i * ldb[0])+ j * ldb[1]] = alpha * A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]];
          )
       else
          HPTT_DUPLICATE(spawnThreads,
             for(int32_t j = myStart; j < myEnd; j++)
-               for(int32_t i = 0; i < n0; i++)
+               for(int32_t i = offsetB_; i < n0 + offsetB_; i++)
                   if( conjA )
-                     B[i + j * ldb] = alpha * conj(A[i + j * lda]);
+                     B[(i * ldb[0]) + j * ldb[1]] = alpha * conj(A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]]);
                   else
-                     B[i + j * ldb] = alpha * A[i + j * lda];
+                     B[(i * ldb[0]) + j * ldb[1]] = alpha * A[((i + offDiffAB_[0]) * lda[0]) + (j + offDiffAB_[1]) * lda[1]];
          )
    }
 }
@@ -987,22 +1466,27 @@ void Transpose<floatType>::execute_expert() noexcept
    int myStart = 0;
    int myEnd = 0;
 
+#ifdef DEBUG
+   printf("Executing expert with dim = %d\n", dim_);
+#endif
    if( dim_ == 1)
    {
       getStartEnd<spawnThreads>(sizeA_[0], myStart, myEnd);
+      const int offDiffAB_ = (int)offsetA_[0] - (int)offsetB_[0];
       if( conjA_ )
-         axpy_1D<betaIsZero, floatType, useStreamingStores, spawnThreads, true>( A_, B_, myStart, myEnd, alpha_, beta_, numThreads_ );
+         axpy_1D<betaIsZero, floatType, useStreamingStores, spawnThreads, true>( A_, B_, myStart + offsetB_[0], myEnd + offsetB_[0], offDiffAB_, lda_[0], ldb_[0], alpha_, beta_, numThreads_ );
       else
-         axpy_1D<betaIsZero, floatType, useStreamingStores, spawnThreads, false>( A_, B_, myStart, myEnd, alpha_, beta_, numThreads_ );
+         axpy_1D<betaIsZero, floatType, useStreamingStores, spawnThreads, false>( A_, B_, myStart + offsetB_[0], myEnd + offsetB_[0], offDiffAB_, lda_[0], ldb_[0], alpha_, beta_, numThreads_ );
       return;
    }
    else if( dim_ == 2 && perm_[0] == 0)
    {
       getStartEnd<spawnThreads>(sizeA_[1], myStart, myEnd);
+      const int offDiffAB_[2] = {((int)offsetA_[0] - (int)offsetB_[0]), ((int)offsetA_[1] - (int)offsetB_[1])};
       if( conjA_ )
-         axpy_2D<betaIsZero, floatType, useStreamingStores, spawnThreads, true>( A_, lda_[1], B_, ldb_[1], sizeA_[0], myStart, myEnd, alpha_, beta_, numThreads_ );
+         axpy_2D<betaIsZero, floatType, useStreamingStores, spawnThreads, true>( A_, {lda_[0], lda_[1]}, B_, {ldb_[0], ldb_[1]}, sizeA_[0], myStart + offsetB_[1], myEnd + offsetB_[1], offDiffAB_, offsetB_[0], alpha_, beta_, numThreads_ );
       else
-         axpy_2D<betaIsZero, floatType, useStreamingStores, spawnThreads, false>( A_, lda_[1], B_, ldb_[1], sizeA_[0], myStart, myEnd, alpha_, beta_, numThreads_ );
+         axpy_2D<betaIsZero, floatType, useStreamingStores, spawnThreads, false>( A_, {lda_[0], lda_[1]}, B_, {ldb_[0], ldb_[1]}, sizeA_[0], myStart + offsetB_[1], myEnd + offsetB_[1], offDiffAB_, offsetB_[0], alpha_, beta_, numThreads_ );
       return;
    }
 
@@ -1015,9 +1499,9 @@ void Transpose<floatType>::execute_expert() noexcept
          if ( perm_[0] != 0 ) {
             auto rootNode = masterPlan_->getRootNode_const( taskId );
             if( conjA_ )
-               transpose_int<blocking_,blocking_,betaIsZero,floatType, useStreamingStores, true>( A_, A_, B_, B_, alpha_, beta_, rootNode);
+               transpose_int<blocking_,blocking_,betaIsZero,floatType, useStreamingStores, true>( A_, A_,innerStrideA_, B_, B_,innerStrideB_, alpha_, beta_, rootNode);
             else
-               transpose_int<blocking_,blocking_,betaIsZero,floatType, useStreamingStores, false>( A_, A_, B_, B_, alpha_, beta_, rootNode);
+               transpose_int<blocking_,blocking_,betaIsZero,floatType, useStreamingStores, false>( A_, A_,innerStrideA_, B_, B_,innerStrideB_, alpha_, beta_, rootNode);
          } else {
             auto rootNode = masterPlan_->getRootNode_const( taskId );
             if( conjA_ )
@@ -1420,7 +1904,7 @@ void Transpose<floatType>::getParallelismStrategies(std::vector<std::vector<int>
 }
 
 template<typename floatType>
-void Transpose<floatType>::verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int dim) const
+void Transpose<floatType>::verifyParameter(const int *size, const int* perm, const int* outerSizeA, const int* outerSizeB, const int* offsetA, const int* offsetB, const size_t innerStrideA, const size_t innerStrideB, const int dim) const
 {
    if ( dim < 1 ) {
       fprintf(stderr,"[HPTT] ERROR: dimensionality too low.\n");
@@ -1457,12 +1941,36 @@ void Transpose<floatType>::verifyParameter(const int *size, const int* perm, con
             fprintf(stderr,"[HPTT] ERROR: outerSizeB invalid\n");
             exit(-1);
          }
+      
+   if ( offsetA != NULL )
+      for(int i=0;i < dim ; ++i)
+         if ( offsetA[i] + size[i] > outerSizeA[i] ) {
+            fprintf(stderr,"[HPTT] ERROR: offsetA invalid\n");
+            exit(-1);
+         }
+
+   if ( offsetB != NULL )
+      for(int i=0;i < dim ; ++i)
+         if ( offsetB[i] + size[perm[i]] > outerSizeB[i] ) {
+            fprintf(stderr,"[HPTT] ERROR: offsetB invalid\n");
+            exit(-1);
+         }
+      
+   if ( innerStrideA < 0 ) {
+      fprintf(stderr,"[HPTT] ERROR: innerStrideA invalid\n");
+      exit(-1);
+   }
+
+   if ( innerStrideB < 0 ) {
+      fprintf(stderr,"[HPTT] ERROR: innerStrideB invalid\n");
+      exit(-1);
+   }
 }
 
 template<typename floatType>
 void Transpose<floatType>::computeLeadingDimensions()
 {
-   lda_[0] = 1;
+   lda_[0] = innerStrideA_;
    if( outerSizeA_[0] == -1 )
       for(int i=1;i < dim_ ; ++i)
          lda_[i] = lda_[i-1] * sizeA_[i-1];
@@ -1470,7 +1978,7 @@ void Transpose<floatType>::computeLeadingDimensions()
       for(int i=1;i < dim_ ; ++i)
          lda_[i] = outerSizeA_[i-1] * lda_[i-1];
 
-   ldb_[0] = 1;
+   ldb_[0] = innerStrideB_;
    if( outerSizeB_[0] == -1 )
       for(int i=1;i < dim_ ; ++i)
          ldb_[i] = ldb_[i-1] * sizeA_[perm_[i-1]];
@@ -1480,7 +1988,7 @@ void Transpose<floatType>::computeLeadingDimensions()
 }
 
 template<typename floatType>
-void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const int *outerSizeA, const int *outerSizeB, const int dim)
+void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const int *outerSizeA, const int *outerSizeB, const int *offsetA, const int *offsetB, const int dim)
 {
    for(int i=0;i < dim ; ++i)
    {
@@ -1494,6 +2002,14 @@ void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const
          outerSizeB_[i] = outerSizeB[i];
       else
          outerSizeB_[i] = sizeA[perm[i]];
+      if(offsetA)
+         offsetA_[i] = offsetA[i];
+      else
+         offsetA_[i] = 0;
+      if(offsetB)
+         offsetB_[i] = offsetB[i];
+      else
+         offsetB_[i] = 0;
    }
 
    int skipped = 0;
@@ -1508,6 +2024,8 @@ void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const
          sizeA_[i] = -1;
          outerSizeA_[i] = -1;
          outerSizeB_[idxB] = -1;
+         offsetA_[i] = -1;
+         offsetB_[idxB] = -1;
          perm_[idxB] = -1;
          skipped++;
       }
@@ -1530,8 +2048,10 @@ void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const
          for(;j < dim ; ++j)
             if( outerSizeA_[j] != -1 )
                break;
-         if( j < dim )
+         if( j < dim ) {
             std::swap(outerSizeA_[i], outerSizeA_[j]);
+            std::swap(offsetA_[i], offsetA_[j]);
+         }
       }
    for(int i=0;i < dim ; ++i)
       if( outerSizeB_[i] == -1 )
@@ -1540,8 +2060,10 @@ void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const
          for(;j < dim ; ++j)
             if( outerSizeB_[j] != -1 )
                break;
-         if( j < dim )
+         if( j < dim ) {
             std::swap(outerSizeB_[i], outerSizeB_[j]);
+            std::swap(offsetB_[i], offsetB_[j]);
+         }
       }
    for(int i=0;i < dim ; ++i)
       if( perm_[i] == -1 )
@@ -1565,11 +2087,15 @@ void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const
       sizeA_[0] = 1;
       outerSizeA_[0] = 1;
       outerSizeB_[0] = 1;
+      offsetA_[0] = 0;
+      offsetB_[0] = 0;
    } else {
       perm_.resize(dim_);
       sizeA_.resize(dim_);
       outerSizeA_.resize(dim_);
       outerSizeB_.resize(dim_);
+      offsetA_.resize(dim_);
+      offsetB_.resize(dim_);
 
       // remove gaps in the perm, if requried (e.g., perm=3,1,0 -> 2,1,0)
       int currentValue = 0;
@@ -1593,6 +2119,8 @@ void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const
    printVector(sizeA_,"sizeA");
    printVector(outerSizeA_,"outerSizeA");
    printVector(outerSizeB_,"outerSizeB");
+   printf("innerStrideA: %lu\n",innerStrideA_);
+   printf("innerStrideB: %lu\n",innerStrideB_);
 #endif
 }
 
@@ -1630,11 +2158,13 @@ void Transpose<floatType>::fuseIndices()
       sizeA_[std::get<0>(tup)] *= sizeA_[std::get<1>(tup)];
       outerSizeA_[std::get<0>(tup)] *= outerSizeA_[std::get<1>(tup)];
       outerSizeA_[std::get<1>(tup)] = -1;
+      offsetA_[std::get<1>(tup)] = -1;
 
       auto pos1 = std::find(perm_.begin(), perm_.end(), std::get<0>(tup)) - perm_.begin();
       auto pos2 = std::find(perm_.begin(), perm_.end(), std::get<1>(tup)) - perm_.begin();
       outerSizeB_[pos1] *= outerSizeB_[pos2];
       outerSizeB_[pos2] = -1;
+      offsetB_[pos2] = -1;
    }
 
    if ( fusedIndices.size() > 0 ) {
@@ -1668,8 +2198,10 @@ void Transpose<floatType>::fuseIndices()
             for(;j < dim_ ; ++j)
                if( outerSizeA_[j] != -1 )
                   break;
-            if( j < dim_ )
+            if( j < dim_ ) {
                std::swap(outerSizeA_[i], outerSizeA_[j]);
+               std::swap(offsetA_[i], offsetA_[j]);
+            }
          }
       for(int i=0;i < dim_ ; ++i)
          if( outerSizeB_[i] == -1 )
@@ -1678,12 +2210,16 @@ void Transpose<floatType>::fuseIndices()
             for(;j < dim_ ; ++j)
                if( outerSizeB_[j] != -1 )
                   break;
-            if( j < dim_ )
+            if( j < dim_ ) {
                std::swap(outerSizeB_[i], outerSizeB_[j]);
+               std::swap(offsetB_[i], offsetB_[j]);
+            }
          }
       dim_ -= fusedIndices.size();
       outerSizeA_.resize(dim_);
       outerSizeB_.resize(dim_);
+      offsetA_.resize(dim_);
+      offsetB_.resize(dim_);
       sizeA_.resize(dim_);
       perm_.resize(dim_);
 
@@ -1693,13 +2229,19 @@ void Transpose<floatType>::fuseIndices()
          printf("%d ",perm_[i]);
       printf("\nsizes_new: ");
       for(int i=0;i < dim_ ; ++i)
-         printf("%d ",sizeA_[i]);
+         printf("%lu ",sizeA_[i]);
       printf("\nouterSizeA_new: ");
       for(int i=0;i < dim_ ; ++i)
-         printf("%d ",outerSizeA_[i]);
+         printf("%lu ",outerSizeA_[i]);
       printf("\nouterSizeB_new: ");
       for(int i=0;i < dim_ ; ++i)
-         printf("%d ",outerSizeB_[i]);
+         printf("%lu ",outerSizeB_[i]);
+      printf("\noffsetA_new: ");
+      for(int i=0;i < dim_ ; ++i)
+         printf("%lu ",offsetA_[i]);
+      printf("\noffsetB_new: ");
+      for(int i=0;i < dim_ ; ++i)
+         printf("%lu ",offsetB_[i]);
       printf("\n");
 #endif
    } 
@@ -1951,11 +2493,17 @@ void Transpose<floatType>::createPlans( std::vector<std::shared_ptr<Plan> > &pla
                   const int commId = (taskIdComm/numThreadsPerComm); //  = 0,0,1,1,2,2
                   taskIdComm = taskIdComm % numThreadsPerComm; // local taskId in next communicator // 0,1,0,1,0,1
 
-                  currentNode->start = std::min( sizeA_[index], commId * workPerThread * currentNode->inc );
-                  currentNode->end = std::min( sizeA_[index], (commId+1) * workPerThread * currentNode->inc );
+                  if (index == 0) currentNode->indexA = true;
+                  if (findPos(index, perm_) == 0) currentNode->indexB = true;
+                  currentNode->start = std::min( sizeA_[index] + offsetB_[findPos(index, perm_)], (commId * workPerThread * currentNode->inc) + offsetB_[findPos(index, perm_)] );
+                  currentNode->end = std::min( sizeA_[index] + offsetB_[findPos(index, perm_)], ((commId+1) * workPerThread * currentNode->inc) + offsetB_[findPos(index, perm_)] );
 
                   currentNode->lda = lda_[index];
                   currentNode->ldb = ldb_[findPos(index, perm_)];
+                  currentNode->offDiffAB = (int)offsetA_[index] - (int)offsetB_[findPos(index, perm_)];
+#ifdef DEBUG
+                  printf("(Task %d, Node %p) Level %d is IndexA %d, IndexB %d. Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d\n",taskId,currentNode,l,currentNode->indexA,currentNode->indexB,currentNode->start,currentNode->end,currentNode->lda,currentNode->ldb,currentNode->offDiffAB);
+#endif
 
                   if( perm_[0] != 0 || l != dim_-1 ){
                      currentNode->next = new ComputeNode;
@@ -1966,12 +2514,17 @@ void Transpose<floatType>::createPlans( std::vector<std::shared_ptr<Plan> > &pla
                //macro-kernel
                if( perm_[0] != 0 )
                {
+                  if (posStride1A_inB == 0) currentNode->indexB = true;
                   currentNode->start = -1;
                   currentNode->end = -1;
                   currentNode->inc = -1;
                   currentNode->lda = lda_[ posStride1B_inA ];
                   currentNode->ldb = ldb_[ posStride1A_inB ];
+                  currentNode->offDiffAB = (int)offsetA_[ posStride1B_inA ] - (int)offsetB_[ posStride1A_inB ];
                   currentNode->next = nullptr;
+#ifdef DEBUG
+                  printf("    Adjust Node (%p) IndexA: %d, IndexB: %d, Start: %zu, End: %zu, lda: %zu, ldb: %zu, offDiffAB: %d\n",currentNode,currentNode->indexA,currentNode->indexB,currentNode->start,currentNode->end,currentNode->lda,currentNode->ldb,currentNode->offDiffAB);
+#endif
                }
             }
             plans.push_back(plan);
@@ -2078,7 +2631,7 @@ std::shared_ptr<Plan> Transpose<floatType>::selectPlan( const std::vector<std::s
          }
       }
       if( this->infoLevel_ > 0 )
-         printf("We evaluated %d/%d candidates and selected candidate %d.\n", plansEvaluated, plans.size(), bestPlan_id); 
+         printf("We evaluated %d/%zu candidates and selected candidate %d.\n", plansEvaluated, plans.size(), bestPlan_id); 
    }
    return plans[bestPlan_id];
 }
diff --git a/src/utils.cpp b/src/utils.cpp
index 7742bce..abb29f8 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -46,8 +46,8 @@ int factorial(int n){
    return n * factorial(n-1);
 }
 
-void accountForRowMajor(const int *sizeA, const int *outerSizeA, const int *outerSizeB, const int *perm, 
-      int *tmpSizeA, int *tmpOuterSizeA, int *tmpOuterSizeB, int *tmpPerm, const int dim, const bool useRowMajor)
+void accountForRowMajor(const int *sizeA, const int *outerSizeA, const int *outerSizeB, const int *offsetA, const int *offsetB, const int *perm, 
+      int *tmpSizeA, int *tmpOuterSizeA, int *tmpOuterSizeB, int *tmpOffsetA, int *tmpOffsetB, int *tmpPerm, const int dim, const bool useRowMajor)
 {
    for(int i=0; i < dim; ++i){
       int idx = i;
@@ -66,6 +66,14 @@ void accountForRowMajor(const int *sizeA, const int *outerSizeA, const int *oute
          tmpOuterSizeB[i] = sizeA[perm[idx]];
       else
          tmpOuterSizeB[i] = outerSizeB[idx];
+      if( offsetA == nullptr )
+         tmpOffsetA[i] = 0;
+      else
+         tmpOffsetA[i] = offsetA[idx];
+      if( offsetB == nullptr )
+         tmpOffsetB[i] = 0;
+      else
+         tmpOffsetB[i] = offsetB[idx];
    }
 }
 
diff --git a/testframework/Makefile b/testframework/Makefile
index 3e87780..38eca08 100644
--- a/testframework/Makefile
+++ b/testframework/Makefile
@@ -28,8 +28,11 @@ LIBS=-lhptt
 all: ${OBJ}
 	${CXX} ${OBJ} ${LIB_PATH} ${LIBS} ${CXX_FLAGS} -o testframework.exe
 
+LIB_PATH+= $(OPENMP_LIB_PATH)
+LIBS+= -lomp
+
 %.o: %.cpp
-	${CXX} ${CXX_FLAGS} ${INCLUDE_PATH} -c $< -o $@
+	${CXX} ${LIB_PATH} ${LIBS} ${CXX_FLAGS} ${INCLUDE_PATH} -c $< -o $@
 
 clean:
 	rm -rf *.o benchmark.exe
diff --git a/testframework/testframework.cpp b/testframework/testframework.cpp
index bfc11d2..2b116ed 100644
--- a/testframework/testframework.cpp
+++ b/testframework/testframework.cpp
@@ -56,6 +56,9 @@ int equal_(const floatType *A, const floatType*B, int total_size){
          }
       }
    }
+#ifdef DEBUG
+   printf("\nNumber of Errors: %d of %d\n", error, total_size);
+#endif
    return (error == 0) ? 1 : 0;
 }
 
@@ -67,22 +70,41 @@ void restore(const floatType* A, floatType* B, size_t n)
 }
 
 template<typename floatType>
-static void getRandomTest(int &dim, uint32_t *perm, uint32_t *size, floatType &beta, 
+static void getRandomTest(int &dim, uint32_t *perm, uint32_t *size, 
+      uint32_t *outerSizeA, uint32_t *outerSizeB,
+      uint32_t *offsetA, uint32_t *offsetB,
+      int &innerStrideA, int &innerStrideB,
+      floatType &beta, 
       int &numThreads, 
-      std::string &perm_str, 
-      std::string &size_str, 
-      const int total_size)
+      std::string &perm_str, std::string &size_str, 
+      std::string &outerSizeA_str, std::string &offsetA_str, 
+      std::string &outerSizeB_str, std::string &offsetB_str, 
+      const int total_size, bool subTensors)
 {
    dim = (rand() % MAX_DIM) + 1;
    uint32_t maxSizeDim = std::max(1.0, std::pow(total_size, 1.0/dim));
    std::vector<int> perm_(dim);
    for(int i=0;i < dim ; ++i){
-      size[i] = std::max((((double)rand())/RAND_MAX) * maxSizeDim, 1.);
+      outerSizeA[i] = std::max((((double)rand())/RAND_MAX) * maxSizeDim, 1.);
+      size[i] = outerSizeA[i];
+      if (subTensors) 
+         size[i] = std::max((((double)rand())/RAND_MAX) * outerSizeA[i], 1.);
       perm_[i] = i;
    }
    std::random_shuffle(perm_.begin(), perm_.end());
-   for(int i=0;i < dim ; ++i)
+   for(int i=0;i < dim ; ++i) 
+   {
       perm[i] = perm_[i];
+      outerSizeB[i] = outerSizeA[perm[i]];
+      offsetA[i] = 0;
+      offsetB[i] = 0;
+      if (subTensors)
+      {
+         outerSizeB[i] = std::max((((double)rand())/RAND_MAX) * maxSizeDim, (double)size[perm[i]]);
+         offsetA[i] = std::max((((double)rand())/RAND_MAX) * (outerSizeA[i] - size[i]), 0.);
+         offsetB[i] = std::max((((double)rand())/RAND_MAX) * (outerSizeB[i] - size[perm[i]]), 0.);
+      }
+   }
 
    numThreads = std::max(std::round((((double)rand())/RAND_MAX) * 24), 1.);
    if( rand() > RAND_MAX/2 )
@@ -90,29 +112,52 @@ static void getRandomTest(int &dim, uint32_t *perm, uint32_t *size, floatType &b
    else
       beta = 4.0;
 
+   // Provide a larger inner stride if the tensor is less than a integer factor of the total size
+   int ordinalSizeA = std::accumulate(outerSizeA, outerSizeA+dim, 1, std::multiplies<uint32_t>());
+   int ordinalSizeB = std::accumulate(outerSizeB, outerSizeB+dim, 1, std::multiplies<uint32_t>());
+   innerStrideA = (ordinalSizeA < (total_size / (dim*4))) ? std::max((((double)rand())/RAND_MAX) * dim, 1.) : 1;
+   innerStrideB = (ordinalSizeB < (total_size / (dim*4))) ? std::max((((double)rand())/RAND_MAX) * dim, 1.) : 1;
+
    for(int i=0;i < dim ; ++i){
       perm_str += std::to_string(perm[i]) + " ";
       size_str += std::to_string(size[i]) + " ";
+      outerSizeA_str += std::to_string(outerSizeA[i]) + " ";
+      offsetA_str += std::to_string(offsetA[i]) + " ";
+      outerSizeB_str += std::to_string(outerSizeB[i]) + " ";
+      offsetB_str += std::to_string(offsetB[i]) + " ";
    }
    printf("dim: %d\n", dim);
    printf("beta: %f\n", std::real(beta));
    printf("perm: %s\n", perm_str.c_str());
    printf("size: %s\n", size_str.c_str());
+   printf("outerSizeA: %s\n", outerSizeA_str.c_str());
+   printf("outerSizeB: %s\n", outerSizeB_str.c_str());
+   printf("offsetA: %s\n", offsetA_str.c_str());
+   printf("offsetB: %s\n", offsetB_str.c_str());
+   printf("ordinalSizeA: %d\n", ordinalSizeA);
+   printf("ordinalSizeB: %d\n", ordinalSizeB);
+   printf("innerStrideA: %d\n", innerStrideA);
+   printf("innerStrideB: %d\n", innerStrideB);
    printf("numThreads: %d\n",numThreads);
 }
 
 template<typename floatType>
-void runTests()
+void runTests(bool subTensors = false)
 {
    int numThreads = 1;
    floatType alpha = 2.;
    floatType beta = 4.;
-   //beta = 0; 
 
    srand(time(NULL));
    int dim;
    uint32_t perm[MAX_DIM];
    uint32_t size[MAX_DIM];
+   uint32_t outerSizeA[MAX_DIM];
+   uint32_t outerSizeB[MAX_DIM];
+   uint32_t offsetA[MAX_DIM];
+   uint32_t offsetB[MAX_DIM];
+   int innerStrideA = 2;
+   int innerStrideB = 2;
    size_t total_size = 128*1024*1024;
 
    // Allocating memory for tensors
@@ -136,37 +181,68 @@ void runTests()
       B_ref[i]  = B[i];
       B_hptt[i] = B[i];
    }
+   printf("Total size: %lu\n", total_size);
+   printf("Last element of A: %f\n", A[total_size-1]);
+   printf("Last element of B: %f\n", B[total_size-1]);
 
    for(int j=0; j < NUM_TESTS; ++j)
    {  
       std::string perm_str = "";
       std::string size_str = "";
-      getRandomTest(dim, perm, size, beta, numThreads, perm_str, size_str, total_size);
+      std::string outerSizeA_str = "";
+      std::string outerSizeB_str = "";
+      std::string offsetA_str = "";
+      std::string offsetB_str = "";
+      std::cout<<"Test "<<j<<std::endl;
+      getRandomTest(dim, perm, size, outerSizeA, outerSizeB, offsetA, offsetB, innerStrideA, innerStrideB, beta, numThreads, perm_str, size_str, outerSizeA_str, offsetA_str, outerSizeB_str, offsetB_str, total_size, subTensors);
+      //dim = 8;
+      //beta = 0.0;
+      //int presetPerm[8] = {6, 3, 7, 2, 0, 1, 4, 5};
+      //int presetSize[8] = {4, 5, 8, 1, 7, 4, 9, 9};
+      //int presetOuterSizeA[8] = {4, 5, 8, 1, 7, 4, 9, 9};
+      //int presetOuterSizeB[8] = {9, 1, 9, 8, 4, 5, 7, 4};
+      //int presetOffsetA[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+      //int presetOffsetB[8] = {0, 0, 0, 0, 0, 0, 0, 0};
       int perm_[dim];
       int size_[dim];
+      int outerSizeA_[dim];
+      int outerSizeB_[dim];
+      int offsetA_[dim];
+      int offsetB_[dim];
       for(int i=0;i < dim ; ++i){
          perm_[i] = (int)perm[i];
          size_[i] = (int)size[i];
+         outerSizeA_[i] = (int)outerSizeA[i];
+         outerSizeB_[i] = (int)outerSizeB[i];
+         offsetA_[i] = (int)offsetA[i];
+         offsetB_[i] = (int)offsetB[i];
+         //perm[i] = (int)presetPerm[i];
+         //size[i] = (int)presetSize[i];
+         //outerSizeA[i] = (int)presetOuterSizeA[i];
+         //outerSizeB[i] = (int)presetOuterSizeB[i];
+         //offsetA[i] = (int)presetOffsetA[i];
+         //offsetB[i] = (int)presetOffsetB[i];
       }
 
       auto plan = hptt::create_plan( perm_, dim, 
-            alpha, A, size_, NULL, 
-            beta, B_hptt, NULL, 
+            alpha, A, size_, outerSizeA_, offsetA_, innerStrideA,
+            beta, B_hptt, outerSizeB_, offsetB_, innerStrideB,
             hptt::ESTIMATE, numThreads);
 
       restore(B, B_ref, total_size);
-      transpose_ref<floatType>( size, perm, dim, A, alpha, B_ref, beta, false);
-
+      transpose_ref<floatType>(size, perm, dim, A, alpha, outerSizeA_, offsetA_, innerStrideA, B_ref, beta, outerSizeB_, offsetB_, innerStrideB, false);
       restore(B, B_hptt, total_size);
       plan->execute();
 
       if( !equal_(B_ref, B_hptt, total_size) )
       {
          fprintf(stderr, "Error in HPTT.\n");
-         fprintf(stderr,"%d OMP_NUM_THREADS=%d ./benchmark.exe %d  %s  %s\n",sizeof(floatType), numThreads, dim, perm_str.c_str(), size_str.c_str());
+         fprintf(stderr,"%lu OMP_NUM_THREADS=%d ./benchmark.exe %d  %s  %s\n",sizeof(floatType), numThreads, dim, perm_str.c_str(), size_str.c_str());
          exit(-1);
       }
+      std::cout << "Test " << j << " passed." << std::endl;
    }
+   std::cout << "All tests passed." << std::endl;
    free(A);
    free(B);
    free(B_ref);
@@ -177,15 +253,19 @@ int main(int argc, char *argv[])
 {
   printf("float tests: \n");
   runTests<float>();
+  runTests<float>(true);
 
   printf("double tests: \n");
   runTests<double>();
+  runTests<double>(true);
 
   printf("float complex tests: \n");
   runTests<FloatComplex>();
+  runTests<FloatComplex>(true);
 
   printf("double complex tests: \n");
   runTests<DoubleComplex>();
+  runTests<DoubleComplex>(true);
 
   printf("[SUCCESS] All tests have passed.\n");
   return 0;