diff --git a/CMakeLists.txt b/CMakeLists.txt
index 582ada3..eca64e5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,30 +1,51 @@
 
-cmake_minimum_required(VERSION 3.7 FATAL_ERROR)
-project (HPTT C CXX)
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+project (HPTT VERSION 1.0.0 LANGUAGES C CXX)
 
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(ENABLE_IBM OFF)
+set(CMAKE_NO_SYSTEM_FROM_IMPORTED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY ON)
+
+option(ENABLE_IBM OFF)
+
+if(NOT CMAKE_BUILD_TYPE)
+    set (CMAKE_BUILD_TYPE Release)
+endif()
+
+include(CheckCXXCompilerFlag)
 
 if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le")
   set(ENABLE_IBM ON)
 endif()
 
-if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-  set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -qopenmp -xhost)
-elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-  if(ENABLE_IBM)
-    set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -fopenmp)
-  else()
-    set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -fopenmp -march=native -mtune=native)
+
+if(DEFINED MARCH_FLAGS)
+  set(HPTT_ARCH_FLAGS ${MARCH_FLAGS})
+else()
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+    set(HPTT_ARCH_FLAGS -xhost)
+  elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    if(NOT ENABLE_IBM)
+      set(HPTT_ARCH_FLAGS -march=native) # -mtune=native
+    endif() 
+  elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    set(HPTT_ARCH_FLAGS -march=native)
+  elseif(CMAKE_CXX_COMPILER_ID STREQUAL "PGI")
+    set(HPTT_ARCH_FLAGS -silent -w -Mnovect)
+  # elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL")
+  #   set(HPTT_ARCH_FLAGS -qsmp=omp)
+  #elseif(CMAKE_CXX_COMPILER_ID STREQUAL "ARMClang")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    set(HPTT_ARCH_FLAGS -mcpu=native)
   endif()
-elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-  set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -fopenmp -march=native)
-elseif(CMAKE_CXX_COMPILER_ID STREQUAL "PGI")
-  set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -silent -w -Mnovect)
-elseif(CMAKE_CXX_COMPILER_ID STREQUAL "XL")
-  set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -qsmp=omp)
+endif()
+
+check_cxx_compiler_flag("${HPTT_ARCH_FLAGS}" __COMPILER_SUPPORTS_MARCH)
+if(__COMPILER_SUPPORTS_MARCH)
+  set(HPTT_CXX_FLAGS "${HPTT_ARCH_FLAGS}")
 endif()
 
 if(ENABLE_AVX)
@@ -35,26 +56,80 @@ elseif(ENABLE_IBM)
   set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -mtune=native -DHPTT_ARCH_IBM -maltivec -mabi=altivec)
 endif()
 
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "8.2")
+  set(HPTT_CXX_FLAGS ${HPTT_CXX_FLAGS} -O2)
+endif()
+
 set(HPTT_SRCS src/hptt.cpp src/plan.cpp src/transpose.cpp src/utils.cpp)
 
-add_library(hptt STATIC ${HPTT_SRCS})
-target_compile_features(hptt PUBLIC cxx_std_11)
-target_include_directories(hptt PUBLIC ${PROJECT_SOURCE_DIR}/include)
-#target_compile_definitions(hptt PRIVATE ${HPTT_CXX_COMPILE_DEFS})
-target_compile_options(hptt PUBLIC ${HPTT_CXX_FLAGS})
+add_library(hptt ${HPTT_SRCS})
+
+add_library(hptt::hptt ALIAS hptt)
+
+target_include_directories(hptt 
+    PUBLIC
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+)
+
+target_compile_options(hptt PRIVATE ${HPTT_CXX_FLAGS})
+
+if(ENABLE_OPENMP)
+  find_package(OpenMP REQUIRED)
+  target_link_libraries(hptt PUBLIC OpenMP::OpenMP_CXX)
+endif()
+
+set_target_properties(hptt PROPERTIES EXPORT_NAME hptt)
+
+# Install
+
+include(GNUInstallDirs)
+set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/hptt)
 
 install(TARGETS hptt
-        LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
-        ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
-
-set(HPTT_INCLUDES 
-    include/compute_node.h 
-    include/hptt_types.h 
-    include/hptt.h 
-    include/macros.h 
-    include/plan.h 
-    include/utils.h 
-    include/transpose.h)
-
-install(FILES ${HPTT_INCLUDES}
-        DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
+    EXPORT hptt-targets
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
+
+
+install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hptt)
+
+#Export the targets to a script
+install(EXPORT hptt-targets
+    FILE
+        hptt-targets.cmake
+    NAMESPACE
+        hptt::
+    DESTINATION
+        ${INSTALL_CONFIGDIR}
+)
+
+#Create a ConfigVersion.cmake file
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+    ${CMAKE_CURRENT_BINARY_DIR}/hptt-config-version.cmake
+    VERSION ${PROJECT_VERSION}
+    COMPATIBILITY AnyNewerVersion
+)
+
+configure_package_config_file(${CMAKE_CURRENT_LIST_DIR}/cmake/hptt-config.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/hptt-config.cmake
+    INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
+    PATH_VARS CMAKE_INSTALL_INCLUDEDIR
+)
+
+#Install the config, configversion and custom find modules
+install(FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/hptt-config.cmake
+    ${CMAKE_CURRENT_BINARY_DIR}/hptt-config-version.cmake
+    DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+
+export(EXPORT hptt-targets
+    FILE ${CMAKE_CURRENT_BINARY_DIR}/hptt-targets.cmake
+    NAMESPACE hptt::)
+
+#Register package in user's package registry
+export(PACKAGE hptt)
diff --git a/cmake/hptt-config.cmake.in b/cmake/hptt-config.cmake.in
new file mode 100644
index 0000000..673e714
--- /dev/null
+++ b/cmake/hptt-config.cmake.in
@@ -0,0 +1,18 @@
+
+@PACKAGE_INIT@
+
+set(ENABLE_OPENMP @ENABLE_OPENMP@)
+
+if(ENABLE_OPENMP)
+    # include( CMakeFindDependencyMacro )
+    find_package(OpenMP REQUIRED)
+endif()
+
+if(NOT TARGET hptt::hptt)
+    include("${CMAKE_CURRENT_LIST_DIR}/hptt-targets.cmake")
+endif()
+
+set(HPTT_FOUND TRUE)
+set(HPTT_LIBRARIES hptt::hptt)
+set(HPTT_INCLUDE_DIRS "@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@")
+
diff --git a/include/compute_node.h b/include/compute_node.h
index b777857..a778f7c 100644
--- a/include/compute_node.h
+++ b/include/compute_node.h
@@ -15,11 +15,11 @@ class ComputeNode
             delete next;
       }
 
-   size_t start; //!< start index for at the current loop
-   size_t end; //!< end index for at the current loop
-   size_t inc; //!< increment for at the current loop
-   size_t lda; //!< stride of A w.r.t. the loop index
-   size_t ldb; //!< stride of B w.r.t. the loop index
+   int start; //!< start index for at the current loop
+   int end; //!< end index for at the current loop
+   int inc; //!< increment for at the current loop
+   int lda; //!< stride of A w.r.t. the loop index
+   int ldb; //!< stride of B w.r.t. the loop index
    ComputeNode *next; //!< next ComputeNode, this might be another loop or 'nullptr' (i.e., indicating that the macro-kernel should be called)
 };
 
diff --git a/include/hptt_types.h b/include/hptt_types.h
index 170288e..ebc5796 100644
--- a/include/hptt_types.h
+++ b/include/hptt_types.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <complex>
-#include <complex.h>
 
 #define REGISTER_BITS 256 // AVX
 #ifdef HPTT_ARCH_ARM
diff --git a/include/plan.h b/include/plan.h
index 2ff260d..27e6b3c 100644
--- a/include/plan.h
+++ b/include/plan.h
@@ -17,7 +17,7 @@ class ComputeNode;
 class Plan
 {
    public:
-      Plan() : rootNodes_(nullptr), numTasks_(0) { }
+      Plan() : numTasks_(0), rootNodes_(nullptr) { }
 
       Plan(std::vector<int>loopOrder, std::vector<int>numThreadsAtLoop);
 
diff --git a/include/transpose.h b/include/transpose.h
index 82f0239..90dabae 100644
--- a/include/transpose.h
+++ b/include/transpose.h
@@ -252,12 +252,12 @@ class Transpose
       floatType alpha_; //!< scaling factor for A
       floatType beta_; //!< scaling factor for B
       int dim_; //!< dimension of the tensor
-      std::vector<size_t> sizeA_; //!< size of A
+      std::vector<int> sizeA_; //!< size of A
       std::vector<int> perm_; //!< permutation 
-      std::vector<size_t> outerSizeA_; //!< outer sizes of A
-      std::vector<size_t> outerSizeB_;  //!< outer sizes of B
-      std::vector<size_t> lda_;  //!< strides for all dimensions of A (first dimension has a stride of 1)
-      std::vector<size_t> ldb_;  //!< strides for all dimensions of B (first dimension has a stride of 1)
+      std::vector<int> outerSizeA_; //!< outer sizes of A
+      std::vector<int> outerSizeB_;  //!< outer sizes of B
+      std::vector<int> lda_;  //!< strides for all dimensions of A (first dimension has a stride of 1)
+      std::vector<int> ldb_;  //!< strides for all dimensions of B (first dimension has a stride of 1)
       std::vector<int> threadIds_; //!< OpenMP threadIds of the threads involed in the transposition
       int numThreads_;
       int selectedParallelStrategyId_;
diff --git a/include/utils.h b/include/utils.h
index a85b27c..3937f7b 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -16,6 +16,7 @@ template<typename floatType>
 static floatType conj(floatType x){
    return std::conj(x);
 }
+
 template<>
 float conj(float x){
    return x;
@@ -66,7 +67,7 @@ void getPrimeFactors( int n, std::list<int> &primeFactors );
 template<typename t>
 int findPos(t value, const std::vector<t> &array)
 {
-   for(int i=0;i < array.size() ; ++i)
+   for(size_t i = 0; i < array.size(); ++i)
       if( array[i] == value )
          return i;
    return -1;
diff --git a/src/hptt.cpp b/src/hptt.cpp
index 82d4e73..c3cafe0 100644
--- a/src/hptt.cpp
+++ b/src/hptt.cpp
@@ -180,8 +180,10 @@ void cTensorTranspose( const int *perm, const int dim,
                  const float _Complex beta,        float _Complex *B,                   const int *outerSizeB, 
                  const int numThreads, const int useRowMajor)
 {
+   const hptt::FloatComplex* calpha = reinterpret_cast<const hptt::FloatComplex*>(&alpha);
+   const hptt::FloatComplex* cbeta  = reinterpret_cast<const hptt::FloatComplex*>(&beta);
    auto plan(std::make_shared<hptt::Transpose<hptt::FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim, 
-                         (const hptt::FloatComplex*) A, (hptt::FloatComplex) alpha, (hptt::FloatComplex*) B, (hptt::FloatComplex) beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+                         (const hptt::FloatComplex*) A, *calpha, (hptt::FloatComplex*) B,   *cbeta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->setConjA(conjA);
    plan->execute();
 }
@@ -191,8 +193,10 @@ void zTensorTranspose( const int *perm, const int dim,
                  const double _Complex beta,        double _Complex *B,                   const int *outerSizeB, 
                  const int numThreads, const int useRowMajor)
 {
+  const hptt::DoubleComplex* calpha = reinterpret_cast<const hptt::DoubleComplex*>(&alpha);
+  const hptt::DoubleComplex* cbeta  = reinterpret_cast<const hptt::DoubleComplex*>(&beta);
    auto plan(std::make_shared<hptt::Transpose<hptt::DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim, 
-                         (const hptt::DoubleComplex*) A, (hptt::DoubleComplex) alpha, (hptt::DoubleComplex*) B, (hptt::DoubleComplex) beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+                         (const hptt::DoubleComplex*) A, *calpha, (hptt::DoubleComplex*) B, *cbeta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
    plan->setConjA(conjA);
    plan->execute();
 }
diff --git a/src/plan.cpp b/src/plan.cpp
index 7c5b9bc..d1d5d13 100644
--- a/src/plan.cpp
+++ b/src/plan.cpp
@@ -6,7 +6,7 @@
 
 namespace hptt {
 
-   Plan::Plan(std::vector<int>loopOrder, std::vector<int>numThreadsAtLoop) : rootNodes_(nullptr), loopOrder_(loopOrder), numThreadsAtLoop_(numThreadsAtLoop) {
+   Plan::Plan(std::vector<int>loopOrder, std::vector<int>numThreadsAtLoop) : loopOrder_(loopOrder), numThreadsAtLoop_(numThreadsAtLoop), rootNodes_(nullptr) {
       numTasks_ = 1;
       for(auto nt : numThreadsAtLoop)
          numTasks_ *= nt;
diff --git a/src/transpose.cpp b/src/transpose.cpp
index f77cd5b..03ec97e 100644
--- a/src/transpose.cpp
+++ b/src/transpose.cpp
@@ -704,11 +704,11 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
       } else {
          if( useStreamingStores)
             if( conjA )
-#pragma vector nontemporal
+// #pragma vector nontemporal
                for(int32_t i = plan->start; i < end; i+= inc)
                   B[i] = alpha * conj(A[i]);
             else
-#pragma vector nontemporal
+// #pragma vector nontemporal
                for(int32_t i = plan->start; i < end; i+= inc)
                   B[i] = alpha * A[i];
          else
@@ -742,12 +742,12 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
          beta_(beta),
          dim_(-1),
          numThreads_(numThreads), 
-         masterPlan_(nullptr),
-         selectionMethod_(selectionMethod),
-         maxAutotuningCandidates_(-1),
          selectedParallelStrategyId_(-1),
          selectedLoopOrderId_(-1),
-         conjA_(false)
+         conjA_(false),
+         masterPlan_(nullptr),
+         selectionMethod_(selectionMethod),
+         maxAutotuningCandidates_(-1)
       {
 #ifdef _OPENMP
          omp_init_lock(&writelock);
@@ -793,12 +793,6 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
                                           alpha_(other.alpha_),
                                           beta_(other.beta_),
                                           dim_(other.dim_),
-                                          numThreads_(other.numThreads_),
-                                          masterPlan_(other.masterPlan_),
-                                          selectionMethod_(other.selectionMethod_),
-                                          selectedParallelStrategyId_(other.selectedParallelStrategyId_),
-                                          selectedLoopOrderId_(other.selectedLoopOrderId_),
-                                          maxAutotuningCandidates_(other.maxAutotuningCandidates_),
                                           sizeA_(other.sizeA_),
                                           perm_(other.perm_),
                                           outerSizeA_(other.outerSizeA_),
@@ -806,7 +800,14 @@ void transpose_int_constStride1( const floatType* __restrict__ A, floatType* __r
                                           lda_(other.lda_),
                                           ldb_(other.ldb_),
                                           threadIds_(other.threadIds_),
-                                          conjA_(other.conjA_)
+                                          numThreads_(other.numThreads_),
+                                          selectedParallelStrategyId_(other.selectedParallelStrategyId_),
+                                          selectedLoopOrderId_(other.selectedLoopOrderId_),
+                                          conjA_(other.conjA_),
+                                          masterPlan_(other.masterPlan_),
+                                          selectionMethod_(other.selectionMethod_),
+                                          maxAutotuningCandidates_(other.maxAutotuningCandidates_)
+
       { 
 #ifdef _OPENMP
          omp_init_lock(&writelock);
@@ -834,19 +835,21 @@ void Transpose<floatType>::executeEstimate(const Plan *plan) noexcept
 #ifdef _OPENMP
 #pragma omp parallel for num_threads(numThreads_)  if(numThreads_ > 1)
 #endif
+
+   const floatType* __restrict__ Bnext__ = B_;
    for( int taskId = 0; taskId < numTasks; taskId++)
       if ( perm_[0] != 0 ) {
          auto rootNode = plan->getRootNode_const( taskId );
          if( std::abs(beta_) < getZeroThreshold<floatType>() ) {
             if( conjA_ )
-               transpose_int<blocking_,blocking_,1,floatType, useStreamingStores, true>( A_,A_, B_, B_, 0.0, 1.0, rootNode);
+               transpose_int<blocking_,blocking_,1,floatType, useStreamingStores, true>( A_,A_, B_, Bnext__, 0.0, 1.0, rootNode);
             else
-               transpose_int<blocking_,blocking_,1,floatType, useStreamingStores, false>( A_,A_, B_, B_, 0.0, 1.0, rootNode);
+               transpose_int<blocking_,blocking_,1,floatType, useStreamingStores, false>( A_,A_, B_, Bnext__, 0.0, 1.0, rootNode);
          } else {
             if( conjA_ )
-               transpose_int<blocking_,blocking_,0,floatType, useStreamingStores, true>( A_,A_, B_, B_, 0.0, 1.0, rootNode);
+               transpose_int<blocking_,blocking_,0,floatType, useStreamingStores, true>( A_,A_, B_, Bnext__, 0.0, 1.0, rootNode);
             else
-               transpose_int<blocking_,blocking_,0,floatType, useStreamingStores, false>( A_,A_, B_, B_, 0.0, 1.0, rootNode);
+               transpose_int<blocking_,blocking_,0,floatType, useStreamingStores, false>( A_,A_, B_, Bnext__, 0.0, 1.0, rootNode);
          }
       } else {
          auto rootNode = plan->getRootNode_const( taskId );
@@ -880,7 +883,7 @@ static void axpy_1D( const floatType* __restrict__ A, floatType* __restrict__ B,
       )
    } else {
       if( useStreamingStores)
-#pragma vector nontemporal
+// #pragma vector nontemporal
          HPTT_DUPLICATE(spawnThreads,
             for(int32_t i = myStart; i < myEnd; i++)
                if( conjA )
@@ -918,7 +921,7 @@ static void axpy_2D( const floatType* __restrict__ A, const int lda,
       if( useStreamingStores)
          HPTT_DUPLICATE(spawnThreads,
             for(int32_t j = myStart; j < myEnd; j++)
-_Pragma("vector nontemporal")
+// _Pragma("vector nontemporal")
             for(int32_t i = 0; i < n0; i++)
                if( conjA )
                   B[i + j * ldb] = alpha * conj(A[i + j * lda]);
@@ -1007,17 +1010,18 @@ void Transpose<floatType>::execute_expert() noexcept
    }
 
    const int numTasks = masterPlan_->getNumTasks();
-   const int numThreads = numThreads_;
+   // const int numThreads = numThreads_;
    getStartEnd<spawnThreads>(numTasks, myStart, myEnd);
 
+   const floatType* __restrict__ Bnext__ = B_;
    HPTT_DUPLICATE(spawnThreads,
       for( int taskId = myStart; taskId < myEnd; taskId++)
          if ( perm_[0] != 0 ) {
             auto rootNode = masterPlan_->getRootNode_const( taskId );
             if( conjA_ )
-               transpose_int<blocking_,blocking_,betaIsZero,floatType, useStreamingStores, true>( A_, A_, B_, B_, alpha_, beta_, rootNode);
+               transpose_int<blocking_,blocking_,betaIsZero,floatType, useStreamingStores, true>( A_, A_, B_, Bnext__, alpha_, beta_, rootNode);
             else
-               transpose_int<blocking_,blocking_,betaIsZero,floatType, useStreamingStores, false>( A_, A_, B_, B_, alpha_, beta_, rootNode);
+               transpose_int<blocking_,blocking_,betaIsZero,floatType, useStreamingStores, false>( A_, A_, B_, Bnext__, alpha_, beta_, rootNode);
          } else {
             auto rootNode = masterPlan_->getRootNode_const( taskId );
             if( conjA_ )
@@ -1259,7 +1263,7 @@ void Transpose<floatType>::getBestParallelismStrategy ( std::vector<int> &bestPa
          float lb2 = getLoadBalance(strat2);
 //         printVector(strat2,"strat2");
 //         printf("strat2: %f\n",getLoadBalance(strat2));
-         if( lb1 > 0.8 && lb2 < 0.85 || lb1 >lb2 && lb1 > 0.75 )
+         if( (lb1 > 0.8 && lb2 < 0.85) || (lb1 > lb2 && lb1 > 0.75) )
          {
             std::copy(strat1.begin(), strat1.end(), bestParallelismStrategy.begin());
             return;
@@ -1514,11 +1518,11 @@ void Transpose<floatType>::skipIndices(const int *sizeA, const int* perm, const
    }
    // compact arrays (remove -1)
    for(int i=0;i < dim ; ++i)
-      if( sizeA_[i] == -1 )
+      if( (int)sizeA_[i] == -1 )
       {
          int j=i+1;
          for(;j < dim ; ++j)
-            if( sizeA_[j] != -1 )
+            if( (int)sizeA_[j] != -1 )
                break;
          if( j < dim )
             std::swap(sizeA_[i], sizeA_[j]);
@@ -1614,8 +1618,8 @@ void Transpose<floatType>::fuseIndices()
       int toMerge = i;
       perm.push_back(perm_[i]);
       while(i+1 < dim_ && perm_[i] + 1 == perm_[i+1] 
-            && (sizeA_[perm_[i]] == outerSizeA_[perm_[i]]) 
-            && (sizeA_[perm_[i]] == outerSizeB_[i]) ){ 
+            && ((int)sizeA_[perm_[i]] == outerSizeA_[perm_[i]]) 
+            && ((int)sizeA_[perm_[i]] == outerSizeB_[i]) ){ 
 #ifdef DEBUG
          fprintf(stderr,"[HPTT] MERGING indices %d and %d\n",perm_[i], perm_[i+1]); 
 #endif
@@ -1641,11 +1645,11 @@ void Transpose<floatType>::fuseIndices()
       perm_ = perm;
       // remove gaps in the perm, if requried (e.g., perm=3,1,0 -> 2,1,0)
       int currentValue = 0;
-      for(int i=0;i < perm_.size(); ++i){
+      for(size_t i = 0;i < perm_.size(); ++i){
          //find smallest element in perm_ and rename it to currentValue
          int minValue = 1000000;
          int minPos = -1;
-         for(int pos=0; pos < perm_.size(); ++pos){
+         for(int pos = 0; pos < (int)perm_.size(); ++pos){
             if ( perm_[pos] >= currentValue && perm_[pos] < minValue) {
                minValue = perm_[pos];
                minPos = pos;
@@ -1917,10 +1921,10 @@ void Transpose<floatType>::createPlans( std::vector<std::shared_ptr<Plan> > &pla
    // heuristics, search the space with a growing rectangle (from best to worst,
    // see line marked with ***)
    bool done = false;
-   for( int start= 0; start< std::max( parallelismStrategies.size(), loopOrders.size() ) && !done; start++ )
-      for( int i = 0; i < parallelismStrategies.size() && !done; i++)
+   for( size_t start= 0; start< std::max( parallelismStrategies.size(), loopOrders.size() ) && !done; start++ )
+      for( size_t i = 0; i < parallelismStrategies.size() && !done; i++)
       {
-         for( int j = 0; j < loopOrders.size() && !done; j++)
+         for( size_t j = 0; j < loopOrders.size() && !done; j++)
          {
             if( i > start || j > start || (i != start && j != start) ) continue; //these are already done ***
 
@@ -1975,10 +1979,10 @@ void Transpose<floatType>::createPlans( std::vector<std::shared_ptr<Plan> > &pla
                }
             }
             plans.push_back(plan);
-            if( selectionMethod_ == ESTIMATE || 
-                selectionMethod_ == MEASURE && plans.size() > 200 || 
-                selectionMethod_ == PATIENT && plans.size() > 400 || 
-                selectionMethod_ == CRAZY && plans.size() > 800 )
+            if( (selectionMethod_ == ESTIMATE) || 
+                (selectionMethod_ == MEASURE && plans.size() > 200) || 
+                (selectionMethod_ == PATIENT && plans.size() > 400) || 
+                (selectionMethod_ == CRAZY && plans.size() > 800) )
                done = true;
          }
       }
@@ -2078,7 +2082,7 @@ std::shared_ptr<Plan> Transpose<floatType>::selectPlan( const std::vector<std::s
          }
       }
       if( this->infoLevel_ > 0 )
-         printf("We evaluated %d/%d candidates and selected candidate %d.\n", plansEvaluated, plans.size(), bestPlan_id); 
+         printf("We evaluated %d/%ld candidates and selected candidate %d.\n", plansEvaluated, plans.size(), bestPlan_id); 
    }
    return plans[bestPlan_id];
 }