diff --git a/tests/ap/matmul/all_tuning_configs.h b/tests/ap/matmul/all_tuning_configs.h
index dca41fa..dfa70d1 100644
--- a/tests/ap/matmul/all_tuning_configs.h
+++ b/tests/ap/matmul/all_tuning_configs.h
@@ -6,483 +6,334 @@
 
 namespace ap {
 
-constexpr int kNumConfigsHalf = 23;
-constexpr int kNumConfigsFloat = 13;
-
-template <int SwizzleFactor, bool Batched> struct SwizzleWrapper {
-  using Type =
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<SwizzleFactor>;
-};
-
-// template <int SwizzleFactor>
-// struct SwizzleWrapper<SwizzleFactor, true> {
-//   using Type =
-//       cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle;
-// };
-
-#define AP_AUTOTUNE_half(func, stream, ...)                                    \
-  {                                                                            \
-    using FuncType = decltype(func<0>);                                        \
-    static int selected_config_id = -1;                                        \
-    static std::vector<std::function<FuncType>> matmul_functions = {           \
-        func<0>,  func<1>,  func<2>,  func<3>,  func<4>,  func<5>,             \
-        func<6>,  func<7>,  func<8>,  func<9>,  func<10>, func<11>,            \
-        func<12>, func<13>, func<14>, func<15>, func<16>, func<17>,            \
-        func<18>, func<19>, func<20>, func<21>, func<22>};                     \
-    if (selected_config_id == -1) {                                            \
-      selected_config_id =                                                     \
-          ap::ProfileBestConfig(matmul_functions, stream, ##__VA_ARGS__);      \
-    }                                                                          \
-    matmul_functions[selected_config_id](__VA_ARGS__);                         \
-  }
-
-#define AP_AUTOTUNE_nv_bfloat16(func, stream, ...)                             \
-  AP_AUTOTUNE_half(func, stream, __VA_ARGS__)
-
-#define AP_AUTOTUNE_float(func, stream, ...)                                   \
-  {                                                                            \
-    using FuncType = decltype(func<0>);                                        \
-    static int selected_config_id = -1;                                        \
-    static std::vector<std::function<FuncType>> matmul_functions = {           \
-        func<0>, func<1>, func<2>, func<3>,  func<4>,  func<5>, func<6>,       \
-        func<7>, func<8>, func<9>, func<10>, func<11>, func<12>};              \
-    if (selected_config_id == -1) {                                            \
-      selected_config_id =                                                     \
-          ap::ProfileBestConfig(matmul_functions, stream, ##__VA_ARGS__);      \
-    }                                                                          \
-    matmul_functions[selected_config_id](__VA_ARGS__);                         \
-  }
-
-template <typename ElementT, int SwizzleFactor, bool Batched, int Id = 0>
-struct GemmTuningConfigs {
+template <typename T> struct ConfigsInfo {
+  static constexpr int kNumTotals = 23;
+};
+
+template <> struct ConfigsInfo<float> { static constexpr int kNumTotals = 13; };
+
+template <typename ElementT, int Id = 0> struct GemmTuningConfigs {
   using TShape = cutlass::gemm::GemmShape<256, 128, 64>;
   using WShape = cutlass::gemm::GemmShape<64, 64, 64>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 2;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = Id;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 1> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 1> {
   using TShape = cutlass::gemm::GemmShape<64, 128, 64>;
   using WShape = cutlass::gemm::GemmShape<32, 64, 64>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 1;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 2> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 2> {
   using TShape = cutlass::gemm::GemmShape<64, 128, 64>;
   using WShape = cutlass::gemm::GemmShape<64, 64, 64>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 2;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 3> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 3> {
   using TShape = cutlass::gemm::GemmShape<128, 64, 64>;
   using WShape = cutlass::gemm::GemmShape<64, 32, 64>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 3;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 4> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 4> {
   using TShape = cutlass::gemm::GemmShape<128, 128, 32>;
   using WShape = cutlass::gemm::GemmShape<64, 64, 32>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 4;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 5> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 5> {
   using TShape = cutlass::gemm::GemmShape<128, 128, 64>;
   using WShape = cutlass::gemm::GemmShape<64, 64, 64>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 5;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 6> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 6> {
   using TShape = cutlass::gemm::GemmShape<256, 64, 32>;
   using WShape = cutlass::gemm::GemmShape<64, 64, 32>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 6;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 7> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 7> {
   using TShape = cutlass::gemm::GemmShape<256, 64, 64>;
   using WShape = cutlass::gemm::GemmShape<64, 64, 64>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 7;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 8> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 8> {
   using TShape = cutlass::gemm::GemmShape<256, 128, 32>;
   using WShape = cutlass::gemm::GemmShape<64, 64, 32>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 8;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 9> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 9> {
   using TShape = cutlass::gemm::GemmShape<256, 128, 64>;
   using WShape = cutlass::gemm::GemmShape<64, 64, 64>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 9;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 10> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 10> {
   using TShape = cutlass::gemm::GemmShape<128, 32, 64>;
   using WShape = cutlass::gemm::GemmShape<32, 32, 64>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 4;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 10;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 11> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 11> {
   using TShape = cutlass::gemm::GemmShape<128, 128, 32>;
   using WShape = cutlass::gemm::GemmShape<64, 64, 32>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 4;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 11;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 12> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 12> {
   using TShape = cutlass::gemm::GemmShape<128, 128, 64>;
   using WShape = cutlass::gemm::GemmShape<64, 64, 64>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 4;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 12;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 13> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 13> {
   using TShape = cutlass::gemm::GemmShape<256, 64, 64>;
   using WShape = cutlass::gemm::GemmShape<64, 64, 64>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 4;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 13;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 14> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 14> {
   using TShape = cutlass::gemm::GemmShape<256, 64, 32>;
   using WShape = cutlass::gemm::GemmShape<64, 64, 32>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 4;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 14;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 15> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 15> {
   using TShape = cutlass::gemm::GemmShape<32, 64, 64>;
   using WShape = cutlass::gemm::GemmShape<16, 32, 64>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 5;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 15;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 16> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 16> {
   using TShape = cutlass::gemm::GemmShape<64, 64, 64>;
   using WShape = cutlass::gemm::GemmShape<32, 32, 64>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 5;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 16;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 17> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 17> {
   using TShape = cutlass::gemm::GemmShape<128, 128, 32>;
   using WShape = cutlass::gemm::GemmShape<64, 64, 32>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 5;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 17;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 18> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 18> {
   using TShape = cutlass::gemm::GemmShape<128, 128, 64>;
   using WShape = cutlass::gemm::GemmShape<64, 64, 64>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 5;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 18;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 19> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 19> {
   using TShape = cutlass::gemm::GemmShape<64, 128, 32>;
   using WShape = cutlass::gemm::GemmShape<32, 64, 32>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 6;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 19;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 20> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 20> {
   using TShape = cutlass::gemm::GemmShape<128, 64, 32>;
   using WShape = cutlass::gemm::GemmShape<64, 32, 32>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 6;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 20;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 21> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 21> {
   using TShape = cutlass::gemm::GemmShape<128, 32, 32>;
   using WShape = cutlass::gemm::GemmShape<32, 32, 32>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 7;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 21;
 };
 
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, 22> {
+template <typename ElementT> struct GemmTuningConfigs<ElementT, 22> {
   using TShape = cutlass::gemm::GemmShape<64, 64, 32>;
   using WShape = cutlass::gemm::GemmShape<32, 32, 32>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 16>;
   static constexpr int kNumStages = 10;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 22;
 };
 
 // Specialization for float
-template <int SwizzleFactor, bool Batched, int Id>
-struct GemmTuningConfigs<float, SwizzleFactor, Batched, Id> {
+template <int Id> struct GemmTuningConfigs<float, Id> {
   using TShape = cutlass::gemm::GemmShape<64, 64, 16>;
   using WShape = cutlass::gemm::GemmShape<32, 32, 16>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 8>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = Id;
 };
 
-template <int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<float, SwizzleFactor, Batched, 1> {
+template <> struct GemmTuningConfigs<float, 1> {
   using TShape = cutlass::gemm::GemmShape<64, 64, 32>;
   using WShape = cutlass::gemm::GemmShape<32, 32, 32>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 8>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 1;
 };
 
-template <int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<float, SwizzleFactor, Batched, 2> {
+template <> struct GemmTuningConfigs<float, 2> {
   using TShape = cutlass::gemm::GemmShape<64, 128, 32>;
   using WShape = cutlass::gemm::GemmShape<32, 64, 32>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 8>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 2;
 };
 
-template <int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<float, SwizzleFactor, Batched, 3> {
+template <> struct GemmTuningConfigs<float, 3> {
   using TShape = cutlass::gemm::GemmShape<64, 256, 16>;
   using WShape = cutlass::gemm::GemmShape<32, 64, 16>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 8>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 3;
 };
 
-template <int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<float, SwizzleFactor, Batched, 4> {
+template <> struct GemmTuningConfigs<float, 4> {
   using TShape = cutlass::gemm::GemmShape<64, 256, 32>;
   using WShape = cutlass::gemm::GemmShape<32, 64, 32>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 8>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 4;
 };
 
-template <int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<float, SwizzleFactor, Batched, 5> {
+template <> struct GemmTuningConfigs<float, 5> {
   using TShape = cutlass::gemm::GemmShape<128, 64, 32>;
   using WShape = cutlass::gemm::GemmShape<64, 32, 32>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 8>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 5;
 };
 
-template <int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<float, SwizzleFactor, Batched, 6> {
+template <> struct GemmTuningConfigs<float, 6> {
   using TShape = cutlass::gemm::GemmShape<128, 128, 16>;
   using WShape = cutlass::gemm::GemmShape<32, 64, 16>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 8>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 6;
 };
 
-template <int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<float, SwizzleFactor, Batched, 7> {
+template <> struct GemmTuningConfigs<float, 7> {
   using TShape = cutlass::gemm::GemmShape<128, 128, 32>;
   using WShape = cutlass::gemm::GemmShape<32, 64, 32>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 8>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 7;
 };
 
-template <int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<float, SwizzleFactor, Batched, 8> {
+template <> struct GemmTuningConfigs<float, 8> {
   using TShape = cutlass::gemm::GemmShape<256, 64, 16>;
   using WShape = cutlass::gemm::GemmShape<64, 32, 16>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 8>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 8;
 };
 
-template <int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<float, SwizzleFactor, Batched, 9> {
+template <> struct GemmTuningConfigs<float, 9> {
   using TShape = cutlass::gemm::GemmShape<256, 64, 32>;
   using WShape = cutlass::gemm::GemmShape<64, 32, 32>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 8>;
   static constexpr int kNumStages = 3;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 9;
 };
 
-template <int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<float, SwizzleFactor, Batched, 10> {
+template <> struct GemmTuningConfigs<float, 10> {
   using TShape = cutlass::gemm::GemmShape<64, 128, 16>;
   using WShape = cutlass::gemm::GemmShape<32, 64, 16>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 8>;
   static constexpr int kNumStages = 4;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 10;
 };
 
-template <int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<float, SwizzleFactor, Batched, 11> {
+template <> struct GemmTuningConfigs<float, 11> {
   using TShape = cutlass::gemm::GemmShape<128, 64, 16>;
   using WShape = cutlass::gemm::GemmShape<64, 32, 16>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 8>;
   static constexpr int kNumStages = 4;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 11;
 };
 
-template <int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<float, SwizzleFactor, Batched, 12> {
+template <> struct GemmTuningConfigs<float, 12> {
   using TShape = cutlass::gemm::GemmShape<128, 128, 16>;
   using WShape = cutlass::gemm::GemmShape<32, 64, 16>;
   using IShape = cutlass::gemm::GemmShape<16, 8, 8>;
   static constexpr int kNumStages = 4;
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = 12;
 };
 
diff --git a/tests/ap/matmul/autotune.h b/tests/ap/matmul/autotune.h
new file mode 100644
index 0000000..376116b
--- /dev/null
+++ b/tests/ap/matmul/autotune.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include "all_tuning_configs.h"
+#include "default_config_id.h"
+#include "profile.h"
+
+namespace ap {
+
+template <typename Runner, SwizzleType ST, int... Is>
+auto GenerateFuncList(std::integer_sequence<int, Is...>) {
+  using FuncPtr = decltype(&Runner::template Apply<0, ST>);
+  return std::vector<FuncPtr>{&Runner::template Apply<Is, ST>...};
+}
+
+template <typename T, typename Runner, bool EnableStreamK, typename... Args>
+int RunWithAutotune(cudaStream_t stream, int config_id, Args &&...args) {
+#if AP_ENABLE_AUTOTUNE
+  int selected_config_id = config_id;
+
+  using FuncPtr = decltype(&Runner::template Apply<0, SwizzleType::kCommon>);
+  constexpr int N = ap::ConfigsInfo<T>::kNumTotals;
+
+  static std::vector<FuncPtr> matmul_functions;
+  static std::vector<FuncPtr> streamk_functions;
+
+  if (matmul_functions.empty()) {
+    matmul_functions = GenerateFuncList<Runner, SwizzleType::kCommon>(
+        std::make_integer_sequence<int, N>{});
+  }
+
+  if constexpr (EnableStreamK) {
+    if (streamk_functions.empty()) {
+      streamk_functions = GenerateFuncList<Runner, SwizzleType::kStreamK>(
+          std::make_integer_sequence<int, N>{});
+    }
+  }
+
+  if (selected_config_id == -1) {
+    selected_config_id = ap::ProfileBestConfig(matmul_functions, stream,
+                                               std::forward<Args>(args)...);
+    if constexpr (EnableStreamK) {
+      std::vector<FuncPtr> mixed_functions = {
+          matmul_functions[selected_config_id],
+          streamk_functions[selected_config_id]};
+      int mixed_config_id = ap::ProfileBestConfig(mixed_functions, stream,
+                                                  std::forward<Args>(args)...);
+      selected_config_id = (mixed_config_id == 0) ? selected_config_id
+                                                  : (selected_config_id + N);
+    }
+  } else {
+    if constexpr (EnableStreamK) {
+      if (selected_config_id < N) {
+        matmul_functions[selected_config_id](std::forward<Args>(args)...);
+      } else {
+        streamk_functions[selected_config_id - N](std::forward<Args>(args)...);
+      }
+    } else {
+      matmul_functions[selected_config_id](std::forward<Args>(args)...);
+    }
+  }
+
+  return selected_config_id;
+#else
+  Runner::template Apply<DefaultConfig::kConfigId, SwizzleType::kCommon>(
+      std::forward<Args>(args)...);
+  return -1;
+#endif
+}
+
+} // namespace ap
diff --git a/tests/ap/matmul/cutlass_matmul.cuh b/tests/ap/matmul/cutlass_matmul.cuh
index 1ba8556..faa57b6 100644
--- a/tests/ap/matmul/cutlass_matmul.cuh
+++ b/tests/ap/matmul/cutlass_matmul.cuh
@@ -79,8 +79,7 @@ template <typename ElementT,
           bool TransposeA = false,
           bool TransposeB = false,
           int ConfigId = DefaultConfig::kConfigId,
-          int SwizzleFactor = DefaultConfig::kSwizzleFactor,
-          bool Batched = DefaultConfig::kBatched>
+          SwizzleType ST = DefaultConfig::kSwizzleType>
 void CutlassMatmul(const GemmEpilogueParams& params) {
   using ElementAccumulator = typename CutlassDataType<ElementComputeT>::Type; // <- data type of accumulator
   using ElementComputeEpilogue = ElementAccumulator;              // <- data type of epilogue operations
@@ -107,12 +106,12 @@ void CutlassMatmul(const GemmEpilogueParams& params) {
       ElementAccumulator,
       cutlass::arch::OpClassTensorOp,
       cutlass::arch::Sm80,
-      typename GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::TShape,
-      typename GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::WShape,
-      typename GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::IShape,
+      typename GemmTuningConfigs<ElementT, ConfigId>::TShape,
+      typename GemmTuningConfigs<ElementT, ConfigId>::WShape,
+      typename GemmTuningConfigs<ElementT, ConfigId>::IShape,
       EpilogueOutputOp,
-      typename GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::SwizzleThreadBlock,
-      GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::kNumStages,
+      typename ThreadBlockSwizzle<ST>::Type,
+      GemmTuningConfigs<ElementT, ConfigId>::kNumStages,
       128 / cutlass::sizeof_bits<ElementInputA>::value, // AlignA
       128 / cutlass::sizeof_bits<ElementInputB>::value, // AlignB
       typename GemmOperation<ElementT>::Type            // Operation performed by GEMM
@@ -170,11 +169,10 @@ void CutlassMatmul(const GemmEpilogueParams& params) {
 template <typename ElementT,
           typename ElementComputeT,
           template<typename T> class UnaryFunctor,
-          bool TransposeA = false,
-          bool TransposeB = false,
+          int AlignA = 128 / cutlass::sizeof_bits<ElementT>::value,
+          int AlignB = 128 / cutlass::sizeof_bits<ElementT>::value,
           int ConfigId = DefaultConfig::kConfigId,
-          int SwizzleFactor = DefaultConfig::kSwizzleFactor,
-          bool Batched = DefaultConfig::kBatched>
+          SwizzleType ST = DefaultConfig::kSwizzleType>
 void CutlassMatmulAddUnary(const GemmEpilogueParams& params, const typename UnaryFunctor<ElementComputeT>::Arguments& unary_args) {
   using ElementAccumulator = typename CutlassDataType<ElementComputeT>::Type; // <- data type of accumulator
   using ElementComputeEpilogue = ElementAccumulator;              // <- data type of epilogue operations
@@ -197,23 +195,23 @@ void CutlassMatmulAddUnary(const GemmEpilogueParams& params, const typename Unar
 
   using GemmFunc = cutlass::gemm::device::GemmUniversal<
       ElementInputA,
-      typename MatrixLayout<TransposeA>::Type,
+      cutlass::layout::RowMajor,
       ElementInputB,
-      typename MatrixLayout<TransposeB>::Type,
+      cutlass::layout::RowMajor,
       ElementOutput,
       cutlass::layout::RowMajor,
       ElementAccumulator,
       cutlass::arch::OpClassTensorOp,
       cutlass::arch::Sm80,
-      typename GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::TShape,
-      typename GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::WShape,
-      typename GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::IShape,
+      typename GemmTuningConfigs<ElementT, ConfigId>::TShape,
+      typename GemmTuningConfigs<ElementT, ConfigId>::WShape,
+      typename GemmTuningConfigs<ElementT, ConfigId>::IShape,
       EpilogueOutputOp,
-      typename GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::SwizzleThreadBlock,
-      GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::kNumStages,
-      128 / cutlass::sizeof_bits<ElementInputA>::value, // AlignA
-      128 / cutlass::sizeof_bits<ElementInputB>::value, // AlignB
-      typename GemmOperation<ElementT>::Type  // Operation performed by GEMM
+      typename ThreadBlockSwizzle<ST>::Type,
+      GemmTuningConfigs<ElementT, ConfigId>::kNumStages,
+      AlignA,
+      AlignB,
+      typename GemmOperation<ElementT>::Type
   >;
 
   CHECK_CUTLASS(SetMaxDynamicSharedMemorySize<GemmFunc>());
@@ -265,137 +263,13 @@ void CutlassMatmulAddUnary(const GemmEpilogueParams& params, const typename Unar
 #endif
 }
 
-template <typename ElementT,
-          typename ElementComputeT,
-          int ConfigId = DefaultConfig::kConfigId,
-          int SwizzleFactor = DefaultConfig::kSwizzleFactor,
-          bool Batched = DefaultConfig::kBatched>
-void CutlassMatmulAddBroadcast(const GemmBroadcastEpilogueParams& params) {
-  using ElementAccumulator = typename CutlassDataType<ElementComputeT>::Type; // <- data type of accumulator
-  using ElementComputeEpilogue = ElementAccumulator;              // <- data type of epilogue operations
-  using ElementInputA = typename CutlassDataType<ElementT>::Type; // <- data type of elements in input matrix A
-  using ElementInputB = typename CutlassDataType<ElementT>::Type; // <- data type of elements in input matrix B
-  using ElementOutputC = typename CutlassDataType<ElementT>::Type;// <- data type of elements in output matrix D
-  using ElementOutputZ = ElementOutputC;
-  using ElementOutputT = ElementOutputC;
-
-  // Epilogue operation as LinearCombinationBiasElementwise:
-  //  Y = GEMM(AB, C)
-  //  T[i, j] = BinaryOp(Y[i, j], Broadcast[i])
-  //  Z[i, j] = Elementwise(T[i, j])
-  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
-    ElementOutputC,
-    ElementAccumulator,
-    ElementComputeEpilogue,
-    ElementOutputZ,
-    ElementOutputT,
-    128 / cutlass::sizeof_bits<ElementOutputC>::value
-  >;
-
-  // Epilogue operation as LinearCombinationResidualBlock:
-  //  Y = GEMM(AB, C1)
-  //  UnaryOp(BinaryOp2(BinaryOp1(ActivationOp(Y), residual1), residual2))
-  // using EpilogueOp = cutlass::epilogue::thread::LinearCombinationResidualBlock<
-  //   ElementOutput,                        // Element type for output matrix
-  //   ElementAccumulator,                   // Element type from internal accumulation
-  //   ElementCompute,                       // Element type from internal accumulation
-  //   ElementC,                             // Element type for C1/C2/D matrix operands
-  //   AlignmentC,                           // Memory access granularity of C and D matrix in units of elements
-  //   cutlass::epilogue::thread::Identity,  // Activation
-  //   cutlass::plus,                        // Binary operation 1
-  //   cutlass::epilogue::thread::Identity,  // Unary operation
-  //   cutlass::plus                         // Binary operation 2
-  //   >;
-
-  using GemmFunc = cutlass::gemm::device::GemmUniversalWithBroadcast<
-      ElementInputA,
-      cutlass::layout::RowMajor,
-      ElementInputB,
-      cutlass::layout::RowMajor,
-      ElementOutputC,
-      cutlass::layout::RowMajor,
-      ElementAccumulator,
-      cutlass::arch::OpClassTensorOp,
-      cutlass::arch::Sm80,
-      typename GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::TShape,
-      typename GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::WShape,
-      typename GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::IShape,
-      EpilogueOutputOp,
-      typename GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::SwizzleThreadBlock,
-      GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::kNumStages,
-      128 / cutlass::sizeof_bits<ElementInputA>::value, // AlignA
-      128 / cutlass::sizeof_bits<ElementInputB>::value, // AlignB
-      typename GemmOperation<ElementT>::Type  // Operation performed by GEMM
-  >;
-
-  CHECK_CUTLASS(SetMaxDynamicSharedMemorySize<GemmFunc>());
-
-  /// Arguments
-  cutlass::gemm::GemmCoord problem_size{params.m, params.n, params.k};
-
-  const ElementInputA *input = reinterpret_cast<const ElementInputA *>(params.input);
-  const ElementInputB *weight = reinterpret_cast<const ElementInputB *>(params.weight);
-  const ElementOutputC *bias = reinterpret_cast<const ElementOutputC *>(params.bias);
-  ElementOutputZ *output = reinterpret_cast<ElementOutputZ *>(params.output);
-  ElementOutputC *broadcast = reinterpret_cast<ElementOutputC *>(params.broadcast);
-  ElementOutputT *broadcast_out = reinterpret_cast<ElementOutputT *>(params.broadcast_out);
-
-  const int64_t batch_stride_Broadcast = params.need_broadcast ? problem_size.m() : problem_size.m() * problem_size.n();
-  const int64_t ldr_broadcast = params.need_broadcast ? 0 : problem_size.n();
-
-  ElementComputeEpilogue alpha = static_cast<ElementComputeEpilogue>(1);
-  ElementComputeEpilogue beta = static_cast<ElementComputeEpilogue>(1);
-
-  typename GemmFunc::Arguments arguments{
-      GetGemmMode(params.batch_count),
-      problem_size,                         // <- problem size of matrix multiplication
-      params.batch_count,                   // <- batch_count or k-dimension split factor
-      {alpha, beta},                        // <- epilogue params, alpha, beta
-      input,                                // <- input, ptr_A, A, shape={M, K}
-      weight,                               // <- input, ptr_B, B, shape={K, N}
-      bias,                                 // <- input, ptr_C, shape={M, N} or {1, N}
-      output,                               // <- output, ptr_D, Z, shape={M, N}
-      broadcast,                            // <- input, ptr_Vector, Broadcast, shape={M, 1}
-      broadcast_out,                        // <- output, ptr_Tensor, T
-      params.shape_args.batch_stride_A,
-      params.shape_args.batch_stride_B,
-      params.shape_args.batch_stride_C,
-      params.shape_args.batch_stride_D,
-      batch_stride_Broadcast,               // <- batch_stride_Vector, need broadcast
-      problem_size.m() * problem_size.n(),  // <- batch_stride_Tensor
-      params.shape_args.lda,
-      params.shape_args.ldb,
-      params.shape_args.ldc_bias,
-      params.shape_args.ldd,
-      ldr_broadcast,                        // <- ldr, must be zero
-      problem_size.n()                      // <- ldt
-  };
-
-  size_t workspace_size = GemmFunc::get_workspace_size(arguments);
-  void* workspace = workspace_size > 0 ? GetWorkspace(workspace_size) : nullptr;
-
-  GemmFunc device_gemm;
-
-  CHECK_CUTLASS(device_gemm.can_implement(arguments));
-  CHECK_CUTLASS(device_gemm.initialize(arguments, workspace, params.stream));
-
-  //
-  // Run the GEMM
-  //
-  CHECK_CUTLASS(device_gemm(params.stream));
-#if AP_ENABLE_DEBUG
-  CHECK_CUDA(cudaStreamSynchronize(params.stream));
-#endif
-}
-
 template <typename ElementT,
           typename ElementComputeT,
           template<typename T> class VariadicFunctor,
           int AlignA = 128 / cutlass::sizeof_bits<ElementT>::value,
           int AlignB = 128 / cutlass::sizeof_bits<ElementT>::value,
           int ConfigId = DefaultConfig::kConfigId,
-          int SwizzleFactor = DefaultConfig::kSwizzleFactor,
-          bool Batched = DefaultConfig::kBatched>
+          SwizzleType ST = DefaultConfig::kSwizzleType>
 void CutlassMatmulAddVariadic(const GemmEpilogueParams& params, const typename VariadicFunctor<ElementComputeT>::Arguments& variadic_args) {
   using ElementAccumulator = typename CutlassDataType<ElementComputeT>::Type; // <- data type of accumulator
   using ElementComputeEpilogue = ElementAccumulator;              // <- data type of epilogue operations
@@ -425,12 +299,12 @@ void CutlassMatmulAddVariadic(const GemmEpilogueParams& params, const typename V
       ElementAccumulator,
       cutlass::arch::OpClassTensorOp,
       cutlass::arch::Sm80,
-      typename GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::TShape,
-      typename GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::WShape,
-      typename GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::IShape,
+      typename GemmTuningConfigs<ElementT, ConfigId>::TShape,
+      typename GemmTuningConfigs<ElementT, ConfigId>::WShape,
+      typename GemmTuningConfigs<ElementT, ConfigId>::IShape,
       EpilogueOutputOp,
-      typename GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::SwizzleThreadBlock,
-      GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ConfigId>::kNumStages,
+      typename ThreadBlockSwizzle<ST>::Type,
+      GemmTuningConfigs<ElementT, ConfigId>::kNumStages,
       AlignA,
       AlignB,
       typename GemmOperation<ElementT>::Type
diff --git a/tests/ap/matmul/cutlass_patch/epilogue/threadblock/default_epilogue_with_variadic.h b/tests/ap/matmul/cutlass_patch/epilogue/threadblock/default_epilogue_with_variadic.h
index d9b4894..9020dd2 100644
--- a/tests/ap/matmul/cutlass_patch/epilogue/threadblock/default_epilogue_with_variadic.h
+++ b/tests/ap/matmul/cutlass_patch/epilogue/threadblock/default_epilogue_with_variadic.h
@@ -1,16 +1,17 @@
 /*! \file
   \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
 
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+  The epilogue rearranges the result of a matrix product through shared memory
+  to match canonical tensor layouts in global memory. Epilogues support
+  conversion and reduction operations.
 
 */
 
 #pragma once
 
+#include "cutlass/array.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
 
 #include "cutlass/gemm/gemm.h"
 
@@ -22,267 +23,164 @@
 
 #include "cutlass/layout/permute.h"
 
-
 namespace cutlass {
 namespace epilogue {
 namespace threadblock {
 
 /// Defines sensible defaults for epilogues for SimtOps.
-template <
-  typename Shape,
-  typename WarpMmaSimt,
-  typename ElementOutput,
-  typename OutputOp,
-  int ElementsPerAccess,
-  bool ScatterD = false,
-  typename PermuteDLayout = layout::NoPermute,
-  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
-  int Rank = 4
->
+template <typename Shape, typename WarpMmaSimt, typename ElementOutput,
+          typename OutputOp, int ElementsPerAccess, bool ScatterD = false,
+          typename PermuteDLayout = layout::NoPermute,
+          conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+          int Rank = 4>
 struct DefaultEpilogueWithVariadicSimt {
 
   static conv::StrideSupport const kStrideSupport = StrideSupport;
   static int const kRank = Rank;
 
-  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+  static bool const UseCUDAStore =
+      platform::is_same<ElementOutput, double>::value;
 
   /// Use defaults related to the existing epilogue
-  using Base = DefaultEpilogueSimt<
-    Shape,
-    WarpMmaSimt,
-    OutputOp,
-    ElementsPerAccess
-  >;
-
-  using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout,
-    UseCUDAStore
-  >;
-
-  using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
-    typename Base::OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout,
-    UseCUDAStore,
-    kRank
-  >;
+  using Base =
+      DefaultEpilogueSimt<Shape, WarpMmaSimt, OutputOp, ElementsPerAccess>;
+
+  using PackedOutputTileIterator =
+      cutlass::epilogue::threadblock::PredicatedTileIterator<
+          typename Base::OutputTileThreadMap, ElementOutput, ScatterD,
+          PermuteDLayout, UseCUDAStore>;
+
+  using StridedOutputTileIterator =
+      cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
+          typename Base::OutputTileThreadMap, ElementOutput, ScatterD,
+          PermuteDLayout, UseCUDAStore, kRank>;
 
   //
   // Stores the result z = (y = GEMM(A, B, C), variadic)
   //
-  using OutputTileIterator = typename platform::conditional<StrideSupport == cutlass::conv::StrideSupport::kUnity,
-                                                            PackedOutputTileIterator,
-                                                            StridedOutputTileIterator>::type;
+  using OutputTileIterator = typename platform::conditional<
+      StrideSupport == cutlass::conv::StrideSupport::kUnity,
+      PackedOutputTileIterator, StridedOutputTileIterator>::type;
 
   //
   // Define the epilogue
   //
   using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVariadic<
-    Shape,
-    WarpMmaSimt,
-    Base::kPartitionsK,
-    OutputTileIterator,
-    typename Base::AccumulatorFragmentIterator,
-    typename Base::WarpTileIterator,
-    typename Base::SharedLoadIterator,
-    OutputOp,
-    typename Base::Padding
-  >;
+      Shape, WarpMmaSimt, Base::kPartitionsK, OutputTileIterator,
+      typename Base::AccumulatorFragmentIterator,
+      typename Base::WarpTileIterator, typename Base::SharedLoadIterator,
+      OutputOp, typename Base::Padding>;
 };
 
 /// Defines sensible defaults for strided dgrad epilogues for SimtOps.
-template <
-  typename Shape,
-  typename WarpMmaSimt,
-  typename ElementOutput,
-  typename OutputOp,
-  int ElementsPerAccess,
-  bool ScatterD = false,
-  typename PermuteDLayout = layout::NoPermute
->
+template <typename Shape, typename WarpMmaSimt, typename ElementOutput,
+          typename OutputOp, int ElementsPerAccess, bool ScatterD = false,
+          typename PermuteDLayout = layout::NoPermute>
 struct DefaultEpilogueWithVariadicSimtStridedDgrad {
 
   /// Use defaults related to the existing epilogue
-  using Base = DefaultEpilogueSimtStridedDgrad<
-    Shape,
-    WarpMmaSimt,
-    OutputOp,
-    ElementsPerAccess
-  >;
+  using Base = DefaultEpilogueSimtStridedDgrad<Shape, WarpMmaSimt, OutputOp,
+                                               ElementsPerAccess>;
 
   //
   // Stores the result z = (y = GEMM(A, B, C), variadic)
   //
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
-    typename Base::OutputTileThreadMap,
-    ElementOutput
-  >;
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+          typename Base::OutputTileThreadMap, ElementOutput>;
 
   //
   // Define the epilogue
   //
   using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVariadic<
-    Shape,
-    WarpMmaSimt,
-    Base::kPartitionsK,
-    OutputTileIterator,
-    typename Base::AccumulatorFragmentIterator,
-    typename Base::WarpTileIterator,
-    typename Base::SharedLoadIterator,
-    OutputOp,
-    typename Base::Padding
-  >;
+      Shape, WarpMmaSimt, Base::kPartitionsK, OutputTileIterator,
+      typename Base::AccumulatorFragmentIterator,
+      typename Base::WarpTileIterator, typename Base::SharedLoadIterator,
+      OutputOp, typename Base::Padding>;
 };
 
 /// Defines sensible defaults for epilogues for TensorOps.
-template <
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename ElementOutput,
-  typename OutputOp,
-  int ElementsPerAccess,
-  bool ScatterD = false,
-  typename PermuteDLayout = layout::NoPermute
->
+template <typename Shape, typename WarpMmaTensorOp, int PartitionsK,
+          typename ElementOutput, typename OutputOp, int ElementsPerAccess,
+          bool ScatterD = false, typename PermuteDLayout = layout::NoPermute>
 struct DefaultEpilogueWithVariadicTensorOp {
 
   /// Use defaults related to the existing epilogue
-  using Base = DefaultEpilogueTensorOp<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputOp,
-    ElementsPerAccess
-  >;
+  using Base = DefaultEpilogueTensorOp<Shape, WarpMmaTensorOp, PartitionsK,
+                                       OutputOp, ElementsPerAccess>;
 
   //
   // Stores the result z = (y = GEMM(A, B, C), variadic)
   //
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout
-  >;
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::PredicatedTileIterator<
+          typename Base::OutputTileThreadMap, ElementOutput, ScatterD,
+          PermuteDLayout>;
 
   //
   // Define the epilogue
   //
   using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVariadic<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputTileIterator,
-    typename Base::AccumulatorFragmentIterator,
-    typename Base::WarpTileIterator,
-    typename Base::SharedLoadIterator,
-    OutputOp,
-    typename Base::Padding,
-    Base::kFragmentsPerIteration
-  >;
+      Shape, WarpMmaTensorOp, PartitionsK, OutputTileIterator,
+      typename Base::AccumulatorFragmentIterator,
+      typename Base::WarpTileIterator, typename Base::SharedLoadIterator,
+      OutputOp, typename Base::Padding, Base::kFragmentsPerIteration>;
 };
 
-#if 0
 /// Defines sensible defaults for streamk epilogues for TensorOps.
-template <
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename ElementOutput,
-  typename OutputOp,
-  int ElementsPerAccess,
-  bool ScatterD = false,
-  typename PermuteDLayout = layout::NoPermute
->
+template <typename Shape, typename WarpMmaTensorOp, int PartitionsK,
+          typename ElementOutput, typename OutputOp, int ElementsPerAccess,
+          bool ScatterD = false, typename PermuteDLayout = layout::NoPermute>
 struct DefaultStreamkEpilogueWithVariadicTensorOp {
 
   /// Use defaults related to the existing epilogue
-  using Base = DefaultEpilogueTensorOp<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputOp,
-    ElementsPerAccess
-  >;
+  using Base = DefaultEpilogueTensorOp<Shape, WarpMmaTensorOp, PartitionsK,
+                                       OutputOp, ElementsPerAccess>;
 
   //
   // Stores the result z = (y = GEMM(A, B, C), variadic)
   //
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    ElementOutput,
-    ScatterD,
-    PermuteDLayout
-  >;
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::PredicatedTileIterator<
+          typename Base::OutputTileThreadMap, ElementOutput, ScatterD,
+          PermuteDLayout>;
 
   //
   // Define the epilogue
   //
   using Epilogue = cutlass::epilogue::threadblock::EpilogueStreamkWithVariadic<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputTileIterator,
-    typename Base::AccumulatorFragmentIterator,
-    typename Base::WarpTileIterator,
-    typename Base::SharedLoadIterator,
-    OutputOp,
-    typename Base::Padding,
-    Base::kFragmentsPerIteration
-  >;
+      Shape, WarpMmaTensorOp, PartitionsK, OutputTileIterator,
+      typename Base::AccumulatorFragmentIterator,
+      typename Base::WarpTileIterator, typename Base::SharedLoadIterator,
+      OutputOp, typename Base::Padding, Base::kFragmentsPerIteration>;
 };
-#endif
 
 /// Defines sensible defaults for epilogues for VoltaTensorOps.
-template <
-  typename Shape,
-  typename WarpMmaTensorOp,
-  int PartitionsK,
-  typename ElementOutput,
-  typename OutputOp,
-  int ElementsPerAccess
->
+template <typename Shape, typename WarpMmaTensorOp, int PartitionsK,
+          typename ElementOutput, typename OutputOp, int ElementsPerAccess>
 struct DefaultEpilogueWithVariadicVoltaTensorOp {
 
   /// Use defaults related to the existing epilogue
-  using Base = DefaultEpilogueVoltaTensorOp<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputOp,
-    ElementsPerAccess
-  >;
+  using Base = DefaultEpilogueVoltaTensorOp<Shape, WarpMmaTensorOp, PartitionsK,
+                                            OutputOp, ElementsPerAccess>;
 
   //
   // Stores the result z = (y = GEMM(A, B, C), variadic)
   //
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    typename Base::OutputTileThreadMap,
-    ElementOutput
-  >;
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::PredicatedTileIterator<
+          typename Base::OutputTileThreadMap, ElementOutput>;
 
   //
   // Define the epilogue
   //
   using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVariadic<
-    Shape,
-    WarpMmaTensorOp,
-    PartitionsK,
-    OutputTileIterator,
-    typename Base::AccumulatorFragmentIterator,
-    typename Base::WarpTileIterator,
-    typename Base::SharedLoadIterator,
-    OutputOp,
-    typename Base::Padding
-  >;
+      Shape, WarpMmaTensorOp, PartitionsK, OutputTileIterator,
+      typename Base::AccumulatorFragmentIterator,
+      typename Base::WarpTileIterator, typename Base::SharedLoadIterator,
+      OutputOp, typename Base::Padding>;
 };
 
-
 } // namespace threadblock
 } // namespace epilogue
 } // namespace cutlass
diff --git a/tests/ap/matmul/cutlass_patch/gemm/kernel/default_gemm_with_variadic.h b/tests/ap/matmul/cutlass_patch/gemm/kernel/default_gemm_with_variadic.h
index b98db0e..8aa21a8 100644
--- a/tests/ap/matmul/cutlass_patch/gemm/kernel/default_gemm_with_variadic.h
+++ b/tests/ap/matmul/cutlass_patch/gemm/kernel/default_gemm_with_variadic.h
@@ -8,194 +8,163 @@
 
 #include "cutlass/cutlass.h"
 
-#include "cutlass/gemm/kernel/gemm_universal.h"
 #include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
 
 #include "cutlass_patch/epilogue/threadblock/default_epilogue_with_variadic.h"
 #include "cutlass_patch/epilogue/threadblock/epilogue_with_variadic.h"
 
-
 namespace cutlass {
 namespace gemm {
 namespace kernel {
 
-
 template <
-  /// Element type for A matrix operand
-  typename ElementA_,
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Complex elementwise transformation on A operand
-  ComplexTransform TransformA,
-  /// Access granularity of A matrix in units of elements
-  int kAlignmentA,
-  /// Element type for B matrix operand
-  typename ElementB_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Complex elementwise transformation on B operand
-  ComplexTransform TransformB,
-  /// Access granularity of B matrix in units of elements
-  int kAlignmentB,
-  /// Element type for C and D matrix operands
-  typename ElementC_,
-  /// Layout type for C and D matrix operands
-  typename LayoutC_,
-  /// Element type for internal accumulation
-  typename ElementAccumulator,
-  /// Operator class tag
-  typename OperatorClass,
-  /// Tag indicating architecture to tune for
-  typename ArchTag,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape,
-  /// Epilogue output operator      - must satisfy concept of 'EpilogueWithVariadicOp'
-  typename EpilogueOutputOp,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle,
-  /// Number of stages used in the pipelined mainloop
-  int Stages,
-  /// Operation performed by GEMM
-  typename Operator,
-  ///
-  typename Enable = void
->
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator      - must satisfy concept of
+    /// 'EpilogueWithVariadicOp'
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    ///
+    typename Enable = void>
 struct DefaultGemmWithVariadic {
 
   using GemmBase = typename DefaultGemmUniversal<
-    ElementA_, LayoutA_, TransformA, kAlignmentA,
-    ElementB_, LayoutB_, TransformB, kAlignmentB,
-    ElementC_, LayoutC_, ElementAccumulator,
-    OperatorClass,
-    ArchTag,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    Operator
-  >::GemmKernel;
+      ElementA_, LayoutA_, TransformA, kAlignmentA, ElementB_, LayoutB_,
+      TransformB, kAlignmentB, ElementC_, LayoutC_, ElementAccumulator,
+      OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+      EpilogueOutputOp, ThreadblockSwizzle, Stages, Operator>::GemmKernel;
 
   // Define epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithVariadicTensorOp<
-    typename GemmBase::Epilogue::Shape,
-    typename GemmBase::Epilogue::WarpMmaOperator,
-    GemmBase::Epilogue::kPartitionsK,
-    ElementC_,
-    EpilogueOutputOp,
-    GemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
-
-  // Compose the GEMM kernel
-  using GemmKernel = GemmUniversal<
-    typename GemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle
-  >;
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      DefaultEpilogueWithVariadicTensorOp<
+          typename GemmBase::Epilogue::Shape,
+          typename GemmBase::Epilogue::WarpMmaOperator,
+          GemmBase::Epilogue::kPartitionsK, ElementC_, EpilogueOutputOp,
+          GemmBase::Epilogue::kElementsPerAccess>::Epilogue;
+
+  /// Universal kernel without StreamkFeature member type
+  template <class SwizzleT, class SwizzleEnable = void>
+  class SelectBase : public kernel::GemmUniversal<typename GemmBase::Mma,
+                                                  Epilogue, SwizzleT> {};
+
+  /// Universal kernel with StreamkFeature member type
+  template <class SwizzleT>
+  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature>
+      : public kernel::GemmUniversalStreamk<typename GemmBase::Mma, Epilogue,
+                                            SwizzleT> {};
+
+  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
+  using GemmKernel = SelectBase<ThreadblockSwizzle>;
 };
 
-
 /// Partial specialization: ArchTag = cutlass::arch::Sm70
 ///
 ///
 template <
-  /// Element type for A matrix operand
-  typename ElementA_,
-  /// Layout type for A matrix operand
-  typename LayoutA_,
-  /// Complex elementwise transformation on A operand
-  ComplexTransform TransformA,
-  /// Access granularity of A matrix in units of elements
-  int kAlignmentA,
-  /// Element type for B matrix operand
-  typename ElementB_,
-  /// Layout type for B matrix operand
-  typename LayoutB_,
-  /// Complex elementwise transformation on B operand
-  ComplexTransform TransformB,
-  /// Access granularity of B matrix in units of elements
-  int kAlignmentB,
-  /// Element type for C and D matrix operands
-  typename ElementC_,
-  /// Layout type for C and D matrix operands
-  typename LayoutC_,
-  /// Element type for internal accumulation
-  typename ElementAccumulator,
-  /// Operator class tag
-  typename OperatorClass,
-  /// Threadblock-level tile size (concept: GemmShape)
-  typename ThreadblockShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename WarpShape,
-  /// Warp-level tile size (concept: GemmShape)
-  typename InstructionShape,
-  /// Epilogue output operator      - must satisfy concept of 'EpilogueWithVariadicOp'
-  typename EpilogueOutputOp,
-  /// Threadblock-level swizzling operator
-  typename ThreadblockSwizzle,
-  /// Number of stages used in the pipelined mainloop
-  int Stages,
-  /// Operation performed by GEMM
-  typename Operator,
-  ///
-  typename Enable
->
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator      - must satisfy concept of
+    /// 'EpilogueWithVariadicOp'
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    ///
+    typename Enable>
 struct DefaultGemmWithVariadic<
-  ElementA_, LayoutA_, TransformA, kAlignmentA,
-  ElementB_, LayoutB_, TransformB, kAlignmentB,
-  ElementC_, LayoutC_,
-  ElementAccumulator,
-  OperatorClass,
-  cutlass::arch::Sm70,
-  ThreadblockShape,
-  WarpShape,
-  InstructionShape,
-  EpilogueOutputOp,
-  ThreadblockSwizzle,
-  Stages,
-  Operator,
-  Enable
-  > {
+    ElementA_, LayoutA_, TransformA, kAlignmentA, ElementB_, LayoutB_,
+    TransformB, kAlignmentB, ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass, cutlass::arch::Sm70, ThreadblockShape, WarpShape,
+    InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, Operator,
+    Enable> {
 
   using GemmBase = typename DefaultGemmUniversal<
-    ElementA_, LayoutA_, TransformA, kAlignmentA,
-    ElementB_, LayoutB_, TransformB, kAlignmentB,
-    ElementC_, LayoutC_, ElementAccumulator,
-    OperatorClass,
-    cutlass::arch::Sm70,
-    ThreadblockShape,
-    WarpShape,
-    InstructionShape,
-    EpilogueOutputOp,
-    ThreadblockSwizzle,
-    Stages,
-    Operator
-  >::GemmKernel;
+      ElementA_, LayoutA_, TransformA, kAlignmentA, ElementB_, LayoutB_,
+      TransformB, kAlignmentB, ElementC_, LayoutC_, ElementAccumulator,
+      OperatorClass, cutlass::arch::Sm70, ThreadblockShape, WarpShape,
+      InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
+      Operator>::GemmKernel;
 
   // Define epilogue
-  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithVariadicVoltaTensorOp<
-    typename GemmBase::Epilogue::Shape,
-    typename GemmBase::Epilogue::WarpMmaOperator,
-    GemmBase::Epilogue::kPartitionsK,
-    ElementC_,
-    EpilogueOutputOp,
-    GemmBase::Epilogue::kElementsPerAccess
-  >::Epilogue;
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      DefaultEpilogueWithVariadicVoltaTensorOp<
+          typename GemmBase::Epilogue::Shape,
+          typename GemmBase::Epilogue::WarpMmaOperator,
+          GemmBase::Epilogue::kPartitionsK, ElementC_, EpilogueOutputOp,
+          GemmBase::Epilogue::kElementsPerAccess>::Epilogue;
 
   // Compose the GEMM kernel
-  using GemmKernel = GemmUniversal<
-    typename GemmBase::Mma,
-    Epilogue,
-    ThreadblockSwizzle
-  >;
+  using GemmKernel =
+      GemmUniversal<typename GemmBase::Mma, Epilogue, ThreadblockSwizzle>;
 };
 
-
-}  // namespace kernel
-}  // namespace gemm
-}  // namespace cutlass
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/tests/ap/matmul/default_config_id.h b/tests/ap/matmul/default_config_id.h
index d2d6afa..55702d1 100644
--- a/tests/ap/matmul/default_config_id.h
+++ b/tests/ap/matmul/default_config_id.h
@@ -1,13 +1,32 @@
 #pragma once
 
 #include "all_tuning_configs.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
 
 namespace ap {
 
+enum SwizzleType { kCommon = 0, kStreamK, kBatched };
+
+template <SwizzleType ST, int SwizzleFactor = 1> struct ThreadBlockSwizzle {
+  using Type =
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<SwizzleFactor>;
+};
+
+template <int SwizzleFactor>
+struct ThreadBlockSwizzle<SwizzleType::kStreamK, SwizzleFactor> {
+  using Type = cutlass::gemm::threadblock::ThreadblockSwizzleStreamK;
+};
+
+template <int SwizzleFactor>
+struct ThreadBlockSwizzle<SwizzleType::kBatched, SwizzleFactor> {
+  using Type =
+      cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle;
+};
+
 struct DefaultConfig {
-  static constexpr int kConfigId = 0;
+  static constexpr int kConfigId = 8;
   static constexpr int kSwizzleFactor = 1;
-  static constexpr bool kBatched = false;
+  static constexpr SwizzleType kSwizzleType = SwizzleType::kCommon;
 };
 
 } // namespace ap
diff --git a/tests/ap/matmul/generate_configs.py b/tests/ap/matmul/generate_configs.py
index ff6c4aa..9391129 100644
--- a/tests/ap/matmul/generate_configs.py
+++ b/tests/ap/matmul/generate_configs.py
@@ -6,89 +6,62 @@
 
 namespace ap {
 
-constexpr int kNumConfigsHalf = ${num_configs_fp16};
-constexpr int kNumConfigsFloat = ${num_configs_fp32};
-
-template <int SwizzleFactor, bool Batched> struct SwizzleWrapper {
-  using Type =
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<SwizzleFactor>;
+template <typename T>
+struct ConfigsInfo {
+  static constexpr int kNumTotals = ${num_configs_fp16};
 };
 
-// template <int SwizzleFactor>
-// struct SwizzleWrapper<SwizzleFactor, true> {
-//   using Type =
-//       cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle;
-// };
-"""
-
-autotune_wrapper_template = """
-#define AP_AUTOTUNE_${datatype}(func, stream, ...) { \\
-  using FuncType = decltype(func<0>); \\
-  static int selected_config_id = -1; \\
-  static std::vector<std::function<FuncType>> \\
-      matmul_functions = { \\
-          ${repeat_functions} \\
-          };  \\
-  if (selected_config_id == -1) { \\
-    selected_config_id = ap::ProfileBestConfig(matmul_functions, stream, ##__VA_ARGS__); \\
-  } \\
-  matmul_functions[selected_config_id](__VA_ARGS__); \\
-}
+template <>
+struct ConfigsInfo<float> {
+  static constexpr int kNumTotals = ${num_configs_fp32};
+};
 """
 
 fp16_config_template_0 = """
-template <typename ElementT, int SwizzleFactor, bool Batched, int Id = 0>
+template <typename ElementT, int Id = 0>
 struct GemmTuningConfigs {
   using TShape = cutlass::gemm::GemmShape<${tshape}>;
   using WShape = cutlass::gemm::GemmShape<${wshape}>;
   using IShape = cutlass::gemm::GemmShape<${ishape}>;
   static constexpr int kNumStages = ${stages};
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = Id;
 };
 """
 
 fp16_config_template = """
-template <typename ElementT, int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<ElementT, SwizzleFactor, Batched, ${config_id}> {
+template <typename ElementT>
+struct GemmTuningConfigs<ElementT, ${config_id}> {
   using TShape = cutlass::gemm::GemmShape<${tshape}>;
   using WShape = cutlass::gemm::GemmShape<${wshape}>;
   using IShape = cutlass::gemm::GemmShape<${ishape}>;
   static constexpr int kNumStages = ${stages};
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = ${config_id};
 };
 """
 
 fp32_config_template_0 = """
 // Specialization for float
-template <int SwizzleFactor, bool Batched, int Id>
-struct GemmTuningConfigs<float, SwizzleFactor, Batched, Id> {
+template <int Id>
+struct GemmTuningConfigs<float, Id> {
   using TShape = cutlass::gemm::GemmShape<${tshape}>;
   using WShape = cutlass::gemm::GemmShape<${wshape}>;
   using IShape = cutlass::gemm::GemmShape<${ishape}>;
   static constexpr int kNumStages = ${stages};
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = Id;
 };
 """
 
 fp32_config_template = """
-template <int SwizzleFactor, bool Batched>
-struct GemmTuningConfigs<float, SwizzleFactor, Batched, ${config_id}> {
+template <>
+struct GemmTuningConfigs<float, ${config_id}> {
   using TShape = cutlass::gemm::GemmShape<${tshape}>;
   using WShape = cutlass::gemm::GemmShape<${wshape}>;
   using IShape = cutlass::gemm::GemmShape<${ishape}>;
   static constexpr int kNumStages = ${stages};
 
-  using SwizzleThreadBlock =
-      typename SwizzleWrapper<SwizzleFactor, Batched>::Type;
   static constexpr int kId = ${config_id};
 };
 """
@@ -194,16 +167,6 @@ def all_configs_sm80_fp32():
     return all_tuning_configs_fp32
 
 
-def generate_autotune_wrapper(datatype, num_configs):
-    repeat_func_strs = []
-    for i in range(num_configs):
-        repeat_func_strs.append(f"func<{i}>")
-    code_str = autotune_wrapper_template.replace("${datatype}", datatype).replace(
-        "${repeat_functions}", ", \\\n          ".join(repeat_func_strs)
-    )
-    return code_str
-
-
 def get_configs(all_configs_list, level=3):
     consigs_list = []
     for i in range(len(all_configs_list)):
@@ -261,14 +224,8 @@ def main():
     head_code_str = head_template.replace(
         "${num_configs_fp16}", str(num_fp16_configs)
     ).replace("${num_configs_fp32}", str(num_fp32_configs))
-    fp16_autotune_wrapper_code_str = generate_autotune_wrapper("half", num_fp16_configs)
-    fp32_autotune_wrapper_code_str = generate_autotune_wrapper(
-        "float", num_fp32_configs
-    )
     with open("all_tuning_configs.h", "w") as f:
         f.write(head_code_str)
-        f.write(fp16_autotune_wrapper_code_str)
-        f.write(fp32_autotune_wrapper_code_str)
         f.write(fp16_configs_code_str)
         f.write(fp32_configs_code_str)
         f.write(tail_code_str)
diff --git a/tests/ap/matmul/profile.h b/tests/ap/matmul/profile.h
index 72ac822..3baedd9 100644
--- a/tests/ap/matmul/profile.h
+++ b/tests/ap/matmul/profile.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include "matmul.h"
 #include <cuda_profiler_api.h>
 #include <functional>
diff --git a/tests/ap/matmul/tests/Makefile b/tests/ap/matmul/tests/Makefile
index 5942dca..ed865b4 100644
--- a/tests/ap/matmul/tests/Makefile
+++ b/tests/ap/matmul/tests/Makefile
@@ -6,13 +6,14 @@ BUILD_DIR   := ${TESTS_DIR}/build
 
 DEBUG     := 0
 TUNING    := 1
-PROFILE   := 0
+PROFILE   := 1
 USE_HALF  := 1
 
 LIB_NAME  := matmul_kernel
 LIB       := ${BUILD_DIR}/lib${LIB_NAME}.so
 TARGET    := ${BUILD_DIR}/test_main
 TARGET_OBJ:= ${BUILD_DIR}/test_main_matmul_binary.o
+#TARGET_OBJ:= ${BUILD_DIR}/test_main_matmul_dlopen.o
 
 # COMPILER OPTIONS
 NVCC      := nvcc
@@ -21,16 +22,18 @@ NVCCFLAGS := -std=c++17 -O3
 NVCCFLAGS += -Xcompiler=-fPIC -arch=sm_80 --expt-relaxed-constexpr
 NVCCFLAGS += -I ${CUTLASS_DIR}/include -I ${CUTLASS_DIR}/tools/util/include
 NVCCFLAGS += -I ${CUDA_ROOT}/include -I ${SOURCE_DIR} -I ${TESTS_DIR}
-NVCCFLAGS += -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -DCUTLASS_DEBUG_TRACE_LEVEL=${DEBUG} -DCUTLASS_EPILOGUE_ENABLE_VECTORIZE=1
+NVCCFLAGS += -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -DCUTLASS_DEBUG_TRACE_LEVEL=${DEBUG} -DCUTLASS_EPILOGUE_ENABLE_VECTORIZE=0
 NVCCFLAGS += -DAP_USE_FLOAT16=${USE_HALF} -DAP_ENABLE_DEBUG=${DEBUG} -DAP_ENABLE_PROFILE=${PROFILE} -DAP_ENABLE_AUTOTUNE=${TUNING}
 
 LDFLAGS   := -L./build -l${LIB_NAME} -lcuda -lcudart
 
-# SRCS = native_kernel.cu matmul_unary_kernel.cu matmul_binary_kernel.cu kernel.cu profile.cu
+# SRCS = native_kernel.cu matmul_unary_kernel.cu matmul_binary_kernel.cu
 LIB_SRCS  := $(wildcard ${TESTS_DIR}/*.cu)
+#LIB_SRCS  := ${TESTS_DIR}/matmul_unary_kernel.cu ${TESTS_DIR}/test_util.cu
 LIB_OBJS  := $(patsubst ${TESTS_DIR}/%.cu, ${BUILD_DIR}/%.o, ${LIB_SRCS})
 
 TEST_SRCS := $(wildcard ${TESTS_DIR}/test_*.cc)
+#TEST_SRCS := ${TESTS_DIR}/test_main_matmul_unary.cc
 TEST_OBJS := $(patsubst ${TESTS_DIR}/%.cc, ${BUILD_DIR}/%.o, ${TEST_SRCS})
 
 # Compile
diff --git a/tests/ap/matmul/tests/epilogue_op.h b/tests/ap/matmul/tests/epilogue_op.h
index 214fba6..bdfdd88 100644
--- a/tests/ap/matmul/tests/epilogue_op.h
+++ b/tests/ap/matmul/tests/epilogue_op.h
@@ -5,6 +5,14 @@
 
 namespace ap {
 
+struct KernelUtils {
+#if AP_USE_FLOAT16
+  using Type = half;
+#else
+  using Type = float;
+#endif
+};
+
 // Unary Epilogue
 template <typename T> struct IdentityFunctor {
   struct Arguments {};
@@ -39,37 +47,38 @@ template <typename T> struct VariadicEpilogueFunctor {
   };
 
   // __forceinline__ __host__ __device__ int64_t
-  // Load(const Arguments &args, const MatrixCoord &coord) const {
+  // CalcOffset(const Arguments &args, const MatrixCoord &coord) const {
   //   int64_t offset = coord.batch * args.in0_shape[1] * args.in0_shape[2] +
   //                    coord.row * args.in0_shape[2] + coord.column;
   //   return offset;
   // }
 
   // __forceinline__ __host__ __device__ int64_t
-  // Load(const Arguments &args, const MatrixCoord &coord) const {
+  // CalcOffset(const Arguments &args, const MatrixCoord &coord) const {
   //   int64_t offset = coord.batch * args.in0_strides[0] +
   //                    coord.row * args.in0_strides[1] + coord.column;
   //   return offset;
   // }
 
   // __forceinline__ __host__ __device__ int64_t
-  // Load(const Arguments &args, const MatrixCoord &coord) const {
-  //   int64_t offset = coord.batch * 65536 * 32 + coord.row * 32 + coord.column;
-  //   return offset;
+  // CalcOffset(const Arguments &args, const MatrixCoord &coord) const {
+  //   int64_t offset = coord.batch * 65536 * 32 + coord.row * 32 +
+  //   coord.column; return offset;
   // }
 
   __forceinline__ __host__ __device__ int64_t
-  Load(const Arguments &args, const MatrixCoord &coord) const {
+  CalcOffset(const Arguments &args, const MatrixCoord &coord) const {
     int64_t offset = coord.column;
     return offset;
   }
 
   __forceinline__ __host__ __device__ T
   operator()(T x, const Arguments &args, const MatrixCoord &coord) const {
-    int64_t offset = Load(args, coord);
-    T y = static_cast<T>(args.in0_ptr[offset]);
-    T out = x + y;
-    return out;
+    //int64_t offset = CalcOffset(args, coord);
+    //T y = static_cast<T>(args.in0_ptr[offset]);
+    //T out = x + y;
+    //return out;
+    return x;
   }
 
   template <int N>
diff --git a/tests/ap/matmul/tests/kernel.cu b/tests/ap/matmul/tests/kernel.cu
deleted file mode 100644
index 687cce3..0000000
--- a/tests/ap/matmul/tests/kernel.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-#include "cutlass_matmul.cuh"
-#include "epilogue_op.h"
-#include "profile.h"
-#include <vector>
-
-namespace ap {
-
-void MatmulAddBroadcastKernel(cudaStream_t *stream, const void *input,
-                              const void *weight, const void *bias,
-                              void *broadcast, void *broadcast_out,
-                              void *output,
-                              const std::vector<int64_t> &input_shape,
-                              const std::vector<int64_t> &weight_shape,
-                              const std::vector<int64_t> &bias_shape,
-                              bool need_broadcast) {
-  GemmBroadcastEpilogueParams params(*stream, input, weight, bias, broadcast,
-                                     broadcast_out, output, input_shape,
-                                     weight_shape, bias_shape, need_broadcast);
-
-  CutlassMatmulAddBroadcast<half, float>(params);
-}
-
-} // namespace ap
diff --git a/tests/ap/matmul/tests/kernel.h b/tests/ap/matmul/tests/kernel.h
index 4f10f87..de19d33 100644
--- a/tests/ap/matmul/tests/kernel.h
+++ b/tests/ap/matmul/tests/kernel.h
@@ -18,23 +18,16 @@ void MatmulAddUnaryKernel(cudaStream_t *stream, const void *input,
                           const std::vector<int64_t> &bias_shape,
                           bool transpose_b);
 
-void MatmulAddBroadcastKernel(cudaStream_t *stream, const void *input,
-                              const void *weight, const void *bias,
-                              void *broadcast, void *broadcast_out,
-                              void *output,
-                              const std::vector<int64_t> &input_shape,
-                              const std::vector<int64_t> &weight_shape,
-                              const std::vector<int64_t> &bias_shape,
-                              bool need_broadcast);
-
 void MatmulAddBinaryKernel(
     cudaStream_t *stream, const void *input, const void *weight,
     const void *bias, void *output,
     const std::vector<const void *> &epilogue_ins,
+    const std::vector<void *> &epilogue_outs,
     const std::vector<int64_t> &input_shape,
     const std::vector<int64_t> &weight_shape,
     const std::vector<int64_t> &bias_shape,
-    const std::vector<std::vector<int64_t>> &epilogue_shapes);
+    const std::vector<std::vector<int64_t>> &epilogue_in_shapes,
+    const std::vector<std::vector<int64_t>> &epilogue_out_shapes);
 
 } // namespace ap
 
diff --git a/tests/ap/matmul/tests/matmul_binary_kernel.cu b/tests/ap/matmul/tests/matmul_binary_kernel.cu
index b090f33..5b3d067 100644
--- a/tests/ap/matmul/tests/matmul_binary_kernel.cu
+++ b/tests/ap/matmul/tests/matmul_binary_kernel.cu
@@ -1,47 +1,44 @@
+#include "autotune.h"
 #include "cutlass_matmul.cuh"
 #include "default_config_id.h"
 #include "epilogue_op.h"
-#include "profile.h"
 #include <vector>
 
 namespace ap {
 
-template <int TuningConfigId>
-static void RunMatmulAddBinaryKernel(const GemmEpilogueParams &params) {
-#if AP_USE_FLOAT16
-  using ElementT = half;
-  using ElementComputeT = float;
-#else
-  using ElementT = float;
-  using ElementComputeT = float;
-#endif
+struct MatmulAddBinaryRunner {
+  template <int TuningConfigId, SwizzleType ST>
+  static void Apply(const GemmEpilogueParams &params) {
+    using ElementT = KernelUtils::Type;
+    using ElementComputeT = float;
 
-  typename VariadicEpilogueFunctor<ElementComputeT>::Arguments variadic_args;
-  if (params.epilogue_in_ptrs.size() > 0U) {
-    std::vector<int64_t> epilogue_in0_shape = params.epilogue_in_shapes[0];
-    size_t begin = 3 - epilogue_in0_shape.size();
-    int64_t stride = 1;
-    for (int i = epilogue_in0_shape.size() - 1; i >= 0; --i) {
-      variadic_args.in0_shape[begin + i] = epilogue_in0_shape[i];
-      variadic_args.in0_strides[begin + i] = stride;
-      // std::cout << "stride[" << begin + i << "]=" << stride << std::endl;
-      stride *= epilogue_in0_shape[i];
+    typename VariadicEpilogueFunctor<ElementComputeT>::Arguments variadic_args;
+    if (params.epilogue_in_ptrs.size() > 0U) {
+      std::vector<int64_t> epilogue_in0_shape = params.epilogue_in_shapes[0];
+      size_t begin = 3 - epilogue_in0_shape.size();
+      int64_t stride = 1;
+      for (int i = epilogue_in0_shape.size() - 1; i >= 0; --i) {
+        variadic_args.in0_shape[begin + i] = epilogue_in0_shape[i];
+        variadic_args.in0_strides[begin + i] = stride;
+        // std::cout << "stride[" << begin + i << "]=" << stride << std::endl;
+        stride *= epilogue_in0_shape[i];
+      }
+      for (size_t i = 0; i < begin; ++i) {
+        variadic_args.in0_shape[i] = 1;
+        variadic_args.in0_strides[i] = stride;
+        // std::cout << "stride[" << i << "]=" << stride << std::endl;
+      }
+      variadic_args.in0_ptr =
+          reinterpret_cast<const ElementT *>(params.epilogue_in_ptrs[0]);
     }
-    for (size_t i = 0; i < begin; ++i) {
-      variadic_args.in0_shape[i] = 1;
-      variadic_args.in0_strides[i] = stride;
-      // std::cout << "stride[" << i << "]=" << stride << std::endl;
-    }
-    variadic_args.in0_ptr =
-        reinterpret_cast<const ElementT *>(params.epilogue_in_ptrs[0]);
-  }
 
-  constexpr int AlignA = 128 / cutlass::sizeof_bits<ElementT>::value;
-  constexpr int AlignB = 128 / cutlass::sizeof_bits<ElementT>::value;
-  CutlassMatmulAddVariadic<ElementT, ElementComputeT, VariadicEpilogueFunctor,
-                           AlignA, AlignB, TuningConfigId>(params,
-                                                           variadic_args);
-}
+    constexpr int AlignA = 128 / cutlass::sizeof_bits<ElementT>::value;
+    constexpr int AlignB = 128 / cutlass::sizeof_bits<ElementT>::value;
+    CutlassMatmulAddVariadic<ElementT, ElementComputeT, VariadicEpilogueFunctor,
+                             AlignA, AlignB, TuningConfigId, ST>(params,
+                                                                 variadic_args);
+  }
+};
 
 void MatmulAddBinaryKernel(
     cudaStream_t *stream, const void *input, const void *weight,
@@ -55,17 +52,13 @@ void MatmulAddBinaryKernel(
     const std::vector<std::vector<int64_t>> &epilogue_out_shapes) {
   GemmEpilogueParams params(*stream, input, weight, bias, output, input_shape,
                             weight_shape, bias_shape);
-  params.SetEpilogueAndShapes(epilogue_ins, epilogue_in_shapes, epilogue_outs, epilogue_out_shapes);
+  params.SetEpilogueAndShapes(epilogue_ins, epilogue_in_shapes, epilogue_outs,
+                              epilogue_out_shapes);
 
-#if AP_ENABLE_AUTOTUNE
-#if AP_USE_FLOAT16
-  AP_AUTOTUNE_half(RunMatmulAddBinaryKernel, *stream, params);
-#else
-  AP_AUTOTUNE_float(RunMatmulAddBinaryKernel, *stream, params);
-#endif
-#else
-  RunMatmulAddBinaryKernel<DefaultConfig::kConfigId>(params);
-#endif
+  static int selected_config_id = -1;
+  selected_config_id =
+      RunWithAutotune<KernelUtils::Type, MatmulAddBinaryRunner, true>(
+          *stream, selected_config_id, params);
 }
 
 } // namespace ap
diff --git a/tests/ap/matmul/tests/matmul_kernel.cu b/tests/ap/matmul/tests/matmul_kernel.cu
index 4cefe04..23d4b63 100644
--- a/tests/ap/matmul/tests/matmul_kernel.cu
+++ b/tests/ap/matmul/tests/matmul_kernel.cu
@@ -1,29 +1,21 @@
+#include "autotune.h"
 #include "cutlass_matmul.cuh"
 #include "default_config_id.h"
 #include "epilogue_op.h"
-#include "profile.h"
 #include <vector>
 
 namespace ap {
 
-template <int TuningConfigId>
-static void RunMatmulKernel(const GemmEpilogueParams &params) {
-#if AP_USE_FLOAT16
-  using ElementT = half;
-  using ElementComputeT = float;
-#else
-  using ElementT = float;
-  using ElementComputeT = float;
-#endif
+struct MatmulRunner {
+  template <int TuningConfigId>
+  static void Apply(const GemmEpilogueParams &params) {
+    using ElementT = KernelUtils::Type;
+    using ElementComputeT = float;
 
-  if (params.transpose_b) {
-    CutlassMatmul<ElementT, ElementComputeT, false, true, TuningConfigId>(
-        params);
-  } else {
     CutlassMatmul<ElementT, ElementComputeT, false, false, TuningConfigId>(
         params);
   }
-}
+};
 
 void MatmulKernel(cudaStream_t *stream, const void *input, const void *weight,
                   void *output, const std::vector<int64_t> &input_shape,
@@ -32,15 +24,9 @@ void MatmulKernel(cudaStream_t *stream, const void *input, const void *weight,
                             input_shape, weight_shape, std::vector<int64_t>{},
                             false, transpose_b);
 
-#if AP_ENABLE_AUTOTUNE
-#if AP_USE_FLOAT16
-  AP_AUTOTUNE_half(RunMatmulKernel, *stream, params);
-#else
-  AP_AUTOTUNE_float(RunMatmulKernel, *stream, params);
-#endif
-#else
-  RunMatmulKernel<DefaultConfig::kConfigId>(params);
-#endif
+  static int selected_config_id = -1;
+  selected_config_id = RunWithAutotune<KernelUtils::Type, MatmulRunner>(
+      *stream, selected_config_id, params);
 }
 
 } // namespace ap
diff --git a/tests/ap/matmul/tests/matmul_unary_kernel.cu b/tests/ap/matmul/tests/matmul_unary_kernel.cu
index 4cce75c..d7d4e02 100644
--- a/tests/ap/matmul/tests/matmul_unary_kernel.cu
+++ b/tests/ap/matmul/tests/matmul_unary_kernel.cu
@@ -1,7 +1,7 @@
+#include "autotune.h"
 #include "cutlass_matmul.cuh"
 #include "default_config_id.h"
 #include "epilogue_op.h"
-#include "profile.h"
 #include <vector>
 
 namespace ap {
@@ -10,27 +10,23 @@ template <typename T>
 // using UnaryEpilogueFunctor = ScaleFunctor<T>;
 using UnaryEpilogueFunctor = IdentityFunctor<T>;
 
-template <int TuningConfigId>
-static void RunMatmulAddUnaryKernel(const GemmEpilogueParams &params) {
-#if AP_USE_FLOAT16
-  using ElementT = half;
-  using ElementComputeT = float;
-#else
-  using ElementT = float;
-  using ElementComputeT = float;
-#endif
+struct MatmulAddUnaryRunner {
+  template <int TuningConfigId, SwizzleType ST>
+  static void Apply(const GemmEpilogueParams &params) {
+    using ElementT = KernelUtils::Type;
+    using ElementComputeT = float;
 
-  // typename UnaryEpilogueFunctor<ElementComputeT>::Arguments unary_args{0.1};
-  typename UnaryEpilogueFunctor<ElementComputeT>::Arguments unary_args;
+    // typename UnaryEpilogueFunctor<ElementComputeT>::Arguments
+    // unary_args{0.1};
+    typename UnaryEpilogueFunctor<ElementComputeT>::Arguments unary_args;
 
-  if (params.transpose_b) {
+    constexpr int AlignA = 128 / cutlass::sizeof_bits<ElementT>::value;
+    constexpr int AlignB = 128 / cutlass::sizeof_bits<ElementT>::value;
     CutlassMatmulAddUnary<ElementT, ElementComputeT, UnaryEpilogueFunctor,
-                          false, true, TuningConfigId>(params, unary_args);
-  } else {
-    CutlassMatmulAddUnary<ElementT, ElementComputeT, UnaryEpilogueFunctor,
-                          false, false, TuningConfigId>(params, unary_args);
+                          AlignA, AlignB, TuningConfigId, ST>(params,
+                                                              unary_args);
   }
-}
+};
 
 void MatmulAddUnaryKernel(cudaStream_t *stream, const void *input,
                           const void *weight, const void *bias, void *output,
@@ -41,15 +37,10 @@ void MatmulAddUnaryKernel(cudaStream_t *stream, const void *input,
   GemmEpilogueParams params(*stream, input, weight, bias, output, input_shape,
                             weight_shape, bias_shape, false, transpose_b);
 
-#if AP_ENABLE_AUTOTUNE
-#if AP_USE_FLOAT16
-  AP_AUTOTUNE_half(RunMatmulAddUnaryKernel, *stream, params);
-#else
-  AP_AUTOTUNE_float(RunMatmulAddUnaryKernel, *stream, params);
-#endif
-#else
-  RunMatmulAddUnaryKernel<DefaultConfig::kConfigId>(params);
-#endif
+  static int selected_config_id = -1;
+  selected_config_id =
+      RunWithAutotune<KernelUtils::Type, MatmulAddUnaryRunner, true>(
+          *stream, selected_config_id, params);
 }
 
 } // namespace ap
diff --git a/tests/ap/matmul/tests/run.sh b/tests/ap/matmul/tests/run.sh
index 58e4cb8..400dbff 100755
--- a/tests/ap/matmul/tests/run.sh
+++ b/tests/ap/matmul/tests/run.sh
@@ -1,15 +1,19 @@
 #!/bin/bash
 
-export CUDA_VISIBLE_DEVICES="7"
+export CUDA_VISIBLE_DEVICES="6"
 
 #export LD_LIBRARY_PATH=/usr/local/cuda/compat:/usr/lib64:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=/work/abstract_pass/Athena/tests/ap/matmul/tests/build:$LD_LIBRARY_PATH
-export PATH=/opt/nvidia/nsight-systems/2023.4.1/bin:$PATH
+#export PATH=/opt/nvidia/nsight-systems/2023.4.1/bin:$PATH
+export PATH=/opt/nvidia/nsight-systems/2025.1.1/bin:$PATH
 
-nsys_args="nsys profile --stats true -w true -t cuda,nvtx,osrt,cudnn,cublas --capture-range=cudaProfilerApi -x true --force-overwrite true -o cutlass_matmul"
+AP_WORKSPACE_PATH=/work/abstract_pass/Athena/tests/ap/ap_workspace/5855040424717010002
+export LD_LIBRARY_PATH=${AP_WORKSPACE_PATH}/main:${AP_WORKSPACE_PATH}/api_wrapper:$LD_LIBRARY_PATH
 
+#nsys_args="nsys profile --stats true -w true -t cuda,nvtx,osrt,cudnn,cublas --capture-range=cudaProfilerApi -x true --force-overwrite true -o cutlass_matmul"
+nsys_args="nsys profile --stats true -w true -t cuda,nvtx --gpu-metrics-devices=cuda-visible --gpuctxsw=true --capture-range=cudaProfilerApi -x true --force-overwrite true -o cutlass_matmul"
 
 #AP_LIB_DIR=/work/abstract_pass/Athena/tests/ap/ap_workspace/15859713568798564682/main
 #export LD_LIBRARY_PATH=${AP_LIB_DIR}:$LD_LIBRARY_PATH
 ldd ./build/test_main
-${nsys_args} ./build/test_main
+${nsys_args} ./build/test_main 1 4096 7168 16384
diff --git a/tests/ap/matmul/tests/test_main_matmul_binary.cc b/tests/ap/matmul/tests/test_main_matmul_binary.cc
index 674871d..1c8c9ed 100644
--- a/tests/ap/matmul/tests/test_main_matmul_binary.cc
+++ b/tests/ap/matmul/tests/test_main_matmul_binary.cc
@@ -4,17 +4,17 @@
 #include "test_util.h"
 
 template <typename T>
-void TestMatmulAddBinary(cudaStream_t stream, int batch_count, int m, int n,
-                         int k, bool add_bias, bool has_epilogue) {
-  bool transpose_b = false;
+void TestMain(cudaStream_t stream, int batch_count, int m, int n, int k,
+              bool add_bias, bool has_epilogue) {
+  bool random = true;
 
   std::vector<int64_t> input_shape{batch_count, m, k};
   std::vector<int64_t> weight_shape{k, n};
   std::vector<int64_t> bias_shape;
   std::vector<int64_t> output_shape{batch_count, m, n};
 
-  T *input = AllocateAndInit<T>(stream, input_shape, false, 1.);
-  T *weight = AllocateAndInit<T>(stream, weight_shape, false, 1.);
+  T *input = AllocateAndInit<T>(stream, input_shape, random, 1.);
+  T *weight = AllocateAndInit<T>(stream, weight_shape, random, 1.);
 
   T *bias = nullptr;
   if (add_bias) {
@@ -25,11 +25,13 @@ void TestMatmulAddBinary(cudaStream_t stream, int batch_count, int m, int n,
     for (size_t i = 0; i < bias_ref.size(); ++i) {
       bias_ref[i] = static_cast<float>(1000 * (i % 11));
     }
-    bias = AllocateAndInit<T>(stream, bias_shape, false, 0., bias_ref);
+    bias = AllocateAndInit<T>(stream, bias_shape, random, 0., bias_ref);
   }
 
   std::vector<const void *> epilogue_ins;
-  std::vector<std::vector<int64_t>> epilogue_shapes;
+  std::vector<void *> epilogue_outs;
+  std::vector<std::vector<int64_t>> epilogue_in_shapes;
+  std::vector<std::vector<int64_t>> epilogue_out_shapes;
 
   T *another = nullptr;
   if (has_epilogue) {
@@ -43,7 +45,7 @@ void TestMatmulAddBinary(cudaStream_t stream, int batch_count, int m, int n,
     another = AllocateAndInit<T>(stream, another_shape, false, 0., another_ref);
 
     epilogue_ins = {another};
-    epilogue_shapes = {another_shape};
+    epilogue_in_shapes = {another_shape};
   }
 
   T *output = AllocateAndInit<T>(stream, output_shape, false, 0.);
@@ -53,8 +55,9 @@ void TestMatmulAddBinary(cudaStream_t stream, int batch_count, int m, int n,
       cudaMemsetAsync(output, 0, sizeof(T) * Product(output_shape), stream));
 
   KERNEL_PROFILE(ap::MatmulAddBinaryKernel(
-      &stream, input, weight, bias, output, epilogue_ins, input_shape,
-      weight_shape, bias_shape, epilogue_shapes));
+      &stream, input, weight, bias, output, epilogue_ins, epilogue_outs,
+      input_shape, weight_shape, bias_shape, epilogue_in_shapes,
+      epilogue_out_shapes));
 
   Print<T>(stream, reinterpret_cast<T *>(output), batch_count, m, n);
 
@@ -67,6 +70,7 @@ void TestMatmulAddBinary(cudaStream_t stream, int batch_count, int m, int n,
 }
 
 int main(int argc, const char *argv[]) {
+  std::cout << "TestName: test_main_matmul_binary" << std::endl;
   ProblemSizeArgs args = ParseArgs(argc, argv);
 
   cudaStream_t stream;
@@ -76,11 +80,11 @@ int main(int argc, const char *argv[]) {
   bool has_epilogue = true;
 
 #if AP_USE_FLOAT16
-  TestMatmulAddBinary<half>(stream, args.batch_count, args.m, args.n, args.k,
-                            add_bias, has_epilogue);
+  TestMain<half>(stream, args.batch_count, args.m, args.n, args.k, add_bias,
+                 has_epilogue);
 #else
-  TestMatmulAddBinary<float>(stream, args.batch_count, args.m, args.n, args.k,
-                             add_bias, has_epilogue);
+  TestMain<float>(stream, args.batch_count, args.m, args.n, args.k, add_bias,
+                  has_epilogue);
 #endif
 
   CHECK_CUDA(cudaStreamDestroy(stream));
diff --git a/tests/ap/matmul/tests/test_main_matmul_broadcast.cc b/tests/ap/matmul/tests/test_main_matmul_broadcast.cc
deleted file mode 100644
index bf51615..0000000
--- a/tests/ap/matmul/tests/test_main_matmul_broadcast.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-#include <iostream>
-
-#include "kernel.h"
-#include "test_util.h"
-
-template <typename T> void TestMatmulAddBroadcast(cudaStream_t stream) {
-  int batch_count = 1;
-  int m = 256;
-  int n = 512;
-  int k = 256;
-  bool need_broadcast = false;
-
-  std::cout << "we are running for problem: [" << m << ", " << n << ", " << k
-            << "]" << std::endl;
-
-  std::vector<int64_t> input_shape{batch_count, m, k};
-  std::vector<int64_t> weight_shape{k, n};
-  std::vector<int64_t> bias_shape{n};
-  std::vector<int64_t> output_shape{batch_count, m, n};
-
-  T *input = AllocateAndInit<T>(stream, input_shape, false, 1.);
-  T *weight = AllocateAndInit<T>(stream, weight_shape, false, 1.);
-
-  std::vector<float> bias_ref;
-  bias_ref.resize(Product(bias_shape));
-  for (size_t i = 0; i < bias_ref.size(); ++i) {
-    bias_ref[i] = static_cast<float>(1000 * (i % 10));
-  }
-  T *bias = AllocateAndInit<T>(stream, bias_shape, false, 0., bias_ref);
-
-  std::vector<int64_t> broadcast_shape = need_broadcast ? {m} : {m, n};
-  std::vector<float> broadcast_ref;
-  broadcast_ref.resize(Product(broadcast_shape));
-  if (need_broadcast) {
-    for (size_t i = 0; i < broadcast_ref.size(); ++i) {
-      broadcast_ref[i] = static_cast<float>(10000 * (i % 5));
-    }
-  } else {
-    for (size_t i = 0; i < m; ++i) {
-      for (size_t j = 0; j < n; ++j) {
-        broadcast_ref[i * n + j] = static_cast<float>(10000 * (i % 5));
-      }
-    }
-  }
-  T *broadcast =
-      AllocateAndInit<T>(stream, broadcast_shape, false, 0., broadcast_ref);
-
-  T *output = AllocateAndInit<T>(stream, output_shape, false, 0.);
-  T *broadcast_out = AllocateAndInit<T>(stream, broadcast_shape, false, 0.);
-  CHECK_CUDA(cudaStreamSynchronize(stream));
-
-  CHECK_CUDA(
-      cudaMemsetAsync(output, 0, sizeof(T) * Product(output_shape), stream));
-  CHECK_CUDA(cudaMemsetAsync(broadcast_out, 0,
-                             sizeof(T) * Product(broadcast_shape), stream));
-  ap::MatmulAddBroadcastKernel(&stream, input, weight, bias, broadcast,
-                               broadcast_out, output, input_shape, weight_shape,
-                               bias_shape, need_broadcast);
-
-  Print<T>(stream, reinterpret_cast<T *>(output), batch_count, m, n);
-
-  cudaFree(input);
-  cudaFree(weight);
-  cudaFree(bias);
-  cudaFree(output);
-  cudaFree(broadcast);
-  cudaFree(broadcast_out);
-}
-
-int main(int argc, const char *arg[]) {
-  cudaStream_t stream;
-  CHECK_CUDA(cudaStreamCreate(&stream));
-
-  TestMatmulAddBroadcast<half>(stream);
-
-  CHECK_CUDA(cudaStreamDestroy(stream));
-  return 0;
-}
diff --git a/tests/ap/matmul/tests/test_main_matmul_dlopen.cc b/tests/ap/matmul/tests/test_main_matmul_dlopen.cc
new file mode 100644
index 0000000..efbded5
--- /dev/null
+++ b/tests/ap/matmul/tests/test_main_matmul_dlopen.cc
@@ -0,0 +1,170 @@
+#include <dlfcn.h>
+#include <iostream>
+#include <string>
+
+#include "test_util.h"
+
+typedef void (*MatmulFunc)(void *, void *, void *, void *, int64_t, int64_t,
+                           int64_t, int64_t, void *);
+typedef void (*ApiWrapperFunc)(void *, void *, void **);
+
+struct DlHandle {
+  explicit DlHandle(const std::string &main_so_path,
+                    const std::string &api_wrapper_so_path) {
+    std::cout << "main_so_path: " << main_so_path << std::endl;
+    std::cout << "api_wrapper_so_path: " << api_wrapper_so_path << std::endl;
+    main_handle = dlopen(main_so_path.c_str(), RTLD_LAZY);
+    if (!main_handle) {
+      std::cerr << "Cannot open library: " << dlerror() << std::endl;
+    }
+    api_wrapper_handle = dlopen(api_wrapper_so_path.c_str(), RTLD_LAZY);
+    if (!api_wrapper_handle) {
+      std::cerr << "Cannot open library: " << dlerror() << std::endl;
+    }
+  }
+
+  template <typename... Args> void Call(Args &&...args) {
+    MatmulFunc func = (MatmulFunc)dlsym(main_handle, "MatmulVariadicKernel");
+    const char *func_error = dlerror();
+    if (func_error) {
+      std::cerr << "Cannot load symbol 'MatmulVariadicKernel': " << func_error
+                << std::endl;
+    } else {
+      func(std::forward<Args>(args)...);
+    }
+  }
+
+  void CallWithApiWrapper(void **args) {
+    void *main_func = dlsym(main_handle, "MatmulVariadicKernel");
+    const char *main_error = dlerror();
+    if (main_error) {
+      std::cerr << "Cannot load symbol 'MatmulVariadicKernel': " << main_error
+                << std::endl;
+    }
+
+    ApiWrapperFunc api_wrapper_func =
+        (ApiWrapperFunc)dlsym(api_wrapper_handle, "MatmulVariadicKernel");
+    const char *api_wrapper_error = dlerror();
+    if (api_wrapper_error) {
+      std::cerr << "Cannot load symbol 'MatmulVariadicKernel': "
+                << api_wrapper_error << std::endl;
+    }
+
+    void *ret;
+    api_wrapper_func(ret, main_func, args);
+  }
+
+  ~DlHandle() {
+    dlclose(main_handle);
+    dlclose(api_wrapper_handle);
+  }
+
+  void *main_handle;
+  void *api_wrapper_handle;
+};
+
+template <typename T>
+void TestMain(const std::string &main_so_path,
+              const std::string &api_wrapper_so_path, cudaStream_t stream,
+              int batch_count, int m, int n, int k, bool add_bias) {
+  bool random = true;
+
+  std::vector<int64_t> input_shape{batch_count, m, k};
+  std::vector<int64_t> weight_shape{k, n};
+  std::vector<int64_t> bias_shape;
+  std::vector<int64_t> output_shape{batch_count, m, n};
+
+  T *input = AllocateAndInit<T>(stream, input_shape, random, 10.);
+  T *weight = AllocateAndInit<T>(stream, weight_shape, random, 10.);
+
+  T *bias = nullptr;
+  if (add_bias) {
+    // bias_shape = {n};
+    bias_shape = {batch_count, m, n};
+    std::vector<float> bias_ref;
+    bias_ref.resize(Product(bias_shape));
+    for (size_t i = 0; i < bias_ref.size(); ++i) {
+      bias_ref[i] = static_cast<float>(1000 * (i % 11));
+    }
+    bias = AllocateAndInit<T>(stream, bias_shape, random, 0., bias_ref);
+  }
+
+  T *output0 = AllocateAndInit<T>(stream, output_shape, false, 0.);
+  T *output1 = AllocateAndInit<T>(stream, output_shape, false, 0.);
+  CHECK_CUDA(cudaStreamSynchronize(stream));
+
+  CHECK_CUDA(
+      cudaMemsetAsync(output0, 0, sizeof(T) * Product(output_shape), stream));
+  CHECK_CUDA(
+      cudaMemsetAsync(output1, 0, sizeof(T) * Product(output_shape), stream));
+
+  void *stream_ptr = &stream;
+  int64_t input0_dim0 = batch_count;
+  int64_t input0_dim1 = m;
+  int64_t input0_dim2 = k;
+  int64_t input1_dim1 = n;
+  T *in_ptr_0 = bias;
+
+  DlHandle handle(main_so_path, api_wrapper_so_path);
+  // KERNEL_PROFILE(handle.Call(stream_ptr, input, weight, output, input0_dim0,
+  //                            input0_dim1, input0_dim2, input1_dim1,
+  //                            in_ptr_0));
+
+  ap::GpuTimer gpu_timer(true);
+  for (int i = 0; i < 1010; ++i) {
+    if (i == 10) {
+      CHECK_CUDA(cudaStreamSynchronize(stream));
+      gpu_timer.Start(stream);
+    }
+    std::vector<void *> args;
+    args.push_back(&stream_ptr);
+    args.push_back(&input);
+    args.push_back(&weight);
+    if (i % 2 == 0) {
+      args.push_back(&output0);
+    } else {
+      args.push_back(&output1);
+    }
+    args.push_back(&input0_dim0);
+    args.push_back(&input0_dim1);
+    args.push_back(&input0_dim2);
+    args.push_back(&input1_dim1);
+    args.push_back(&in_ptr_0);
+    // KERNEL_PROFILE(handle.CallWithApiWrapper(args.data()));
+    handle.CallWithApiWrapper(args.data());
+  }
+  gpu_timer.Stop(stream);
+
+  Print<T>(stream, reinterpret_cast<T *>(output0), batch_count, m, n);
+
+  cudaFree(input);
+  cudaFree(weight);
+  if (add_bias) {
+    cudaFree(bias);
+  }
+  cudaFree(output0);
+  cudaFree(output1);
+}
+
+int main(int argc, const char *argv[]) {
+  std::cout << "TestName: test_main_matmul_dlopen" << std::endl;
+  ProblemSizeArgs args = ParseArgs(argc, argv);
+
+  cudaStream_t stream;
+  CHECK_CUDA(cudaStreamCreate(&stream));
+
+  std::string main_so_path = "libmatmul_variadic_kernel.so";
+  std::string api_wrapper_so_path = "api_wrapper.so";
+  bool add_bias = false;
+
+#if AP_USE_FLOAT16
+  TestMain<half>(main_so_path, api_wrapper_so_path, stream, args.batch_count,
+                 args.m, args.n, args.k, add_bias);
+#else
+  TestMain<float>(main_so_path, api_wrapper_so_path, stream, args.batch_count,
+                  args.m, args.n, args.k, add_bias);
+#endif
+
+  CHECK_CUDA(cudaStreamDestroy(stream));
+  return 0;
+}
diff --git a/tests/ap/matmul/tests/test_main_matmul_unary.cc b/tests/ap/matmul/tests/test_main_matmul_unary.cc
index c9a5c74..5999cfe 100644
--- a/tests/ap/matmul/tests/test_main_matmul_unary.cc
+++ b/tests/ap/matmul/tests/test_main_matmul_unary.cc
@@ -4,8 +4,9 @@
 #include "test_util.h"
 
 template <typename T>
-void TestMatmulAddUnary(cudaStream_t stream, int batch_count, int m, int n,
-                        int k, bool add_bias) {
+void TestMain(cudaStream_t stream, int batch_count, int m, int n, int k,
+              bool add_bias) {
+  bool random = true;
   bool transpose_b = false;
 
   std::vector<int64_t> input_shape{batch_count, m, k};
@@ -13,8 +14,8 @@ void TestMatmulAddUnary(cudaStream_t stream, int batch_count, int m, int n,
   std::vector<int64_t> bias_shape;
   std::vector<int64_t> output_shape{batch_count, m, n};
 
-  T *input = AllocateAndInit<T>(stream, input_shape, false, 1.);
-  T *weight = AllocateAndInit<T>(stream, weight_shape, false, 1.);
+  T *input = AllocateAndInit<T>(stream, input_shape, random, 1.);
+  T *weight = AllocateAndInit<T>(stream, weight_shape, random, 1.);
 
   T *bias = nullptr;
   if (add_bias) {
@@ -24,7 +25,7 @@ void TestMatmulAddUnary(cudaStream_t stream, int batch_count, int m, int n,
     for (size_t i = 0; i < bias_ref.size(); ++i) {
       bias_ref[i] = static_cast<float>(1000 * (i % 11));
     }
-    bias = AllocateAndInit<T>(stream, bias_shape, false, 0., bias_ref);
+    bias = AllocateAndInit<T>(stream, bias_shape, random, 0., bias_ref);
   }
 
   T *output = AllocateAndInit<T>(stream, output_shape, false, 0.);
@@ -48,19 +49,18 @@ void TestMatmulAddUnary(cudaStream_t stream, int batch_count, int m, int n,
 }
 
 int main(int argc, const char *argv[]) {
+  std::cout << "TestName: test_main_matmul_unary" << std::endl;
   ProblemSizeArgs args = ParseArgs(argc, argv);
 
   cudaStream_t stream;
   CHECK_CUDA(cudaStreamCreate(&stream));
 
-  bool add_bias = true;
+  bool add_bias = false;
 
 #if AP_USE_FLOAT16
-  TestMatmulAddUnary<half>(stream, args.batch_count, args.m, args.n, args.k,
-                           add_bias);
+  TestMain<half>(stream, args.batch_count, args.m, args.n, args.k, add_bias);
 #else
-  TestMatmulAddUnary<float>(stream, args.batch_count, args.m, args.n, args.k,
-                            add_bias);
+  TestMain<float>(stream, args.batch_count, args.m, args.n, args.k, add_bias);
 #endif
 
   CHECK_CUDA(cudaStreamDestroy(stream));
diff --git a/tests/ap/matmul_variadic_tpl.py b/tests/ap/matmul_variadic_tpl.py
index 74819db..f2eb9e3 100644
--- a/tests/ap/matmul_variadic_tpl.py
+++ b/tests/ap/matmul_variadic_tpl.py
@@ -198,9 +198,9 @@ def make_project(
 #include <cuda_bf16.h>
 #include <vector>
 
+#include "autotune.h"
 #include "cutlass_matmul.cuh"
 #include "math_function.h"
-#include "profile.h"
 
 namespace ap {
 
@@ -212,28 +212,29 @@ def make_project(
 
   // Note: need to support vectorized operation
   __forceinline__ __host__ __device__
-  T operator()(T x, const Arguments& args, const MatrixCoord& coord) const {
-    T out;
-    ${AP_EPILOGUE_COMPUTATION_STATEMENTS}
-    return out;
+  T operator()(T x, const Arguments& args) const {
+    return x;
   }
 };
 
-template <int TuningConfigId>
-static void RunMatmulWithVariadicKernel(const GemmEpilogueParams &params, ${AP_KERNEL_ARGS_DECLARE}) {
-  using ElementT = ${output_dtype};
-  using ElementComputeT = float;
+struct MatmulWithVariadicRunner {
+  template <int TuningConfigId>
+  static void Apply(const GemmEpilogueParams &params, ${AP_KERNEL_ARGS_DECLARE}) {
+    using ElementT = ${output_dtype};
+    using ElementComputeT = float;
 
-  typename VariadicEpilogueFunctor<ElementComputeT>::Arguments epilogue_args;
+    typename VariadicEpilogueFunctor<ElementComputeT>::Arguments epilogue_args;
 
-  ${AP_EPILOGUE_ARGUMENTS_INIT}
+    ${AP_EPILOGUE_ARGUMENTS_INIT}
 
-  constexpr int AlignA = Alignment<ElementT, ${k_value}>::kValue;
-  constexpr int AlignB = Alignment<ElementT, ${n_value}>::kValue;
+    constexpr int AlignA = Alignment<ElementT, ${k_value}>::kValue;
+    constexpr int AlignB = Alignment<ElementT, ${n_value}>::kValue;
+    //std::cout << "AlignA: " << AlignA << ", AlignB: " << AlignB << std::endl;
 
-  CutlassMatmulAddVariadic<ElementT, ElementComputeT, VariadicEpilogueFunctor,
-                           AlignA, AlignB, TuningConfigId>(params, epilogue_args);
-}
+    CutlassMatmulAddUnary<ElementT, ElementComputeT, VariadicEpilogueFunctor,
+                             AlignA, AlignB, TuningConfigId>(params, epilogue_args);
+  }
+};
 
 } // namespace ap
 
@@ -250,11 +251,8 @@ def make_project(
   ap::GemmEpilogueParams params(
       *cuda_stream_ptr, ${input0}, ${input1}, nullptr, ${output}, ${input0}_shape, ${input1}_shape, std::vector<int64_t>{});
 
-#if AP_ENABLE_AUTOTUNE
-  AP_AUTOTUNE_${output_dtype}(ap::RunMatmulWithVariadicKernel, *cuda_stream_ptr, params, ${AP_KERNEL_ARGS_CALL});
-#else
-  ap::RunMatmulWithVariadicKernel<ap::DefaultConfig::kConfigId>(params, ${AP_KERNEL_ARGS_CALL});
-#endif
+  static int selected_config_id = -1;
+  selected_config_id = ap::RunWithAutotune<${output_dtype}, ap::MatmulWithVariadicRunner>(*cuda_stream_ptr, selected_config_id, params, ${AP_KERNEL_ARGS_CALL});
 }
 }
   """
@@ -290,7 +288,7 @@ def make_project(
             )
             .replace(
                 "${AP_EPILOGUE_ARGUMENTS_INIT}",
-                self.get_epilogue_arguments_init_str("epilogue_args", indent="  "),
+                self.get_epilogue_arguments_init_str("epilogue_args", indent="    "),
             )
             .replace("${kernel_name}", self.kernel_name)
             .replace("${input0}", self.get_kernel_arg_id_var_name(input0_karg))
@@ -313,7 +311,7 @@ def make_project(
             compile_cmd
             + " -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1 -DCUTLASS_DEBUG_TRACE_LEVEL=0"
         )
-        compile_cmd = compile_cmd + " -DAP_ENABLE_AUTOTUNE=1 -DAP_ENABLE_DEBUG=0"
+        compile_cmd = compile_cmd + " -DAP_ENABLE_AUTOTUNE=0 -DAP_ENABLE_DEBUG=1"
         compile_cmd = (
             compile_cmd
             + f" --shared {self.library_name}.cu -o lib{self.library_name}.so"
diff --git a/tests/ap/paddle-tests/test_pcc.py b/tests/ap/paddle-tests/test_pcc.py
index 29ec020..5a47cd3 100644
--- a/tests/ap/paddle-tests/test_pcc.py
+++ b/tests/ap/paddle-tests/test_pcc.py
@@ -32,8 +32,8 @@ def __init__(self, fn):
         super().__init__()
         self.fn = fn
 
-    def forward(self, x, y, b):
-        out = self.fn(x, y, b)
+    def forward(self, x, y, b, r):
+        out = self.fn(x, y, b, r)
         return out
 
 
@@ -43,34 +43,41 @@ def setUp(self):
         self.prepare_data()
 
     def prepare_data(self):
-        self.dtype = "bfloat16"
+        self.dtype = "float16"
 
-        self.x_shape = [4, 65536, 128]
+        self.x_shape = [1, 4096, 16384]
+        #self.x_shape = [1, 4096, 2048]
         self.x = paddle.randn(self.x_shape, dtype=self.dtype)
         self.x.stop_gradient = False
 
-        self.y_shape = [128, 32]
+        self.y_shape = [16384, 7168]
+        #self.y_shape = [2048, 7168]
         self.y = paddle.randn(self.y_shape, dtype=self.dtype)
         self.y.stop_gradient = False
 
-        self.b_shape = [32]
+        self.b_shape = [1, 4096, 7168]
         self.b = paddle.randn(self.b_shape, dtype=self.dtype)
         self.b.stop_gradient = False
 
+        self.r = paddle.randn(self.b_shape, dtype=self.dtype)
+        self.r.stop_gradient = False
+
     def run_with_dy2st(self, profile):
-        def matmul_add_relu(x, y, b):
+        def matmul_add_relu(x, y, b, r):
             out = paddle.matmul(x, y)
-            return paddle.nn.functional.relu(out + b)
+            #return paddle.nn.functional.relu(out + b)
+            return out + b #+ r
 
         net = CINNSubGraphNet(matmul_add_relu)
         input_spec = [
             InputSpec(shape=self.x_shape, dtype=self.dtype),
             InputSpec(shape=self.y_shape, dtype=self.dtype),
             InputSpec(shape=self.b_shape, dtype=self.dtype),
+            InputSpec(shape=self.b_shape, dtype=self.dtype),
         ]
         net = utils.apply_to_static(net, False, input_spec)
         net.eval()
-        out = utils.run_with_profile(profile, net, self.x, self.y, self.b)
+        out = utils.run_with_profile(profile, net, self.x, self.y, self.b, self.r)
         return out
 
     def run_with_pcc(self, profile):
@@ -83,25 +90,27 @@ def run_with_pcc(self, profile):
         def matmul_add_relu(
             x: pct.Tensor([B, M, K], DType),
             y: pct.Tensor([K, N], DType),
-            b: pct.Tensor([N], DType),
+            b: pct.Tensor([B, M, N], DType),
+            #r: pct.Tensor([B, M, N], DType),
         ):
             def epilogue(out):
-                return paddle.nn.functional.relu(out + b)
+                #return paddle.nn.functional.relu(out + b)
+                return out + b #+ r
 
             out = paddle.matmul(x, y)
             return epilogue(out)
 
         parent_dir = os.path.dirname(os.path.abspath(__file__))
         fused_matmul = pcc.compile(matmul_add_relu, ap_path=parent_dir)
-        out = fused_matmul(self.x, self.y, self.b)
+        out = utils.run_with_profile(profile, fused_matmul, self.x, self.y, self.b)#, self.r)
         return out
 
     def test_matmul_add_relu(self):
-        profile = False
+        profile = True
         ap_out = self.run_with_pcc(profile=profile)
-        dy2st_out = self.run_with_dy2st(profile=profile)
-        if not profile:
-            utils.check_result(self.dtype, ap_out, dy2st_out)
+        #dy2st_out = self.run_with_dy2st(profile=profile)
+        #if not profile:
+        #    utils.check_result(self.dtype, ap_out, dy2st_out)
 
 
 if __name__ == "__main__":