From bf96f71ebbee2a2490ad268fe815efbe960aa953 Mon Sep 17 00:00:00 2001
From: SHshenhao <shen.hao.1985@gmail.com>
Date: Mon, 16 Oct 2023 15:55:40 +0800
Subject: [PATCH] add_dp_rt

---
 .../runtime/core/DIPIRawGeneratorImpl.cpp     |  93 ++
 .../runtime/core/DIPURawGeneratorImpl.h       |  37 +
 src/turbomind/runtime/device/basedef.h        |  88 ++
 src/turbomind/runtime/device/rawdeviceapis.h  | 107 +++
 src/turbomind/runtime/device/rawdiclapis.h    |  84 ++
 .../runtime/diopirt/diopirt_impl.cpp          |  96 +++
 src/turbomind/runtime/diopirt/diopirt_impl.h  |  65 ++
 src/turbomind/runtime/rthelper.h              |   5 +
 .../vendor/CUDA/cmake/CUDAComputeArch.cmake   | 266 ++++++
 .../runtime/vendor/CUDA/cmake/FindNCCL.cmake  |  49 ++
 .../runtime/vendor/CUDA/deviceimpl.cpp        | 255 ++++++
 .../vendor/CUDA/rawcommuniatorimpl.cpp        | 146 ++++
 src/turbomind/runtime/vendor/CUDA/vendorapi.h |  28 +
 src/turbomind/utils/Tensor.cc                 |   6 +-
 src/turbomind/utils/allocator.h               |  33 +-
 src/turbomind/utils/cuda_utils.cc             |  24 +-
 src/turbomind/utils/cuda_utils.h              | 134 ++-
 src/turbomind/utils/logger.cc                 |   7 +-
 src/turbomind/utils/memory_utils.cu           | 804 ++++++++++--------
 src/turbomind/utils/memory_utils.h            |  84 +-
 src/turbomind/utils/nccl_utils.cc             | 132 ++-
 src/turbomind/utils/nccl_utils.h              |  32 +-
 22 files changed, 2047 insertions(+), 528 deletions(-)
 create mode 100644 src/turbomind/runtime/core/DIPIRawGeneratorImpl.cpp
 create mode 100644 src/turbomind/runtime/core/DIPURawGeneratorImpl.h
 create mode 100644 src/turbomind/runtime/device/basedef.h
 create mode 100644 src/turbomind/runtime/device/rawdeviceapis.h
 create mode 100644 src/turbomind/runtime/device/rawdiclapis.h
 create mode 100644 src/turbomind/runtime/diopirt/diopirt_impl.cpp
 create mode 100644 src/turbomind/runtime/diopirt/diopirt_impl.h
 create mode 100644 src/turbomind/runtime/rthelper.h
 create mode 100644 src/turbomind/runtime/vendor/CUDA/cmake/CUDAComputeArch.cmake
 create mode 100644 src/turbomind/runtime/vendor/CUDA/cmake/FindNCCL.cmake
 create mode 100644 src/turbomind/runtime/vendor/CUDA/deviceimpl.cpp
 create mode 100644 src/turbomind/runtime/vendor/CUDA/rawcommuniatorimpl.cpp
 create mode 100644 src/turbomind/runtime/vendor/CUDA/vendorapi.h

diff --git a/src/turbomind/runtime/core/DIPIRawGeneratorImpl.cpp b/src/turbomind/runtime/core/DIPIRawGeneratorImpl.cpp
new file mode 100644
index 0000000000..f11571b3f4
--- /dev/null
+++ b/src/turbomind/runtime/core/DIPIRawGeneratorImpl.cpp
@@ -0,0 +1,93 @@
+// Copyright (c) 2023, DeepLink.
+#include "DIPURawGeneratorImpl.h"
+#include "../device/rawdeviceapis.h>
+
+namespace dipu {
+/**
+ * DIPURawGeneratorImpl class implementation
+ */
+DIPURawGeneratorImpl::DIPURawGeneratorImpl() :state_need_reset_(true) {
+  seed_ = 0;
+}
+
+/**
+ * Private clone method implementation
+ *
+ * See Note [Acquire lock when using random generators]
+ */
+std::shared_ptr<DIPURawGeneratorImpl> DIPURawGeneratorImpl::clone() const {
+  auto gen = new DIPURawGeneratorImpl();
+  gen->set_current_seed(this->seed_);
+  auto state = this->state_;
+  const auto& state_clone = this->clone_state();
+  gen->set_state(state_clone);
+  gen->set_state_flag(this->state_need_reset_);
+  return std::shared_ptr<DIPURawGeneratorImpl>(&gen);
+}
+
+/**
+ * Sets the seed to be used by MTGP
+ *
+ * See Note [Acquire lock when using random generators]
+ */
+void DIPURawGeneratorImpl::set_current_seed(uint64_t seed) {
+  seed_ = seed;
+  state_need_reset_ = true;
+}
+
+/**
+ * Gets the current seed of DIPUGeneratorImpl.
+ */
+uint64_t DIPURawGeneratorImpl::current_seed() const {
+  return seed_;
+}
+
+/**
+ * Gets a nondeterministic random number from /dev/urandom or time,
+ * seeds the CPUGeneratorImpl with it and then returns that number.
+ *
+ */
+uint64_t DIPURawGeneratorImpl::seed() {
+  //TODO:随机生成seed
+  uint64_t random = 42;
+  this->set_current_seed(random);
+  return random;
+}
+
+/**
+ * get state
+ *
+ * See Note [Acquire lock when using random generators]
+  */
+RandState DIPURawGeneratorImpl::get_state() const {
+  if (state_need_reset_) {
+    update_state();
+  }
+  auto state_clone = this->clone_state();
+  return state_clone;
+}
+
+RandState DIPURawGeneratorImpl::clone_state(const RandState& state) const {
+  uint8_t newnum = *(state.state_);
+  RandState newState{&newnum, state.size_};
+  return newState;
+}
+
+/**
+* set state
+*
+* See Note [Acquire lock when using random generators]
+*/
+void DIPURawGeneratorImpl::set_state(const RandState& state) {
+  this->state_ = this->clone_state(state);
+}
+
+/**
+ * set state flag
+ * See Note [Acquire lock when using random generators]
+  */
+void DIPURawGeneratorImpl::set_state_flag(bool flag) {
+  state_need_reset_ = flag;
+}
+
+}  // namespace dipu
diff --git a/src/turbomind/runtime/core/DIPURawGeneratorImpl.h b/src/turbomind/runtime/core/DIPURawGeneratorImpl.h
new file mode 100644
index 0000000000..bc09c32f58
--- /dev/null
+++ b/src/turbomind/runtime/core/DIPURawGeneratorImpl.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2023, DeepLink.
+#pragma once
+
+#include <memory>
+#include <cstdint>
+
+#include "basedef.h"
+
+namespace dipu {
+struct RandState {
+    std::share_ptr<uint8_t*> state_ = nullptr;
+    size_t size_ = 0;
+};
+
+class DIPURawGeneratorImpl {
+public:
+  // Constructors
+  explicit DIPURawGeneratorImpl();
+  ~DIPURawGeneratorImpl() = default;
+
+  std::shared_ptr<DIPURawGeneratorImpl> clone() const;
+  void set_current_seed(uint64_t seed);
+  uint64_t current_seed() const;
+  uint64_t seed();
+  RandState get_state() const;
+  RandState clone_state(const RandState& state) const;
+  virtual void set_state(const RandState& state) {};
+
+  void set_state_flag(bool flag);
+  virtual void update_state() const {};
+
+  uint64_t seed_;
+  mutable RandState state_;
+  mutable bool state_need_reset_;
+}
+
+}  // namespace dipu
diff --git a/src/turbomind/runtime/device/basedef.h b/src/turbomind/runtime/device/basedef.h
new file mode 100644
index 0000000000..5bb6ad90de
--- /dev/null
+++ b/src/turbomind/runtime/device/basedef.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2023, DeepLink.
+#pragma once
+
+// todo: move out deice dir to diopi
+namespace dipu {
+
+#define DIPU_API __attribute__ ((visibility ("default")))
+
+#define DIPU_WEAK  __attribute__((weak))
+
+// "default", "hidden", "protected" or "internal
+#define DIPU_HIDDEN __attribute__ ((visibility ("hidden")))
+
+typedef int32_t enum_t;
+
+#define DIPU_STRING(x) #x
+#define DIPU_CODELOC __FILE__ " (" DIPU_STRING(__LINE__) ")"
+
+
+#define DIPU_LOGE(fmt, ...)          \
+  printf(                           \
+      "[ERROR]%s,%s:%u:" #fmt "\n", \
+      __FUNCTION__,                 \
+      __FILE__,                     \
+      __LINE__,                     \
+      ##__VA_ARGS__)
+
+#define DIPU_LOGW(fmt, ...)         \
+  printf(                          \
+      "[WARN]%s,%s:%u:" #fmt "\n", \
+      __FUNCTION__,                \
+      __FILE__,                    \
+      __LINE__,                    \
+      ##__VA_ARGS__)
+
+
+namespace devapis {
+
+enum class VendorDeviceType : enum_t {
+  MLU,  //camb
+  NPU,  //ascend
+  CUDA, //cuda
+  GCU,  //gcu
+  SUPA, //Biren
+  DROPLET, //droplet
+};
+
+enum class EventStatus: enum_t {
+  PENDING,
+  RUNNING,
+  DEFERRED,
+  READY
+};
+
+enum class OpStatus: enum_t {
+  SUCCESS,
+  ERR_UNKNOWN,
+  ERR_NOMEM,
+};
+
+enum class MemCPKind: enum_t {
+  D2H,
+  H2D,
+  D2D,
+};
+
+typedef enum {
+  /*! The operation was successful. */
+  DICL_SUCCESS = 0x0,
+
+  /*! undefined error */
+  DICL_ERR_UNDEF = 0x01000,
+
+} diclResult_t;
+
+struct DIPUDeviceProperties {
+  std::string name;
+  size_t totalGlobalMem = 0;
+  int32_t major = 0;
+  int32_t minor = 0;
+  int32_t multiProcessorCount = 0;
+};
+
+using deviceId_t = int64_t;
+
+
+} // end namespace devapis
+} // end namespace dipu
\ No newline at end of file
diff --git a/src/turbomind/runtime/device/rawdeviceapis.h b/src/turbomind/runtime/device/rawdeviceapis.h
new file mode 100644
index 0000000000..9d85ff8f0a
--- /dev/null
+++ b/src/turbomind/runtime/device/rawdeviceapis.h
@@ -0,0 +1,107 @@
+// Copyright (c) 2023, DeepLink.
+#pragma once
+
+#include <cstring>
+
+#include "./vendor/vendorapi.h"
+#include "./basedef.h"
+
+namespace dipu {
+
+extern devapis::VendorDeviceType VENDOR_TYPE;
+namespace devapis {
+
+DIPU_API void initializeVendor();
+
+DIPU_API void finalizeVendor();
+
+DIPU_API deviceId_t current_device();
+
+DIPU_API DIPUDeviceProperties getDeviceProperties(int32_t device_index);
+
+// set current device given device according to id
+DIPU_API void setDevice(deviceId_t devId);
+
+DIPU_API void resetDevice(deviceId_t devId = 0);
+
+DIPU_API void syncDevice();
+
+// check last launch succ or not, throw if fail
+DIPU_API void checkLastError();
+
+DIPU_API int getDeviceCount();
+
+DIPU_API void getDriverVersion(int* version);
+
+DIPU_API void getRuntimeVersion(int* version);
+
+DIPU_API void createStream(deviceStream_t* stream, bool prior=false);
+
+DIPU_API void destroyStream(deviceStream_t stream);
+DIPU_API void destroyStream(deviceStream_t stream, deviceId_t devId);
+
+DIPU_API void releaseStream();
+
+DIPU_API void syncStream(deviceStream_t stream);
+
+DIPU_API bool streamNotNull(deviceStream_t stream);
+
+DIPU_API void streamWaitEvent(deviceStream_t stream, deviceEvent_t event);
+
+// same as query last event status in stream.(every op has a event)
+DIPU_API bool isStreamEmpty(deviceStream_t stream);
+
+// =====================
+//  device event related
+// =====================
+
+DIPU_API void createEvent(deviceEvent_t* event);
+
+DIPU_API void destroyEvent(deviceEvent_t event);
+
+DIPU_API void waitEvent(deviceEvent_t event);
+
+DIPU_API void recordEvent(deviceEvent_t event, deviceStream_t stream);
+
+DIPU_API void eventElapsedTime(float *time, deviceEvent_t start, deviceEvent_t end);
+
+DIPU_API EventStatus getEventStatus(deviceEvent_t event);
+
+// =====================
+//  mem related
+// =====================
+DIPU_API void mallocHost(void** p, size_t nbytes);
+
+DIPU_API void freeHost(void* p);
+
+DIPU_API OpStatus mallocDevice(void** p, size_t nbytes, bool throwExcepion= true);
+
+DIPU_API void freeDevice(void* p);
+
+DIPU_API bool isPinnedPtr(const void *p);
+
+// (asynchronous) set val
+DIPU_API void memSetAsync(const deviceStream_t stream, void* ptr, int val, size_t size);
+
+// (synchronous) copy from device to a device
+DIPU_API void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src);
+
+// (synchronous) copy from host to a device
+DIPU_API void memCopyH2D(size_t nbytes, /*deviceId_t dstDevId,*/ void* dst, /*Host srcDev,*/ const void* src);
+
+// (synchronous) copy from a device to host
+DIPU_API void memCopyD2H(size_t nbytes, /*Host dstDev,*/ void* dst, /*deviceId_t srcDevId,*/ const void* src);
+
+// (asynchronous) copy from device to a device
+DIPU_API void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes,
+        deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src);
+
+// (asynchronous) copy from host to a device
+DIPU_API void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes,
+        /*deviceId_t dstDevId,*/ void* dst, /*Host srcDev,*/ const void* src);
+
+// (asynchronous) copy from a device to host
+DIPU_API void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes,
+        /*Host dstDev,*/ void* dst, /*deviceId_t srcDevId,*/ const void* src);
+}  // end namespace devapis
+}  // end namespace dipu
\ No newline at end of file
diff --git a/src/turbomind/runtime/device/rawdiclapis.h b/src/turbomind/runtime/device/rawdiclapis.h
new file mode 100644
index 0000000000..d697f2f7aa
--- /dev/null
+++ b/src/turbomind/runtime/device/rawdiclapis.h
@@ -0,0 +1,84 @@
+#pragma once
+
+
+#include "vendor/vendorapi.h“
+#include "deviceapis.h"
+#include "../diopirt/diopirt_impl.h"
+
+namespace dipu {
+
+// need add return status.
+namespace devapis {
+  typedef enum  
+  {
+    MIN,
+    MAX,
+    SUM,
+    PRODUCT,
+    AVG,
+  } DiclReduceOp;
+
+  template<typename T>
+  static DiclDataType getDiclDataType() {
+    DiclDataType dType;
+    if (std::is_same<T, float>::value) {
+        dType = DiclDataType::TYPE_FP32;
+    }
+    else if (std::is_same<T, int>::value) {
+        dType = DiclDataType::TYPE_INT32;
+    }
+    else if (std::is_same<T, bool>::value) {
+        dType = DiclDataType::TYPE_BOOL;
+    }
+    else {
+        printf("[ERROR] DICL only support float, int, and bool. \n");
+        exit(-1);
+    }
+    return dType;
+  };
+
+  extern const int DICL_UNIQUE_ID_BYTES_SIZE;
+
+  // todo:: dipu only export devproxy but not devapis (which move o diopi)
+  DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm);
+
+  DIPU_API diclResult_t diclGetUniqueId(commUniqueId* uniqueId);
+
+  DIPU_API diclResult_t diclCommInitRank(diclComm_t* comm, int nranks, commUniqueId uniqueId, int rank, int localDeviceId = -1);
+
+  // DIPU_API void diclCommInitAll(diclComm_t* comms, int ndev, const int* devlist);
+
+  DIPU_API diclResult_t diclCommDestroy(diclComm_t comm);
+
+  // DIPU_API diclResult_t diclCommFinalize(diclComm_t comm);
+
+  // DIPU_API diclResult_t diclCommAbort(diclComm_t comm);
+
+  DIPU_API diclResult_t diclRawAllReduce(const void *sendbuff, void *recvbuff, size_t count, DiclDataType datatype,
+                              const DiclReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream);
+
+  DIPU_API diclResult_t diclRawBroadcast(const void *sendbuff, void* recvbuff, size_t count, DiclDataType datatype,
+                              int root, diclComm_t comm, deviceStream_t stream);
+
+  DIPU_API diclResult_t diclRawAllGather(const void *sendBuf, void *recvBuf, size_t count, DiclDataType datatype,
+                              diclComm_t comm, deviceStream_t stream);
+
+  DIPU_API diclResult_t diclRawReduce(const void* sendbuff, void* recvbuff, size_t count, DiclDataType datatype,
+                            const ReduceOp& reduceOp, int root, diclComm_t comm, deviceStream_t stream);
+
+  DIPU_API diclResult_t diclRawReduceScatter(void *sendBuf, void *recvBuf, uint64_t recvCount, DiclDataType dataType, 
+                                  const ReduceOp& op, diclComm_t comm, deviceStream_t stream);
+
+  DIPU_API diclResult_t diclRawSend(void* sendbuff, size_t count, DiclDataType datatype, int peer,
+                          diclComm_t comm, deviceStream_t stream);
+
+  DIPU_API diclResult_t diclRawRecv(void* recvbuff, size_t count, DiclDataType datatype, int peer,
+                          diclComm_t comm, deviceStream_t stream);
+
+  DIPU_API diclResult_t diclRawGroupStart();
+
+  DIPU_API diclResult_t diclRawGroupEnd();
+
+} // namespace devapis
+
+} // namespace dipu
\ No newline at end of file
diff --git a/src/turbomind/runtime/diopirt/diopirt_impl.cpp b/src/turbomind/runtime/diopirt/diopirt_impl.cpp
new file mode 100644
index 0000000000..19126e45ef
--- /dev/null
+++ b/src/turbomind/runtime/diopirt/diopirt_impl.cpp
@@ -0,0 +1,96 @@
+#include <stdio.h>
+
+#include "diopirt_impl.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <set>
+#include <vector>
+
+namespace dipu {
+
+namespace diopi_helper {
+
+::diopiTensorHandle_t toDiopiTensorHandle(turbomind::Tensor& tensor) {
+    return tensor.data == nullptr ? nullptr : reinterpret_cast<::diopiTensorHandle_t>(&tensor);
+}
+
+::diopiConstTensorHandle_t toDiopiTensorHandle(const turbomind::Tensor& tensor) {
+    return tensor.data == nullptr ? nullptr : reinterpret_cast<::diopiConstTensorHandle_t>(&tensor);
+}
+
+::diopiConstTensorHandle_t toDiopiTensorHandle(const turbomind::Tensor* tensor) {
+    return tensor == nullptr ? nullptr : toDiopiTensorHandle(*tensor);
+}
+
+// ::diopiGeneratorHandle_t toDiopiGeneratorHandle(::Generator& generator) {
+//     if (!generator.has_value()) return nullptr;
+//         return toDiopiGeneratorHandle(generator.value());
+// }
+
+}  // namespace diopi_helper
+
+}  // namespace dipu
+
+
+bool diopiTensor::resetShape(const diopiSize_t* size) {
+    int64_t numel = 1;
+    for (int64_t i = 0; i < size->len; ++i) {
+        numel *= size->data[i];
+    }
+    if (numel != numel_) return false;
+
+    shape_.resize(size->len);
+    for (int64_t i = size->len - 1; i >= 0; --i) {
+        shape_[i] = size->data[i];
+    }
+    return true;
+}
+
+DIOPI_RT_API diopiError_t diopiGetTensorData(diopiTensorHandle_t th, void** pptr) {
+    *pptr = th->getPtr<void*>();
+    return diopiSuccess;
+}
+
+DIOPI_RT_API diopiError_t diopiGetTensorDataConst(diopiConstTensorHandle_t th, const void** pptr) {
+    *pptr = th->getPtr<void*>();
+    return diopiSuccess;
+}
+
+DIOPI_RT_API diopiError_t diopiGetTensorShape(diopiConstTensorHandle_t th, diopiSize_t* size) {
+    diopiSize_t thSize = {th->getPtr<int64_t>(), std::static_cast<int64_t>(size())};
+    *size = thSize;
+    return diopiSuccess;
+}
+
+DIOPI_RT_API diopiError_t diopiGetTensorStride(diopiConstTensorHandle_t th, diopiSize_t* stride) {
+    diopiSize_t thStride = {th->getPtr<int64_t>(), std::static_cast<int64_t>(size())};
+    *size = thStride;
+    return diopiSuccess;
+}
+
+DIOPI_RT_API diopiError_t diopiGetTensorDtype(diopiConstTensorHandle_t th, diopiDtype_t* dtype) {
+    *dtype = toDiopiDataType[th->dtype()];
+    return diopiSuccess;
+}
+
+// DIOPI_RT_API diopiError_t diopiGetTensorDevice(diopiConstTensorHandle_t th, diopiDevice_t* device) {
+//     *device = th->device();
+//     return diopiSuccess;
+// }
+
+DIOPI_RT_API diopiError_t diopiGetTensorNumel(diopiConstTensorHandle_t th, int64_t* numel) {
+    diopiSize_t thNumel = {th->getPtr<int64_t>(), std::static_cast<int64_t>(size())};
+    *numel = thNumel;
+    return diopiSuccess;
+}
+
+DIOPI_RT_API diopiError_t diopiGetTensorElemSize(diopiConstTensorHandle_t th, int64_t* elemSize) {
+    *elemSize = std::static_cast<int64_t>(th->sizeBytes());
+    return diopiSuccess;
+}
\ No newline at end of file
diff --git a/src/turbomind/runtime/diopirt/diopirt_impl.h b/src/turbomind/runtime/diopirt/diopirt_impl.h
new file mode 100644
index 0000000000..7754020aa9
--- /dev/null
+++ b/src/turbomind/runtime/diopirt/diopirt_impl.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2023, DeepLink.
+#pragma once
+
+#include <diopi/diopirt.h>
+// #include <diopi/functions.h>
+
+#include "../rthelper.h"
+
+#include "../utils/tensor.h"
+
+using deviceStream_t = dipu::deviceStream_t;
+
+extern "C" {
+struct diopiContext {
+    deviceStream_t stream;
+    // 1. use arrays to hold tensor that avoid tensor deleting when leaving scope
+    // 2. The address of each array must be fixed, so use list instead of vector
+    std::list<at::Tensor> arrays;
+
+    explicit diopiContext(const deviceStream_t& s) : stream(s) {}
+};
+
+}  // extern "C"
+
+namespace dipu {
+
+namespace diopi_helper {
+
+::diopiTensorHandle_t toDiopiTensorHandle(turbomind::Tensor& tensor);
+::diopiConstTensorHandle_t toDiopiTensorHandle(const turbomind::Tensor& tensor);
+::diopiConstTensorHandle_t toDiopiTensorHandle(const turbomind::Tensor* tensor);
+
+::diopiGeneratorHandle_t toDiopiGeneratorHandle(::Generator& generator);
+
+static std::map<turbomind::DataType, diopiDtype_t> toDiopiDataType = {
+    {turbomind::DataType::TYPE_INVALID, diopiDtype_t::diopi_dtype_unsupported},
+    {turbomind::DataType::TYPE_BOOL, diopiDtype_t::diopi_dtype_bool},
+    {turbomind::DataType::TYPE_INT8, diopiDtype_t::diopi_dtype_int8},
+    {turbomind::DataType::TYPE_INT32, diopiDtype_t::diopi_dtype_int32},
+    {turbomind::DataType::TYPE_INT64, diopiDtype_t::diopi_dtype_int64},
+    {turbomind::DataType::TYPE_UINT8, diopiDtype_t::diopi_dtype_uint8},
+    {turbomind::DataType::TYPE_UINT32, diopiDtype_t::diopi_dtype_uint32},
+    {turbomind::DataType::TYPE_UINT64, diopiDtype_t::diopi_dtype_uint64},
+    {turbomind::DataType::TYPE_FP16, diopiDtype_t::diopi_dtype_float16},
+    {turbomind::DataType::TYPE_FP32, diopiDtype_t::diopi_dtype_float32},
+    {turbomind::DataType::TYPE_FP64, diopiDtype_t::diopi_dtype_float64}
+};
+
+static std::map<diopiDtype_t, turbomind::DataType> toTmDataType = {
+    {diopiDtype_t::diopi_dtype_unsupported, turbomind::DataType::TYPE_INVALID},
+    {diopiDtype_t::diopi_dtype_bool, turbomind::DataType::TYPE_BOOL},
+    {diopiDtype_t::diopi_dtype_int8, turbomind::DataType::TYPE_INT8},
+    {diopiDtype_t::diopi_dtype_int32, turbomind::DataType::TYPE_INT32},
+    {diopiDtype_t::diopi_dtype_int64, turbomind::DataType::TYPE_INT64},
+    {diopiDtype_t::diopi_dtype_uint8, turbomind::DataType::TYPE_UINT8},
+    {diopiDtype_t::diopi_dtype_uint32, turbomind::DataType::TYPE_UINT32},
+    {diopiDtype_t::diopi_dtype_uint64, turbomind::DataType::TYPE_UINT64},
+    {diopiDtype_t::diopi_dtype_float16, turbomind::DataType::TYPE_FP16},
+    {diopiDtype_t::diopi_dtype_float32, turbomind::DataType::TYPE_FP32},
+    {diopiDtype_t::diopi_dtype_float64, turbomind::DataType::TYPE_FP64}
+};
+
+}  // namespace diopi_helper
+
+}  // namespace dipu
\ No newline at end of file
diff --git a/src/turbomind/runtime/rthelper.h b/src/turbomind/runtime/rthelper.h
new file mode 100644
index 0000000000..94bd84eef4
--- /dev/null
+++ b/src/turbomind/runtime/rthelper.h
@@ -0,0 +1,5 @@
+// Copyright (c) 2023, DeepLink.
+#include <csrc_dipu/runtime/device/basedef.h>
+#include <csrc_dipu/runtime/device/rawdeviceapis.h>
+#include <csrc_dipu/runtime/device/rawdiclapis.h>
+#include <csrc_dipu/runtime/core/DIPURawGeneratorImpl.h>
\ No newline at end of file
diff --git a/src/turbomind/runtime/vendor/CUDA/cmake/CUDAComputeArch.cmake b/src/turbomind/runtime/vendor/CUDA/cmake/CUDAComputeArch.cmake
new file mode 100644
index 0000000000..df1b7d667f
--- /dev/null
+++ b/src/turbomind/runtime/vendor/CUDA/cmake/CUDAComputeArch.cmake
@@ -0,0 +1,266 @@
+# Synopsis:
+#   CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable [target_CUDA_architectures])
+#   -- Selects GPU arch flags for nvcc based on target_CUDA_architectures
+#      target_CUDA_architectures : Auto | Common | All | LIST(ARCH_AND_PTX ...)
+#       - "Auto" detects local machine GPU compute arch at runtime.
+#       - "Common" and "All" cover common and entire subsets of architectures
+#      ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
+#      NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal
+#      NUM: Any number. Only those pairs are currently accepted by NVCC though:
+#            2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2
+#      Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable}
+#      Additionally, sets ${out_variable}_readable to the resulting numeric list
+#      Example:
+#        CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.0 3.5+PTX 5.2(5.0) Maxwell)
+#        LIST(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS})
+#
+#      More info on CUDA architectures: https://en.wikipedia.org/wiki/CUDA
+#
+
+# This list will be used for CUDA_ARCH_NAME = All option
+set(CUDA_KNOWN_GPU_ARCHITECTURES  "Kepler" "Maxwell")
+
+# This list will be used for CUDA_ARCH_NAME = Common option (enabled by default)
+set(CUDA_COMMON_GPU_ARCHITECTURES "3.0" "3.5" "5.0")
+
+if (NOT (CUDA_VERSION VERSION_LESS "7.0"))
+    list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler+Tegra" "Kepler+Tesla" "Maxwell+Tegra")
+    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2")
+    if (NOT (CUDA_VERSION VERSION_LESS "8.0"))
+        list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Pascal")
+        list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.0" "6.1")
+        if (NOT (CUDA_VERSION VERSION_LESS "9.0"))
+            list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Volta")
+            list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.0")
+            if (NOT (CUDA_VERSION VERSION_LESS "10.0"))
+                list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Turing")
+                list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.5")
+                if (NOT (CUDA_VERSION VERSION_LESS "11.0"))
+                    list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ampere")
+                    # https://forums.developer.nvidia.com/t/nvcc-fatal-unsupported-gpu-architecture-compute-86/161424
+                    if(NOT (CUDA_VERSION VERSION_LESS "11.2"))
+                        list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0" "8.6+PTX")
+                    else()
+                        list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0+PTX")
+                    endif()
+                else()
+                    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.5+PTX")
+                endif()
+            else()
+                list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.0+PTX")
+            endif()
+        else()
+            list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.1+PTX")
+        endif()
+    else()
+        list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2+PTX")
+    endif()
+else()
+    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.0+PTX")
+endif()
+
+
+################################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   CUDA_DETECT_INSTALLED_GPUS(OUT_VARIABLE)
+#
+function(CUDA_DETECT_INSTALLED_GPUS OUT_VARIABLE)
+    if(NOT CUDA_GPU_DETECT_OUTPUT)
+        set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+
+        file(WRITE ${cufile} ""
+            "#include <cstdio>\n"
+            "int main()\n"
+            "{\n"
+            "  int count = 0;\n"
+            "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+            "  if (count == 0) return -1;\n"
+            "  for (int device = 0; device < count; ++device)\n"
+            "  {\n"
+            "    cudaDeviceProp prop;\n"
+            "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+            "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+            "  }\n"
+            "  return 0;\n"
+            "}\n")
+
+        execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${cufile}"
+                        "-ccbin" ${CMAKE_CXX_COMPILER}
+                        WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                        RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
+                        ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+        if(nvcc_res EQUAL 0)
+            # only keep the last line of nvcc_out
+            STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
+            STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
+            list(GET nvcc_out -1 nvcc_out)
+            string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
+            set(CUDA_GPU_DETECT_OUTPUT ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_gpus tool" FORCE)
+        endif()
+    endif()
+
+    if(NOT CUDA_GPU_DETECT_OUTPUT)
+        message(STATUS "Automatic GPU detection failed. Building for common architectures.")
+        set(${OUT_VARIABLE} ${CUDA_COMMON_GPU_ARCHITECTURES} PARENT_SCOPE)
+    else()
+        list(SORT CUDA_GPU_DETECT_OUTPUT)
+        set(CUDA_GPU_DETECT_OUTPUT ${CUDA_GPU_DETECT_OUTPUT}+PTX)
+        set(${OUT_VARIABLE} ${CUDA_GPU_DETECT_OUTPUT} PARENT_SCOPE)
+    endif()
+endfunction()
+
+function(FILTER_NOHALF_ARCH OUT_VARIABLE)
+    set(ARCH_LIST_IN "${ARGN}")
+    set(ARCH_VALID_HALF)
+    set(HALF_LOWEST 6.0)
+
+    foreach(arch_num ${ARCH_LIST_IN})
+        if ((arch_num GREATER HALF_LOWEST) OR (arch_num EQUAL HALF_LOWEST))
+            list(APPEND ARCH_VALID_HALF ${arch_num})
+        else()
+            message(STATUS "FILTER_NOHALF_ARCH: remove CUDA arch:${arch_num}!")
+        endif()
+    endforeach()
+    set(${OUT_VARIABLE} ${ARCH_VALID_HALF} PARENT_SCOPE)
+
+endfunction()
+
+################################################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA architectures from parameter list
+# Usage:
+#   CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable 
+#       option flags: ONE OF "All/Common/Auto" and HalfFilter or not, HalfFilter means filter-out
+#           arch which not support Half-precision
+#       Direct: direct list of CUDA compute archs, mutually exclusive with options
+#   example1: CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS Auto)
+#   example2: CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS Auto HalfFilter)
+#   example3: CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS Direct 6.0 6.1 7.0+PTX)
+function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
+
+    set(CUDA_ARCH_LIST)
+    set(options All Auto Common HalfFilter)
+    set(oneValueArgs)
+    set(multiValueArgs Direct)
+    cmake_parse_arguments(P "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if(P_All)
+        set(CUDA_ARCH_LIST ${CUDA_KNOWN_GPU_ARCHITECTURES})
+    elseif(P_Common)
+        set(CUDA_ARCH_LIST ${CUDA_COMMON_GPU_ARCHITECTURES})
+    elseif(P_Auto)
+        CUDA_DETECT_INSTALLED_GPUS(CUDA_ARCH_LIST)
+        message(STATUS "Autodetected CUDA architecture(s): ${CUDA_ARCH_LIST}")
+        set(CUDA_ARCH_LIST ${CUDA_ARCH_LIST})
+    elseif(P_Direct)
+        set(CUDA_ARCH_LIST ${P_Direct})
+    endif()
+
+    if(NOT CUDA_ARCH_LIST)
+        CUDA_DETECT_INSTALLED_GPUS(CUDA_ARCH_LIST)
+        message(STATUS "Autodetected CUDA architecture(s): ${CUDA_ARCH_LIST}")
+    endif()
+
+    set(cuda_arch_bin)
+    set(cuda_arch_ptx)
+    # Now process the list and look for names
+    string(REGEX REPLACE "[ \t]+" ";" CUDA_ARCH_LIST "${CUDA_ARCH_LIST}")
+    list(REMOVE_DUPLICATES CUDA_ARCH_LIST)
+    foreach(arch_name ${CUDA_ARCH_LIST})
+        set(arch_bin)
+        set(add_ptx FALSE)
+        # Check to see if we are compiling PTX
+        if(arch_name MATCHES "(.*)\\+PTX$")
+            set(add_ptx TRUE)
+            set(arch_name ${CMAKE_MATCH_1})
+        endif()
+        if(arch_name MATCHES "(^[0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
+            set(arch_bin ${CMAKE_MATCH_1})
+            set(arch_ptx ${arch_bin})
+        else()
+            # Look for it in our list of known architectures
+            if(${arch_name} STREQUAL "Kepler+Tegra")
+                set(arch_bin 3.2)
+            elseif(${arch_name} STREQUAL "Kepler+Tesla")
+                set(arch_bin 3.7)
+            elseif(${arch_name} STREQUAL "Kepler")
+                set(arch_bin 3.0 3.5)
+                set(arch_ptx 3.5)
+            elseif(${arch_name} STREQUAL "Maxwell+Tegra")
+                set(arch_bin 5.3)
+            elseif(${arch_name} STREQUAL "Maxwell")
+                set(arch_bin 5.0 5.2)
+                set(arch_ptx 5.2)
+            elseif(${arch_name} STREQUAL "Pascal")
+                set(arch_bin 6.0 6.1)
+                set(arch_ptx 6.1)
+            elseif(${arch_name} STREQUAL "Volta")
+                set(arch_bin 7.0)
+                set(arch_ptx 7.0)
+            elseif(${arch_name} STREQUAL "Turing")
+                set(arch_bin 7.5)
+                set(arch_ptx 7.5)
+            elseif(${arch_name} STREQUAL "Ampere")
+                set(arch_bin 8.0)
+                set(arch_ptx 8.0)
+            else()
+                message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
+            endif()
+        endif()
+        if(NOT arch_bin)
+            message(SEND_ERROR "arch_bin wasn't set for some reason")
+        endif()
+        list(APPEND cuda_arch_bin ${arch_bin})
+        if(add_ptx)
+            if (NOT arch_ptx)
+                set(arch_ptx ${arch_bin})
+            endif()
+            list(APPEND cuda_arch_ptx ${arch_ptx})
+        endif()
+    endforeach()
+
+    if(P_HalfFilter)
+        FILTER_NOHALF_ARCH(cuda_arch_bin ${cuda_arch_bin})
+        FILTER_NOHALF_ARCH(cuda_arch_ptx ${cuda_arch_ptx})
+    endif()
+
+    # remove dots and convert to lists
+    string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
+    string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
+    string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
+    string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+
+    if(cuda_arch_bin)
+        list(REMOVE_DUPLICATES cuda_arch_bin)
+    endif()
+    if(cuda_arch_ptx)
+        list(REMOVE_DUPLICATES cuda_arch_ptx)
+    endif()
+
+    set(nvcc_flags "")
+    set(nvcc_archs_readable "")
+
+    # Tell NVCC to add binaries for the specified GPUs
+    foreach(arch ${cuda_arch_bin})
+        if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+            # User explicitly specified ARCH for the concrete CODE
+            list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+            list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
+        else()
+            # User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE
+            list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
+            list(APPEND nvcc_archs_readable sm_${arch})
+        endif()
+    endforeach()
+
+    # Tell NVCC to add PTX intermediate code for the specified architectures
+    foreach(arch ${cuda_arch_ptx})
+        list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
+        list(APPEND nvcc_archs_readable compute_${arch})
+    endforeach()
+
+    string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
+    set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
+    set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
diff --git a/src/turbomind/runtime/vendor/CUDA/cmake/FindNCCL.cmake b/src/turbomind/runtime/vendor/CUDA/cmake/FindNCCL.cmake
new file mode 100644
index 0000000000..aa8028098c
--- /dev/null
+++ b/src/turbomind/runtime/vendor/CUDA/cmake/FindNCCL.cmake
@@ -0,0 +1,49 @@
+# CMake script to locate NCCL
+
+include(FindPackageHandleStandardArgs)
+
+find_path(NCCL_INCLUDE_DIR
+        NAMES nccl.h
+        HINTS ${NCCL_ROOT}/include
+        $ENV{NCCL_ROOT}/include
+        /usr/include
+        /usr/local/include
+        )
+
+find_library(NCCL_LIBRARIES
+        NAMES nccl
+        HINTS ${NCCL_ROOT}/lib
+        $ENV{NCCL_ROOT}/lib
+        /usr/lib
+        /usr/local/lib
+        )
+
+if (NCCL_INCLUDE_DIR)
+    file(READ ${NCCL_INCLUDE_DIR}/nccl.h NCCL_VERSION_FILE_CONTENTS)
+    string(REGEX MATCH "define NCCL_MAJOR * +([0-9]+)"
+            NCCL_VERSION_MAJOR "${NCCL_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define NCCL_MAJOR * +([0-9]+)" "\\1"
+            NCCL_VERSION_MAJOR "${NCCL_VERSION_MAJOR}")
+    string(REGEX MATCH "define NCCL_MINOR * +([0-9]+)"
+            NCCL_VERSION_MINOR "${NCCL_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define NCCL_MINOR * +([0-9]+)" "\\1"
+            NCCL_VERSION_MINOR "${NCCL_VERSION_MINOR}")
+    string(REGEX MATCH "define NCCL_PATCH * +([0-9]+)"
+            NCCL_VERSION_PATCH "${NCCL_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define NCCL_PATCH * +([0-9]+)" "\\1"
+            NCCL_VERSION_PATCH "${NCCL_VERSION_PATCH}")
+endif (NCCL_INCLUDE_DIR)
+
+if(NOT NCCL_VERSION_MAJOR)
+    set(NCCL_FOUND FALSE)
+    set(NCCL_VERSION "???")
+else()
+    set(NCCL_FOUND TRUE)
+    set(NCCL_VERSION_STRING "${NCCL_VERSION_MAJOR}.${NCCL_VERSION_MINOR}.${NCCL_VERSION_PATCH}")
+endif()
+
+find_package_handle_standard_args(NCCL DEFAULT_MSG
+        NCCL_INCLUDE_DIR
+        NCCL_LIBRARIES)
+
+mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARIES)
diff --git a/src/turbomind/runtime/vendor/CUDA/deviceimpl.cpp b/src/turbomind/runtime/vendor/CUDA/deviceimpl.cpp
new file mode 100644
index 0000000000..6567d8c87a
--- /dev/null
+++ b/src/turbomind/runtime/vendor/CUDA/deviceimpl.cpp
@@ -0,0 +1,255 @@
+// Copyright (c) 2023, DeepLink.
+#include <cuda_runtime_api.h>
+#include "../../device/rawdeviceapis.h"
+
+namespace dipu {
+DIPU_API devapis::VendorDeviceType VENDOR_TYPE = devapis::VendorDeviceType::CUDA;
+
+// extern int patchCachingAllocator();
+
+namespace devapis {
+
+using cuda_deviceId = int;
+// =====================
+//  Device class related
+// =====================
+
+void initializeVendor() {
+    // patchCachingAllocator();
+}
+
+void finalizeVendor() {
+
+}
+
+
+deviceId_t current_device() {
+  cuda_deviceId devId_;
+  DIPU_CALLCUDA(::cudaGetDevice(&devId_))
+  return static_cast<deviceId_t>(devId_);
+}
+
+DIPUDeviceProperties getDeviceProperties(int32_t device_index) {
+  ::cudaDeviceProp device_prop;
+  DIPU_CALLCUDA(cudaGetDeviceProperties(&device_prop, device_index))
+
+  DIPUDeviceProperties prop;
+  prop.name = device_prop.name;
+  prop.totalGlobalMem = device_prop.totalGlobalMem;
+  prop.major = device_prop.major;
+  prop.minor = device_prop.minor;
+  prop.multiProcessorCount = device_prop.multiProcessorCount;
+  return prop;
+}
+
+// in cuda_runtime_api.h
+// set current device given device according to id
+void setDevice(deviceId_t devId) {
+    cuda_deviceId devId_ = static_cast<deviceId_t>(devId);
+    DIPU_CALLCUDA(::cudaSetDevice(devId_))
+}
+
+void resetDevice(deviceId_t devId) {
+    DIPU_CALLCUDA(::cudaDeviceReset())
+}
+
+void syncDevice() {
+    DIPU_CALLCUDA(::cudaDeviceSynchronize())
+}
+
+// check last launch succ or not, throw if fail
+void checkLastError() {
+    DIPU_CALLCUDA(::cudaGetLastError())
+}
+
+int getDeviceCount() {
+  int num = -1;
+  DIPU_CALLCUDA(::cudaGetDeviceCount(reinterpret_cast<int*>(&num)))
+  return num;
+}
+
+void getDriverVersion(int* version) {
+    DIPU_CALLCUDA(::cudaDriverGetVersion(version))
+}
+
+void getRuntimeVersion(int* version) {
+    DIPU_CALLCUDA(::cudaRuntimeGetVersion(version))
+}
+
+// =====================
+//  device stream related
+// =====================
+void createStream(deviceStream_t* stream, bool prior) {
+    if (prior) {
+        DIPU_CALLCUDA(::cudaStreamCreateWithPriority(stream, cudaStreamDefault, -1))
+    } else {
+        DIPU_CALLCUDA(::cudaStreamCreate(stream))
+    }
+}
+
+void destroyStream(deviceStream_t stream) {
+    DIPU_CALLCUDA(::cudaStreamDestroy(stream))
+}
+
+void destroyStream(deviceStream_t stream, deviceId_t devId) {
+    setDevice(devId);
+    destroyStream(stream);
+}
+
+void releaseStream() {
+    return;
+}
+
+bool streamNotNull(deviceStream_t stream) {
+    return (stream != nullptr && stream != cudaStreamLegacy && stream != cudaStreamPerThread);
+}
+
+void syncStream(deviceStream_t stream) {
+    DIPU_CALLCUDA(::cudaStreamSynchronize(stream));
+}
+
+void streamWaitEvent(deviceStream_t stream, deviceEvent_t event) {
+    DIPU_CALLCUDA(::cudaStreamWaitEvent(stream, event, 0))
+}
+
+bool isStreamEmpty(deviceStream_t stream) {
+  auto err = cudaStreamQuery(stream);
+  if (err == ::cudaSuccess) {
+    return true;
+  }
+  return false;
+}
+
+
+// =====================
+//  device event related
+// =====================
+
+void createEvent(deviceEvent_t* event) {
+    DIPU_CALLCUDA(::cudaEventCreateWithFlags(event, cudaEventDisableTiming))
+}
+
+void destroyEvent(deviceEvent_t event) {
+    DIPU_CALLCUDA(::cudaEventDestroy(event))
+}
+
+void waitEvent(deviceEvent_t event) {
+    DIPU_CALLCUDA(::cudaEventSynchronize(event))
+}
+
+void recordEvent(deviceEvent_t event, deviceStream_t stream) {
+    DIPU_CALLCUDA(::cudaEventRecord(event, stream))
+}
+
+void eventElapsedTime(float* time, deviceEvent_t start, deviceEvent_t end) {
+    DIPU_CALLCUDA(cudaEventElapsedTime(time, start, end))
+}
+
+EventStatus getEventStatus(deviceEvent_t event) {
+    ::cudaError_t ret = ::cudaEventQuery(event);
+    if (ret == ::cudaSuccess) {
+        return devapis::EventStatus::READY;
+    } else if (ret == ::cudaErrorNotReady) {
+        ::cudaGetLastError(); /* reset internal error state*/
+        return devapis::EventStatus::PENDING;
+    } else {
+        TORCH_CHECK(false, "unexpected event status in getEventStatus, ret = ", ret);
+    }
+}
+
+// =====================
+//  mem related
+// =====================
+void mallocHost(void** p, size_t nbytes) {
+    DIPU_CALLCUDA(::cudaMallocHost(p, nbytes))
+}
+
+void freeHost(void* p) {
+    DIPU_CALLCUDA(::cudaFreeHost(p))
+}
+
+OpStatus mallocDevice(void **p, size_t nbytes, bool throwExcepion) {
+    ::cudaError_t r = ::cudaMalloc(p, nbytes);
+    if (r != ::cudaSuccess) {
+        if(throwExcepion) {
+            ::cudaGetLastError(); /* reset internal error state*/
+            TORCH_CHECK(false, "alloc failed in mallocDevice, ret = ", r, " size= ", nbytes);
+        }
+        else if(r == ::cudaErrorMemoryAllocation) {
+            return OpStatus::ERR_NOMEM;
+        }
+        else {
+            return OpStatus::ERR_UNKNOWN;
+        }
+    }
+    return OpStatus::SUCCESS;
+}
+
+void freeDevice(void* p) {
+    DIPU_CALLCUDA(::cudaFree(p))
+}
+
+bool isPinnedPtr(const void *p) {
+    ::cudaPointerAttributes attr;
+    DIPU_CALLCUDA(::cudaPointerGetAttributes(&attr, p))
+    return attr.type == cudaMemoryTypeHost;
+}
+
+void memSetAsync(const deviceStream_t stream, void* ptr, int val, size_t size) {
+    DIPU_CALLCUDA(::cudaMemsetAsync(ptr, val, size, stream))
+}
+
+void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src) {
+    if (dstDevId == srcDevId) {
+        DIPU_CALLCUDA(::cudaMemcpy(dst, src, nbytes, ::cudaMemcpyDeviceToDevice))
+    } else {
+        DIPU_CALLCUDA(::cudaMemcpyPeer(dst, dstDevId, src, srcDevId, nbytes))
+    }
+}
+
+// (synchronous) copy from host to a CUDA device
+void memCopyH2D(size_t nbytes, void* dst, const void* src) {
+    DIPU_CALLCUDA(::cudaMemcpy(dst, src, nbytes, ::cudaMemcpyHostToDevice))
+}
+
+// (synchronous) copy from a CUDA device to host
+void memCopyD2H(size_t nbytes, void* dst, const void* src) {
+    DIPU_CALLCUDA(::cudaMemcpy(dst, src, nbytes, ::cudaMemcpyDeviceToHost))
+}
+
+// (asynchronous) copy from device to a device
+void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes,
+        deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src) {
+  if (dstDevId == srcDevId) {
+      DIPU_CALLCUDA(::cudaMemcpyAsync(
+          dst, src, nbytes, cudaMemcpyDeviceToDevice, stream))
+  } else {
+      DIPU_CALLCUDA(::cudaMemcpyPeerAsync(
+          dst, dstDevId, src, srcDevId, nbytes, stream))
+  }
+}
+
+// (asynchronous) copy from host to a device
+void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, void* dst, const void* src) {
+    DIPU_CALLCUDA(::cudaMemcpyAsync(
+            dst, src, nbytes, cudaMemcpyHostToDevice, stream))
+}
+
+// (asynchronous) copy from a device to host
+void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, void* dst,  const void* src) {
+  DIPU_CALLCUDA(::cudaMemcpyAsync(
+    dst, src, nbytes, cudaMemcpyDeviceToHost, stream));
+}
+
+}   // end namespace devapis
+
+}  // namespace parrots
+
+
+
+
+
+
+
+
+
diff --git a/src/turbomind/runtime/vendor/CUDA/rawcommuniatorimpl.cpp b/src/turbomind/runtime/vendor/CUDA/rawcommuniatorimpl.cpp
new file mode 100644
index 0000000000..50a7b31fdd
--- /dev/null
+++ b/src/turbomind/runtime/vendor/CUDA/rawcommuniatorimpl.cpp
@@ -0,0 +1,146 @@
+#include <cstring>
+#include "../../device/rawdiclapis.h"
+
+namespace dipu {
+
+namespace devapis {
+  // NCCL op mapping
+  static std::map<DiclReduceOp, ncclRedOp_t> ncclOp = {
+    {DiclReduceOp::MIN, ncclMin},
+    {DiclReduceOp::MAX, ncclMax},
+    {DiclReduceOp::SUM, ncclSum},
+    {DiclReduceOp::PRODUCT, ncclProd},
+  #ifdef NCCL_HAS_AVG
+    {DiclReduceOp::AVG, ncclAvg},
+  #endif
+  };
+
+  static ncclRedOp_t getNcclOp(DiclReduceOp op) {
+    auto it = ncclOp.find(op);
+    if (it != myMap.end()) {
+        int value = it->second;
+    } else {
+        std::cout << "Key '" << key << "' not found." << std::endl;
+        exit();
+    }
+    return 0;
+  }
+
+  static std::map<DiclDataType, ncclDataType_t> ncclDataType = {
+    // {DiclDataType::TYPE_INVALID, },
+    {DiclDataType::TYPE_BOOL, ncclUint8},
+    {DiclDataType::TYPE_UINT8, ncclUint8},
+    // {DiclDataType::TYPE_UINT16, },
+    // {DiclDataType::TYPE_UINT32, },
+    // {DiclDataType::TYPE_UINT64, },
+    {DiclDataType::TYPE_INT8, ncclInt8},
+    // {DiclDataType::TYPE_INT16, },
+    {DiclDataType::TYPE_INT32, ncclInt32},
+    {DiclDataType::TYPE_INT64, ncclInt64},
+    {DiclDataType::TYPE_FP16, ncclHalf},
+    {DiclDataType::TYPE_FP32, ncclFloat},
+    {DiclDataType::TYPE_FP64, ncclDouble},
+    // {DiclDataType::TYPE_BYTES, },
+  #if HAS_NCCL_BF16_DATATYPE
+    {DiclDataType::TYPE_BF16, ncclBfloat16},
+  #endif
+  };
+
+// Macro to print and abort on a non-successful NCCL return value.
+#define NCCL_THROW(cmd)                            \
+  do {                                                   \
+    ncclResult_t result = cmd;                           \
+    if (result != ncclSuccess) {                         \
+      std::string err = ncclGetErrorString(result); \
+      fprintf(                                           \
+          stderr,                                        \
+          "NCCL error in: %s:%d, %s\n",                  \
+          __FILE__,                                      \
+          __LINE__,                                      \
+          err.c_str());                                  \
+      TORCH_CHECK(false, err);                           \
+    }                                                    \
+  } while (0)
+
+
+  const int DICL_UNIQUE_ID_BYTES_SIZE = NCCL_UNIQUE_ID_BYTES;
+
+  DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm) {
+    ncclResult_t ncclAsyncErr_;
+    NCCL_THROW(ncclCommGetAsyncError(comm, &ncclAsyncErr_));
+    if (ncclAsyncErr_ != ncclSuccess) {
+      return DICL_SUCCESS;
+    } else {
+      return DICL_ERR_UNDEF;
+    }
+  }
+
+  DIPU_API diclResult_t diclGetUniqueId(commUniqueId* uniqueId) {
+    NCCL_THROW(ncclGetUniqueId(uniqueId));
+    return DICL_SUCCESS;
+  }
+
+  DIPU_API diclResult_t diclCommInitRank(diclComm_t* comm, int nranks, commUniqueId uniqueId,
+                                          int rank, int localDeviceId) {
+    NCCL_THROW(ncclCommInitRank(comm, nranks, uniqueId, rank));
+    return DICL_SUCCESS;
+  }
+
+  DIPU_API diclResult_t diclCommDestroy(ncclComm_t comm) {
+    NCCL_THROW(ncclCommDestroy(comm));
+    return DICL_SUCCESS;
+  }
+
+  DIPU_API diclResult_t diclRawAllReduce(const void *sendbuff, void *recvbuff, size_t count, dipuDataType datatype,
+                              const ReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream) {
+    NCCL_THROW(ncclAllReduce(sendbuff, recvbuff, count, ncclDataType[datatype], ncclOp[reduceOp], comm, stream));
+    return DICL_SUCCESS;
+  }
+
+  DIPU_API diclResult_t diclRawBroadcast(const void *sendbuff, void* recvbuff, size_t count, dipuDataType datatype,
+                              int root, diclComm_t comm, deviceStream_t stream) {
+    NCCL_THROW(ncclBroadcast(sendbuff, recvbuff, count, ncclDataType[datatype], root, comm, stream));
+    return DICL_SUCCESS;
+  }
+
+  DIPU_API diclResult_t diclRawAllGather(const void *sendBuf, void *recvBuf, size_t count, dipuDataType datatype,
+                              diclComm_t comm, deviceStream_t stream) {
+    NCCL_THROW(ncclAllGather(sendBuf, recvBuf, count, ncclDataType[datatype], comm, stream));
+    return DICL_SUCCESS;
+  }
+
+  DIPU_API diclResult_t diclRawReduce(const void* sendbuff, void* recvbuff, size_t count, dipuDataType datatype,
+                            const ReduceOp& reduceOp, int root, diclComm_t comm, deviceStream_t stream) {
+    NCCL_THROW(ncclReduce(sendbuff, recvbuff, count, ncclDataType[datatype], ncclOp[reduceOp], root, comm, stream));
+    return DICL_SUCCESS;
+  }
+
+  DIPU_API diclResult_t diclRawReduceScatter(void *sendBuf, void *recvBuf, uint64_t recvCount, dipuDataType dataType, 
+                                  const ReduceOp& op, diclComm_t comm, deviceStream_t stream) {
+    throw std::runtime_error("mlu Not implement diclReduceScatter");
+  }
+
+  DIPU_API diclResult_t diclRawSend(void* sendbuff, size_t count, dipuDataType datatype, int peer,
+                          diclComm_t comm, deviceStream_t stream){
+    NCCL_THROW(ncclSend(sendbuff, count, ncclDataType[datatype], peer, comm, stream));
+    return DICL_SUCCESS;
+  }
+
+  DIPU_API diclResult_t diclRawRecv(void* recvbuff, size_t count, dipuDataType datatype, int peer,
+                          diclComm_t comm, deviceStream_t stream) {
+    NCCL_THROW(ncclRecv(recvbuff, count, ncclDataType[datatype], peer, comm, stream));
+    return DICL_SUCCESS;
+  }
+
+  DIPU_API diclResult_t diclRawGroupStart() {
+    NCCL_THROW(ncclGroupStart());
+    return DICL_SUCCESS;
+  }
+
+  DIPU_API diclResult_t diclRawGroupEnd() {
+    NCCL_THROW(ncclGroupEnd());
+    return DICL_SUCCESS;
+  }
+
+} // end namespace devapis
+} // end namespace dipu
diff --git a/src/turbomind/runtime/vendor/CUDA/vendorapi.h b/src/turbomind/runtime/vendor/CUDA/vendorapi.h
new file mode 100644
index 0000000000..87a214847d
--- /dev/null
+++ b/src/turbomind/runtime/vendor/CUDA/vendorapi.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2023, DeepLink.
+
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nccl.h>
+
+namespace dipu {
+
+#define DIPU_CALLCUDA(Expr)                                                     \
+{                                                                               \
+    cudaError_t ret = Expr;                                                     \
+    TORCH_CHECK(ret == ::cudaSuccess, "call cuda error, expr = ", #Expr, ", ret = ", ret); \
+}
+
+using deviceStream_t = cudaStream_t;
+#define deviceDefaultStreamLiteral cudaStreamLegacy
+using deviceEvent_t = cudaEvent_t;
+
+using diclComm_t = ncclComm_t;
+using commUniqueId = ncclUniqueId;
+
+}
+
+
+
+
+
diff --git a/src/turbomind/utils/Tensor.cc b/src/turbomind/utils/Tensor.cc
index 1ae72bc5e4..38298a987b 100644
--- a/src/turbomind/utils/Tensor.cc
+++ b/src/turbomind/utils/Tensor.cc
@@ -153,9 +153,9 @@ Tensor Tensor::loadNpy(const std::string& npy_file, const MemoryType where)
     size_t n_elems = fread(data_cpu, Tensor::getTypeSize(type), size, f_ptr);
     FT_CHECK_WITH_INFO(n_elems == size, "reading tensor failed");
     if (where == MEMORY_GPU) {
-        cudaMalloc(&data, size * Tensor::getTypeSize(type));
-        cudaMemcpy(data, data_cpu, size * Tensor::getTypeSize(type), cudaMemcpyHostToDevice);
-        free(data_cpu);
+        dipu::devapis::mallocDevice(&data, size * Tensor::getTypeSize(type));
+        dipu::devapis::memCopyH2D(size * Tensor::getTypeSize(type), data, data_cpu);
+        dipu::devapis::freeDevice(data_cpu);
     }
 
     fclose(f_ptr);
diff --git a/src/turbomind/utils/allocator.h b/src/turbomind/utils/allocator.h
index 1ba191d211..b8dc6e319f 100644
--- a/src/turbomind/utils/allocator.h
+++ b/src/turbomind/utils/allocator.h
@@ -21,7 +21,9 @@
 
 #include "cuda_utils.h"
 #include "src/turbomind/macro.h"
+#ifndef DIOPI_ENABLE
 #include <cuda_runtime.h>
+#endif  // DIOPI_ENABLE
 #include <unordered_map>
 #include <vector>
 
@@ -70,8 +72,8 @@ class IAllocator {
 
     virtual void*        malloc(size_t size, const bool is_set_zero = true, bool is_host = false) = 0;
     virtual void         free(void** ptr, bool is_host = false) const                             = 0;
-    virtual void         setStream(cudaStream_t stream)                                           = 0;
-    virtual cudaStream_t returnStream()                                                           = 0;
+    virtual void         setStream(deviceStream_t stream)                                           = 0;
+    virtual deviceStream_t returnStream()                                                           = 0;
     virtual void         memSet(void* ptr, const int val, const size_t size)                      = 0;
 
     template<typename T>
@@ -132,7 +134,7 @@ class Allocator<AllocatorType::CUDA>: public IAllocator {
     };
 
     const int                                                 device_id_;
-    cudaStream_t                                              stream_ = 0;  // initialize as default stream
+    deviceStream_t                                            stream_ = 0;  // initialize as default stream
     std::unordered_map<void*, std::pair<size_t, MemoryType>>* pointer_mapping_;
 
     bool isExist(void* address) const
@@ -201,12 +203,12 @@ class Allocator<AllocatorType::CUDA>: public IAllocator {
         delete pointer_mapping_;
     }
 
-    void setStream(cudaStream_t stream)
+    void setStream(deviceStream_t stream)
     {
         stream_ = stream;
     }
 
-    cudaStream_t returnStream()
+    deviceStream_t returnStream()
     {
         return stream_;
     };
@@ -222,17 +224,13 @@ class Allocator<AllocatorType::CUDA>: public IAllocator {
 
         check_cuda_error(getSetDevice(device_id_, &o_device));
         if (is_host) {
-            check_cuda_error(cudaMallocHost(&ptr, (size_t)(ceil(size / 32.)) * 32));
+            check_cuda_error(dipu::devproxy::mallocHost(&ptr, (size_t)(ceil(size / 32.)) * 32));
         }
         else {
-#if defined(CUDA_MEMORY_POOL_DISABLED)
-            check_cuda_error(cudaMalloc(&ptr, (size_t)(ceil(size / 32.)) * 32));
-#else
-            check_cuda_error(cudaMallocAsync(&ptr, (size_t)(ceil(size / 32.)) * 32, stream_));
-#endif
+            check_cuda_error(dipu::devproxy::mallocDevice(&ptr, (size_t)(ceil(size / 32.)) * 32));
         }
         if (is_set_zero) {
-            check_cuda_error(cudaMemsetAsync(ptr, 0, (size_t)(ceil(size / 32.)) * 32, stream_));
+            check_cuda_error(dipu::devproxy::memSetAsync(stream_, ptr, 0, (size_t)(ceil(size / 32.)) * 32));
         }
         check_cuda_error(getSetDevice(o_device));
         TM_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size);
@@ -253,15 +251,10 @@ class Allocator<AllocatorType::CUDA>: public IAllocator {
                 TM_LOG_DEBUG("Free buffer %p", address);
                 check_cuda_error(getSetDevice(device_id_, &o_device));
                 if (is_host) {
-                    check_cuda_error(cudaFreeHost(*ptr));
+                    check_cuda_error(dipu::devproxy::freeHost(*ptr));
                 }
                 else {
-#if defined(CUDA_MEMORY_POOL_DISABLED)
-                    check_cuda_error(cudaFree(*ptr));
-#else
-                    check_cuda_error(cudaFreeAsync(*ptr, stream_));
-                    cudaStreamSynchronize(stream_);
-#endif
+                    check_cuda_error(dipu::devproxy::freeDevice(*ptr));
                 }
                 check_cuda_error(getSetDevice(o_device));
                 pointer_mapping_->erase(address);
@@ -276,7 +269,7 @@ class Allocator<AllocatorType::CUDA>: public IAllocator {
 
     void memSet(void* ptr, const int val, const size_t size)
     {
-        check_cuda_error(cudaMemsetAsync(ptr, val, size, stream_));
+        check_cuda_error(dipu::devproxy::memSetAsync(stream_, ptr, val, size));
     }
 };
 
diff --git a/src/turbomind/utils/cuda_utils.cc b/src/turbomind/utils/cuda_utils.cc
index 45fa06a6d5..442abb7dfb 100644
--- a/src/turbomind/utils/cuda_utils.cc
+++ b/src/turbomind/utils/cuda_utils.cc
@@ -21,7 +21,7 @@
 namespace turbomind {
 
 /* **************************** debug tools ********************************* */
-
+#ifndef DIOPI_ENABLE
 template<typename T>
 void print_to_file(const T* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode)
 {
@@ -331,40 +331,31 @@ template void check_abs_mean_val(const half* result, const int size);
 #ifdef ENABLE_BF16
 template void check_abs_mean_val(const __nv_bfloat16* result, const int size);
 #endif
+#endif  // DIOPI_ENABLE
 
 /* ***************************** common utils ****************************** */
 
-cudaError_t getSetDevice(int i_device, int* o_device)
+void getSetDevice(int i_device, int* o_device)
 {
     int         current_dev_id = 0;
     cudaError_t err            = cudaSuccess;
 
     if (o_device != NULL) {
-        err = cudaGetDevice(&current_dev_id);
-        if (err != cudaSuccess) {
-            return err;
-        }
+        current_dev_id = dipu::devapis::current_device();
         if (current_dev_id == i_device) {
             *o_device = i_device;
         }
         else {
-            err = cudaSetDevice(i_device);
-            if (err != cudaSuccess) {
-                return err;
-            }
+            err = dipu::devapis::setDevice(i_device);
             *o_device = current_dev_id;
         }
     }
     else {
-        err = cudaSetDevice(i_device);
-        if (err != cudaSuccess) {
-            return err;
-        }
+        dipu::devapis::setDevice(i_device);
     }
-
-    return cudaSuccess;
 }
 
+#ifndef DIOPI_ENABLE
 FtCudaDataType getModelFileType(std::string ini_file, std::string section_name)
 {
     FtCudaDataType model_file_type;
@@ -391,6 +382,7 @@ FtCudaDataType getModelFileType(std::string ini_file, std::string section_name)
     }
     return model_file_type;
 }
+#endif  // DIOPI_ENABLE
 
 /* ************************** end of common utils ************************** */
 }  // namespace turbomind
diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h
index be0b85d69a..1950a5840a 100644
--- a/src/turbomind/utils/cuda_utils.h
+++ b/src/turbomind/utils/cuda_utils.h
@@ -116,6 +116,7 @@ static const char* _cudaGetErrorEnum(cublasStatus_t error)
     return "<unknown>";
 }
 
+#ifndef DIOPI_ENABLE
 template<typename T>
 void check(T result, char const* const func, const char* const file, int const line)
 {
@@ -126,7 +127,19 @@ void check(T result, char const* const func, const char* const file, int const l
 }
 
 #define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
-#define check_cuda_error_2(val, file, line) check((val), #val, file, line)
+#else
+#define check_cuda_error(val)                                                              \
+{                                                                                          \            
+    try {                                                                                  \
+        (val);                                                                             \
+        dipu::devapis::checkLastError();                                                   \
+    }catch(const std::exception& ex) {                                                     \
+        throw std::runtime_error(std::string("[TM][ERROR] DEVICE runtime error: ") + (ex.what()) + " " \
+                                 + __FILE__ + ":" + std::to_string(__LINE__) + " \n");                 \
+    }                                                                                      \
+}                                                                                          \
+#endif  // DIOPI_ENABLE
+// #define check_cuda_error_2(val, file, line) check((val), #val, file, line)
 
 inline void syncAndCheck(const char* const file, int const line)
 {
@@ -135,28 +148,47 @@ inline void syncAndCheck(const char* const file, int const line)
     if (level_name != nullptr) {
         static std::string level = std::string(level_name);
         if (level == "DEBUG") {
-            cudaDeviceSynchronize();
+            dipu::devapis::syncDevice();
+#ifndef DIOPI_ENABLE
             cudaError_t result = cudaGetLastError();
             if (result) {
                 throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result))
                                          + " " + file + ":" + std::to_string(line) + " \n");
             }
+#else
+            try {
+                dipu::devapis::checkLastError();
+            }catch(const std::exception& ex) {
+                throw std::runtime_error(std::string("[TM][ERROR] DEVICE runtime error: ") + (ex.what())
+                                         + " " + file + ":" + std::to_string(line) + " \n");
+            }
+#endif  // DIOPI_ENABLE
             TM_LOG_DEBUG(fmtstr("run syncAndCheck at %s:%d", file, line));
         }
     }
 
 #ifndef NDEBUG
-    cudaDeviceSynchronize();
+    dipu::devapis::syncDevice();
+#ifndef DIOPI_ENABLE
     cudaError_t result = cudaGetLastError();
     if (result) {
         throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " "
                                  + file + ":" + std::to_string(line) + " \n");
     }
+#else
+    try {
+        dipu::devapis::checkLastError();
+    }catch(const std::exception& ex) {
+        throw std::runtime_error(std::string("[TM][ERROR] DEVICE runtime error: ") + (ex.what()) + " "
+                                 + file + ":" + std::to_string(line) + " \n")
+    }
+#endif  // DIOPI_ENABLE
 #endif
 }
 
 #define sync_check_cuda_error() syncAndCheck(__FILE__, __LINE__)
 
+#ifndef DIOPI_ENABLE
 #define checkCUDNN(expression)                                                                                         \
     {                                                                                                                  \
         cudnnStatus_t status = (expression);                                                                           \
@@ -166,6 +198,7 @@ inline void syncAndCheck(const char* const file, int const line)
             std::exit(EXIT_FAILURE);                                                                                   \
         }                                                                                                              \
     }
+#endif  // DIOPI_ENABLE
 
 template<typename T>
 void print_to_file(const T*           result,
@@ -235,50 +268,53 @@ inline void myAssert(bool result, const char* const file, int const line, std::s
 #endif
 
 /*************Time Handling**************/
-class CudaTimer {
-private:
-    cudaEvent_t  event_start_;
-    cudaEvent_t  event_stop_;
-    cudaStream_t stream_;
-
-public:
-    explicit CudaTimer(cudaStream_t stream = 0)
-    {
-        stream_ = stream;
-    }
-    void start()
-    {
-        check_cuda_error(cudaEventCreate(&event_start_));
-        check_cuda_error(cudaEventCreate(&event_stop_));
-        check_cuda_error(cudaEventRecord(event_start_, stream_));
-    }
-    float stop()
-    {
-        float time;
-        check_cuda_error(cudaEventRecord(event_stop_, stream_));
-        check_cuda_error(cudaEventSynchronize(event_stop_));
-        check_cuda_error(cudaEventElapsedTime(&time, event_start_, event_stop_));
-        check_cuda_error(cudaEventDestroy(event_start_));
-        check_cuda_error(cudaEventDestroy(event_stop_));
-        return time;
-    }
-    ~CudaTimer() {}
-};
+// class CudaTimer {
+// private:
+//     cudaEvent_t  event_start_;
+//     cudaEvent_t  event_stop_;
+//     cudaStream_t stream_;
+
+// public:
+//     explicit CudaTimer(cudaStream_t stream = 0)
+//     {
+//         stream_ = stream;
+//     }
+//     void start()
+//     {
+//         check_cuda_error(cudaEventCreate(&event_start_));
+//         check_cuda_error(cudaEventCreate(&event_stop_));
+//         check_cuda_error(cudaEventRecord(event_start_, stream_));
+//     }
+//     float stop()
+//     {
+//         float time;
+//         check_cuda_error(cudaEventRecord(event_stop_, stream_));
+//         check_cuda_error(cudaEventSynchronize(event_stop_));
+//         check_cuda_error(cudaEventElapsedTime(&time, event_start_, event_stop_));
+//         check_cuda_error(cudaEventDestroy(event_start_));
+//         check_cuda_error(cudaEventDestroy(event_stop_));
+//         return time;
+//     }
+//     ~CudaTimer() {}
+// };
 
 /* ***************************** common utils ****************************** */
 
 inline void print_mem_usage(std::string time = "after allocation")
 {
+#ifndef DIOPI_ENABLE
     size_t free_bytes, total_bytes;
     check_cuda_error(cudaMemGetInfo(&free_bytes, &total_bytes));
     float free  = static_cast<float>(free_bytes) / 1024.0 / 1024.0 / 1024.0;
     float total = static_cast<float>(total_bytes) / 1024.0 / 1024.0 / 1024.0;
     float used  = total - free;
     printf("%-20s: free: %5.2f GB, total: %5.2f GB, used: %5.2f GB\n", time.c_str(), free, total, used);
+#endif  // DIOPI_ENABLE
 }
 
 inline int getSMVersion()
 {
+#ifndef DIOPI_ENABLE
     int device{-1};
     check_cuda_error(cudaGetDevice(&device));
     int sm_major = 0;
@@ -286,25 +322,28 @@ inline int getSMVersion()
     check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
     check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
     return sm_major * 10 + sm_minor;
+#endif  // DIOPI_ENABLE
 }
 
 inline int getMaxSharedMemoryPerBlock()
 {
+#ifndef DIOPI_ENABLE
     int device{-1};
     check_cuda_error(cudaGetDevice(&device));
     int max_shared_memory_size = 0;
     check_cuda_error(cudaDeviceGetAttribute(&max_shared_memory_size, cudaDevAttrMaxSharedMemoryPerBlock, device));
     return max_shared_memory_size;
+#endif  // DIOPI_ENABLE
 }
 
-inline std::string getDeviceName()
-{
-    int device{-1};
-    check_cuda_error(cudaGetDevice(&device));
-    cudaDeviceProp props;
-    check_cuda_error(cudaGetDeviceProperties(&props, device));
-    return std::string(props.name);
-}
+// inline std::string getDeviceName()
+// {
+//     int device{-1};
+//     check_cuda_error(cudaGetDevice(&device));
+//     cudaDeviceProp props;
+//     check_cuda_error(cudaGetDeviceProperties(&props, device));
+//     return std::string(props.name);
+// }
 
 inline int div_up(int a, int n)
 {
@@ -315,18 +354,16 @@ cudaError_t getSetDevice(int i_device, int* o_device = NULL);
 
 inline int getDevice()
 {
-    int current_dev_id = 0;
-    check_cuda_error(cudaGetDevice(&current_dev_id));
-    return current_dev_id;
+    return dipu::devapis::current_device();
 }
 
 inline int getDeviceCount()
 {
-    int count = 0;
-    check_cuda_error(cudaGetDeviceCount(&count));
+    int count = dipu::devapis::getDeviceCount();
     return count;
 }
 
+// suggest: move to CublasWrapper
 template<typename T>
 CublasDataType getCublasDataType()
 {
@@ -347,6 +384,7 @@ CublasDataType getCublasDataType()
     }
 }
 
+// suggest: move to CublasWrapper
 template<typename T>
 cudaDataType_t getCudaDataType()
 {
@@ -389,7 +427,9 @@ FtCudaDataType getModelFileType(std::string ini_file, std::string section_name);
 // clang-format off
 template<typename T> struct packed_type;
 template <>          struct packed_type<float>         { using type = float; }; // we don't need to pack float by default
+#ifndef DIOPI_ENABLE
 template <>          struct packed_type<half>          { using type = half2; };
+#endif  // DIOPI_ENABLE
 
 #ifdef ENABLE_BF16
 template<>
@@ -400,15 +440,18 @@ struct packed_type<__nv_bfloat16> {
 
 template<typename T> struct num_elems;
 template <>          struct num_elems<float>           { static constexpr int value = 1; };
+#ifndef DIOPI_ENABLE
 template <>          struct num_elems<float2>          { static constexpr int value = 2; };
 template <>          struct num_elems<float4>          { static constexpr int value = 4; };
 template <>          struct num_elems<half>            { static constexpr int value = 1; };
 template <>          struct num_elems<half2>           { static constexpr int value = 2; };
+#endif  // DIOPI_ENABLE
 #ifdef ENABLE_BF16
 template <>          struct num_elems<__nv_bfloat16>   { static constexpr int value = 1; };
 template <>          struct num_elems<__nv_bfloat162>  { static constexpr int value = 2; };
 #endif
 
+#ifndef DIOPI_ENABLE
 template<typename T, int num> struct packed_as;
 template<typename T>          struct packed_as<T, 1>              { using type = T; };
 template<>                    struct packed_as<half,  2>          { using type = half2; };
@@ -482,6 +525,7 @@ void compareTwoTensor(
     delete[] h_pred;
     delete[] h_ref;
 }
+#endif  // DIOPI_ENABLE
 
 /* ************************** end of common utils ************************** */
 }  // namespace turbomind
diff --git a/src/turbomind/utils/logger.cc b/src/turbomind/utils/logger.cc
index 9788a8fad7..22a01af35a 100644
--- a/src/turbomind/utils/logger.cc
+++ b/src/turbomind/utils/logger.cc
@@ -15,8 +15,11 @@
  */
 
 #include "src/turbomind/utils/logger.h"
+#ifndef DIOPI_ENABLE
 #include <cuda_runtime.h>
-
+#else
+#include "../runtime/device/rawdeviceapis.h"
+#endif  // DIOPI_ENABLE
 namespace turbomind {
 
 Logger::Logger()
@@ -26,7 +29,7 @@ Logger::Logger()
         (is_first_rank_only_char != nullptr && std::string(is_first_rank_only_char) == "ON") ? true : false;
 
     int device_id;
-    cudaGetDevice(&device_id);
+    device_id = dipu::current_device();
 
     char* level_name = std::getenv("TM_LOG_LEVEL");
     if (level_name != nullptr) {
diff --git a/src/turbomind/utils/memory_utils.cu b/src/turbomind/utils/memory_utils.cu
index 93547f364f..99a476fce2 100644
--- a/src/turbomind/utils/memory_utils.cu
+++ b/src/turbomind/utils/memory_utils.cu
@@ -16,10 +16,8 @@
 
 #include "src/turbomind/macro.h"
 #include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/cuda_type_utils.cuh"
 #include "src/turbomind/utils/logger.h"
 #include "src/turbomind/utils/memory_utils.h"
-#include <curand_kernel.h>
 #include <sys/stat.h>
 #include <unordered_map>
 
@@ -29,18 +27,22 @@ template<typename T>
 void deviceMalloc(T** ptr, size_t size, bool is_random_initialize)
 {
     FT_CHECK_WITH_INFO(size >= ((size_t)0), "Ask deviceMalloc size " + std::to_string(size) + "< 0 is invalid.");
-    check_cuda_error(cudaMalloc((void**)(ptr), sizeof(T) * size));
+    check_cuda_error(dipu::devapis::mallocDevice((void**)(ptr), sizeof(T) * size));
+#ifndef DIOPI_ENABLE
     if (is_random_initialize) {
         cudaRandomUniform(*ptr, size);
     }
+#endif  // DIOPI_ENABLE
 }
 
 template void deviceMalloc(float** ptr, size_t size, bool is_random_initialize);
+#ifndef DIOPI_ENABLE
 template void deviceMalloc(half** ptr, size_t size, bool is_random_initialize);
 #ifdef ENABLE_BF16
 template void deviceMalloc(__nv_bfloat16** ptr, size_t size, bool is_random_initialize);
 #endif
 template void deviceMalloc(uint16_t** ptr, size_t size, bool is_random_initialize);
+#endif  // DIOPI_ENABLE
 template void deviceMalloc(int** ptr, size_t size, bool is_random_initialize);
 template void deviceMalloc(bool** ptr, size_t size, bool is_random_initialize);
 template void deviceMalloc(char** ptr, size_t size, bool is_random_initialize);
@@ -52,11 +54,16 @@ template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, bool is_random_init
 template<typename T>
 void deviceMemSetZero(T* ptr, size_t size)
 {
-    check_cuda_error(cudaMemset(static_cast<void*>(ptr), 0, sizeof(T) * size));
+    T* arr = new T[size];
+    std::fill(arr, arr + size, 0);
+    check_cuda_error(dipu::devapis::memCopyH2D(sizeof(T) * size, ptr, arr));
+    delete[] arr;
 }
 
 template void deviceMemSetZero(float* ptr, size_t size);
+#ifndef DIOPI_ENABLE
 template void deviceMemSetZero(half* ptr, size_t size);
+#endif  // DIOPI_ENABLE
 template void deviceMemSetZero(int* ptr, size_t size);
 template void deviceMemSetZero(uint32_t* ptr, size_t size);
 template void deviceMemSetZero(bool* ptr, size_t size);
@@ -71,13 +78,15 @@ template<typename T>
 void deviceFree(T*& ptr)
 {
     if (ptr != NULL) {
-        check_cuda_error(cudaFree(ptr));
+        check_cuda_error(dipu::devapis::freeDevice(ptr));
         ptr = NULL;
     }
 }
 
 template void deviceFree(float*& ptr);
+#ifndef DIOPI_ENABLE
 template void deviceFree(half*& ptr);
+#endif  // DIOPI_ENABLE
 #ifdef ENABLE_BF16
 template void deviceFree(__nv_bfloat16*& ptr);
 #endif
@@ -91,16 +100,18 @@ template void deviceFree(__nv_fp8_e4m3*& ptr);
 #endif
 
 template<typename T>
-void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream)
+void deviceFill(T* devptr, size_t size, T value, dipu::devapis::deviceStream_t stream)
 {
     T* arr = new T[size];
     std::fill(arr, arr + size, value);
-    check_cuda_error(cudaMemcpyAsync(devptr, arr, sizeof(T) * size, cudaMemcpyHostToDevice, stream));
+    check_cuda_error(dipu::devapis::memCopyH2DAsync(sizeof(T) * size, devptr, arr));
     delete[] arr;
 }
 
-template void deviceFill(float* devptr, size_t size, float value, cudaStream_t stream);
+template void deviceFill(float* devptr, size_t size, float value, dipu::devapis::deviceStream_t stream);
+#ifndef DIOPI_ENABLE
 template void deviceFill(half* devptr, size_t size, half value, cudaStream_t stream);
+#endif  // DIOPI_ENABLE
 #ifdef ENABLE_BF16
 template void deviceFill(__nv_bfloat16* devptr, size_t size, __nv_bfloat16 value, cudaStream_t stream);
 #endif
@@ -110,11 +121,13 @@ template void deviceFill(bool* devptr, size_t size, bool value, cudaStream_t str
 template<typename T>
 void cudaD2Hcpy(T* tgt, const T* src, const size_t size)
 {
-    check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToHost));
+    check_cuda_error(devapis::memCopyD2H(sizeof(T) * size, tgt, src);
 }
 
 template void cudaD2Hcpy(float* tgt, const float* src, size_t size);
+#ifndef DIOPI_ENABLE
 template void cudaD2Hcpy(half* tgt, const half* src, size_t size);
+#endif  // DIOPI_ENABLE
 #ifdef ENABLE_BF16
 template void cudaD2Hcpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size);
 #endif
@@ -133,11 +146,13 @@ void cudaH2Dcpy(T* tgt, const T* src, const size_t size)
     if (tgt == nullptr || src == nullptr) {
         TM_LOG_ERROR("cudaH2Dcpy: dst=%p src=%p, size=%d", tgt, src, (int)(sizeof(T) * size));
     }
-    check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyHostToDevice));
+    check_cuda_error(dipu::devapis::memCopyH2D(sizeof(T) * size, tgt, src));
 }
 
 template void cudaH2Dcpy(float* tgt, const float* src, size_t size);
+#ifndef DIOPI_ENABLE
 template void cudaH2Dcpy(half* tgt, const half* src, size_t size);
+#endif  // DIOPI_ENABLE
 #ifdef ENABLE_BF16
 template void cudaH2Dcpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size);
 #endif
@@ -153,11 +168,13 @@ template void cudaH2Dcpy(int8_t* tgt, const int8_t* src, size_t size);
 template<typename T>
 void cudaD2Dcpy(T* tgt, const T* src, const size_t size)
 {
-    check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToDevice));
+    check_cuda_error(dipu::devapis::memCopyD2D(sizeof(T) * size, devapis::current_device(), tgt, devapis::current_device(), src));
 }
 
 template void cudaD2Dcpy(float* tgt, const float* src, size_t size);
+#ifndef DIOPI_ENABLE
 template void cudaD2Dcpy(half* tgt, const half* src, size_t size);
+#endif  // DIOPI_ENABLE
 #ifdef ENABLE_BF16
 template void cudaD2Dcpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size);
 #endif
@@ -169,6 +186,7 @@ template void cudaD2Dcpy(__nv_fp8_e4m3* tgt, const __nv_fp8_e4m3* src, size_t si
 #endif
 template void cudaD2Dcpy(unsigned long long* tgt, const unsigned long long* src, size_t size);
 
+#ifndef DIOPI_ENABLE
 template<typename T_OUT, typename T_IN>
 __global__ void cudaCast(T_OUT* dst, T_IN* src, const size_t size)
 {
@@ -200,18 +218,77 @@ template void
 invokeCudaCast(__nv_fp8_e4m3* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream);
 template void invokeCudaCast(__nv_fp8_e4m3* dst, half const* const src, const size_t size, cudaStream_t stream);
 #endif
+#endif  // DIOPI_ENABLE
+
+template<typename T>
+void cudaH2DCpy(T* tgt, const T* src, const size_t size, dipu::devapis::deviceStream_t stream)
+{
+    if (stream != NULL) {
+        check_cuda_error(dipu::devapis::memCopyH2DAsync(stream, sizeof(T) * size, tgt, src));
+    }
+    else {
+        check_cuda_error(dipu::devapis::memCopyH2D(sizeof(T) * size, tgt, src));
+    }
+}
+
+template<typename T>
+void cudaD2DCpy(T* tgt, const T* src, const size_t size, dipu::devapis::deviceStream_t stream)
+{
+    if (stream != NULL) {
+        check_cuda_error(dipu::devapis::memCopyD2DAsync(stream, sizeof(T) * size, dipu::devapis::current_device(), tgt, dipu::devapis::current_device(), src));
+    }
+    else {
+        check_cuda_error(dipu::devapis::memCopyD2D(sizeof(T) * size, dipu::devapis::current_device(), tgt, dipu::devapis::current_device(), src));
+    }
+}
 
 template<typename T>
-void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream)
+void cudaD2HCpy(T* tgt, const T* src, const size_t size, dipu::devapis::deviceStream_t stream)
 {
     if (stream != NULL) {
-        check_cuda_error(cudaMemcpyAsync(tgt, src, sizeof(T) * size, cudaMemcpyDefault, stream));
+        check_cuda_error(dipu::devapis::memCopyD2HAsync(stream, sizeof(T) * size, tgt, src));
     }
     else {
-        check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDefault));
+        check_cuda_error(dipu::devapis::memCopyD2H(sizeof(T) * size, tgt, src));
     }
 }
 
+template void cudaH2DCpy(float* tgt, const float* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2DCpy(float* tgt, const float* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2HCpy(float* tgt, const float* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaH2DCpy(int* tgt, const int* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2DCpy(int* tgt, const int* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2HCpy(int* tgt, const int* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaH2DCpy(bool* tgt, const bool* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2DCpy(bool* tgt, const bool* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2HCpy(bool* tgt, const bool* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaH2DCpy(int8_t* tgt, const int8_t* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2DCpy(int8_t* tgt, const int8_t* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2HCpy(int8_t* tgt, const int8_t* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaH2DCpy(uint* tgt, const uint* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2DCpy(uint* tgt, const uint* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2HCpy(uint* tgt, const uint* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaH2DCpy(unsigned long long* tgt, const unsigned long long* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2DCpy(unsigned long long* tgt, const unsigned long long* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2HCpy(unsigned long long* tgt, const unsigned long long* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaH2DCpy(char* tgt, const char* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2DCpy(char* tgt, const char* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2HCpy(char* tgt, const char* src, const size_t size, dipu::devapis::deviceStream_t stream);
+
+template void cudaH2DCpy(float const** tgt, float const* const* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2DCpy(float const** tgt, float const* const* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2HCpy(float const** tgt, float const* const* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaH2DCpy(bool const** tgt, bool const* const* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2DCpy(bool const** tgt, bool const* const* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2HCpy(bool const** tgt, bool const* const* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaH2DCpy(int8_t const** tgt, int8_t const* const* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2DCpy(int8_t const** tgt, int8_t const* const* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2HCpy(int8_t const** tgt, int8_t const* const* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaH2DCpy(unsigned long long const** tgt, unsigned long long const* const* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2DCpy(unsigned long long const** tgt, unsigned long long const* const* src, const size_t size, dipu::devapis::deviceStream_t stream);
+template void cudaD2HCpy(unsigned long long const** tgt, unsigned long long const* const* src, const size_t size, dipu::devapis::deviceStream_t stream);
+
+#ifndef DIOPI_ENABLE
 template void cudaAutoCpy(float* tgt, const float* src, size_t size, cudaStream_t stream);
 template void cudaAutoCpy(half* tgt, const half* src, size_t size, cudaStream_t stream);
 #ifdef ENABLE_BF16
@@ -234,7 +311,9 @@ template void cudaAutoCpy(bool const** tgt, bool const* const* src, size_t size,
 template void cudaAutoCpy(int8_t const** tgt, int8_t const* const* src, size_t size, cudaStream_t stream);
 template void
 cudaAutoCpy(unsigned long long const** tgt, unsigned long long const* const* src, size_t size, cudaStream_t stream);
+#endif  // DIOPI_ENABLE
 
+#ifndef DIOPI_ENABLE
 template<typename T>
 __global__ void cuda_random_uniform_kernel(T* buffer, const size_t size, const int seq_offset)
 {
@@ -298,6 +377,7 @@ template void cudaRandomUniform(char* buffer, const size_t size);
 #ifdef ENABLE_FP8
 template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size);
 #endif
+#endif  // DIOPI_ENABLE
 
 // loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or
 // the product of the elements in shape is 0, this function will return an empty vector.
@@ -459,15 +539,17 @@ int loadWeightFromBinFunc(T*                        ptr,
         return 0;
     }
 
-    if (std::is_same<T, T_IN>::value == true) {
+    if (std::is_same<T, T_IN>::value == true) {// now <float, float> only
         cudaH2Dcpy(ptr, (T*)host_array.data(), host_array.size());
     }
     else {
+#ifndef DIOPI_ENABLE
         T_IN* ptr_2 = nullptr;
         deviceMalloc(&ptr_2, host_array.size(), false);
         cudaH2Dcpy(ptr_2, host_array.data(), host_array.size());
         invokeCudaD2DcpyConvert(ptr, ptr_2, host_array.size());
         deviceFree(ptr_2);
+#endif  // DIOPI_ENABLE
     }
     return 0;
 }
@@ -476,6 +558,7 @@ template int loadWeightFromBinFunc<float, float>(float*                    ptr,
                                                  std::vector<size_t>       shape,
                                                  std::string               filename,
                                                  std::vector<ConcateSlice> slices);
+#ifndef DIOPI_ENABLE
 template int loadWeightFromBinFunc<half, float>(half*                     ptr,
                                                 std::vector<size_t>       shape,
                                                 std::string               filename,
@@ -488,6 +571,7 @@ template int loadWeightFromBinFunc<half, half>(half*                     ptr,
                                                std::vector<size_t>       shape,
                                                std::string               filename,
                                                std::vector<ConcateSlice> slices);
+#endif  // DIOPI_ENABLE
 template int loadWeightFromBinFunc<int8_t, int8_t>(int8_t*                   ptr,
                                                    std::vector<size_t>       shape,
                                                    std::string               filename,
@@ -536,6 +620,7 @@ int loadWeightFromBin(T*                        ptr,
         case FtCudaDataType::FP32:
             loadWeightFromBinFunc<T, float>(ptr, shape, filename, slices);
             break;
+#ifndef DIOPI_ENABLE
         case FtCudaDataType::FP16:
             loadWeightFromBinFunc<T, half>(ptr, shape, filename, slices);
             break;
@@ -552,6 +637,7 @@ int loadWeightFromBin(T*                        ptr,
             loadWeightFromBinFunc<T, float>(ptr, shape, filename, slices);
             break;
 #endif
+#endif  // DIOPI_ENABLE
         default:
             TM_LOG_ERROR("Does not support FtCudaDataType=%d", model_file_type);
             FT_CHECK(false);
@@ -559,6 +645,7 @@ int loadWeightFromBin(T*                        ptr,
     return 0;
 }
 
+#ifndef DIOPI_ENABLE
 template<>
 int loadWeightFromBin(int*                      ptr,
                       std::vector<size_t>       shape,
@@ -569,12 +656,14 @@ int loadWeightFromBin(int*                      ptr,
     loadWeightFromBinFunc<int, int>(ptr, shape, filename, slices);
     return 0;
 }
+#endif  // DIOPI_ENABLE
 
 template int loadWeightFromBin(float*                    ptr,
                                std::vector<size_t>       shape,
                                std::string               filename,
                                FtCudaDataType            model_file_type,
                                std::vector<ConcateSlice> slices);
+#ifndef DIOPI_ENABLE
 template int loadWeightFromBin(half*                     ptr,
                                std::vector<size_t>       shape,
                                std::string               filename,
@@ -599,12 +688,14 @@ template int loadWeightFromBin(__nv_fp8_e4m3*            ptr,
                                FtCudaDataType            model_file_type,
                                std::vector<ConcateSlice> slices);
 #endif
+#endif  // DIOPI_ENABLE
 template int loadWeightFromBin(int*                      ptr,
                                std::vector<size_t>       shape,
                                std::string               filename,
                                FtCudaDataType            model_file_type,
                                std::vector<ConcateSlice> slices);
 
+#ifndef DIOPI_ENABLE
 template<typename T_IN, typename T_OUT>
 __global__ void cudaD2DcpyConvert(T_OUT* dst, const T_IN* src, const size_t size)
 {
@@ -637,324 +728,325 @@ template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, const int* src, const
 template void invokeCudaD2DcpyConvert(float* tgt, const __nv_bfloat16* src, const size_t size, cudaStream_t stream);
 template void invokeCudaD2DcpyConvert(int* tgt, const __nv_bfloat16* src, const size_t size, cudaStream_t stream);
 #endif  // ENABLE_BF16
-
-template<typename T_IN, typename T_OUT>
-__global__ void
-cudaD2DScaleCpyConvert(T_OUT* dst, const T_IN* src, const float* scale, bool invert_scale, const size_t size)
-{
-    const float scale_value = invert_scale ? 1.0f / scale[0] : scale[0];
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
-        dst[tid] = cuda_cast<T_OUT>(cuda_cast<float>(src[tid]) * scale_value);
-    }
-}
-
-template<typename T_IN, typename T_OUT>
-void invokeCudaD2DScaleCpyConvert(
-    T_OUT* tgt, const T_IN* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream)
-{
-    cudaD2DScaleCpyConvert<<<256, 256, 0, stream>>>(tgt, src, scale, invert_scale, size);
-}
+#endif  // DIOPI_ENABLE
+
+// template<typename T_IN, typename T_OUT>
+// __global__ void
+// cudaD2DScaleCpyConvert(T_OUT* dst, const T_IN* src, const float* scale, bool invert_scale, const size_t size)
+// {
+//     const float scale_value = invert_scale ? 1.0f / scale[0] : scale[0];
+//     for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
+//         dst[tid] = cuda_cast<T_OUT>(cuda_cast<float>(src[tid]) * scale_value);
+//     }
+// }
+
+// template<typename T_IN, typename T_OUT>
+// void invokeCudaD2DScaleCpyConvert(
+//     T_OUT* tgt, const T_IN* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream)
+// {
+//     cudaD2DScaleCpyConvert<<<256, 256, 0, stream>>>(tgt, src, scale, invert_scale, size);
+// }
 
 // clang-format off
-template void invokeCudaD2DScaleCpyConvert(float* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const float* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DScaleCpyConvert(half* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const half* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeCudaD2DScaleCpyConvert(__nv_bfloat16* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const __nv_bfloat16* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-#endif  // ENABLE_BF16
-#ifdef ENABLE_FP8
-template void invokeCudaD2DScaleCpyConvert(float* tgt, const __nv_fp8_e4m3* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-#endif  // ENABLE_FP8
+// template void invokeCudaD2DScaleCpyConvert(float* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
+// template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const float* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
+// template void invokeCudaD2DScaleCpyConvert(half* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
+// template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const half* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
+// #ifdef ENABLE_BF16
+// template void invokeCudaD2DScaleCpyConvert(__nv_bfloat16* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
+// template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const __nv_bfloat16* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
+// #endif  // ENABLE_BF16
+// #ifdef ENABLE_FP8
+// template void invokeCudaD2DScaleCpyConvert(float* tgt, const __nv_fp8_e4m3* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
+// #endif  // ENABLE_FP8
 // clang-format on
 
-void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream)
-{
-    invokeCudaD2DcpyConvert(dst, src, size, stream);
-}
-
-void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream)
-{
-    invokeCudaD2DcpyConvert(dst, src, size, stream);
-}
-
-template<typename T>
-void saveToBinary(const T* ptr, const size_t size, std::string filename)
-{
-
-    std::vector<T> h_ptr(size);
-    cudaD2Hcpy(h_ptr.data(), ptr, size);
-    std::vector<float> float_ptr(size);
-    for (size_t i = 0; i < size; i++) {
-        float_ptr[i] = (float)h_ptr[i];
-    }
-
-    std::ofstream out(filename, std::ios::out | std::ios::binary);
-    FT_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename);
-
-    out.write((char*)float_ptr.data(), size * sizeof(float));
-}
-
-template void saveToBinary(const float* ptr, const size_t size, std::string filename);
-template void saveToBinary(const half* ptr, const size_t size, std::string filename);
-#ifdef ENABLE_BF16
-template void saveToBinary(const __nv_bfloat16* ptr, const size_t size, std::string filename);
-#endif  // ENABLE_BF16
-
-template<>
-void saveToBinary(const int* ptr, const size_t size, std::string filename)
-{
-    std::vector<int> h_ptr(size);
-    cudaD2Hcpy(h_ptr.data(), ptr, size);
-    std::ofstream out(filename, std::ios::out | std::ios::binary);
-    FT_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename);
-    out.write((char*)h_ptr.data(), size * sizeof(int));
-}
-
-template<typename T_IN, typename T_fake_type>
-__global__ void fakeCast(T_IN* input_ptr, const size_t size)
-{
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
-        T_fake_type tmp_val = (T_fake_type)((float)input_ptr[i]);
-        input_ptr[i]        = (T_IN)((float)tmp_val);
-    }
-}
-
-template<typename T_IN, typename T_fake_type>
-void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream)
-{
-    dim3 block(256);
-    dim3 grid((size + 255) / 256);
-    fakeCast<T_IN, T_fake_type><<<grid, block, 0, stream>>>(input_ptr, size);
-}
-
-#ifdef ENABLE_FP8
-__global__ void cudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
-        dst[tid] = (float)(src[tid]);
-    }
-}
-
-void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2Dcpyfp82Float<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-__global__ void cudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
-        dst[tid] = (half)((float)(src[tid]));
-    }
-}
-
-void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2Dcpyfp82Half<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-__global__ void cudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
-        dst[tid] = (__nv_fp8_e4m3)src[tid];
-    }
-}
-
-void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2DcpyFloat2fp8<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-__global__ void cudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
-        dst[tid] = (__nv_fp8_e4m3)src[tid];
-    }
-}
-
-void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2DcpyHalf2fp8<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-__global__ void cudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
-        dst[tid] = (__nv_fp8_e4m3)src[tid];
-    }
-}
-
-void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2DcpyBfloat2fp8<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-#endif  // ENABLE_FP8
-
-template<typename T_OUT, typename T_IN>
-__global__ void transpose(T_OUT* dst, T_IN* src, const int dim0, const int dim1)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1; tid += blockDim.x * gridDim.x) {
-        const int src_col_id                = tid % dim1;
-        const int src_row_id                = tid / dim1;
-        dst[src_col_id * dim0 + src_row_id] = (T_OUT)(src[tid]);
-    }
-}
-
-template<typename T>
-void invokeInPlaceTranspose(T* data, T* workspace, const int dim0, const int dim1)
-{
-    // copy data to workspace, and then transpose from workspace to data
-    cudaD2Dcpy(workspace, data, dim0 * dim1);
-    transpose<<<256, 256>>>(data, workspace, dim0, dim1);
-}
-
-#ifdef ENABLE_FP8
-template void invokeInPlaceTranspose(__nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1);
-#endif  // ENABLE_FP8
-#ifdef ENABLE_BF16
-template void invokeInPlaceTranspose(__nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1);
-#endif  // ENABLE_BF16
-template void invokeInPlaceTranspose(float* data, float* workspace, const int dim0, const int dim1);
-
-template<typename T_OUT, typename T_IN>
-__global__ void transpose0213(T_OUT* dst, T_IN* src, const int dim0, const int dim1, const int dim2, const int dim3)
-{
-    // src permutation: [0, 1, 2, 3]
-    // dst permutation: [0, 2, 1, 3]
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2 * dim3;
-         tid += blockDim.x * gridDim.x) {
-        int       tmp_idx   = tid;
-        const int dim_3_idx = tmp_idx % dim3;
-        tmp_idx             = (tmp_idx - dim_3_idx) / dim3;
-        const int dim_2_idx = tmp_idx % dim2;
-        tmp_idx             = (tmp_idx - dim_2_idx) / dim2;
-        const int dim_1_idx = tmp_idx % dim1;
-        tmp_idx             = (tmp_idx - dim_1_idx) / dim1;
-        const int dim_0_idx = tmp_idx % dim0;
-        dst[dim_0_idx * dim1 * dim2 * dim3 + dim_2_idx * dim1 * dim3 + dim_1_idx * dim3 + dim_3_idx] = src[tid];
-    }
-}
-
-template<typename T>
-void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3)
-{
-    // copy data to workspace, and then transpose from workspace to data
-    // Note that this kernel is used for pre-processing and not very efficient.
-    cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2 * dim3);
-    transpose0213<<<256, 256>>>(data, workspace, dim0, dim1, dim2, dim3);
-}
-
-#ifdef ENABLE_FP8
-template void invokeInPlaceTranspose0213(
-    __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1, const int dim2, const int dim3);
-#endif  // ENABLE_FP8
-#ifdef ENABLE_BF16
-template void invokeInPlaceTranspose0213(
-    __nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1, const int dim2, const int dim3);
-#endif  // ENABLE_BF16
-template void invokeInPlaceTranspose0213(
-    float* data, float* workspace, const int dim0, const int dim1, const int dim2, const int dim3);
-
-template<typename T_OUT, typename T_IN>
-__global__ void transpose102(T_OUT* dst, T_IN* src, const int dim0, const int dim1, const int dim2)
-{
-    // src permutation: [0, 1, 2]
-    // dst permutation: [1, 0, 2]
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2; tid += blockDim.x * gridDim.x) {
-        int       tmp_idx                                           = tid;
-        const int dim_2_idx                                         = tmp_idx % dim2;
-        tmp_idx                                                     = (tmp_idx - dim_2_idx) / dim2;
-        const int dim_1_idx                                         = tmp_idx % dim1;
-        tmp_idx                                                     = (tmp_idx - dim_1_idx) / dim1;
-        const int dim_0_idx                                         = tmp_idx % dim0;
-        dst[dim_1_idx * dim0 * dim2 + dim_0_idx * dim2 + dim_2_idx] = src[tid];
-    }
-}
-
-template<typename T>
-void invokeInPlaceTranspose102(T* data, T* workspace, const int dim0, const int dim1, const int dim2)
-{
-    // copy data to workspace, and then transpose from workspace to data
-    // Note that this kernel is used for pre-processing and not very efficient.
-    cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2);
-    transpose102<<<256, 256>>>(data, workspace, dim0, dim1, dim2);
-}
-
-#ifdef ENABLE_FP8
-template void invokeInPlaceTranspose102(
-    __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1, const int dim2);
-#endif  // ENABLE_FP8
-#ifdef ENABLE_BF16
-template void invokeInPlaceTranspose102(
-    __nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1, const int dim2);
-#endif  // ENABLE_BF16
-template void invokeInPlaceTranspose102(float* data, float* workspace, const int dim0, const int dim1, const int dim2);
-
-template<typename T>
-void __global__ multiplyScale(T* tensor, float scale, const size_t size)
-{
-    for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) {
-        tensor[index] = (T)(((float)tensor[index]) * scale);
-    }
-}
-
-template<typename T>
-void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream)
-{
-    int block = 256;
-    int grid  = (size + 255) / 256;
-    multiplyScale<<<grid, block, 0, stream>>>(tensor, scale, size);
-}
-
-template void invokeMultiplyScale(float* tensor, float scale, const size_t size, cudaStream_t stream);
-template void invokeMultiplyScale(half* tensor, float scale, const size_t size, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeMultiplyScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream);
-#endif
-#ifdef ENABLE_FP8
-template void invokeMultiplyScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream);
-#endif
-
-template<typename T>
-void __global__ divideScale(T* tensor, float scale, const size_t size)
-{
-    for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) {
-        tensor[index] = (T)(((float)tensor[index]) / scale);
-    }
-}
-
-template<typename T>
-void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream)
-{
-    int block = 256;
-    int grid  = (size + 255) / 256;
-    divideScale<<<grid, block, 0, stream>>>(tensor, scale, size);
-}
-
-template void invokeDivideScale(float* tensor, float scale, const size_t size, cudaStream_t stream);
-template void invokeDivideScale(half* tensor, float scale, const size_t size, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeDivideScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream);
-#endif
-#ifdef ENABLE_FP8
-template void invokeDivideScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream);
-#endif
-#ifdef ENABLE_BF16
-template void invokeFakeCast<float, __nv_bfloat16>(float* input_ptr, const size_t size, cudaStream_t stream);
-template void
-invokeFakeCast<__nv_bfloat16, __nv_bfloat16>(__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream);
-template void invokeFakeCast<half, __nv_bfloat16>(half* input_ptr, const size_t size, cudaStream_t stream);
-#endif
-template void invokeFakeCast<float, half>(float* input_ptr, const size_t size, cudaStream_t stream);
-template void invokeFakeCast<float, float>(float* input_ptr, const size_t size, cudaStream_t stream);
-#ifdef ENABLE_FP8
-template void invokeFakeCast<float, __nv_fp8_e4m3>(float* input_ptr, const size_t size, cudaStream_t stream);
-template void invokeFakeCast<half, __nv_fp8_e4m3>(half* input_ptr, const size_t size, cudaStream_t stream);
-template void
-invokeFakeCast<__nv_bfloat16, __nv_fp8_e4m3>(__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream);
-#endif
+// void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream)
+// {
+//     invokeCudaD2DcpyConvert(dst, src, size, stream);
+// }
+
+// void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream)
+// {
+//     invokeCudaD2DcpyConvert(dst, src, size, stream);
+// }
+
+// template<typename T>
+// void saveToBinary(const T* ptr, const size_t size, std::string filename)
+// {
+
+//     std::vector<T> h_ptr(size);
+//     cudaD2Hcpy(h_ptr.data(), ptr, size);
+//     std::vector<float> float_ptr(size);
+//     for (size_t i = 0; i < size; i++) {
+//         float_ptr[i] = (float)h_ptr[i];
+//     }
+
+//     std::ofstream out(filename, std::ios::out | std::ios::binary);
+//     FT_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename);
+
+//     out.write((char*)float_ptr.data(), size * sizeof(float));
+// }
+
+// template void saveToBinary(const float* ptr, const size_t size, std::string filename);
+// template void saveToBinary(const half* ptr, const size_t size, std::string filename);
+// #ifdef ENABLE_BF16
+// template void saveToBinary(const __nv_bfloat16* ptr, const size_t size, std::string filename);
+// #endif  // ENABLE_BF16
+
+// template<>
+// void saveToBinary(const int* ptr, const size_t size, std::string filename)
+// {
+//     std::vector<int> h_ptr(size);
+//     cudaD2Hcpy(h_ptr.data(), ptr, size);
+//     std::ofstream out(filename, std::ios::out | std::ios::binary);
+//     FT_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename);
+//     out.write((char*)h_ptr.data(), size * sizeof(int));
+// }
+
+// template<typename T_IN, typename T_fake_type>
+// __global__ void fakeCast(T_IN* input_ptr, const size_t size)
+// {
+//     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
+//         T_fake_type tmp_val = (T_fake_type)((float)input_ptr[i]);
+//         input_ptr[i]        = (T_IN)((float)tmp_val);
+//     }
+// }
+
+// template<typename T_IN, typename T_fake_type>
+// void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream)
+// {
+//     dim3 block(256);
+//     dim3 grid((size + 255) / 256);
+//     fakeCast<T_IN, T_fake_type><<<grid, block, 0, stream>>>(input_ptr, size);
+// }
+
+// #ifdef ENABLE_FP8
+// __global__ void cudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size)
+// {
+//     for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
+//         dst[tid] = (float)(src[tid]);
+//     }
+// }
+
+// void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream)
+// {
+//     cudaD2Dcpyfp82Float<<<256, 256, 0, stream>>>(dst, src, size);
+// }
+
+// __global__ void cudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size)
+// {
+//     for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
+//         dst[tid] = (half)((float)(src[tid]));
+//     }
+// }
+
+// void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream)
+// {
+//     cudaD2Dcpyfp82Half<<<256, 256, 0, stream>>>(dst, src, size);
+// }
+
+// __global__ void cudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size)
+// {
+//     for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
+//         dst[tid] = (__nv_fp8_e4m3)src[tid];
+//     }
+// }
+
+// void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size, cudaStream_t stream)
+// {
+//     cudaD2DcpyFloat2fp8<<<256, 256, 0, stream>>>(dst, src, size);
+// }
+
+// __global__ void cudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size)
+// {
+//     for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
+//         dst[tid] = (__nv_fp8_e4m3)src[tid];
+//     }
+// }
+
+// void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size, cudaStream_t stream)
+// {
+//     cudaD2DcpyHalf2fp8<<<256, 256, 0, stream>>>(dst, src, size);
+// }
+
+// __global__ void cudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size)
+// {
+//     for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
+//         dst[tid] = (__nv_fp8_e4m3)src[tid];
+//     }
+// }
+
+// void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream)
+// {
+//     cudaD2DcpyBfloat2fp8<<<256, 256, 0, stream>>>(dst, src, size);
+// }
+
+// #endif  // ENABLE_FP8
+
+// template<typename T_OUT, typename T_IN>
+// __global__ void transpose(T_OUT* dst, T_IN* src, const int dim0, const int dim1)
+// {
+//     for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1; tid += blockDim.x * gridDim.x) {
+//         const int src_col_id                = tid % dim1;
+//         const int src_row_id                = tid / dim1;
+//         dst[src_col_id * dim0 + src_row_id] = (T_OUT)(src[tid]);
+//     }
+// }
+
+// template<typename T>
+// void invokeInPlaceTranspose(T* data, T* workspace, const int dim0, const int dim1)
+// {
+//     // copy data to workspace, and then transpose from workspace to data
+//     cudaD2Dcpy(workspace, data, dim0 * dim1);
+//     transpose<<<256, 256>>>(data, workspace, dim0, dim1);
+// }
+
+// #ifdef ENABLE_FP8
+// template void invokeInPlaceTranspose(__nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1);
+// #endif  // ENABLE_FP8
+// #ifdef ENABLE_BF16
+// template void invokeInPlaceTranspose(__nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1);
+// #endif  // ENABLE_BF16
+// template void invokeInPlaceTranspose(float* data, float* workspace, const int dim0, const int dim1);
+
+// template<typename T_OUT, typename T_IN>
+// __global__ void transpose0213(T_OUT* dst, T_IN* src, const int dim0, const int dim1, const int dim2, const int dim3)
+// {
+//     // src permutation: [0, 1, 2, 3]
+//     // dst permutation: [0, 2, 1, 3]
+//     for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2 * dim3;
+//          tid += blockDim.x * gridDim.x) {
+//         int       tmp_idx   = tid;
+//         const int dim_3_idx = tmp_idx % dim3;
+//         tmp_idx             = (tmp_idx - dim_3_idx) / dim3;
+//         const int dim_2_idx = tmp_idx % dim2;
+//         tmp_idx             = (tmp_idx - dim_2_idx) / dim2;
+//         const int dim_1_idx = tmp_idx % dim1;
+//         tmp_idx             = (tmp_idx - dim_1_idx) / dim1;
+//         const int dim_0_idx = tmp_idx % dim0;
+//         dst[dim_0_idx * dim1 * dim2 * dim3 + dim_2_idx * dim1 * dim3 + dim_1_idx * dim3 + dim_3_idx] = src[tid];
+//     }
+// }
+
+// template<typename T>
+// void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3)
+// {
+//     // copy data to workspace, and then transpose from workspace to data
+//     // Note that this kernel is used for pre-processing and not very efficient.
+//     cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2 * dim3);
+//     transpose0213<<<256, 256>>>(data, workspace, dim0, dim1, dim2, dim3);
+// }
+
+// #ifdef ENABLE_FP8
+// template void invokeInPlaceTranspose0213(
+//     __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1, const int dim2, const int dim3);
+// #endif  // ENABLE_FP8
+// #ifdef ENABLE_BF16
+// template void invokeInPlaceTranspose0213(
+//     __nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1, const int dim2, const int dim3);
+// #endif  // ENABLE_BF16
+// template void invokeInPlaceTranspose0213(
+//     float* data, float* workspace, const int dim0, const int dim1, const int dim2, const int dim3);
+
+// template<typename T_OUT, typename T_IN>
+// __global__ void transpose102(T_OUT* dst, T_IN* src, const int dim0, const int dim1, const int dim2)
+// {
+//     // src permutation: [0, 1, 2]
+//     // dst permutation: [1, 0, 2]
+//     for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2; tid += blockDim.x * gridDim.x) {
+//         int       tmp_idx                                           = tid;
+//         const int dim_2_idx                                         = tmp_idx % dim2;
+//         tmp_idx                                                     = (tmp_idx - dim_2_idx) / dim2;
+//         const int dim_1_idx                                         = tmp_idx % dim1;
+//         tmp_idx                                                     = (tmp_idx - dim_1_idx) / dim1;
+//         const int dim_0_idx                                         = tmp_idx % dim0;
+//         dst[dim_1_idx * dim0 * dim2 + dim_0_idx * dim2 + dim_2_idx] = src[tid];
+//     }
+// }
+
+// template<typename T>
+// void invokeInPlaceTranspose102(T* data, T* workspace, const int dim0, const int dim1, const int dim2)
+// {
+//     // copy data to workspace, and then transpose from workspace to data
+//     // Note that this kernel is used for pre-processing and not very efficient.
+//     cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2);
+//     transpose102<<<256, 256>>>(data, workspace, dim0, dim1, dim2);
+// }
+
+// #ifdef ENABLE_FP8
+// template void invokeInPlaceTranspose102(
+//     __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1, const int dim2);
+// #endif  // ENABLE_FP8
+// #ifdef ENABLE_BF16
+// template void invokeInPlaceTranspose102(
+//     __nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1, const int dim2);
+// #endif  // ENABLE_BF16
+// template void invokeInPlaceTranspose102(float* data, float* workspace, const int dim0, const int dim1, const int dim2);
+
+// template<typename T>
+// void __global__ multiplyScale(T* tensor, float scale, const size_t size)
+// {
+//     for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) {
+//         tensor[index] = (T)(((float)tensor[index]) * scale);
+//     }
+// }
+
+// template<typename T>
+// void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream)
+// {
+//     int block = 256;
+//     int grid  = (size + 255) / 256;
+//     multiplyScale<<<grid, block, 0, stream>>>(tensor, scale, size);
+// }
+
+// template void invokeMultiplyScale(float* tensor, float scale, const size_t size, cudaStream_t stream);
+// template void invokeMultiplyScale(half* tensor, float scale, const size_t size, cudaStream_t stream);
+// #ifdef ENABLE_BF16
+// template void invokeMultiplyScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream);
+// #endif
+// #ifdef ENABLE_FP8
+// template void invokeMultiplyScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream);
+// #endif
+
+// template<typename T>
+// void __global__ divideScale(T* tensor, float scale, const size_t size)
+// {
+//     for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) {
+//         tensor[index] = (T)(((float)tensor[index]) / scale);
+//     }
+// }
+
+// template<typename T>
+// void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream)
+// {
+//     int block = 256;
+//     int grid  = (size + 255) / 256;
+//     divideScale<<<grid, block, 0, stream>>>(tensor, scale, size);
+// }
+
+// template void invokeDivideScale(float* tensor, float scale, const size_t size, cudaStream_t stream);
+// template void invokeDivideScale(half* tensor, float scale, const size_t size, cudaStream_t stream);
+// #ifdef ENABLE_BF16
+// template void invokeDivideScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream);
+// #endif
+// #ifdef ENABLE_FP8
+// template void invokeDivideScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream);
+// #endif
+// #ifdef ENABLE_BF16
+// template void invokeFakeCast<float, __nv_bfloat16>(float* input_ptr, const size_t size, cudaStream_t stream);
+// template void
+// invokeFakeCast<__nv_bfloat16, __nv_bfloat16>(__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream);
+// template void invokeFakeCast<half, __nv_bfloat16>(half* input_ptr, const size_t size, cudaStream_t stream);
+// #endif
+// template void invokeFakeCast<float, half>(float* input_ptr, const size_t size, cudaStream_t stream);
+// template void invokeFakeCast<float, float>(float* input_ptr, const size_t size, cudaStream_t stream);
+// #ifdef ENABLE_FP8
+// template void invokeFakeCast<float, __nv_fp8_e4m3>(float* input_ptr, const size_t size, cudaStream_t stream);
+// template void invokeFakeCast<half, __nv_fp8_e4m3>(half* input_ptr, const size_t size, cudaStream_t stream);
+// template void
+// invokeFakeCast<__nv_bfloat16, __nv_fp8_e4m3>(__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream);
+// #endif
 
 size_t cuda_datatype_size(FtCudaDataType dt)
 {
@@ -969,32 +1061,32 @@ size_t cuda_datatype_size(FtCudaDataType dt)
     return sizes.at(dt);
 }
 
-template<typename T>
-__global__ void check_range(T* buffer, size_t size, T min, T max, bool* d_within_range)
-{
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
-        const T val = buffer[i];
-        if (val < min || val > max) {
-            *d_within_range = false;
-        }
-    }
-}
-
-template<typename T>
-bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream)
-{
-    cudaMemsetAsync(d_within_range, true, sizeof(bool), stream);
-
-    dim3 block(256);
-    dim3 grid((size + 255) / 256);
-    check_range<T><<<grid, block, 0, stream>>>(buffer, size, min, max, d_within_range);
-
-    bool result;
-    cudaD2Hcpy(&result, d_within_range, 1);
-    return result;
-}
-
-template bool
-invokeCheckRange<int>(int* buffer, const size_t size, int min, int max, bool* d_within_range, cudaStream_t stream);
+// template<typename T>
+// __global__ void check_range(T* buffer, size_t size, T min, T max, bool* d_within_range)
+// {
+//     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
+//         const T val = buffer[i];
+//         if (val < min || val > max) {
+//             *d_within_range = false;
+//         }
+//     }
+// }
+
+// template<typename T>
+// bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream)
+// {
+//     cudaMemsetAsync(d_within_range, true, sizeof(bool), stream);
+
+//     dim3 block(256);
+//     dim3 grid((size + 255) / 256);
+//     check_range<T><<<grid, block, 0, stream>>>(buffer, size, min, max, d_within_range);
+
+//     bool result;
+//     cudaD2Hcpy(&result, d_within_range, 1);
+//     return result;
+// }
+
+// template bool
+// invokeCheckRange<int>(int* buffer, const size_t size, int min, int max, bool* d_within_range, cudaStream_t stream);
 
 }  // namespace turbomind
diff --git a/src/turbomind/utils/memory_utils.h b/src/turbomind/utils/memory_utils.h
index e51c903905..24cdae18aa 100644
--- a/src/turbomind/utils/memory_utils.h
+++ b/src/turbomind/utils/memory_utils.h
@@ -17,8 +17,6 @@
 #pragma once
 
 #include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/cuda_fp8_utils.h"
-#include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
 
@@ -71,8 +69,10 @@ std::vector<float> loadArrayFromBin(std::vector<size_t>       shape,
 //                                               std::string         filename,
 //                                               FtCudaDataType      model_file_type = FtCudaDataType::FP32);
 
+#ifndef DIOPI_ENABLE
 void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream);
 void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream);
+#endif  // DIOPI_ENABLE
 #ifdef ENABLE_FP8
 void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream);
 void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream);
@@ -84,51 +84,55 @@ void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const si
 void invokeCudaD2DcpyBfloat2Float(float* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream);
 #endif  // ENABLE_BF16
 
+#ifndef DIOPI_ENABLE
 template<typename T_OUT, typename T_IN>
 void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream);
+#endif  // DIOPI_ENABLE
+
+// template<typename T, size_t n_dims>
+// __inline__ __host__ __device__ size_t dim2flat(const T (&idx)[n_dims], const T (&dims)[n_dims])
+// {
+//     size_t flat_idx = 0;
+//     for (size_t i = 0; i < n_dims; i++) {
+//         flat_idx += idx[i];
+//         if (i + 1 < n_dims)
+//             flat_idx *= dims[i + 1];
+//     }
+//     return flat_idx;
+// }
+
+// template<typename T1, size_t n_dims, typename T2>
+// __inline__ __host__ __device__ void flat2dim(T1 flat_idx, const T2 (&dims)[n_dims], T2 (&idx)[n_dims])
+// {
+//     for (int i = n_dims - 1; i >= 0; i--) {
+//         idx[i] = flat_idx % dims[i];
+//         flat_idx /= dims[i];
+//     }
+// }
 
-template<typename T, size_t n_dims>
-__inline__ __host__ __device__ size_t dim2flat(const T (&idx)[n_dims], const T (&dims)[n_dims])
-{
-    size_t flat_idx = 0;
-    for (size_t i = 0; i < n_dims; i++) {
-        flat_idx += idx[i];
-        if (i + 1 < n_dims)
-            flat_idx *= dims[i + 1];
-    }
-    return flat_idx;
-}
-
-template<typename T1, size_t n_dims, typename T2>
-__inline__ __host__ __device__ void flat2dim(T1 flat_idx, const T2 (&dims)[n_dims], T2 (&idx)[n_dims])
-{
-    for (int i = n_dims - 1; i >= 0; i--) {
-        idx[i] = flat_idx % dims[i];
-        flat_idx /= dims[i];
-    }
-}
-
-template<typename T>
-void invokeInPlaceTranspose(T* data, T* workspace, const int dim0, const int dim1);
+// template<typename T>
+// void invokeInPlaceTranspose(T* data, T* workspace, const int dim0, const int dim1);
 
-template<typename T>
-void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3);
+// template<typename T>
+// void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3);
 
-template<typename T>
-void invokeInPlaceTranspose102(T* data, T* workspace, const int dim0, const int dim1, const int dim2);
+// template<typename T>
+// void invokeInPlaceTranspose102(T* data, T* workspace, const int dim0, const int dim1, const int dim2);
 
-template<typename T>
-void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream);
+// template<typename T>
+// void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream);
 
-template<typename T>
-void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream);
+// template<typename T>
+// void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream);
 
+#ifndef DIOPI_ENABLE
 template<typename T_IN, typename T_OUT>
 void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, const size_t size, cudaStream_t stream = 0);
+#endif  // DIOPI_ENABLE
 
-template<typename T_IN, typename T_OUT>
-void invokeCudaD2DScaleCpyConvert(
-    T_OUT* tgt, const T_IN* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream = 0);
+// template<typename T_IN, typename T_OUT>
+// void invokeCudaD2DScaleCpyConvert(
+//     T_OUT* tgt, const T_IN* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream = 0);
 
 inline bool checkIfFileExist(const std::string& file_path)
 {
@@ -143,12 +147,12 @@ inline bool checkIfFileExist(const std::string& file_path)
 template<typename T>
 void saveToBinary(const T* ptr, const size_t size, std::string filename);
 
-template<typename T_IN, typename T_fake_type>
-void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream);
+// template<typename T_IN, typename T_fake_type>
+// void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream);
 
 size_t cuda_datatype_size(FtCudaDataType dt);
 
-template<typename T>
-bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream);
+// template<typename T>
+// bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream);
 
 }  // namespace turbomind
diff --git a/src/turbomind/utils/nccl_utils.cc b/src/turbomind/utils/nccl_utils.cc
index bd669ac227..1e0b8b57f8 100644
--- a/src/turbomind/utils/nccl_utils.cc
+++ b/src/turbomind/utils/nccl_utils.cc
@@ -21,6 +21,7 @@
 namespace turbomind {
 
 #ifdef BUILD_MULTI_GPU
+#ifndef DIOPI_ENABLE
 template<typename T>
 ncclDataType_t getNcclDataType()
 {
@@ -52,129 +53,172 @@ ncclDataType_t getNcclDataType()
     }
     return nccl_data_type;
 }
+#endif // DIOPI_ENABLE
 #endif
 
 template<typename T>
-void ftNcclAllReduceSum(const T* send_buf, T* recv_buf, const int data_size, NcclParam nccl_param, cudaStream_t stream)
+void ftNcclAllReduceSum(const T* send_buf, T* recv_buf, const int data_size, NcclParam nccl_param, deviceStream_t stream)
 {
     TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+
 #ifdef BUILD_MULTI_GPU
+#ifndef DIOPI_ENABLE
     ncclDataType_t nccl_data_type = getNcclDataType<T>();
     NCCLCHECK(ncclGroupStart());
     NCCLCHECK(ncclAllReduce(
         (const void*)send_buf, (void*)recv_buf, data_size, nccl_data_type, ncclSum, nccl_param.nccl_comm_, stream));
     NCCLCHECK(ncclGroupEnd());
+#else
+    dipu::devapis::diclRawGroupStart();
+    dipu::devapis::diclRawAllReduce(
+        (const void*)send_buf, (void*)recv_buf, data_size, dipu::devapis::getDiclDataType<T>(), dipu::devapis::DiclReduceOp::SUM, nccl_param.nccl_comm_, stream
+    )
+    dipu::devapis::diclRawGroupEnd();
+#endif // DIOPI_ENABLE
 #endif
+
     TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 template<typename T>
 void ftNcclAllGather(
-    const T* send_buf, T* recv_buf, const int data_size, const int rank, NcclParam nccl_param, cudaStream_t stream)
+    const T* send_buf, T* recv_buf, const int data_size, const int rank, NcclParam nccl_param, deviceStream_t stream)
 {
     TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
 #ifdef BUILD_MULTI_GPU
+#ifndef DIOPI_ENABLE
     ncclDataType_t nccl_data_type = getNcclDataType<T>();
     NCCLCHECK(ncclGroupStart());
     NCCLCHECK(
         ncclAllGather(send_buf + rank * data_size, recv_buf, data_size, nccl_data_type, nccl_param.nccl_comm_, stream));
     NCCLCHECK(ncclGroupEnd());
+#else
+    dipu::devapis::diclRawGroupStart();
+    dipu::devapis::diclRawAllGather(
+        (const void*)send_buf, (void*)recv_buf, data_size, dipu::devapis::getDiclDataType<T>(), dipu::devapis::DiclReduceOp::SUM, nccl_param.nccl_comm_, stream
+    )
+    dipu::devapis::diclRawGroupEnd();
+#endif // DIOPI_ENABLE
 #endif
     TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 template<typename T>
-void ftNcclSend(const T* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream)
+void ftNcclSend(const T* send_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream)
 {
     TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
 #ifdef BUILD_MULTI_GPU
+#ifndef DIOPI_ENABLE
     ncclDataType_t nccl_data_type = getNcclDataType<T>();
     NCCLCHECK(ncclSend(send_buf, data_size, nccl_data_type, peer, nccl_param.nccl_comm_, stream));
+#else
+    dipu::devapis::diclRawSend(send_buf, data_size, dipu::devapis::getDiclDataType<T>(), peer, nccl_param.nccl_comm_, stream);
+#endif // DIOPI_ENABLE
 #endif
     TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 template void
-ftNcclSend(const float* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+ftNcclSend(const float* send_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream);
+#ifndef DIOPI_ENABLE
 template void
-ftNcclSend(const half* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+ftNcclSend(const half* send_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream);
+#endif  // DIOPI_ENABLE
 #ifdef ENABLE_BF16
 template void ftNcclSend(
-    const __nv_bfloat16* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+    const __nv_bfloat16* send_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream);
 #endif
 template void
-ftNcclSend(const int* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+ftNcclSend(const int* send_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream);
 template void
-ftNcclSend(const bool* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+ftNcclSend(const bool* send_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream);
+#ifndef DIOPI_ENABLE
 template void
-ftNcclSend(const char* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+ftNcclSend(const char* send_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream);
+#endif  // DIOPI_ENABLE
 
 template<typename T>
-void ftNcclRecv(T* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream)
+void ftNcclRecv(T* recv_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream)
 {
     TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
 #ifdef BUILD_MULTI_GPU
+#ifndef DIOPI_ENABLE
     ncclDataType_t nccl_data_type = getNcclDataType<T>();
     NCCLCHECK(ncclRecv(recv_buf, data_size, nccl_data_type, peer, nccl_param.nccl_comm_, stream));
+#else
+    dipu::devapis::diclRawRecv(recv_buf, data_size, dipu::devapis::getDiclDataType<T>(), peer, nccl_param.nccl_comm_, stream);
+#endif  // DIOPI_ENABLE
 #endif
     TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 template void
-ftNcclRecv(float* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+ftNcclRecv(float* recv_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream);
+#ifndef DIOPI_ENABLE
 template void
-ftNcclRecv(half* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+ftNcclRecv(half* recv_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream);
+#endif  // DIOPI_ENABLE
 #ifdef ENABLE_BF16
 template void
-ftNcclRecv(__nv_bfloat16* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+ftNcclRecv(__nv_bfloat16* recv_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream);
 #endif
-template void ftNcclRecv(int* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+template void ftNcclRecv(int* recv_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream);
 template void
-ftNcclRecv(bool* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+ftNcclRecv(bool* recv_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream);
+#ifndef DIOPI_ENABLE
 template void
-ftNcclRecv(char* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+ftNcclRecv(char* recv_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream);
+#endif  // DIOPI_ENABLE
 
 template<typename T>
-void ftNcclBroadCast(T* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream)
+void ftNcclBroadCast(T* buff, const int data_size, const int root, NcclParam nccl_param, deviceStream_t stream)
 {
     TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
 #ifdef BUILD_MULTI_GPU
+#ifndef DIOPI_ENABLE
     ncclDataType_t nccl_data_type = getNcclDataType<T>();
     NCCLCHECK(ncclBcast(buff, data_size, nccl_data_type, root, nccl_param.nccl_comm_, stream));
+#else
+    dipu::devapis::getDiclDataType<T>(), root, nccl_param.nccl_comm_, stream);
+#endif  // DIOPI_ENABLE
 #endif
     TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
+#ifndef DIOPI_ENABLE
 template void
-ftNcclBroadCast(char* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream);
+ftNcclBroadCast(char* buff, const int data_size, const int root, NcclParam nccl_param, deviceStream_t stream);
+#endif  // DIOPI_ENABLE
 template void
-ftNcclBroadCast(bool* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream);
+ftNcclBroadCast(bool* buff, const int data_size, const int root, NcclParam nccl_param, deviceStream_t stream);
 template void
-ftNcclBroadCast(int* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream);
+ftNcclBroadCast(int* buff, const int data_size, const int root, NcclParam nccl_param, deviceStream_t stream);
 template void
-ftNcclBroadCast(float* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream);
+ftNcclBroadCast(float* buff, const int data_size, const int root, NcclParam nccl_param, deviceStream_t stream);
+#ifndef DIOPI_ENABLE
 template void
-ftNcclBroadCast(half* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream);
+ftNcclBroadCast(half* buff, const int data_size, const int root, NcclParam nccl_param, deviceStream_t stream);
+#endif  // DIOPI_ENABLE
 #ifdef ENABLE_BF16
 template void
-ftNcclBroadCast(__nv_bfloat16* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream);
+ftNcclBroadCast(__nv_bfloat16* buff, const int data_size, const int root, NcclParam nccl_param, deviceStream_t stream);
 #endif
 
 template void ftNcclAllReduceSum(
-    const float* send_buf, float* recv_buf, const int data_size, NcclParam nccl_param, cudaStream_t stream);
-
+    const float* send_buf, float* recv_buf, const int data_size, NcclParam nccl_param, deviceStream_t stream);
+#ifndef DIOPI_ENABLE
 template void ftNcclAllReduceSum(
-    const half* send_buf, half* recv_buf, const int data_size, NcclParam nccl_param, cudaStream_t stream);
-
+    const half* send_buf, half* recv_buf, const int data_size, NcclParam nccl_param, deviceStream_t stream);
+#endif  // DIOPI_ENABLE
 template void ftNcclAllReduceSum(
-    const int32_t* send_buf, int32_t* recv_buf, const int data_size, NcclParam nccl_param, cudaStream_t stream);
+    const int32_t* send_buf, int32_t* recv_buf, const int data_size, NcclParam nccl_param, deviceStream_t stream);
 
 #ifdef ENABLE_BF16
 template void ftNcclAllReduceSum(const __nv_bfloat16* send_buf,
                                  __nv_bfloat16*       recv_buf,
                                  const int            data_size,
                                  NcclParam            nccl_param,
-                                 cudaStream_t         stream);
+                                 deviceStream_t         stream);
 #endif
 
 template void ftNcclAllGather(const float* send_buf,
@@ -182,35 +226,43 @@ template void ftNcclAllGather(const float* send_buf,
                               const int    data_size,
                               const int    rank,
                               NcclParam    nccl_param,
-                              cudaStream_t stream);
-
+                              deviceStream_t stream);
+#ifndef DIOPI_ENABLE
 template void ftNcclAllGather(const half*  send_buf,
                               half*        recv_buf,
                               const int    data_size,
                               const int    rank,
                               NcclParam    nccl_param,
-                              cudaStream_t stream);
-
+                              deviceStream_t stream);
+#endif  // DIOPI_ENABLE
 #ifdef ENABLE_BF16
 template void ftNcclAllGather(const __nv_bfloat16* send_buf,
                               __nv_bfloat16*       recv_buf,
                               const int            data_size,
                               const int            rank,
                               NcclParam            nccl_param,
-                              cudaStream_t         stream);
+                              deviceStream_t         stream);
 #endif
 
 void ftNcclGroupStart()
 {
 #ifdef BUILD_MULTI_GPU
+#ifndef DIOPI_ENABLE
     NCCLCHECK(ncclGroupStart());
+#else
+    dipu::devapis::diclRawGroupStart()
+#endif  // DIOPI_ENABLE
 #endif
 }
 
 void ftNcclGroupEnd()
 {
 #ifdef BUILD_MULTI_GPU
+#ifndef DIOPI_ENABLE
     NCCLCHECK(ncclGroupEnd());
+#else
+    dipu::devapis::diclRawGroupEnd()
+#endif  // DIOPI_ENABLE
 #endif
 }
 
@@ -277,7 +329,11 @@ void ftNcclStreamSynchronize(NcclParam tensor_para, NcclParam pipeline_para, cud
 void ftNcclGetUniqueId(NcclUid& uid)
 {
 #ifdef BUILD_MULTI_GPU
+#ifndef DIOPI_ENABLE
     NCCLCHECK(ncclGetUniqueId(&uid.nccl_uid_));
+#else
+    dipu::devapis::diclGetUniqueId(&uid.nccl_uid_);
+#endif  // DIOPI_ENABLE
 #endif
 }
 
@@ -293,7 +349,11 @@ void ftNcclCommInitRank(NcclParam& param, const int rank, const int world_size,
     param.rank_       = rank;
     param.world_size_ = world_size;
     param.nccl_uid_   = uid.nccl_uid_;
+#ifndef DIOPI_ENABLE
     NCCLCHECK(ncclCommInitRank(&param.nccl_comm_, param.world_size_, param.nccl_uid_, param.rank_));
+#else
+    dipu::devapis::diclCommInitRank(&param.nccl_comm_, param.world_size_, param.nccl_uid_, param.rank_);
+#endif  // DIOPI_ENABLE
 #endif
     TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
@@ -302,7 +362,11 @@ void ftNcclParamDestroy(NcclParam& param)
 {
 #ifdef BUILD_MULTI_GPU
     if (param.nccl_comm_ != nullptr) {
+#ifndef DIOPI_ENABLE
         ncclCommDestroy(param.nccl_comm_);
+#else
+        dipu::devapis::diclCommDestroy(param.nccl_comm_);
+#endif 
     }
 #endif
 }
diff --git a/src/turbomind/utils/nccl_utils.h b/src/turbomind/utils/nccl_utils.h
index 9827297c5a..6d796212db 100644
--- a/src/turbomind/utils/nccl_utils.h
+++ b/src/turbomind/utils/nccl_utils.h
@@ -19,20 +19,27 @@
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/logger.h"
 
-#include <cuda_runtime.h>
 #ifdef BUILD_MULTI_GPU
+#ifndef DIOPI_ENABLE
 #include <nccl.h>
+#else
+#include "rawdiclapis.h"
+#endif  // DIOPI_ENABLE
 #endif
 #include <stdio.h>
 #include <string>
 
+#ifndef DIOPI_ENABLE
+#include <cuda_runtime.h>
 #if defined(NCCL_VERSION_CODE) && (NCCL_VERSION_CODE >= 21003)
 #define ENABLE_BF16_NCCL
 #endif
+#endif  // DIOPI_ENABLE
 
 namespace turbomind {
 #ifdef BUILD_MULTI_GPU
-#define NCCLCHECK(cmd)                                                                                                 \
+#define NCCLCHECK(cmd)
+#ifndef DIOPI_ENABLE                                                                                                 \
     do {                                                                                                               \
         ncclResult_t r = cmd;                                                                                          \
         if (r != ncclSuccess) {                                                                                        \
@@ -41,6 +48,7 @@ namespace turbomind {
             exit(EXIT_FAILURE);                                                                                        \
         }                                                                                                              \
     } while (0)
+#endif  // DIOPI_ENABLE
 #else
 #define NCCLCHECK(cmd) printf("[WARNING} No NCCL");
 #endif
@@ -50,7 +58,7 @@ struct NcclUid {
     NcclUid(){};
     NcclUid(NcclUid const& uid){};
 #else
-    ncclUniqueId nccl_uid_;
+    dipu::commUniqueId nccl_uid_;
     NcclUid(){};
     NcclUid(NcclUid const& uid): nccl_uid_(uid.nccl_uid_){};
 #endif
@@ -61,8 +69,8 @@ struct NcclParam {
     int world_size_{1};
     int group_id_{0};
 #ifdef BUILD_MULTI_GPU
-    ncclUniqueId nccl_uid_{};
-    ncclComm_t   nccl_comm_ = nullptr;
+    dipu::commUniqueId nccl_uid_{};
+    dipu::diclComm_t   nccl_comm_ = nullptr;
 #endif
 
 #ifdef BUILD_MULTI_GPU
@@ -92,28 +100,28 @@ struct NcclParam {
 
 // New APIs
 template<typename T>
-void ftNcclAllReduceSum(const T* send_buf, T* recv_buf, const int data_size, NcclParam nccl_param, cudaStream_t stream);
+void ftNcclAllReduceSum(const T* send_buf, T* recv_buf, const int data_size, NcclParam nccl_param, deviceStream_t stream);
 
 template<typename T>
 void ftNcclAllGather(
-    const T* send_buf, T* recv_buf, const int data_size, const int rank, NcclParam nccl_param, cudaStream_t stream);
+    const T* send_buf, T* recv_buf, const int data_size, const int rank, NcclParam nccl_param, deviceStream_t stream);
 
 template<typename T>
-void ftNcclBroadCast(T* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream);
+void ftNcclBroadCast(T* buff, const int data_size, const int root, NcclParam nccl_param, deviceStream_t stream);
 
 template<typename T>
-void ftNcclRecv(T* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+void ftNcclRecv(T* recv_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream);
 
 template<typename T>
-void ftNcclSend(const T* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+void ftNcclSend(const T* send_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream);
 
 // nccl stream synchronize, abort nccl comms and throw errors when nccl async errors detected
-void ftNcclStreamSynchronize(NcclParam tensor_para, NcclParam pipeline_para_, cudaStream_t stream);
+void ftNcclStreamSynchronize(NcclParam tensor_para, NcclParam pipeline_para_, deviceStream_t stream);
 
 void ftNcclGroupStart();
 void ftNcclGroupEnd();
 void ftNcclGetUniqueId(NcclUid& uid);
-void ftNcclCommInitRank(NcclParam& param, const int rank, const int world_size, const NcclUid uid);
+void ftNcclCommInitRank(NcclParam& param, const int rank, const int world_size, const commUniqueId uid);
 void ftNcclParamDestroy(NcclParam& param);
 
 int ftNcclNextGroupId();