From bf96f71ebbee2a2490ad268fe815efbe960aa953 Mon Sep 17 00:00:00 2001 From: SHshenhao Date: Mon, 16 Oct 2023 15:55:40 +0800 Subject: [PATCH] add_dp_rt --- .../runtime/core/DIPIRawGeneratorImpl.cpp | 93 ++ .../runtime/core/DIPURawGeneratorImpl.h | 37 + src/turbomind/runtime/device/basedef.h | 88 ++ src/turbomind/runtime/device/rawdeviceapis.h | 107 +++ src/turbomind/runtime/device/rawdiclapis.h | 84 ++ .../runtime/diopirt/diopirt_impl.cpp | 96 +++ src/turbomind/runtime/diopirt/diopirt_impl.h | 65 ++ src/turbomind/runtime/rthelper.h | 5 + .../vendor/CUDA/cmake/CUDAComputeArch.cmake | 266 ++++++ .../runtime/vendor/CUDA/cmake/FindNCCL.cmake | 49 ++ .../runtime/vendor/CUDA/deviceimpl.cpp | 255 ++++++ .../vendor/CUDA/rawcommuniatorimpl.cpp | 146 ++++ src/turbomind/runtime/vendor/CUDA/vendorapi.h | 28 + src/turbomind/utils/Tensor.cc | 6 +- src/turbomind/utils/allocator.h | 33 +- src/turbomind/utils/cuda_utils.cc | 24 +- src/turbomind/utils/cuda_utils.h | 134 ++- src/turbomind/utils/logger.cc | 7 +- src/turbomind/utils/memory_utils.cu | 804 ++++++++++-------- src/turbomind/utils/memory_utils.h | 84 +- src/turbomind/utils/nccl_utils.cc | 132 ++- src/turbomind/utils/nccl_utils.h | 32 +- 22 files changed, 2047 insertions(+), 528 deletions(-) create mode 100644 src/turbomind/runtime/core/DIPIRawGeneratorImpl.cpp create mode 100644 src/turbomind/runtime/core/DIPURawGeneratorImpl.h create mode 100644 src/turbomind/runtime/device/basedef.h create mode 100644 src/turbomind/runtime/device/rawdeviceapis.h create mode 100644 src/turbomind/runtime/device/rawdiclapis.h create mode 100644 src/turbomind/runtime/diopirt/diopirt_impl.cpp create mode 100644 src/turbomind/runtime/diopirt/diopirt_impl.h create mode 100644 src/turbomind/runtime/rthelper.h create mode 100644 src/turbomind/runtime/vendor/CUDA/cmake/CUDAComputeArch.cmake create mode 100644 src/turbomind/runtime/vendor/CUDA/cmake/FindNCCL.cmake create mode 100644 src/turbomind/runtime/vendor/CUDA/deviceimpl.cpp create mode 100644 src/turbomind/runtime/vendor/CUDA/rawcommuniatorimpl.cpp create mode 100644 src/turbomind/runtime/vendor/CUDA/vendorapi.h diff --git a/src/turbomind/runtime/core/DIPIRawGeneratorImpl.cpp b/src/turbomind/runtime/core/DIPIRawGeneratorImpl.cpp new file mode 100644 index 0000000000..f11571b3f4 --- /dev/null +++ b/src/turbomind/runtime/core/DIPIRawGeneratorImpl.cpp @@ -0,0 +1,93 @@ +// Copyright (c) 2023, DeepLink. +#include "DIPURawGeneratorImpl.h" +#include "../device/rawdeviceapis.h> + +namespace dipu { +/** + * DIPURawGeneratorImpl class implementation + */ +DIPURawGeneratorImpl::DIPURawGeneratorImpl() :state_need_reset_(true) { + seed_ = 0; +} + +/** + * Private clone method implementation + * + * See Note [Acquire lock when using random generators] + */ +std::shared_ptr DIPURawGeneratorImpl::clone() const { + auto gen = new DIPURawGeneratorImpl(); + gen->set_current_seed(this->seed_); + auto state = this->state_; + const auto& state_clone = this->clone_state(); + gen->set_state(state_clone); + gen->set_state_flag(this->state_need_reset_); + return std::shared_ptr(&gen); +} + +/** + * Sets the seed to be used by MTGP + * + * See Note [Acquire lock when using random generators] + */ +void DIPURawGeneratorImpl::set_current_seed(uint64_t seed) { + seed_ = seed; + state_need_reset_ = true; +} + +/** + * Gets the current seed of DIPUGeneratorImpl. + */ +uint64_t DIPURawGeneratorImpl::current_seed() const { + return seed_; +} + +/** + * Gets a nondeterministic random number from /dev/urandom or time, + * seeds the CPUGeneratorImpl with it and then returns that number. + * + */ +uint64_t DIPURawGeneratorImpl::seed() { + //TODO:随机生成seed + uint64_t random = 42; + this->set_current_seed(random); + return random; +} + +/** + * get state + * + * See Note [Acquire lock when using random generators] + */ +RandState DIPURawGeneratorImpl::get_state() const { + if (state_need_reset_) { + update_state(); + } + auto state_clone = this->clone_state(); + return state_clone; +} + +RandState DIPURawGeneratorImpl::clone_state(const RandState& state) const { + uint8_t newnum = *(state.state_); + RandState newState{&newnum, state.size_}; + return newState; +} + +/** +* set state +* +* See Note [Acquire lock when using random generators] +*/ +void DIPURawGeneratorImpl::set_state(const RandState& state) { + this->state_ = this->clone_state(state); +} + +/** + * set state flag + * See Note [Acquire lock when using random generators] + */ +void DIPURawGeneratorImpl::set_state_flag(bool flag) { + state_need_reset_ = flag; +} + +} // namespace dipu diff --git a/src/turbomind/runtime/core/DIPURawGeneratorImpl.h b/src/turbomind/runtime/core/DIPURawGeneratorImpl.h new file mode 100644 index 0000000000..bc09c32f58 --- /dev/null +++ b/src/turbomind/runtime/core/DIPURawGeneratorImpl.h @@ -0,0 +1,37 @@ +// Copyright (c) 2023, DeepLink. +#pragma once + +#include +#include + +#include "basedef.h" + +namespace dipu { +struct RandState { + std::share_ptr state_ = nullptr; + size_t size_ = 0; +}; + +class DIPURawGeneratorImpl { +public: + // Constructors + explicit DIPURawGeneratorImpl(); + ~DIPURawGeneratorImpl() = default; + + std::shared_ptr clone() const; + void set_current_seed(uint64_t seed); + uint64_t current_seed() const; + uint64_t seed(); + RandState get_state() const; + RandState clone_state(const RandState& state) const; + virtual void set_state(const RandState& state) {}; + + void set_state_flag(bool flag); + virtual void update_state() const {}; + + uint64_t seed_; + mutable RandState state_; + mutable bool state_need_reset_; +} + +} // namespace dipu diff --git a/src/turbomind/runtime/device/basedef.h b/src/turbomind/runtime/device/basedef.h new file mode 100644 index 0000000000..5bb6ad90de --- /dev/null +++ b/src/turbomind/runtime/device/basedef.h @@ -0,0 +1,88 @@ +// Copyright (c) 2023, DeepLink. +#pragma once + +// todo: move out deice dir to diopi +namespace dipu { + +#define DIPU_API __attribute__ ((visibility ("default"))) + +#define DIPU_WEAK __attribute__((weak)) + +// "default", "hidden", "protected" or "internal +#define DIPU_HIDDEN __attribute__ ((visibility ("hidden"))) + +typedef int32_t enum_t; + +#define DIPU_STRING(x) #x +#define DIPU_CODELOC __FILE__ " (" DIPU_STRING(__LINE__) ")" + + +#define DIPU_LOGE(fmt, ...) \ + printf( \ + "[ERROR]%s,%s:%u:" #fmt "\n", \ + __FUNCTION__, \ + __FILE__, \ + __LINE__, \ + ##__VA_ARGS__) + +#define DIPU_LOGW(fmt, ...) \ + printf( \ + "[WARN]%s,%s:%u:" #fmt "\n", \ + __FUNCTION__, \ + __FILE__, \ + __LINE__, \ + ##__VA_ARGS__) + + +namespace devapis { + +enum class VendorDeviceType : enum_t { + MLU, //camb + NPU, //ascend + CUDA, //cuda + GCU, //gcu + SUPA, //Biren + DROPLET, //droplet +}; + +enum class EventStatus: enum_t { + PENDING, + RUNNING, + DEFERRED, + READY +}; + +enum class OpStatus: enum_t { + SUCCESS, + ERR_UNKNOWN, + ERR_NOMEM, +}; + +enum class MemCPKind: enum_t { + D2H, + H2D, + D2D, +}; + +typedef enum { + /*! The operation was successful. */ + DICL_SUCCESS = 0x0, + + /*! undefined error */ + DICL_ERR_UNDEF = 0x01000, + +} diclResult_t; + +struct DIPUDeviceProperties { + std::string name; + size_t totalGlobalMem = 0; + int32_t major = 0; + int32_t minor = 0; + int32_t multiProcessorCount = 0; +}; + +using deviceId_t = int64_t; + + +} // end namespace devapis +} // end namespace dipu \ No newline at end of file diff --git a/src/turbomind/runtime/device/rawdeviceapis.h b/src/turbomind/runtime/device/rawdeviceapis.h new file mode 100644 index 0000000000..9d85ff8f0a --- /dev/null +++ b/src/turbomind/runtime/device/rawdeviceapis.h @@ -0,0 +1,107 @@ +// Copyright (c) 2023, DeepLink. +#pragma once + +#include + +#include "./vendor/vendorapi.h" +#include "./basedef.h" + +namespace dipu { + +extern devapis::VendorDeviceType VENDOR_TYPE; +namespace devapis { + +DIPU_API void initializeVendor(); + +DIPU_API void finalizeVendor(); + +DIPU_API deviceId_t current_device(); + +DIPU_API DIPUDeviceProperties getDeviceProperties(int32_t device_index); + +// set current device given device according to id +DIPU_API void setDevice(deviceId_t devId); + +DIPU_API void resetDevice(deviceId_t devId = 0); + +DIPU_API void syncDevice(); + +// check last launch succ or not, throw if fail +DIPU_API void checkLastError(); + +DIPU_API int getDeviceCount(); + +DIPU_API void getDriverVersion(int* version); + +DIPU_API void getRuntimeVersion(int* version); + +DIPU_API void createStream(deviceStream_t* stream, bool prior=false); + +DIPU_API void destroyStream(deviceStream_t stream); +DIPU_API void destroyStream(deviceStream_t stream, deviceId_t devId); + +DIPU_API void releaseStream(); + +DIPU_API void syncStream(deviceStream_t stream); + +DIPU_API bool streamNotNull(deviceStream_t stream); + +DIPU_API void streamWaitEvent(deviceStream_t stream, deviceEvent_t event); + +// same as query last event status in stream.(every op has a event) +DIPU_API bool isStreamEmpty(deviceStream_t stream); + +// ===================== +// device event related +// ===================== + +DIPU_API void createEvent(deviceEvent_t* event); + +DIPU_API void destroyEvent(deviceEvent_t event); + +DIPU_API void waitEvent(deviceEvent_t event); + +DIPU_API void recordEvent(deviceEvent_t event, deviceStream_t stream); + +DIPU_API void eventElapsedTime(float *time, deviceEvent_t start, deviceEvent_t end); + +DIPU_API EventStatus getEventStatus(deviceEvent_t event); + +// ===================== +// mem related +// ===================== +DIPU_API void mallocHost(void** p, size_t nbytes); + +DIPU_API void freeHost(void* p); + +DIPU_API OpStatus mallocDevice(void** p, size_t nbytes, bool throwExcepion= true); + +DIPU_API void freeDevice(void* p); + +DIPU_API bool isPinnedPtr(const void *p); + +// (asynchronous) set val +DIPU_API void memSetAsync(const deviceStream_t stream, void* ptr, int val, size_t size); + +// (synchronous) copy from device to a device +DIPU_API void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src); + +// (synchronous) copy from host to a device +DIPU_API void memCopyH2D(size_t nbytes, /*deviceId_t dstDevId,*/ void* dst, /*Host srcDev,*/ const void* src); + +// (synchronous) copy from a device to host +DIPU_API void memCopyD2H(size_t nbytes, /*Host dstDev,*/ void* dst, /*deviceId_t srcDevId,*/ const void* src); + +// (asynchronous) copy from device to a device +DIPU_API void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes, + deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src); + +// (asynchronous) copy from host to a device +DIPU_API void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, + /*deviceId_t dstDevId,*/ void* dst, /*Host srcDev,*/ const void* src); + +// (asynchronous) copy from a device to host +DIPU_API void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, + /*Host dstDev,*/ void* dst, /*deviceId_t srcDevId,*/ const void* src); +} // end namespace devapis +} // end namespace dipu \ No newline at end of file diff --git a/src/turbomind/runtime/device/rawdiclapis.h b/src/turbomind/runtime/device/rawdiclapis.h new file mode 100644 index 0000000000..d697f2f7aa --- /dev/null +++ b/src/turbomind/runtime/device/rawdiclapis.h @@ -0,0 +1,84 @@ +#pragma once + + +#include "vendor/vendorapi.h“ +#include "deviceapis.h" +#include "../diopirt/diopirt_impl.h" + +namespace dipu { + +// need add return status. +namespace devapis { + typedef enum + { + MIN, + MAX, + SUM, + PRODUCT, + AVG, + } DiclReduceOp; + + template + static DiclDataType getDiclDataType() { + DiclDataType dType; + if (std::is_same::value) { + dType = DiclDataType::TYPE_FP32; + } + else if (std::is_same::value) { + dType = DiclDataType::TYPE_INT32; + } + else if (std::is_same::value) { + dType = DiclDataType::TYPE_BOOL; + } + else { + printf("[ERROR] DICL only support float, int, and bool. \n"); + exit(-1); + } + return dType; + }; + + extern const int DICL_UNIQUE_ID_BYTES_SIZE; + + // todo:: dipu only export devproxy but not devapis (which move o diopi) + DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm); + + DIPU_API diclResult_t diclGetUniqueId(commUniqueId* uniqueId); + + DIPU_API diclResult_t diclCommInitRank(diclComm_t* comm, int nranks, commUniqueId uniqueId, int rank, int localDeviceId = -1); + + // DIPU_API void diclCommInitAll(diclComm_t* comms, int ndev, const int* devlist); + + DIPU_API diclResult_t diclCommDestroy(diclComm_t comm); + + // DIPU_API diclResult_t diclCommFinalize(diclComm_t comm); + + // DIPU_API diclResult_t diclCommAbort(diclComm_t comm); + + DIPU_API diclResult_t diclRawAllReduce(const void *sendbuff, void *recvbuff, size_t count, DiclDataType datatype, + const DiclReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream); + + DIPU_API diclResult_t diclRawBroadcast(const void *sendbuff, void* recvbuff, size_t count, DiclDataType datatype, + int root, diclComm_t comm, deviceStream_t stream); + + DIPU_API diclResult_t diclRawAllGather(const void *sendBuf, void *recvBuf, size_t count, DiclDataType datatype, + diclComm_t comm, deviceStream_t stream); + + DIPU_API diclResult_t diclRawReduce(const void* sendbuff, void* recvbuff, size_t count, DiclDataType datatype, + const ReduceOp& reduceOp, int root, diclComm_t comm, deviceStream_t stream); + + DIPU_API diclResult_t diclRawReduceScatter(void *sendBuf, void *recvBuf, uint64_t recvCount, DiclDataType dataType, + const ReduceOp& op, diclComm_t comm, deviceStream_t stream); + + DIPU_API diclResult_t diclRawSend(void* sendbuff, size_t count, DiclDataType datatype, int peer, + diclComm_t comm, deviceStream_t stream); + + DIPU_API diclResult_t diclRawRecv(void* recvbuff, size_t count, DiclDataType datatype, int peer, + diclComm_t comm, deviceStream_t stream); + + DIPU_API diclResult_t diclRawGroupStart(); + + DIPU_API diclResult_t diclRawGroupEnd(); + +} // namespace devapis + +} // namespace dipu \ No newline at end of file diff --git a/src/turbomind/runtime/diopirt/diopirt_impl.cpp b/src/turbomind/runtime/diopirt/diopirt_impl.cpp new file mode 100644 index 0000000000..19126e45ef --- /dev/null +++ b/src/turbomind/runtime/diopirt/diopirt_impl.cpp @@ -0,0 +1,96 @@ +#include + +#include "diopirt_impl.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dipu { + +namespace diopi_helper { + +::diopiTensorHandle_t toDiopiTensorHandle(turbomind::Tensor& tensor) { + return tensor.data == nullptr ? nullptr : reinterpret_cast<::diopiTensorHandle_t>(&tensor); +} + +::diopiConstTensorHandle_t toDiopiTensorHandle(const turbomind::Tensor& tensor) { + return tensor.data == nullptr ? nullptr : reinterpret_cast<::diopiConstTensorHandle_t>(&tensor); +} + +::diopiConstTensorHandle_t toDiopiTensorHandle(const turbomind::Tensor* tensor) { + return tensor == nullptr ? nullptr : toDiopiTensorHandle(*tensor); +} + +// ::diopiGeneratorHandle_t toDiopiGeneratorHandle(::Generator& generator) { +// if (!generator.has_value()) return nullptr; +// return toDiopiGeneratorHandle(generator.value()); +// } + +} // namespace diopi_helper + +} // namespace dipu + + +bool diopiTensor::resetShape(const diopiSize_t* size) { + int64_t numel = 1; + for (int64_t i = 0; i < size->len; ++i) { + numel *= size->data[i]; + } + if (numel != numel_) return false; + + shape_.resize(size->len); + for (int64_t i = size->len - 1; i >= 0; --i) { + shape_[i] = size->data[i]; + } + return true; +} + +DIOPI_RT_API diopiError_t diopiGetTensorData(diopiTensorHandle_t th, void** pptr) { + *pptr = th->getPtr(); + return diopiSuccess; +} + +DIOPI_RT_API diopiError_t diopiGetTensorDataConst(diopiConstTensorHandle_t th, const void** pptr) { + *pptr = th->getPtr(); + return diopiSuccess; +} + +DIOPI_RT_API diopiError_t diopiGetTensorShape(diopiConstTensorHandle_t th, diopiSize_t* size) { + diopiSize_t thSize = {th->getPtr(), std::static_cast(size())}; + *size = thSize; + return diopiSuccess; +} + +DIOPI_RT_API diopiError_t diopiGetTensorStride(diopiConstTensorHandle_t th, diopiSize_t* stride) { + diopiSize_t thStride = {th->getPtr(), std::static_cast(size())}; + *size = thStride; + return diopiSuccess; +} + +DIOPI_RT_API diopiError_t diopiGetTensorDtype(diopiConstTensorHandle_t th, diopiDtype_t* dtype) { + *dtype = toDiopiDataType[th->dtype()]; + return diopiSuccess; +} + +// DIOPI_RT_API diopiError_t diopiGetTensorDevice(diopiConstTensorHandle_t th, diopiDevice_t* device) { +// *device = th->device(); +// return diopiSuccess; +// } + +DIOPI_RT_API diopiError_t diopiGetTensorNumel(diopiConstTensorHandle_t th, int64_t* numel) { + diopiSize_t thNumel = {th->getPtr(), std::static_cast(size())}; + *numel = thNumel; + return diopiSuccess; +} + +DIOPI_RT_API diopiError_t diopiGetTensorElemSize(diopiConstTensorHandle_t th, int64_t* elemSize) { + *elemSize = std::static_cast(th->sizeBytes()); + return diopiSuccess; +} \ No newline at end of file diff --git a/src/turbomind/runtime/diopirt/diopirt_impl.h b/src/turbomind/runtime/diopirt/diopirt_impl.h new file mode 100644 index 0000000000..7754020aa9 --- /dev/null +++ b/src/turbomind/runtime/diopirt/diopirt_impl.h @@ -0,0 +1,65 @@ +// Copyright (c) 2023, DeepLink. +#pragma once + +#include +// #include + +#include "../rthelper.h" + +#include "../utils/tensor.h" + +using deviceStream_t = dipu::deviceStream_t; + +extern "C" { +struct diopiContext { + deviceStream_t stream; + // 1. use arrays to hold tensor that avoid tensor deleting when leaving scope + // 2. The address of each array must be fixed, so use list instead of vector + std::list arrays; + + explicit diopiContext(const deviceStream_t& s) : stream(s) {} +}; + +} // extern "C" + +namespace dipu { + +namespace diopi_helper { + +::diopiTensorHandle_t toDiopiTensorHandle(turbomind::Tensor& tensor); +::diopiConstTensorHandle_t toDiopiTensorHandle(const turbomind::Tensor& tensor); +::diopiConstTensorHandle_t toDiopiTensorHandle(const turbomind::Tensor* tensor); + +::diopiGeneratorHandle_t toDiopiGeneratorHandle(::Generator& generator); + +static std::map toDiopiDataType = { + {turbomind::DataType::TYPE_INVALID, diopiDtype_t::diopi_dtype_unsupported}, + {turbomind::DataType::TYPE_BOOL, diopiDtype_t::diopi_dtype_bool}, + {turbomind::DataType::TYPE_INT8, diopiDtype_t::diopi_dtype_int8}, + {turbomind::DataType::TYPE_INT32, diopiDtype_t::diopi_dtype_int32}, + {turbomind::DataType::TYPE_INT64, diopiDtype_t::diopi_dtype_int64}, + {turbomind::DataType::TYPE_UINT8, diopiDtype_t::diopi_dtype_uint8}, + {turbomind::DataType::TYPE_UINT32, diopiDtype_t::diopi_dtype_uint32}, + {turbomind::DataType::TYPE_UINT64, diopiDtype_t::diopi_dtype_uint64}, + {turbomind::DataType::TYPE_FP16, diopiDtype_t::diopi_dtype_float16}, + {turbomind::DataType::TYPE_FP32, diopiDtype_t::diopi_dtype_float32}, + {turbomind::DataType::TYPE_FP64, diopiDtype_t::diopi_dtype_float64} +}; + +static std::map toTmDataType = { + {diopiDtype_t::diopi_dtype_unsupported, turbomind::DataType::TYPE_INVALID}, + {diopiDtype_t::diopi_dtype_bool, turbomind::DataType::TYPE_BOOL}, + {diopiDtype_t::diopi_dtype_int8, turbomind::DataType::TYPE_INT8}, + {diopiDtype_t::diopi_dtype_int32, turbomind::DataType::TYPE_INT32}, + {diopiDtype_t::diopi_dtype_int64, turbomind::DataType::TYPE_INT64}, + {diopiDtype_t::diopi_dtype_uint8, turbomind::DataType::TYPE_UINT8}, + {diopiDtype_t::diopi_dtype_uint32, turbomind::DataType::TYPE_UINT32}, + {diopiDtype_t::diopi_dtype_uint64, turbomind::DataType::TYPE_UINT64}, + {diopiDtype_t::diopi_dtype_float16, turbomind::DataType::TYPE_FP16}, + {diopiDtype_t::diopi_dtype_float32, turbomind::DataType::TYPE_FP32}, + {diopiDtype_t::diopi_dtype_float64, turbomind::DataType::TYPE_FP64} +}; + +} // namespace diopi_helper + +} // namespace dipu \ No newline at end of file diff --git a/src/turbomind/runtime/rthelper.h b/src/turbomind/runtime/rthelper.h new file mode 100644 index 0000000000..94bd84eef4 --- /dev/null +++ b/src/turbomind/runtime/rthelper.h @@ -0,0 +1,5 @@ +// Copyright (c) 2023, DeepLink. +#include +#include +#include +#include \ No newline at end of file diff --git a/src/turbomind/runtime/vendor/CUDA/cmake/CUDAComputeArch.cmake b/src/turbomind/runtime/vendor/CUDA/cmake/CUDAComputeArch.cmake new file mode 100644 index 0000000000..df1b7d667f --- /dev/null +++ b/src/turbomind/runtime/vendor/CUDA/cmake/CUDAComputeArch.cmake @@ -0,0 +1,266 @@ +# Synopsis: +# CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable [target_CUDA_architectures]) +# -- Selects GPU arch flags for nvcc based on target_CUDA_architectures +# target_CUDA_architectures : Auto | Common | All | LIST(ARCH_AND_PTX ...) +# - "Auto" detects local machine GPU compute arch at runtime. +# - "Common" and "All" cover common and entire subsets of architectures +# ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX +# NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal +# NUM: Any number. Only those pairs are currently accepted by NVCC though: +# 2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2 +# Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable} +# Additionally, sets ${out_variable}_readable to the resulting numeric list +# Example: +# CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.0 3.5+PTX 5.2(5.0) Maxwell) +# LIST(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS}) +# +# More info on CUDA architectures: https://en.wikipedia.org/wiki/CUDA +# + +# This list will be used for CUDA_ARCH_NAME = All option +set(CUDA_KNOWN_GPU_ARCHITECTURES "Kepler" "Maxwell") + +# This list will be used for CUDA_ARCH_NAME = Common option (enabled by default) +set(CUDA_COMMON_GPU_ARCHITECTURES "3.0" "3.5" "5.0") + +if (NOT (CUDA_VERSION VERSION_LESS "7.0")) + list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler+Tegra" "Kepler+Tesla" "Maxwell+Tegra") + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2") + if (NOT (CUDA_VERSION VERSION_LESS "8.0")) + list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Pascal") + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.0" "6.1") + if (NOT (CUDA_VERSION VERSION_LESS "9.0")) + list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Volta") + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.0") + if (NOT (CUDA_VERSION VERSION_LESS "10.0")) + list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Turing") + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.5") + if (NOT (CUDA_VERSION VERSION_LESS "11.0")) + list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ampere") + # https://forums.developer.nvidia.com/t/nvcc-fatal-unsupported-gpu-architecture-compute-86/161424 + if(NOT (CUDA_VERSION VERSION_LESS "11.2")) + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0" "8.6+PTX") + else() + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0+PTX") + endif() + else() + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.5+PTX") + endif() + else() + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.0+PTX") + endif() + else() + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.1+PTX") + endif() + else() + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2+PTX") + endif() +else() + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.0+PTX") +endif() + + +################################################################################################ +# A function for automatic detection of GPUs installed (if autodetection is enabled) +# Usage: +# CUDA_DETECT_INSTALLED_GPUS(OUT_VARIABLE) +# +function(CUDA_DETECT_INSTALLED_GPUS OUT_VARIABLE) + if(NOT CUDA_GPU_DETECT_OUTPUT) + set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu) + + file(WRITE ${cufile} "" + "#include \n" + "int main()\n" + "{\n" + " int count = 0;\n" + " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n" + " if (count == 0) return -1;\n" + " for (int device = 0; device < count; ++device)\n" + " {\n" + " cudaDeviceProp prop;\n" + " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" + " std::printf(\"%d.%d \", prop.major, prop.minor);\n" + " }\n" + " return 0;\n" + "}\n") + + execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${cufile}" + "-ccbin" ${CMAKE_CXX_COMPILER} + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" + RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(nvcc_res EQUAL 0) + # only keep the last line of nvcc_out + STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}") + STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}") + list(GET nvcc_out -1 nvcc_out) + string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}") + set(CUDA_GPU_DETECT_OUTPUT ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_gpus tool" FORCE) + endif() + endif() + + if(NOT CUDA_GPU_DETECT_OUTPUT) + message(STATUS "Automatic GPU detection failed. Building for common architectures.") + set(${OUT_VARIABLE} ${CUDA_COMMON_GPU_ARCHITECTURES} PARENT_SCOPE) + else() + list(SORT CUDA_GPU_DETECT_OUTPUT) + set(CUDA_GPU_DETECT_OUTPUT ${CUDA_GPU_DETECT_OUTPUT}+PTX) + set(${OUT_VARIABLE} ${CUDA_GPU_DETECT_OUTPUT} PARENT_SCOPE) + endif() +endfunction() + +function(FILTER_NOHALF_ARCH OUT_VARIABLE) + set(ARCH_LIST_IN "${ARGN}") + set(ARCH_VALID_HALF) + set(HALF_LOWEST 6.0) + + foreach(arch_num ${ARCH_LIST_IN}) + if ((arch_num GREATER HALF_LOWEST) OR (arch_num EQUAL HALF_LOWEST)) + list(APPEND ARCH_VALID_HALF ${arch_num}) + else() + message(STATUS "FILTER_NOHALF_ARCH: remove CUDA arch:${arch_num}!") + endif() + endforeach() + set(${OUT_VARIABLE} ${ARCH_VALID_HALF} PARENT_SCOPE) + +endfunction() + +################################################################################################ +# Function for selecting GPU arch flags for nvcc based on CUDA architectures from parameter list +# Usage: +# CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable +# option flags: ONE OF "All/Common/Auto" and HalfFilter or not, HalfFilter means filter-out +# arch which not support Half-precision +# Direct: direct list of CUDA compute archs, mutually exclusive with options +# example1: CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS Auto) +# example2: CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS Auto HalfFilter) +# example3: CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS Direct 6.0 6.1 7.0+PTX) +function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable) + + set(CUDA_ARCH_LIST) + set(options All Auto Common HalfFilter) + set(oneValueArgs) + set(multiValueArgs Direct) + cmake_parse_arguments(P "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(P_All) + set(CUDA_ARCH_LIST ${CUDA_KNOWN_GPU_ARCHITECTURES}) + elseif(P_Common) + set(CUDA_ARCH_LIST ${CUDA_COMMON_GPU_ARCHITECTURES}) + elseif(P_Auto) + CUDA_DETECT_INSTALLED_GPUS(CUDA_ARCH_LIST) + message(STATUS "Autodetected CUDA architecture(s): ${CUDA_ARCH_LIST}") + set(CUDA_ARCH_LIST ${CUDA_ARCH_LIST}) + elseif(P_Direct) + set(CUDA_ARCH_LIST ${P_Direct}) + endif() + + if(NOT CUDA_ARCH_LIST) + CUDA_DETECT_INSTALLED_GPUS(CUDA_ARCH_LIST) + message(STATUS "Autodetected CUDA architecture(s): ${CUDA_ARCH_LIST}") + endif() + + set(cuda_arch_bin) + set(cuda_arch_ptx) + # Now process the list and look for names + string(REGEX REPLACE "[ \t]+" ";" CUDA_ARCH_LIST "${CUDA_ARCH_LIST}") + list(REMOVE_DUPLICATES CUDA_ARCH_LIST) + foreach(arch_name ${CUDA_ARCH_LIST}) + set(arch_bin) + set(add_ptx FALSE) + # Check to see if we are compiling PTX + if(arch_name MATCHES "(.*)\\+PTX$") + set(add_ptx TRUE) + set(arch_name ${CMAKE_MATCH_1}) + endif() + if(arch_name MATCHES "(^[0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$") + set(arch_bin ${CMAKE_MATCH_1}) + set(arch_ptx ${arch_bin}) + else() + # Look for it in our list of known architectures + if(${arch_name} STREQUAL "Kepler+Tegra") + set(arch_bin 3.2) + elseif(${arch_name} STREQUAL "Kepler+Tesla") + set(arch_bin 3.7) + elseif(${arch_name} STREQUAL "Kepler") + set(arch_bin 3.0 3.5) + set(arch_ptx 3.5) + elseif(${arch_name} STREQUAL "Maxwell+Tegra") + set(arch_bin 5.3) + elseif(${arch_name} STREQUAL "Maxwell") + set(arch_bin 5.0 5.2) + set(arch_ptx 5.2) + elseif(${arch_name} STREQUAL "Pascal") + set(arch_bin 6.0 6.1) + set(arch_ptx 6.1) + elseif(${arch_name} STREQUAL "Volta") + set(arch_bin 7.0) + set(arch_ptx 7.0) + elseif(${arch_name} STREQUAL "Turing") + set(arch_bin 7.5) + set(arch_ptx 7.5) + elseif(${arch_name} STREQUAL "Ampere") + set(arch_bin 8.0) + set(arch_ptx 8.0) + else() + message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS") + endif() + endif() + if(NOT arch_bin) + message(SEND_ERROR "arch_bin wasn't set for some reason") + endif() + list(APPEND cuda_arch_bin ${arch_bin}) + if(add_ptx) + if (NOT arch_ptx) + set(arch_ptx ${arch_bin}) + endif() + list(APPEND cuda_arch_ptx ${arch_ptx}) + endif() + endforeach() + + if(P_HalfFilter) + FILTER_NOHALF_ARCH(cuda_arch_bin ${cuda_arch_bin}) + FILTER_NOHALF_ARCH(cuda_arch_ptx ${cuda_arch_ptx}) + endif() + + # remove dots and convert to lists + string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}") + string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}") + string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}") + string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}") + + if(cuda_arch_bin) + list(REMOVE_DUPLICATES cuda_arch_bin) + endif() + if(cuda_arch_ptx) + list(REMOVE_DUPLICATES cuda_arch_ptx) + endif() + + set(nvcc_flags "") + set(nvcc_archs_readable "") + + # Tell NVCC to add binaries for the specified GPUs + foreach(arch ${cuda_arch_bin}) + if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)") + # User explicitly specified ARCH for the concrete CODE + list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}) + list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1}) + else() + # User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE + list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch}) + list(APPEND nvcc_archs_readable sm_${arch}) + endif() + endforeach() + + # Tell NVCC to add PTX intermediate code for the specified architectures + foreach(arch ${cuda_arch_ptx}) + list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch}) + list(APPEND nvcc_archs_readable compute_${arch}) + endforeach() + + string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}") + set(${out_variable} ${nvcc_flags} PARENT_SCOPE) + set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE) +endfunction() diff --git a/src/turbomind/runtime/vendor/CUDA/cmake/FindNCCL.cmake b/src/turbomind/runtime/vendor/CUDA/cmake/FindNCCL.cmake new file mode 100644 index 0000000000..aa8028098c --- /dev/null +++ b/src/turbomind/runtime/vendor/CUDA/cmake/FindNCCL.cmake @@ -0,0 +1,49 @@ +# CMake script to locate NCCL + +include(FindPackageHandleStandardArgs) + +find_path(NCCL_INCLUDE_DIR + NAMES nccl.h + HINTS ${NCCL_ROOT}/include + $ENV{NCCL_ROOT}/include + /usr/include + /usr/local/include + ) + +find_library(NCCL_LIBRARIES + NAMES nccl + HINTS ${NCCL_ROOT}/lib + $ENV{NCCL_ROOT}/lib + /usr/lib + /usr/local/lib + ) + +if (NCCL_INCLUDE_DIR) + file(READ ${NCCL_INCLUDE_DIR}/nccl.h NCCL_VERSION_FILE_CONTENTS) + string(REGEX MATCH "define NCCL_MAJOR * +([0-9]+)" + NCCL_VERSION_MAJOR "${NCCL_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define NCCL_MAJOR * +([0-9]+)" "\\1" + NCCL_VERSION_MAJOR "${NCCL_VERSION_MAJOR}") + string(REGEX MATCH "define NCCL_MINOR * +([0-9]+)" + NCCL_VERSION_MINOR "${NCCL_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define NCCL_MINOR * +([0-9]+)" "\\1" + NCCL_VERSION_MINOR "${NCCL_VERSION_MINOR}") + string(REGEX MATCH "define NCCL_PATCH * +([0-9]+)" + NCCL_VERSION_PATCH "${NCCL_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define NCCL_PATCH * +([0-9]+)" "\\1" + NCCL_VERSION_PATCH "${NCCL_VERSION_PATCH}") +endif (NCCL_INCLUDE_DIR) + +if(NOT NCCL_VERSION_MAJOR) + set(NCCL_FOUND FALSE) + set(NCCL_VERSION "???") +else() + set(NCCL_FOUND TRUE) + set(NCCL_VERSION_STRING "${NCCL_VERSION_MAJOR}.${NCCL_VERSION_MINOR}.${NCCL_VERSION_PATCH}") +endif() + +find_package_handle_standard_args(NCCL DEFAULT_MSG + NCCL_INCLUDE_DIR + NCCL_LIBRARIES) + +mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARIES) diff --git a/src/turbomind/runtime/vendor/CUDA/deviceimpl.cpp b/src/turbomind/runtime/vendor/CUDA/deviceimpl.cpp new file mode 100644 index 0000000000..6567d8c87a --- /dev/null +++ b/src/turbomind/runtime/vendor/CUDA/deviceimpl.cpp @@ -0,0 +1,255 @@ +// Copyright (c) 2023, DeepLink. +#include +#include "../../device/rawdeviceapis.h" + +namespace dipu { +DIPU_API devapis::VendorDeviceType VENDOR_TYPE = devapis::VendorDeviceType::CUDA; + +// extern int patchCachingAllocator(); + +namespace devapis { + +using cuda_deviceId = int; +// ===================== +// Device class related +// ===================== + +void initializeVendor() { + // patchCachingAllocator(); +} + +void finalizeVendor() { + +} + + +deviceId_t current_device() { + cuda_deviceId devId_; + DIPU_CALLCUDA(::cudaGetDevice(&devId_)) + return static_cast(devId_); +} + +DIPUDeviceProperties getDeviceProperties(int32_t device_index) { + ::cudaDeviceProp device_prop; + DIPU_CALLCUDA(cudaGetDeviceProperties(&device_prop, device_index)) + + DIPUDeviceProperties prop; + prop.name = device_prop.name; + prop.totalGlobalMem = device_prop.totalGlobalMem; + prop.major = device_prop.major; + prop.minor = device_prop.minor; + prop.multiProcessorCount = device_prop.multiProcessorCount; + return prop; +} + +// in cuda_runtime_api.h +// set current device given device according to id +void setDevice(deviceId_t devId) { + cuda_deviceId devId_ = static_cast(devId); + DIPU_CALLCUDA(::cudaSetDevice(devId_)) +} + +void resetDevice(deviceId_t devId) { + DIPU_CALLCUDA(::cudaDeviceReset()) +} + +void syncDevice() { + DIPU_CALLCUDA(::cudaDeviceSynchronize()) +} + +// check last launch succ or not, throw if fail +void checkLastError() { + DIPU_CALLCUDA(::cudaGetLastError()) +} + +int getDeviceCount() { + int num = -1; + DIPU_CALLCUDA(::cudaGetDeviceCount(reinterpret_cast(&num))) + return num; +} + +void getDriverVersion(int* version) { + DIPU_CALLCUDA(::cudaDriverGetVersion(version)) +} + +void getRuntimeVersion(int* version) { + DIPU_CALLCUDA(::cudaRuntimeGetVersion(version)) +} + +// ===================== +// device stream related +// ===================== +void createStream(deviceStream_t* stream, bool prior) { + if (prior) { + DIPU_CALLCUDA(::cudaStreamCreateWithPriority(stream, cudaStreamDefault, -1)) + } else { + DIPU_CALLCUDA(::cudaStreamCreate(stream)) + } +} + +void destroyStream(deviceStream_t stream) { + DIPU_CALLCUDA(::cudaStreamDestroy(stream)) +} + +void destroyStream(deviceStream_t stream, deviceId_t devId) { + setDevice(devId); + destroyStream(stream); +} + +void releaseStream() { + return; +} + +bool streamNotNull(deviceStream_t stream) { + return (stream != nullptr && stream != cudaStreamLegacy && stream != cudaStreamPerThread); +} + +void syncStream(deviceStream_t stream) { + DIPU_CALLCUDA(::cudaStreamSynchronize(stream)); +} + +void streamWaitEvent(deviceStream_t stream, deviceEvent_t event) { + DIPU_CALLCUDA(::cudaStreamWaitEvent(stream, event, 0)) +} + +bool isStreamEmpty(deviceStream_t stream) { + auto err = cudaStreamQuery(stream); + if (err == ::cudaSuccess) { + return true; + } + return false; +} + + +// ===================== +// device event related +// ===================== + +void createEvent(deviceEvent_t* event) { + DIPU_CALLCUDA(::cudaEventCreateWithFlags(event, cudaEventDisableTiming)) +} + +void destroyEvent(deviceEvent_t event) { + DIPU_CALLCUDA(::cudaEventDestroy(event)) +} + +void waitEvent(deviceEvent_t event) { + DIPU_CALLCUDA(::cudaEventSynchronize(event)) +} + +void recordEvent(deviceEvent_t event, deviceStream_t stream) { + DIPU_CALLCUDA(::cudaEventRecord(event, stream)) +} + +void eventElapsedTime(float* time, deviceEvent_t start, deviceEvent_t end) { + DIPU_CALLCUDA(cudaEventElapsedTime(time, start, end)) +} + +EventStatus getEventStatus(deviceEvent_t event) { + ::cudaError_t ret = ::cudaEventQuery(event); + if (ret == ::cudaSuccess) { + return devapis::EventStatus::READY; + } else if (ret == ::cudaErrorNotReady) { + ::cudaGetLastError(); /* reset internal error state*/ + return devapis::EventStatus::PENDING; + } else { + TORCH_CHECK(false, "unexpected event status in getEventStatus, ret = ", ret); + } +} + +// ===================== +// mem related +// ===================== +void mallocHost(void** p, size_t nbytes) { + DIPU_CALLCUDA(::cudaMallocHost(p, nbytes)) +} + +void freeHost(void* p) { + DIPU_CALLCUDA(::cudaFreeHost(p)) +} + +OpStatus mallocDevice(void **p, size_t nbytes, bool throwExcepion) { + ::cudaError_t r = ::cudaMalloc(p, nbytes); + if (r != ::cudaSuccess) { + if(throwExcepion) { + ::cudaGetLastError(); /* reset internal error state*/ + TORCH_CHECK(false, "alloc failed in mallocDevice, ret = ", r, " size= ", nbytes); + } + else if(r == ::cudaErrorMemoryAllocation) { + return OpStatus::ERR_NOMEM; + } + else { + return OpStatus::ERR_UNKNOWN; + } + } + return OpStatus::SUCCESS; +} + +void freeDevice(void* p) { + DIPU_CALLCUDA(::cudaFree(p)) +} + +bool isPinnedPtr(const void *p) { + ::cudaPointerAttributes attr; + DIPU_CALLCUDA(::cudaPointerGetAttributes(&attr, p)) + return attr.type == cudaMemoryTypeHost; +} + +void memSetAsync(const deviceStream_t stream, void* ptr, int val, size_t size) { + DIPU_CALLCUDA(::cudaMemsetAsync(ptr, val, size, stream)) +} + +void memCopyD2D(size_t nbytes, deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src) { + if (dstDevId == srcDevId) { + DIPU_CALLCUDA(::cudaMemcpy(dst, src, nbytes, ::cudaMemcpyDeviceToDevice)) + } else { + DIPU_CALLCUDA(::cudaMemcpyPeer(dst, dstDevId, src, srcDevId, nbytes)) + } +} + +// (synchronous) copy from host to a CUDA device +void memCopyH2D(size_t nbytes, void* dst, const void* src) { + DIPU_CALLCUDA(::cudaMemcpy(dst, src, nbytes, ::cudaMemcpyHostToDevice)) +} + +// (synchronous) copy from a CUDA device to host +void memCopyD2H(size_t nbytes, void* dst, const void* src) { + DIPU_CALLCUDA(::cudaMemcpy(dst, src, nbytes, ::cudaMemcpyDeviceToHost)) +} + +// (asynchronous) copy from device to a device +void memCopyD2DAsync(const deviceStream_t stream, size_t nbytes, + deviceId_t dstDevId, void* dst, deviceId_t srcDevId, const void* src) { + if (dstDevId == srcDevId) { + DIPU_CALLCUDA(::cudaMemcpyAsync( + dst, src, nbytes, cudaMemcpyDeviceToDevice, stream)) + } else { + DIPU_CALLCUDA(::cudaMemcpyPeerAsync( + dst, dstDevId, src, srcDevId, nbytes, stream)) + } +} + +// (asynchronous) copy from host to a device +void memCopyH2DAsync(const deviceStream_t stream, size_t nbytes, void* dst, const void* src) { + DIPU_CALLCUDA(::cudaMemcpyAsync( + dst, src, nbytes, cudaMemcpyHostToDevice, stream)) +} + +// (asynchronous) copy from a device to host +void memCopyD2HAsync(const deviceStream_t stream, size_t nbytes, void* dst, const void* src) { + DIPU_CALLCUDA(::cudaMemcpyAsync( + dst, src, nbytes, cudaMemcpyDeviceToHost, stream)); +} + +} // end namespace devapis + +} // namespace parrots + + + + + + + + + diff --git a/src/turbomind/runtime/vendor/CUDA/rawcommuniatorimpl.cpp b/src/turbomind/runtime/vendor/CUDA/rawcommuniatorimpl.cpp new file mode 100644 index 0000000000..50a7b31fdd --- /dev/null +++ b/src/turbomind/runtime/vendor/CUDA/rawcommuniatorimpl.cpp @@ -0,0 +1,146 @@ +#include +#include "../../device/rawdiclapis.h" + +namespace dipu { + +namespace devapis { + // NCCL op mapping + static std::map ncclOp = { + {DiclReduceOp::MIN, ncclMin}, + {DiclReduceOp::MAX, ncclMax}, + {DiclReduceOp::SUM, ncclSum}, + {DiclReduceOp::PRODUCT, ncclProd}, + #ifdef NCCL_HAS_AVG + {DiclReduceOp::AVG, ncclAvg}, + #endif + }; + + static ncclRedOp_t getNcclOp(DiclReduceOp op) { + auto it = ncclOp.find(op); + if (it != myMap.end()) { + int value = it->second; + } else { + std::cout << "Key '" << key << "' not found." << std::endl; + exit(); + } + return 0; + } + + static std::map ncclDataType = { + // {DiclDataType::TYPE_INVALID, }, + {DiclDataType::TYPE_BOOL, ncclUint8}, + {DiclDataType::TYPE_UINT8, ncclUint8}, + // {DiclDataType::TYPE_UINT16, }, + // {DiclDataType::TYPE_UINT32, }, + // {DiclDataType::TYPE_UINT64, }, + {DiclDataType::TYPE_INT8, ncclInt8}, + // {DiclDataType::TYPE_INT16, }, + {DiclDataType::TYPE_INT32, ncclInt32}, + {DiclDataType::TYPE_INT64, ncclInt64}, + {DiclDataType::TYPE_FP16, ncclHalf}, + {DiclDataType::TYPE_FP32, ncclFloat}, + {DiclDataType::TYPE_FP64, ncclDouble}, + // {DiclDataType::TYPE_BYTES, }, + #if HAS_NCCL_BF16_DATATYPE + {DiclDataType::TYPE_BF16, ncclBfloat16}, + #endif + }; + +// Macro to print and abort on a non-successful NCCL return value. +#define NCCL_THROW(cmd) \ + do { \ + ncclResult_t result = cmd; \ + if (result != ncclSuccess) { \ + std::string err = ncclGetErrorString(result); \ + fprintf( \ + stderr, \ + "NCCL error in: %s:%d, %s\n", \ + __FILE__, \ + __LINE__, \ + err.c_str()); \ + TORCH_CHECK(false, err); \ + } \ + } while (0) + + + const int DICL_UNIQUE_ID_BYTES_SIZE = NCCL_UNIQUE_ID_BYTES; + + DIPU_API diclResult_t diclGetCommAsyncError(diclComm_t comm) { + ncclResult_t ncclAsyncErr_; + NCCL_THROW(ncclCommGetAsyncError(comm, &ncclAsyncErr_)); + if (ncclAsyncErr_ != ncclSuccess) { + return DICL_SUCCESS; + } else { + return DICL_ERR_UNDEF; + } + } + + DIPU_API diclResult_t diclGetUniqueId(commUniqueId* uniqueId) { + NCCL_THROW(ncclGetUniqueId(uniqueId)); + return DICL_SUCCESS; + } + + DIPU_API diclResult_t diclCommInitRank(diclComm_t* comm, int nranks, commUniqueId uniqueId, + int rank, int localDeviceId) { + NCCL_THROW(ncclCommInitRank(comm, nranks, uniqueId, rank)); + return DICL_SUCCESS; + } + + DIPU_API diclResult_t diclCommDestroy(ncclComm_t comm) { + NCCL_THROW(ncclCommDestroy(comm)); + return DICL_SUCCESS; + } + + DIPU_API diclResult_t diclRawAllReduce(const void *sendbuff, void *recvbuff, size_t count, dipuDataType datatype, + const ReduceOp& reduceOp, diclComm_t comm, deviceStream_t stream) { + NCCL_THROW(ncclAllReduce(sendbuff, recvbuff, count, ncclDataType[datatype], ncclOp[reduceOp], comm, stream)); + return DICL_SUCCESS; + } + + DIPU_API diclResult_t diclRawBroadcast(const void *sendbuff, void* recvbuff, size_t count, dipuDataType datatype, + int root, diclComm_t comm, deviceStream_t stream) { + NCCL_THROW(ncclBroadcast(sendbuff, recvbuff, count, ncclDataType[datatype], root, comm, stream)); + return DICL_SUCCESS; + } + + DIPU_API diclResult_t diclRawAllGather(const void *sendBuf, void *recvBuf, size_t count, dipuDataType datatype, + diclComm_t comm, deviceStream_t stream) { + NCCL_THROW(ncclAllGather(sendBuf, recvBuf, count, ncclDataType[datatype], comm, stream)); + return DICL_SUCCESS; + } + + DIPU_API diclResult_t diclRawReduce(const void* sendbuff, void* recvbuff, size_t count, dipuDataType datatype, + const ReduceOp& reduceOp, int root, diclComm_t comm, deviceStream_t stream) { + NCCL_THROW(ncclReduce(sendbuff, recvbuff, count, ncclDataType[datatype], ncclOp[reduceOp], root, comm, stream)); + return DICL_SUCCESS; + } + + DIPU_API diclResult_t diclRawReduceScatter(void *sendBuf, void *recvBuf, uint64_t recvCount, dipuDataType dataType, + const ReduceOp& op, diclComm_t comm, deviceStream_t stream) { + throw std::runtime_error("mlu Not implement diclReduceScatter"); + } + + DIPU_API diclResult_t diclRawSend(void* sendbuff, size_t count, dipuDataType datatype, int peer, + diclComm_t comm, deviceStream_t stream){ + NCCL_THROW(ncclSend(sendbuff, count, ncclDataType[datatype], peer, comm, stream)); + return DICL_SUCCESS; + } + + DIPU_API diclResult_t diclRawRecv(void* recvbuff, size_t count, dipuDataType datatype, int peer, + diclComm_t comm, deviceStream_t stream) { + NCCL_THROW(ncclRecv(recvbuff, count, ncclDataType[datatype], peer, comm, stream)); + return DICL_SUCCESS; + } + + DIPU_API diclResult_t diclRawGroupStart() { + NCCL_THROW(ncclGroupStart()); + return DICL_SUCCESS; + } + + DIPU_API diclResult_t diclRawGroupEnd() { + NCCL_THROW(ncclGroupEnd()); + return DICL_SUCCESS; + } + +} // end namespace devapis +} // end namespace dipu diff --git a/src/turbomind/runtime/vendor/CUDA/vendorapi.h b/src/turbomind/runtime/vendor/CUDA/vendorapi.h new file mode 100644 index 0000000000..87a214847d --- /dev/null +++ b/src/turbomind/runtime/vendor/CUDA/vendorapi.h @@ -0,0 +1,28 @@ +// Copyright (c) 2023, DeepLink. + +#pragma once +#include +#include +#include + +namespace dipu { + +#define DIPU_CALLCUDA(Expr) \ +{ \ + cudaError_t ret = Expr; \ + TORCH_CHECK(ret == ::cudaSuccess, "call cuda error, expr = ", #Expr, ", ret = ", ret); \ +} + +using deviceStream_t = cudaStream_t; +#define deviceDefaultStreamLiteral cudaStreamLegacy +using deviceEvent_t = cudaEvent_t; + +using diclComm_t = ncclComm_t; +using commUniqueId = ncclUniqueId; + +} + + + + + diff --git a/src/turbomind/utils/Tensor.cc b/src/turbomind/utils/Tensor.cc index 1ae72bc5e4..38298a987b 100644 --- a/src/turbomind/utils/Tensor.cc +++ b/src/turbomind/utils/Tensor.cc @@ -153,9 +153,9 @@ Tensor Tensor::loadNpy(const std::string& npy_file, const MemoryType where) size_t n_elems = fread(data_cpu, Tensor::getTypeSize(type), size, f_ptr); FT_CHECK_WITH_INFO(n_elems == size, "reading tensor failed"); if (where == MEMORY_GPU) { - cudaMalloc(&data, size * Tensor::getTypeSize(type)); - cudaMemcpy(data, data_cpu, size * Tensor::getTypeSize(type), cudaMemcpyHostToDevice); - free(data_cpu); + dipu::devapis::mallocDevice(&data, size * Tensor::getTypeSize(type)); + dipu::devapis::memCopyH2D(size * Tensor::getTypeSize(type), data, data_cpu); + dipu::devapis::freeDevice(data_cpu); } fclose(f_ptr); diff --git a/src/turbomind/utils/allocator.h b/src/turbomind/utils/allocator.h index 1ba191d211..b8dc6e319f 100644 --- a/src/turbomind/utils/allocator.h +++ b/src/turbomind/utils/allocator.h @@ -21,7 +21,9 @@ #include "cuda_utils.h" #include "src/turbomind/macro.h" +#ifndef DIOPI_ENABLE #include +#endif // DIOPI_ENABLE #include #include @@ -70,8 +72,8 @@ class IAllocator { virtual void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false) = 0; virtual void free(void** ptr, bool is_host = false) const = 0; - virtual void setStream(cudaStream_t stream) = 0; - virtual cudaStream_t returnStream() = 0; + virtual void setStream(deviceStream_t stream) = 0; + virtual deviceStream_t returnStream() = 0; virtual void memSet(void* ptr, const int val, const size_t size) = 0; template @@ -132,7 +134,7 @@ class Allocator: public IAllocator { }; const int device_id_; - cudaStream_t stream_ = 0; // initialize as default stream + deviceStream_t stream_ = 0; // initialize as default stream std::unordered_map>* pointer_mapping_; bool isExist(void* address) const @@ -201,12 +203,12 @@ class Allocator: public IAllocator { delete pointer_mapping_; } - void setStream(cudaStream_t stream) + void setStream(deviceStream_t stream) { stream_ = stream; } - cudaStream_t returnStream() + deviceStream_t returnStream() { return stream_; }; @@ -222,17 +224,13 @@ class Allocator: public IAllocator { check_cuda_error(getSetDevice(device_id_, &o_device)); if (is_host) { - check_cuda_error(cudaMallocHost(&ptr, (size_t)(ceil(size / 32.)) * 32)); + check_cuda_error(dipu::devproxy::mallocHost(&ptr, (size_t)(ceil(size / 32.)) * 32)); } else { -#if defined(CUDA_MEMORY_POOL_DISABLED) - check_cuda_error(cudaMalloc(&ptr, (size_t)(ceil(size / 32.)) * 32)); -#else - check_cuda_error(cudaMallocAsync(&ptr, (size_t)(ceil(size / 32.)) * 32, stream_)); -#endif + check_cuda_error(dipu::devproxy::mallocDevice(&ptr, (size_t)(ceil(size / 32.)) * 32)); } if (is_set_zero) { - check_cuda_error(cudaMemsetAsync(ptr, 0, (size_t)(ceil(size / 32.)) * 32, stream_)); + check_cuda_error(dipu::devproxy::memSetAsync(stream_, ptr, 0, (size_t)(ceil(size / 32.)) * 32)); } check_cuda_error(getSetDevice(o_device)); TM_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size); @@ -253,15 +251,10 @@ class Allocator: public IAllocator { TM_LOG_DEBUG("Free buffer %p", address); check_cuda_error(getSetDevice(device_id_, &o_device)); if (is_host) { - check_cuda_error(cudaFreeHost(*ptr)); + check_cuda_error(dipu::devproxy::freeHost(*ptr)); } else { -#if defined(CUDA_MEMORY_POOL_DISABLED) - check_cuda_error(cudaFree(*ptr)); -#else - check_cuda_error(cudaFreeAsync(*ptr, stream_)); - cudaStreamSynchronize(stream_); -#endif + check_cuda_error(dipu::devproxy::freeDevice(*ptr)); } check_cuda_error(getSetDevice(o_device)); pointer_mapping_->erase(address); @@ -276,7 +269,7 @@ class Allocator: public IAllocator { void memSet(void* ptr, const int val, const size_t size) { - check_cuda_error(cudaMemsetAsync(ptr, val, size, stream_)); + check_cuda_error(dipu::devproxy::memSetAsync(stream_, ptr, val, size)); } }; diff --git a/src/turbomind/utils/cuda_utils.cc b/src/turbomind/utils/cuda_utils.cc index 45fa06a6d5..442abb7dfb 100644 --- a/src/turbomind/utils/cuda_utils.cc +++ b/src/turbomind/utils/cuda_utils.cc @@ -21,7 +21,7 @@ namespace turbomind { /* **************************** debug tools ********************************* */ - +#ifndef DIOPI_ENABLE template void print_to_file(const T* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode) { @@ -331,40 +331,31 @@ template void check_abs_mean_val(const half* result, const int size); #ifdef ENABLE_BF16 template void check_abs_mean_val(const __nv_bfloat16* result, const int size); #endif +#endif // DIOPI_ENABLE /* ***************************** common utils ****************************** */ -cudaError_t getSetDevice(int i_device, int* o_device) +void getSetDevice(int i_device, int* o_device) { int current_dev_id = 0; cudaError_t err = cudaSuccess; if (o_device != NULL) { - err = cudaGetDevice(¤t_dev_id); - if (err != cudaSuccess) { - return err; - } + current_dev_id = dipu::devapis::current_device(); if (current_dev_id == i_device) { *o_device = i_device; } else { - err = cudaSetDevice(i_device); - if (err != cudaSuccess) { - return err; - } + err = dipu::devapis::setDevice(i_device); *o_device = current_dev_id; } } else { - err = cudaSetDevice(i_device); - if (err != cudaSuccess) { - return err; - } + dipu::devapis::setDevice(i_device); } - - return cudaSuccess; } +#ifndef DIOPI_ENABLE FtCudaDataType getModelFileType(std::string ini_file, std::string section_name) { FtCudaDataType model_file_type; @@ -391,6 +382,7 @@ FtCudaDataType getModelFileType(std::string ini_file, std::string section_name) } return model_file_type; } +#endif // DIOPI_ENABLE /* ************************** end of common utils ************************** */ } // namespace turbomind diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h index be0b85d69a..1950a5840a 100644 --- a/src/turbomind/utils/cuda_utils.h +++ b/src/turbomind/utils/cuda_utils.h @@ -116,6 +116,7 @@ static const char* _cudaGetErrorEnum(cublasStatus_t error) return ""; } +#ifndef DIOPI_ENABLE template void check(T result, char const* const func, const char* const file, int const line) { @@ -126,7 +127,19 @@ void check(T result, char const* const func, const char* const file, int const l } #define check_cuda_error(val) check((val), #val, __FILE__, __LINE__) -#define check_cuda_error_2(val, file, line) check((val), #val, file, line) +#else +#define check_cuda_error(val) \ +{ \ + try { \ + (val); \ + dipu::devapis::checkLastError(); \ + }catch(const std::exception& ex) { \ + throw std::runtime_error(std::string("[TM][ERROR] DEVICE runtime error: ") + (ex.what()) + " " \ + + __FILE__ + ":" + std::to_string(__LINE__) + " \n"); \ + } \ +} \ +#endif // DIOPI_ENABLE +// #define check_cuda_error_2(val, file, line) check((val), #val, file, line) inline void syncAndCheck(const char* const file, int const line) { @@ -135,28 +148,47 @@ inline void syncAndCheck(const char* const file, int const line) if (level_name != nullptr) { static std::string level = std::string(level_name); if (level == "DEBUG") { - cudaDeviceSynchronize(); + dipu::devapis::syncDevice(); +#ifndef DIOPI_ENABLE cudaError_t result = cudaGetLastError(); if (result) { throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " " + file + ":" + std::to_string(line) + " \n"); } +#else + try { + dipu::devapis::checkLastError(); + }catch(const std::exception& ex) { + throw std::runtime_error(std::string("[TM][ERROR] DEVICE runtime error: ") + (ex.what()) + + " " + file + ":" + std::to_string(line) + " \n"); + } +#endif // DIOPI_ENABLE TM_LOG_DEBUG(fmtstr("run syncAndCheck at %s:%d", file, line)); } } #ifndef NDEBUG - cudaDeviceSynchronize(); + dipu::devapis::syncDevice(); +#ifndef DIOPI_ENABLE cudaError_t result = cudaGetLastError(); if (result) { throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " " + file + ":" + std::to_string(line) + " \n"); } +#else + try { + dipu::devapis::checkLastError(); + }catch(const std::exception& ex) { + throw std::runtime_error(std::string("[TM][ERROR] DEVICE runtime error: ") + (ex.what()) + " " + + file + ":" + std::to_string(line) + " \n") + } +#endif // DIOPI_ENABLE #endif } #define sync_check_cuda_error() syncAndCheck(__FILE__, __LINE__) +#ifndef DIOPI_ENABLE #define checkCUDNN(expression) \ { \ cudnnStatus_t status = (expression); \ @@ -166,6 +198,7 @@ inline void syncAndCheck(const char* const file, int const line) std::exit(EXIT_FAILURE); \ } \ } +#endif // DIOPI_ENABLE template void print_to_file(const T* result, @@ -235,50 +268,53 @@ inline void myAssert(bool result, const char* const file, int const line, std::s #endif /*************Time Handling**************/ -class CudaTimer { -private: - cudaEvent_t event_start_; - cudaEvent_t event_stop_; - cudaStream_t stream_; - -public: - explicit CudaTimer(cudaStream_t stream = 0) - { - stream_ = stream; - } - void start() - { - check_cuda_error(cudaEventCreate(&event_start_)); - check_cuda_error(cudaEventCreate(&event_stop_)); - check_cuda_error(cudaEventRecord(event_start_, stream_)); - } - float stop() - { - float time; - check_cuda_error(cudaEventRecord(event_stop_, stream_)); - check_cuda_error(cudaEventSynchronize(event_stop_)); - check_cuda_error(cudaEventElapsedTime(&time, event_start_, event_stop_)); - check_cuda_error(cudaEventDestroy(event_start_)); - check_cuda_error(cudaEventDestroy(event_stop_)); - return time; - } - ~CudaTimer() {} -}; +// class CudaTimer { +// private: +// cudaEvent_t event_start_; +// cudaEvent_t event_stop_; +// cudaStream_t stream_; + +// public: +// explicit CudaTimer(cudaStream_t stream = 0) +// { +// stream_ = stream; +// } +// void start() +// { +// check_cuda_error(cudaEventCreate(&event_start_)); +// check_cuda_error(cudaEventCreate(&event_stop_)); +// check_cuda_error(cudaEventRecord(event_start_, stream_)); +// } +// float stop() +// { +// float time; +// check_cuda_error(cudaEventRecord(event_stop_, stream_)); +// check_cuda_error(cudaEventSynchronize(event_stop_)); +// check_cuda_error(cudaEventElapsedTime(&time, event_start_, event_stop_)); +// check_cuda_error(cudaEventDestroy(event_start_)); +// check_cuda_error(cudaEventDestroy(event_stop_)); +// return time; +// } +// ~CudaTimer() {} +// }; /* ***************************** common utils ****************************** */ inline void print_mem_usage(std::string time = "after allocation") { +#ifndef DIOPI_ENABLE size_t free_bytes, total_bytes; check_cuda_error(cudaMemGetInfo(&free_bytes, &total_bytes)); float free = static_cast(free_bytes) / 1024.0 / 1024.0 / 1024.0; float total = static_cast(total_bytes) / 1024.0 / 1024.0 / 1024.0; float used = total - free; printf("%-20s: free: %5.2f GB, total: %5.2f GB, used: %5.2f GB\n", time.c_str(), free, total, used); +#endif // DIOPI_ENABLE } inline int getSMVersion() { +#ifndef DIOPI_ENABLE int device{-1}; check_cuda_error(cudaGetDevice(&device)); int sm_major = 0; @@ -286,25 +322,28 @@ inline int getSMVersion() check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device)); check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device)); return sm_major * 10 + sm_minor; +#endif // DIOPI_ENABLE } inline int getMaxSharedMemoryPerBlock() { +#ifndef DIOPI_ENABLE int device{-1}; check_cuda_error(cudaGetDevice(&device)); int max_shared_memory_size = 0; check_cuda_error(cudaDeviceGetAttribute(&max_shared_memory_size, cudaDevAttrMaxSharedMemoryPerBlock, device)); return max_shared_memory_size; +#endif // DIOPI_ENABLE } -inline std::string getDeviceName() -{ - int device{-1}; - check_cuda_error(cudaGetDevice(&device)); - cudaDeviceProp props; - check_cuda_error(cudaGetDeviceProperties(&props, device)); - return std::string(props.name); -} +// inline std::string getDeviceName() +// { +// int device{-1}; +// check_cuda_error(cudaGetDevice(&device)); +// cudaDeviceProp props; +// check_cuda_error(cudaGetDeviceProperties(&props, device)); +// return std::string(props.name); +// } inline int div_up(int a, int n) { @@ -315,18 +354,16 @@ cudaError_t getSetDevice(int i_device, int* o_device = NULL); inline int getDevice() { - int current_dev_id = 0; - check_cuda_error(cudaGetDevice(¤t_dev_id)); - return current_dev_id; + return dipu::devapis::current_device(); } inline int getDeviceCount() { - int count = 0; - check_cuda_error(cudaGetDeviceCount(&count)); + int count = dipu::devapis::getDeviceCount(); return count; } +// suggest: move to CublasWrapper template CublasDataType getCublasDataType() { @@ -347,6 +384,7 @@ CublasDataType getCublasDataType() } } +// suggest: move to CublasWrapper template cudaDataType_t getCudaDataType() { @@ -389,7 +427,9 @@ FtCudaDataType getModelFileType(std::string ini_file, std::string section_name); // clang-format off template struct packed_type; template <> struct packed_type { using type = float; }; // we don't need to pack float by default +#ifndef DIOPI_ENABLE template <> struct packed_type { using type = half2; }; +#endif // DIOPI_ENABLE #ifdef ENABLE_BF16 template<> @@ -400,15 +440,18 @@ struct packed_type<__nv_bfloat16> { template struct num_elems; template <> struct num_elems { static constexpr int value = 1; }; +#ifndef DIOPI_ENABLE template <> struct num_elems { static constexpr int value = 2; }; template <> struct num_elems { static constexpr int value = 4; }; template <> struct num_elems { static constexpr int value = 1; }; template <> struct num_elems { static constexpr int value = 2; }; +#endif // DIOPI_ENABLE #ifdef ENABLE_BF16 template <> struct num_elems<__nv_bfloat16> { static constexpr int value = 1; }; template <> struct num_elems<__nv_bfloat162> { static constexpr int value = 2; }; #endif +#ifndef DIOPI_ENABLE template struct packed_as; template struct packed_as { using type = T; }; template<> struct packed_as { using type = half2; }; @@ -482,6 +525,7 @@ void compareTwoTensor( delete[] h_pred; delete[] h_ref; } +#endif // DIOPI_ENABLE /* ************************** end of common utils ************************** */ } // namespace turbomind diff --git a/src/turbomind/utils/logger.cc b/src/turbomind/utils/logger.cc index 9788a8fad7..22a01af35a 100644 --- a/src/turbomind/utils/logger.cc +++ b/src/turbomind/utils/logger.cc @@ -15,8 +15,11 @@ */ #include "src/turbomind/utils/logger.h" +#ifndef DIOPI_ENABLE #include - +#else +#include "../runtime/device/rawdeviceapis.h" +#endif // DIOPI_ENABLE namespace turbomind { Logger::Logger() @@ -26,7 +29,7 @@ Logger::Logger() (is_first_rank_only_char != nullptr && std::string(is_first_rank_only_char) == "ON") ? true : false; int device_id; - cudaGetDevice(&device_id); + device_id = dipu::current_device(); char* level_name = std::getenv("TM_LOG_LEVEL"); if (level_name != nullptr) { diff --git a/src/turbomind/utils/memory_utils.cu b/src/turbomind/utils/memory_utils.cu index 93547f364f..99a476fce2 100644 --- a/src/turbomind/utils/memory_utils.cu +++ b/src/turbomind/utils/memory_utils.cu @@ -16,10 +16,8 @@ #include "src/turbomind/macro.h" #include "src/turbomind/utils/Tensor.h" -#include "src/turbomind/utils/cuda_type_utils.cuh" #include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/memory_utils.h" -#include #include #include @@ -29,18 +27,22 @@ template void deviceMalloc(T** ptr, size_t size, bool is_random_initialize) { FT_CHECK_WITH_INFO(size >= ((size_t)0), "Ask deviceMalloc size " + std::to_string(size) + "< 0 is invalid."); - check_cuda_error(cudaMalloc((void**)(ptr), sizeof(T) * size)); + check_cuda_error(dipu::devapis::mallocDevice((void**)(ptr), sizeof(T) * size)); +#ifndef DIOPI_ENABLE if (is_random_initialize) { cudaRandomUniform(*ptr, size); } +#endif // DIOPI_ENABLE } template void deviceMalloc(float** ptr, size_t size, bool is_random_initialize); +#ifndef DIOPI_ENABLE template void deviceMalloc(half** ptr, size_t size, bool is_random_initialize); #ifdef ENABLE_BF16 template void deviceMalloc(__nv_bfloat16** ptr, size_t size, bool is_random_initialize); #endif template void deviceMalloc(uint16_t** ptr, size_t size, bool is_random_initialize); +#endif // DIOPI_ENABLE template void deviceMalloc(int** ptr, size_t size, bool is_random_initialize); template void deviceMalloc(bool** ptr, size_t size, bool is_random_initialize); template void deviceMalloc(char** ptr, size_t size, bool is_random_initialize); @@ -52,11 +54,16 @@ template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, bool is_random_init template void deviceMemSetZero(T* ptr, size_t size) { - check_cuda_error(cudaMemset(static_cast(ptr), 0, sizeof(T) * size)); + T* arr = new T[size]; + std::fill(arr, arr + size, 0); + check_cuda_error(dipu::devapis::memCopyH2D(sizeof(T) * size, ptr, arr)); + delete[] arr; } template void deviceMemSetZero(float* ptr, size_t size); +#ifndef DIOPI_ENABLE template void deviceMemSetZero(half* ptr, size_t size); +#endif // DIOPI_ENABLE template void deviceMemSetZero(int* ptr, size_t size); template void deviceMemSetZero(uint32_t* ptr, size_t size); template void deviceMemSetZero(bool* ptr, size_t size); @@ -71,13 +78,15 @@ template void deviceFree(T*& ptr) { if (ptr != NULL) { - check_cuda_error(cudaFree(ptr)); + check_cuda_error(dipu::devapis::freeDevice(ptr)); ptr = NULL; } } template void deviceFree(float*& ptr); +#ifndef DIOPI_ENABLE template void deviceFree(half*& ptr); +#endif // DIOPI_ENABLE #ifdef ENABLE_BF16 template void deviceFree(__nv_bfloat16*& ptr); #endif @@ -91,16 +100,18 @@ template void deviceFree(__nv_fp8_e4m3*& ptr); #endif template -void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream) +void deviceFill(T* devptr, size_t size, T value, dipu::devapis::deviceStream_t stream) { T* arr = new T[size]; std::fill(arr, arr + size, value); - check_cuda_error(cudaMemcpyAsync(devptr, arr, sizeof(T) * size, cudaMemcpyHostToDevice, stream)); + check_cuda_error(dipu::devapis::memCopyH2DAsync(sizeof(T) * size, devptr, arr)); delete[] arr; } -template void deviceFill(float* devptr, size_t size, float value, cudaStream_t stream); +template void deviceFill(float* devptr, size_t size, float value, dipu::devapis::deviceStream_t stream); +#ifndef DIOPI_ENABLE template void deviceFill(half* devptr, size_t size, half value, cudaStream_t stream); +#endif // DIOPI_ENABLE #ifdef ENABLE_BF16 template void deviceFill(__nv_bfloat16* devptr, size_t size, __nv_bfloat16 value, cudaStream_t stream); #endif @@ -110,11 +121,13 @@ template void deviceFill(bool* devptr, size_t size, bool value, cudaStream_t str template void cudaD2Hcpy(T* tgt, const T* src, const size_t size) { - check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToHost)); + check_cuda_error(devapis::memCopyD2H(sizeof(T) * size, tgt, src); } template void cudaD2Hcpy(float* tgt, const float* src, size_t size); +#ifndef DIOPI_ENABLE template void cudaD2Hcpy(half* tgt, const half* src, size_t size); +#endif // DIOPI_ENABLE #ifdef ENABLE_BF16 template void cudaD2Hcpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size); #endif @@ -133,11 +146,13 @@ void cudaH2Dcpy(T* tgt, const T* src, const size_t size) if (tgt == nullptr || src == nullptr) { TM_LOG_ERROR("cudaH2Dcpy: dst=%p src=%p, size=%d", tgt, src, (int)(sizeof(T) * size)); } - check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyHostToDevice)); + check_cuda_error(dipu::devapis::memCopyH2D(sizeof(T) * size, tgt, src)); } template void cudaH2Dcpy(float* tgt, const float* src, size_t size); +#ifndef DIOPI_ENABLE template void cudaH2Dcpy(half* tgt, const half* src, size_t size); +#endif // DIOPI_ENABLE #ifdef ENABLE_BF16 template void cudaH2Dcpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size); #endif @@ -153,11 +168,13 @@ template void cudaH2Dcpy(int8_t* tgt, const int8_t* src, size_t size); template void cudaD2Dcpy(T* tgt, const T* src, const size_t size) { - check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToDevice)); + check_cuda_error(dipu::devapis::memCopyD2D(sizeof(T) * size, devapis::current_device(), tgt, devapis::current_device(), src)); } template void cudaD2Dcpy(float* tgt, const float* src, size_t size); +#ifndef DIOPI_ENABLE template void cudaD2Dcpy(half* tgt, const half* src, size_t size); +#endif // DIOPI_ENABLE #ifdef ENABLE_BF16 template void cudaD2Dcpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size); #endif @@ -169,6 +186,7 @@ template void cudaD2Dcpy(__nv_fp8_e4m3* tgt, const __nv_fp8_e4m3* src, size_t si #endif template void cudaD2Dcpy(unsigned long long* tgt, const unsigned long long* src, size_t size); +#ifndef DIOPI_ENABLE template __global__ void cudaCast(T_OUT* dst, T_IN* src, const size_t size) { @@ -200,18 +218,77 @@ template void invokeCudaCast(__nv_fp8_e4m3* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream); template void invokeCudaCast(__nv_fp8_e4m3* dst, half const* const src, const size_t size, cudaStream_t stream); #endif +#endif // DIOPI_ENABLE + +template +void cudaH2DCpy(T* tgt, const T* src, const size_t size, dipu::devapis::deviceStream_t stream) +{ + if (stream != NULL) { + check_cuda_error(dipu::devapis::memCopyH2DAsync(stream, sizeof(T) * size, tgt, src)); + } + else { + check_cuda_error(dipu::devapis::memCopyH2D(sizeof(T) * size, tgt, src)); + } +} + +template +void cudaD2DCpy(T* tgt, const T* src, const size_t size, dipu::devapis::deviceStream_t stream) +{ + if (stream != NULL) { + check_cuda_error(dipu::devapis::memCopyD2DAsync(stream, sizeof(T) * size, dipu::devapis::current_device(), tgt, dipu::devapis::current_device(), src)); + } + else { + check_cuda_error(dipu::devapis::memCopyD2D(sizeof(T) * size, dipu::devapis::current_device(), tgt, dipu::devapis::current_device(), src)); + } +} template -void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream) +void cudaD2HCpy(T* tgt, const T* src, const size_t size, dipu::devapis::deviceStream_t stream) { if (stream != NULL) { - check_cuda_error(cudaMemcpyAsync(tgt, src, sizeof(T) * size, cudaMemcpyDefault, stream)); + check_cuda_error(dipu::devapis::memCopyD2HAsync(stream, sizeof(T) * size, tgt, src)); } else { - check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDefault)); + check_cuda_error(dipu::devapis::memCopyD2H(sizeof(T) * size, tgt, src)); } } +template void cudaH2DCpy(float* tgt, const float* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2DCpy(float* tgt, const float* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2HCpy(float* tgt, const float* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaH2DCpy(int* tgt, const int* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2DCpy(int* tgt, const int* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2HCpy(int* tgt, const int* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaH2DCpy(bool* tgt, const bool* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2DCpy(bool* tgt, const bool* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2HCpy(bool* tgt, const bool* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaH2DCpy(int8_t* tgt, const int8_t* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2DCpy(int8_t* tgt, const int8_t* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2HCpy(int8_t* tgt, const int8_t* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaH2DCpy(uint* tgt, const uint* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2DCpy(uint* tgt, const uint* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2HCpy(uint* tgt, const uint* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaH2DCpy(unsigned long long* tgt, const unsigned long long* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2DCpy(unsigned long long* tgt, const unsigned long long* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2HCpy(unsigned long long* tgt, const unsigned long long* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaH2DCpy(char* tgt, const char* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2DCpy(char* tgt, const char* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2HCpy(char* tgt, const char* src, const size_t size, dipu::devapis::deviceStream_t stream); + +template void cudaH2DCpy(float const** tgt, float const* const* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2DCpy(float const** tgt, float const* const* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2HCpy(float const** tgt, float const* const* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaH2DCpy(bool const** tgt, bool const* const* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2DCpy(bool const** tgt, bool const* const* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2HCpy(bool const** tgt, bool const* const* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaH2DCpy(int8_t const** tgt, int8_t const* const* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2DCpy(int8_t const** tgt, int8_t const* const* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2HCpy(int8_t const** tgt, int8_t const* const* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaH2DCpy(unsigned long long const** tgt, unsigned long long const* const* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2DCpy(unsigned long long const** tgt, unsigned long long const* const* src, const size_t size, dipu::devapis::deviceStream_t stream); +template void cudaD2HCpy(unsigned long long const** tgt, unsigned long long const* const* src, const size_t size, dipu::devapis::deviceStream_t stream); + +#ifndef DIOPI_ENABLE template void cudaAutoCpy(float* tgt, const float* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(half* tgt, const half* src, size_t size, cudaStream_t stream); #ifdef ENABLE_BF16 @@ -234,7 +311,9 @@ template void cudaAutoCpy(bool const** tgt, bool const* const* src, size_t size, template void cudaAutoCpy(int8_t const** tgt, int8_t const* const* src, size_t size, cudaStream_t stream); template void cudaAutoCpy(unsigned long long const** tgt, unsigned long long const* const* src, size_t size, cudaStream_t stream); +#endif // DIOPI_ENABLE +#ifndef DIOPI_ENABLE template __global__ void cuda_random_uniform_kernel(T* buffer, const size_t size, const int seq_offset) { @@ -298,6 +377,7 @@ template void cudaRandomUniform(char* buffer, const size_t size); #ifdef ENABLE_FP8 template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size); #endif +#endif // DIOPI_ENABLE // loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or // the product of the elements in shape is 0, this function will return an empty vector. @@ -459,15 +539,17 @@ int loadWeightFromBinFunc(T* ptr, return 0; } - if (std::is_same::value == true) { + if (std::is_same::value == true) {// now only cudaH2Dcpy(ptr, (T*)host_array.data(), host_array.size()); } else { +#ifndef DIOPI_ENABLE T_IN* ptr_2 = nullptr; deviceMalloc(&ptr_2, host_array.size(), false); cudaH2Dcpy(ptr_2, host_array.data(), host_array.size()); invokeCudaD2DcpyConvert(ptr, ptr_2, host_array.size()); deviceFree(ptr_2); +#endif // DIOPI_ENABLE } return 0; } @@ -476,6 +558,7 @@ template int loadWeightFromBinFunc(float* ptr, std::vector shape, std::string filename, std::vector slices); +#ifndef DIOPI_ENABLE template int loadWeightFromBinFunc(half* ptr, std::vector shape, std::string filename, @@ -488,6 +571,7 @@ template int loadWeightFromBinFunc(half* ptr, std::vector shape, std::string filename, std::vector slices); +#endif // DIOPI_ENABLE template int loadWeightFromBinFunc(int8_t* ptr, std::vector shape, std::string filename, @@ -536,6 +620,7 @@ int loadWeightFromBin(T* ptr, case FtCudaDataType::FP32: loadWeightFromBinFunc(ptr, shape, filename, slices); break; +#ifndef DIOPI_ENABLE case FtCudaDataType::FP16: loadWeightFromBinFunc(ptr, shape, filename, slices); break; @@ -552,6 +637,7 @@ int loadWeightFromBin(T* ptr, loadWeightFromBinFunc(ptr, shape, filename, slices); break; #endif +#endif // DIOPI_ENABLE default: TM_LOG_ERROR("Does not support FtCudaDataType=%d", model_file_type); FT_CHECK(false); @@ -559,6 +645,7 @@ int loadWeightFromBin(T* ptr, return 0; } +#ifndef DIOPI_ENABLE template<> int loadWeightFromBin(int* ptr, std::vector shape, @@ -569,12 +656,14 @@ int loadWeightFromBin(int* ptr, loadWeightFromBinFunc(ptr, shape, filename, slices); return 0; } +#endif // DIOPI_ENABLE template int loadWeightFromBin(float* ptr, std::vector shape, std::string filename, FtCudaDataType model_file_type, std::vector slices); +#ifndef DIOPI_ENABLE template int loadWeightFromBin(half* ptr, std::vector shape, std::string filename, @@ -599,12 +688,14 @@ template int loadWeightFromBin(__nv_fp8_e4m3* ptr, FtCudaDataType model_file_type, std::vector slices); #endif +#endif // DIOPI_ENABLE template int loadWeightFromBin(int* ptr, std::vector shape, std::string filename, FtCudaDataType model_file_type, std::vector slices); +#ifndef DIOPI_ENABLE template __global__ void cudaD2DcpyConvert(T_OUT* dst, const T_IN* src, const size_t size) { @@ -637,324 +728,325 @@ template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, const int* src, const template void invokeCudaD2DcpyConvert(float* tgt, const __nv_bfloat16* src, const size_t size, cudaStream_t stream); template void invokeCudaD2DcpyConvert(int* tgt, const __nv_bfloat16* src, const size_t size, cudaStream_t stream); #endif // ENABLE_BF16 - -template -__global__ void -cudaD2DScaleCpyConvert(T_OUT* dst, const T_IN* src, const float* scale, bool invert_scale, const size_t size) -{ - const float scale_value = invert_scale ? 1.0f / scale[0] : scale[0]; - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { - dst[tid] = cuda_cast(cuda_cast(src[tid]) * scale_value); - } -} - -template -void invokeCudaD2DScaleCpyConvert( - T_OUT* tgt, const T_IN* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream) -{ - cudaD2DScaleCpyConvert<<<256, 256, 0, stream>>>(tgt, src, scale, invert_scale, size); -} +#endif // DIOPI_ENABLE + +// template +// __global__ void +// cudaD2DScaleCpyConvert(T_OUT* dst, const T_IN* src, const float* scale, bool invert_scale, const size_t size) +// { +// const float scale_value = invert_scale ? 1.0f / scale[0] : scale[0]; +// for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { +// dst[tid] = cuda_cast(cuda_cast(src[tid]) * scale_value); +// } +// } + +// template +// void invokeCudaD2DScaleCpyConvert( +// T_OUT* tgt, const T_IN* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream) +// { +// cudaD2DScaleCpyConvert<<<256, 256, 0, stream>>>(tgt, src, scale, invert_scale, size); +// } // clang-format off -template void invokeCudaD2DScaleCpyConvert(float* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const float* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -template void invokeCudaD2DScaleCpyConvert(half* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const half* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void invokeCudaD2DScaleCpyConvert(__nv_bfloat16* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const __nv_bfloat16* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -#endif // ENABLE_BF16 -#ifdef ENABLE_FP8 -template void invokeCudaD2DScaleCpyConvert(float* tgt, const __nv_fp8_e4m3* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -#endif // ENABLE_FP8 +// template void invokeCudaD2DScaleCpyConvert(float* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); +// template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const float* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); +// template void invokeCudaD2DScaleCpyConvert(half* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); +// template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const half* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); +// #ifdef ENABLE_BF16 +// template void invokeCudaD2DScaleCpyConvert(__nv_bfloat16* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); +// template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const __nv_bfloat16* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); +// #endif // ENABLE_BF16 +// #ifdef ENABLE_FP8 +// template void invokeCudaD2DScaleCpyConvert(float* tgt, const __nv_fp8_e4m3* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); +// #endif // ENABLE_FP8 // clang-format on -void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream) -{ - invokeCudaD2DcpyConvert(dst, src, size, stream); -} - -void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream) -{ - invokeCudaD2DcpyConvert(dst, src, size, stream); -} - -template -void saveToBinary(const T* ptr, const size_t size, std::string filename) -{ - - std::vector h_ptr(size); - cudaD2Hcpy(h_ptr.data(), ptr, size); - std::vector float_ptr(size); - for (size_t i = 0; i < size; i++) { - float_ptr[i] = (float)h_ptr[i]; - } - - std::ofstream out(filename, std::ios::out | std::ios::binary); - FT_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename); - - out.write((char*)float_ptr.data(), size * sizeof(float)); -} - -template void saveToBinary(const float* ptr, const size_t size, std::string filename); -template void saveToBinary(const half* ptr, const size_t size, std::string filename); -#ifdef ENABLE_BF16 -template void saveToBinary(const __nv_bfloat16* ptr, const size_t size, std::string filename); -#endif // ENABLE_BF16 - -template<> -void saveToBinary(const int* ptr, const size_t size, std::string filename) -{ - std::vector h_ptr(size); - cudaD2Hcpy(h_ptr.data(), ptr, size); - std::ofstream out(filename, std::ios::out | std::ios::binary); - FT_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename); - out.write((char*)h_ptr.data(), size * sizeof(int)); -} - -template -__global__ void fakeCast(T_IN* input_ptr, const size_t size) -{ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) { - T_fake_type tmp_val = (T_fake_type)((float)input_ptr[i]); - input_ptr[i] = (T_IN)((float)tmp_val); - } -} - -template -void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream) -{ - dim3 block(256); - dim3 grid((size + 255) / 256); - fakeCast<<>>(input_ptr, size); -} - -#ifdef ENABLE_FP8 -__global__ void cudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { - dst[tid] = (float)(src[tid]); - } -} - -void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream) -{ - cudaD2Dcpyfp82Float<<<256, 256, 0, stream>>>(dst, src, size); -} - -__global__ void cudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { - dst[tid] = (half)((float)(src[tid])); - } -} - -void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream) -{ - cudaD2Dcpyfp82Half<<<256, 256, 0, stream>>>(dst, src, size); -} - -__global__ void cudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { - dst[tid] = (__nv_fp8_e4m3)src[tid]; - } -} - -void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size, cudaStream_t stream) -{ - cudaD2DcpyFloat2fp8<<<256, 256, 0, stream>>>(dst, src, size); -} - -__global__ void cudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { - dst[tid] = (__nv_fp8_e4m3)src[tid]; - } -} - -void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size, cudaStream_t stream) -{ - cudaD2DcpyHalf2fp8<<<256, 256, 0, stream>>>(dst, src, size); -} - -__global__ void cudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { - dst[tid] = (__nv_fp8_e4m3)src[tid]; - } -} - -void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream) -{ - cudaD2DcpyBfloat2fp8<<<256, 256, 0, stream>>>(dst, src, size); -} - -#endif // ENABLE_FP8 - -template -__global__ void transpose(T_OUT* dst, T_IN* src, const int dim0, const int dim1) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1; tid += blockDim.x * gridDim.x) { - const int src_col_id = tid % dim1; - const int src_row_id = tid / dim1; - dst[src_col_id * dim0 + src_row_id] = (T_OUT)(src[tid]); - } -} - -template -void invokeInPlaceTranspose(T* data, T* workspace, const int dim0, const int dim1) -{ - // copy data to workspace, and then transpose from workspace to data - cudaD2Dcpy(workspace, data, dim0 * dim1); - transpose<<<256, 256>>>(data, workspace, dim0, dim1); -} - -#ifdef ENABLE_FP8 -template void invokeInPlaceTranspose(__nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1); -#endif // ENABLE_FP8 -#ifdef ENABLE_BF16 -template void invokeInPlaceTranspose(__nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1); -#endif // ENABLE_BF16 -template void invokeInPlaceTranspose(float* data, float* workspace, const int dim0, const int dim1); - -template -__global__ void transpose0213(T_OUT* dst, T_IN* src, const int dim0, const int dim1, const int dim2, const int dim3) -{ - // src permutation: [0, 1, 2, 3] - // dst permutation: [0, 2, 1, 3] - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2 * dim3; - tid += blockDim.x * gridDim.x) { - int tmp_idx = tid; - const int dim_3_idx = tmp_idx % dim3; - tmp_idx = (tmp_idx - dim_3_idx) / dim3; - const int dim_2_idx = tmp_idx % dim2; - tmp_idx = (tmp_idx - dim_2_idx) / dim2; - const int dim_1_idx = tmp_idx % dim1; - tmp_idx = (tmp_idx - dim_1_idx) / dim1; - const int dim_0_idx = tmp_idx % dim0; - dst[dim_0_idx * dim1 * dim2 * dim3 + dim_2_idx * dim1 * dim3 + dim_1_idx * dim3 + dim_3_idx] = src[tid]; - } -} - -template -void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3) -{ - // copy data to workspace, and then transpose from workspace to data - // Note that this kernel is used for pre-processing and not very efficient. - cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2 * dim3); - transpose0213<<<256, 256>>>(data, workspace, dim0, dim1, dim2, dim3); -} - -#ifdef ENABLE_FP8 -template void invokeInPlaceTranspose0213( - __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1, const int dim2, const int dim3); -#endif // ENABLE_FP8 -#ifdef ENABLE_BF16 -template void invokeInPlaceTranspose0213( - __nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1, const int dim2, const int dim3); -#endif // ENABLE_BF16 -template void invokeInPlaceTranspose0213( - float* data, float* workspace, const int dim0, const int dim1, const int dim2, const int dim3); - -template -__global__ void transpose102(T_OUT* dst, T_IN* src, const int dim0, const int dim1, const int dim2) -{ - // src permutation: [0, 1, 2] - // dst permutation: [1, 0, 2] - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2; tid += blockDim.x * gridDim.x) { - int tmp_idx = tid; - const int dim_2_idx = tmp_idx % dim2; - tmp_idx = (tmp_idx - dim_2_idx) / dim2; - const int dim_1_idx = tmp_idx % dim1; - tmp_idx = (tmp_idx - dim_1_idx) / dim1; - const int dim_0_idx = tmp_idx % dim0; - dst[dim_1_idx * dim0 * dim2 + dim_0_idx * dim2 + dim_2_idx] = src[tid]; - } -} - -template -void invokeInPlaceTranspose102(T* data, T* workspace, const int dim0, const int dim1, const int dim2) -{ - // copy data to workspace, and then transpose from workspace to data - // Note that this kernel is used for pre-processing and not very efficient. - cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2); - transpose102<<<256, 256>>>(data, workspace, dim0, dim1, dim2); -} - -#ifdef ENABLE_FP8 -template void invokeInPlaceTranspose102( - __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1, const int dim2); -#endif // ENABLE_FP8 -#ifdef ENABLE_BF16 -template void invokeInPlaceTranspose102( - __nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1, const int dim2); -#endif // ENABLE_BF16 -template void invokeInPlaceTranspose102(float* data, float* workspace, const int dim0, const int dim1, const int dim2); - -template -void __global__ multiplyScale(T* tensor, float scale, const size_t size) -{ - for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) { - tensor[index] = (T)(((float)tensor[index]) * scale); - } -} - -template -void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream) -{ - int block = 256; - int grid = (size + 255) / 256; - multiplyScale<<>>(tensor, scale, size); -} - -template void invokeMultiplyScale(float* tensor, float scale, const size_t size, cudaStream_t stream); -template void invokeMultiplyScale(half* tensor, float scale, const size_t size, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void invokeMultiplyScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream); -#endif -#ifdef ENABLE_FP8 -template void invokeMultiplyScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream); -#endif - -template -void __global__ divideScale(T* tensor, float scale, const size_t size) -{ - for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) { - tensor[index] = (T)(((float)tensor[index]) / scale); - } -} - -template -void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream) -{ - int block = 256; - int grid = (size + 255) / 256; - divideScale<<>>(tensor, scale, size); -} - -template void invokeDivideScale(float* tensor, float scale, const size_t size, cudaStream_t stream); -template void invokeDivideScale(half* tensor, float scale, const size_t size, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void invokeDivideScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream); -#endif -#ifdef ENABLE_FP8 -template void invokeDivideScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream); -#endif -#ifdef ENABLE_BF16 -template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); -template void -invokeFakeCast<__nv_bfloat16, __nv_bfloat16>(__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream); -template void invokeFakeCast(half* input_ptr, const size_t size, cudaStream_t stream); -#endif -template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); -template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); -#ifdef ENABLE_FP8 -template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); -template void invokeFakeCast(half* input_ptr, const size_t size, cudaStream_t stream); -template void -invokeFakeCast<__nv_bfloat16, __nv_fp8_e4m3>(__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream); -#endif +// void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream) +// { +// invokeCudaD2DcpyConvert(dst, src, size, stream); +// } + +// void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream) +// { +// invokeCudaD2DcpyConvert(dst, src, size, stream); +// } + +// template +// void saveToBinary(const T* ptr, const size_t size, std::string filename) +// { + +// std::vector h_ptr(size); +// cudaD2Hcpy(h_ptr.data(), ptr, size); +// std::vector float_ptr(size); +// for (size_t i = 0; i < size; i++) { +// float_ptr[i] = (float)h_ptr[i]; +// } + +// std::ofstream out(filename, std::ios::out | std::ios::binary); +// FT_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename); + +// out.write((char*)float_ptr.data(), size * sizeof(float)); +// } + +// template void saveToBinary(const float* ptr, const size_t size, std::string filename); +// template void saveToBinary(const half* ptr, const size_t size, std::string filename); +// #ifdef ENABLE_BF16 +// template void saveToBinary(const __nv_bfloat16* ptr, const size_t size, std::string filename); +// #endif // ENABLE_BF16 + +// template<> +// void saveToBinary(const int* ptr, const size_t size, std::string filename) +// { +// std::vector h_ptr(size); +// cudaD2Hcpy(h_ptr.data(), ptr, size); +// std::ofstream out(filename, std::ios::out | std::ios::binary); +// FT_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename); +// out.write((char*)h_ptr.data(), size * sizeof(int)); +// } + +// template +// __global__ void fakeCast(T_IN* input_ptr, const size_t size) +// { +// for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) { +// T_fake_type tmp_val = (T_fake_type)((float)input_ptr[i]); +// input_ptr[i] = (T_IN)((float)tmp_val); +// } +// } + +// template +// void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream) +// { +// dim3 block(256); +// dim3 grid((size + 255) / 256); +// fakeCast<<>>(input_ptr, size); +// } + +// #ifdef ENABLE_FP8 +// __global__ void cudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size) +// { +// for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { +// dst[tid] = (float)(src[tid]); +// } +// } + +// void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream) +// { +// cudaD2Dcpyfp82Float<<<256, 256, 0, stream>>>(dst, src, size); +// } + +// __global__ void cudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size) +// { +// for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { +// dst[tid] = (half)((float)(src[tid])); +// } +// } + +// void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream) +// { +// cudaD2Dcpyfp82Half<<<256, 256, 0, stream>>>(dst, src, size); +// } + +// __global__ void cudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size) +// { +// for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { +// dst[tid] = (__nv_fp8_e4m3)src[tid]; +// } +// } + +// void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size, cudaStream_t stream) +// { +// cudaD2DcpyFloat2fp8<<<256, 256, 0, stream>>>(dst, src, size); +// } + +// __global__ void cudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size) +// { +// for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { +// dst[tid] = (__nv_fp8_e4m3)src[tid]; +// } +// } + +// void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size, cudaStream_t stream) +// { +// cudaD2DcpyHalf2fp8<<<256, 256, 0, stream>>>(dst, src, size); +// } + +// __global__ void cudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size) +// { +// for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { +// dst[tid] = (__nv_fp8_e4m3)src[tid]; +// } +// } + +// void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream) +// { +// cudaD2DcpyBfloat2fp8<<<256, 256, 0, stream>>>(dst, src, size); +// } + +// #endif // ENABLE_FP8 + +// template +// __global__ void transpose(T_OUT* dst, T_IN* src, const int dim0, const int dim1) +// { +// for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1; tid += blockDim.x * gridDim.x) { +// const int src_col_id = tid % dim1; +// const int src_row_id = tid / dim1; +// dst[src_col_id * dim0 + src_row_id] = (T_OUT)(src[tid]); +// } +// } + +// template +// void invokeInPlaceTranspose(T* data, T* workspace, const int dim0, const int dim1) +// { +// // copy data to workspace, and then transpose from workspace to data +// cudaD2Dcpy(workspace, data, dim0 * dim1); +// transpose<<<256, 256>>>(data, workspace, dim0, dim1); +// } + +// #ifdef ENABLE_FP8 +// template void invokeInPlaceTranspose(__nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1); +// #endif // ENABLE_FP8 +// #ifdef ENABLE_BF16 +// template void invokeInPlaceTranspose(__nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1); +// #endif // ENABLE_BF16 +// template void invokeInPlaceTranspose(float* data, float* workspace, const int dim0, const int dim1); + +// template +// __global__ void transpose0213(T_OUT* dst, T_IN* src, const int dim0, const int dim1, const int dim2, const int dim3) +// { +// // src permutation: [0, 1, 2, 3] +// // dst permutation: [0, 2, 1, 3] +// for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2 * dim3; +// tid += blockDim.x * gridDim.x) { +// int tmp_idx = tid; +// const int dim_3_idx = tmp_idx % dim3; +// tmp_idx = (tmp_idx - dim_3_idx) / dim3; +// const int dim_2_idx = tmp_idx % dim2; +// tmp_idx = (tmp_idx - dim_2_idx) / dim2; +// const int dim_1_idx = tmp_idx % dim1; +// tmp_idx = (tmp_idx - dim_1_idx) / dim1; +// const int dim_0_idx = tmp_idx % dim0; +// dst[dim_0_idx * dim1 * dim2 * dim3 + dim_2_idx * dim1 * dim3 + dim_1_idx * dim3 + dim_3_idx] = src[tid]; +// } +// } + +// template +// void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3) +// { +// // copy data to workspace, and then transpose from workspace to data +// // Note that this kernel is used for pre-processing and not very efficient. +// cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2 * dim3); +// transpose0213<<<256, 256>>>(data, workspace, dim0, dim1, dim2, dim3); +// } + +// #ifdef ENABLE_FP8 +// template void invokeInPlaceTranspose0213( +// __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1, const int dim2, const int dim3); +// #endif // ENABLE_FP8 +// #ifdef ENABLE_BF16 +// template void invokeInPlaceTranspose0213( +// __nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1, const int dim2, const int dim3); +// #endif // ENABLE_BF16 +// template void invokeInPlaceTranspose0213( +// float* data, float* workspace, const int dim0, const int dim1, const int dim2, const int dim3); + +// template +// __global__ void transpose102(T_OUT* dst, T_IN* src, const int dim0, const int dim1, const int dim2) +// { +// // src permutation: [0, 1, 2] +// // dst permutation: [1, 0, 2] +// for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2; tid += blockDim.x * gridDim.x) { +// int tmp_idx = tid; +// const int dim_2_idx = tmp_idx % dim2; +// tmp_idx = (tmp_idx - dim_2_idx) / dim2; +// const int dim_1_idx = tmp_idx % dim1; +// tmp_idx = (tmp_idx - dim_1_idx) / dim1; +// const int dim_0_idx = tmp_idx % dim0; +// dst[dim_1_idx * dim0 * dim2 + dim_0_idx * dim2 + dim_2_idx] = src[tid]; +// } +// } + +// template +// void invokeInPlaceTranspose102(T* data, T* workspace, const int dim0, const int dim1, const int dim2) +// { +// // copy data to workspace, and then transpose from workspace to data +// // Note that this kernel is used for pre-processing and not very efficient. +// cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2); +// transpose102<<<256, 256>>>(data, workspace, dim0, dim1, dim2); +// } + +// #ifdef ENABLE_FP8 +// template void invokeInPlaceTranspose102( +// __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1, const int dim2); +// #endif // ENABLE_FP8 +// #ifdef ENABLE_BF16 +// template void invokeInPlaceTranspose102( +// __nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1, const int dim2); +// #endif // ENABLE_BF16 +// template void invokeInPlaceTranspose102(float* data, float* workspace, const int dim0, const int dim1, const int dim2); + +// template +// void __global__ multiplyScale(T* tensor, float scale, const size_t size) +// { +// for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) { +// tensor[index] = (T)(((float)tensor[index]) * scale); +// } +// } + +// template +// void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream) +// { +// int block = 256; +// int grid = (size + 255) / 256; +// multiplyScale<<>>(tensor, scale, size); +// } + +// template void invokeMultiplyScale(float* tensor, float scale, const size_t size, cudaStream_t stream); +// template void invokeMultiplyScale(half* tensor, float scale, const size_t size, cudaStream_t stream); +// #ifdef ENABLE_BF16 +// template void invokeMultiplyScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream); +// #endif +// #ifdef ENABLE_FP8 +// template void invokeMultiplyScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream); +// #endif + +// template +// void __global__ divideScale(T* tensor, float scale, const size_t size) +// { +// for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) { +// tensor[index] = (T)(((float)tensor[index]) / scale); +// } +// } + +// template +// void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream) +// { +// int block = 256; +// int grid = (size + 255) / 256; +// divideScale<<>>(tensor, scale, size); +// } + +// template void invokeDivideScale(float* tensor, float scale, const size_t size, cudaStream_t stream); +// template void invokeDivideScale(half* tensor, float scale, const size_t size, cudaStream_t stream); +// #ifdef ENABLE_BF16 +// template void invokeDivideScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream); +// #endif +// #ifdef ENABLE_FP8 +// template void invokeDivideScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream); +// #endif +// #ifdef ENABLE_BF16 +// template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); +// template void +// invokeFakeCast<__nv_bfloat16, __nv_bfloat16>(__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream); +// template void invokeFakeCast(half* input_ptr, const size_t size, cudaStream_t stream); +// #endif +// template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); +// template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); +// #ifdef ENABLE_FP8 +// template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); +// template void invokeFakeCast(half* input_ptr, const size_t size, cudaStream_t stream); +// template void +// invokeFakeCast<__nv_bfloat16, __nv_fp8_e4m3>(__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream); +// #endif size_t cuda_datatype_size(FtCudaDataType dt) { @@ -969,32 +1061,32 @@ size_t cuda_datatype_size(FtCudaDataType dt) return sizes.at(dt); } -template -__global__ void check_range(T* buffer, size_t size, T min, T max, bool* d_within_range) -{ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) { - const T val = buffer[i]; - if (val < min || val > max) { - *d_within_range = false; - } - } -} - -template -bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream) -{ - cudaMemsetAsync(d_within_range, true, sizeof(bool), stream); - - dim3 block(256); - dim3 grid((size + 255) / 256); - check_range<<>>(buffer, size, min, max, d_within_range); - - bool result; - cudaD2Hcpy(&result, d_within_range, 1); - return result; -} - -template bool -invokeCheckRange(int* buffer, const size_t size, int min, int max, bool* d_within_range, cudaStream_t stream); +// template +// __global__ void check_range(T* buffer, size_t size, T min, T max, bool* d_within_range) +// { +// for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) { +// const T val = buffer[i]; +// if (val < min || val > max) { +// *d_within_range = false; +// } +// } +// } + +// template +// bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream) +// { +// cudaMemsetAsync(d_within_range, true, sizeof(bool), stream); + +// dim3 block(256); +// dim3 grid((size + 255) / 256); +// check_range<<>>(buffer, size, min, max, d_within_range); + +// bool result; +// cudaD2Hcpy(&result, d_within_range, 1); +// return result; +// } + +// template bool +// invokeCheckRange(int* buffer, const size_t size, int min, int max, bool* d_within_range, cudaStream_t stream); } // namespace turbomind diff --git a/src/turbomind/utils/memory_utils.h b/src/turbomind/utils/memory_utils.h index e51c903905..24cdae18aa 100644 --- a/src/turbomind/utils/memory_utils.h +++ b/src/turbomind/utils/memory_utils.h @@ -17,8 +17,6 @@ #pragma once #include "src/turbomind/utils/Tensor.h" -#include "src/turbomind/utils/cuda_fp8_utils.h" -#include "src/turbomind/utils/cuda_utils.h" namespace turbomind { @@ -71,8 +69,10 @@ std::vector loadArrayFromBin(std::vector shape, // std::string filename, // FtCudaDataType model_file_type = FtCudaDataType::FP32); +#ifndef DIOPI_ENABLE void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream); void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream); +#endif // DIOPI_ENABLE #ifdef ENABLE_FP8 void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream); void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream); @@ -84,51 +84,55 @@ void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const si void invokeCudaD2DcpyBfloat2Float(float* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream); #endif // ENABLE_BF16 +#ifndef DIOPI_ENABLE template void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream); +#endif // DIOPI_ENABLE + +// template +// __inline__ __host__ __device__ size_t dim2flat(const T (&idx)[n_dims], const T (&dims)[n_dims]) +// { +// size_t flat_idx = 0; +// for (size_t i = 0; i < n_dims; i++) { +// flat_idx += idx[i]; +// if (i + 1 < n_dims) +// flat_idx *= dims[i + 1]; +// } +// return flat_idx; +// } + +// template +// __inline__ __host__ __device__ void flat2dim(T1 flat_idx, const T2 (&dims)[n_dims], T2 (&idx)[n_dims]) +// { +// for (int i = n_dims - 1; i >= 0; i--) { +// idx[i] = flat_idx % dims[i]; +// flat_idx /= dims[i]; +// } +// } -template -__inline__ __host__ __device__ size_t dim2flat(const T (&idx)[n_dims], const T (&dims)[n_dims]) -{ - size_t flat_idx = 0; - for (size_t i = 0; i < n_dims; i++) { - flat_idx += idx[i]; - if (i + 1 < n_dims) - flat_idx *= dims[i + 1]; - } - return flat_idx; -} - -template -__inline__ __host__ __device__ void flat2dim(T1 flat_idx, const T2 (&dims)[n_dims], T2 (&idx)[n_dims]) -{ - for (int i = n_dims - 1; i >= 0; i--) { - idx[i] = flat_idx % dims[i]; - flat_idx /= dims[i]; - } -} - -template -void invokeInPlaceTranspose(T* data, T* workspace, const int dim0, const int dim1); +// template +// void invokeInPlaceTranspose(T* data, T* workspace, const int dim0, const int dim1); -template -void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3); +// template +// void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3); -template -void invokeInPlaceTranspose102(T* data, T* workspace, const int dim0, const int dim1, const int dim2); +// template +// void invokeInPlaceTranspose102(T* data, T* workspace, const int dim0, const int dim1, const int dim2); -template -void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream); +// template +// void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream); -template -void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream); +// template +// void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream); +#ifndef DIOPI_ENABLE template void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, const size_t size, cudaStream_t stream = 0); +#endif // DIOPI_ENABLE -template -void invokeCudaD2DScaleCpyConvert( - T_OUT* tgt, const T_IN* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream = 0); +// template +// void invokeCudaD2DScaleCpyConvert( +// T_OUT* tgt, const T_IN* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream = 0); inline bool checkIfFileExist(const std::string& file_path) { @@ -143,12 +147,12 @@ inline bool checkIfFileExist(const std::string& file_path) template void saveToBinary(const T* ptr, const size_t size, std::string filename); -template -void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream); +// template +// void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream); size_t cuda_datatype_size(FtCudaDataType dt); -template -bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream); +// template +// bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream); } // namespace turbomind diff --git a/src/turbomind/utils/nccl_utils.cc b/src/turbomind/utils/nccl_utils.cc index bd669ac227..1e0b8b57f8 100644 --- a/src/turbomind/utils/nccl_utils.cc +++ b/src/turbomind/utils/nccl_utils.cc @@ -21,6 +21,7 @@ namespace turbomind { #ifdef BUILD_MULTI_GPU +#ifndef DIOPI_ENABLE template ncclDataType_t getNcclDataType() { @@ -52,129 +53,172 @@ ncclDataType_t getNcclDataType() } return nccl_data_type; } +#endif // DIOPI_ENABLE #endif template -void ftNcclAllReduceSum(const T* send_buf, T* recv_buf, const int data_size, NcclParam nccl_param, cudaStream_t stream) +void ftNcclAllReduceSum(const T* send_buf, T* recv_buf, const int data_size, NcclParam nccl_param, deviceStream_t stream) { TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + #ifdef BUILD_MULTI_GPU +#ifndef DIOPI_ENABLE ncclDataType_t nccl_data_type = getNcclDataType(); NCCLCHECK(ncclGroupStart()); NCCLCHECK(ncclAllReduce( (const void*)send_buf, (void*)recv_buf, data_size, nccl_data_type, ncclSum, nccl_param.nccl_comm_, stream)); NCCLCHECK(ncclGroupEnd()); +#else + dipu::devapis::diclRawGroupStart(); + dipu::devapis::diclRawAllReduce( + (const void*)send_buf, (void*)recv_buf, data_size, dipu::devapis::getDiclDataType(), dipu::devapis::DiclReduceOp::SUM, nccl_param.nccl_comm_, stream + ) + dipu::devapis::diclRawGroupEnd(); +#endif // DIOPI_ENABLE #endif + TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } template void ftNcclAllGather( - const T* send_buf, T* recv_buf, const int data_size, const int rank, NcclParam nccl_param, cudaStream_t stream) + const T* send_buf, T* recv_buf, const int data_size, const int rank, NcclParam nccl_param, deviceStream_t stream) { TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); #ifdef BUILD_MULTI_GPU +#ifndef DIOPI_ENABLE ncclDataType_t nccl_data_type = getNcclDataType(); NCCLCHECK(ncclGroupStart()); NCCLCHECK( ncclAllGather(send_buf + rank * data_size, recv_buf, data_size, nccl_data_type, nccl_param.nccl_comm_, stream)); NCCLCHECK(ncclGroupEnd()); +#else + dipu::devapis::diclRawGroupStart(); + dipu::devapis::diclRawAllGather( + (const void*)send_buf, (void*)recv_buf, data_size, dipu::devapis::getDiclDataType(), dipu::devapis::DiclReduceOp::SUM, nccl_param.nccl_comm_, stream + ) + dipu::devapis::diclRawGroupEnd(); +#endif // DIOPI_ENABLE #endif TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } template -void ftNcclSend(const T* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream) +void ftNcclSend(const T* send_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream) { TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); #ifdef BUILD_MULTI_GPU +#ifndef DIOPI_ENABLE ncclDataType_t nccl_data_type = getNcclDataType(); NCCLCHECK(ncclSend(send_buf, data_size, nccl_data_type, peer, nccl_param.nccl_comm_, stream)); +#else + dipu::devapis::diclRawSend(send_buf, data_size, dipu::devapis::getDiclDataType(), peer, nccl_param.nccl_comm_, stream); +#endif // DIOPI_ENABLE #endif TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } template void -ftNcclSend(const float* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream); +ftNcclSend(const float* send_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream); +#ifndef DIOPI_ENABLE template void -ftNcclSend(const half* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream); +ftNcclSend(const half* send_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream); +#endif // DIOPI_ENABLE #ifdef ENABLE_BF16 template void ftNcclSend( - const __nv_bfloat16* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream); + const __nv_bfloat16* send_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream); #endif template void -ftNcclSend(const int* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream); +ftNcclSend(const int* send_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream); template void -ftNcclSend(const bool* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream); +ftNcclSend(const bool* send_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream); +#ifndef DIOPI_ENABLE template void -ftNcclSend(const char* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream); +ftNcclSend(const char* send_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream); +#endif // DIOPI_ENABLE template -void ftNcclRecv(T* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream) +void ftNcclRecv(T* recv_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream) { TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); #ifdef BUILD_MULTI_GPU +#ifndef DIOPI_ENABLE ncclDataType_t nccl_data_type = getNcclDataType(); NCCLCHECK(ncclRecv(recv_buf, data_size, nccl_data_type, peer, nccl_param.nccl_comm_, stream)); +#else + dipu::devapis::diclRawRecv(recv_buf, data_size, dipu::devapis::getDiclDataType(), peer, nccl_param.nccl_comm_, stream); +#endif // DIOPI_ENABLE #endif TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } template void -ftNcclRecv(float* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream); +ftNcclRecv(float* recv_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream); +#ifndef DIOPI_ENABLE template void -ftNcclRecv(half* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream); +ftNcclRecv(half* recv_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream); +#endif // DIOPI_ENABLE #ifdef ENABLE_BF16 template void -ftNcclRecv(__nv_bfloat16* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream); +ftNcclRecv(__nv_bfloat16* recv_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream); #endif -template void ftNcclRecv(int* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream); +template void ftNcclRecv(int* recv_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream); template void -ftNcclRecv(bool* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream); +ftNcclRecv(bool* recv_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream); +#ifndef DIOPI_ENABLE template void -ftNcclRecv(char* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream); +ftNcclRecv(char* recv_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream); +#endif // DIOPI_ENABLE template -void ftNcclBroadCast(T* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream) +void ftNcclBroadCast(T* buff, const int data_size, const int root, NcclParam nccl_param, deviceStream_t stream) { TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); #ifdef BUILD_MULTI_GPU +#ifndef DIOPI_ENABLE ncclDataType_t nccl_data_type = getNcclDataType(); NCCLCHECK(ncclBcast(buff, data_size, nccl_data_type, root, nccl_param.nccl_comm_, stream)); +#else + dipu::devapis::getDiclDataType(), root, nccl_param.nccl_comm_, stream); +#endif // DIOPI_ENABLE #endif TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } +#ifndef DIOPI_ENABLE template void -ftNcclBroadCast(char* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream); +ftNcclBroadCast(char* buff, const int data_size, const int root, NcclParam nccl_param, deviceStream_t stream); +#endif // DIOPI_ENABLE template void -ftNcclBroadCast(bool* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream); +ftNcclBroadCast(bool* buff, const int data_size, const int root, NcclParam nccl_param, deviceStream_t stream); template void -ftNcclBroadCast(int* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream); +ftNcclBroadCast(int* buff, const int data_size, const int root, NcclParam nccl_param, deviceStream_t stream); template void -ftNcclBroadCast(float* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream); +ftNcclBroadCast(float* buff, const int data_size, const int root, NcclParam nccl_param, deviceStream_t stream); +#ifndef DIOPI_ENABLE template void -ftNcclBroadCast(half* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream); +ftNcclBroadCast(half* buff, const int data_size, const int root, NcclParam nccl_param, deviceStream_t stream); +#endif // DIOPI_ENABLE #ifdef ENABLE_BF16 template void -ftNcclBroadCast(__nv_bfloat16* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream); +ftNcclBroadCast(__nv_bfloat16* buff, const int data_size, const int root, NcclParam nccl_param, deviceStream_t stream); #endif template void ftNcclAllReduceSum( - const float* send_buf, float* recv_buf, const int data_size, NcclParam nccl_param, cudaStream_t stream); - + const float* send_buf, float* recv_buf, const int data_size, NcclParam nccl_param, deviceStream_t stream); +#ifndef DIOPI_ENABLE template void ftNcclAllReduceSum( - const half* send_buf, half* recv_buf, const int data_size, NcclParam nccl_param, cudaStream_t stream); - + const half* send_buf, half* recv_buf, const int data_size, NcclParam nccl_param, deviceStream_t stream); +#endif // DIOPI_ENABLE template void ftNcclAllReduceSum( - const int32_t* send_buf, int32_t* recv_buf, const int data_size, NcclParam nccl_param, cudaStream_t stream); + const int32_t* send_buf, int32_t* recv_buf, const int data_size, NcclParam nccl_param, deviceStream_t stream); #ifdef ENABLE_BF16 template void ftNcclAllReduceSum(const __nv_bfloat16* send_buf, __nv_bfloat16* recv_buf, const int data_size, NcclParam nccl_param, - cudaStream_t stream); + deviceStream_t stream); #endif template void ftNcclAllGather(const float* send_buf, @@ -182,35 +226,43 @@ template void ftNcclAllGather(const float* send_buf, const int data_size, const int rank, NcclParam nccl_param, - cudaStream_t stream); - + deviceStream_t stream); +#ifndef DIOPI_ENABLE template void ftNcclAllGather(const half* send_buf, half* recv_buf, const int data_size, const int rank, NcclParam nccl_param, - cudaStream_t stream); - + deviceStream_t stream); +#endif // DIOPI_ENABLE #ifdef ENABLE_BF16 template void ftNcclAllGather(const __nv_bfloat16* send_buf, __nv_bfloat16* recv_buf, const int data_size, const int rank, NcclParam nccl_param, - cudaStream_t stream); + deviceStream_t stream); #endif void ftNcclGroupStart() { #ifdef BUILD_MULTI_GPU +#ifndef DIOPI_ENABLE NCCLCHECK(ncclGroupStart()); +#else + dipu::devapis::diclRawGroupStart() +#endif // DIOPI_ENABLE #endif } void ftNcclGroupEnd() { #ifdef BUILD_MULTI_GPU +#ifndef DIOPI_ENABLE NCCLCHECK(ncclGroupEnd()); +#else + dipu::devapis::diclRawGroupEnd() +#endif // DIOPI_ENABLE #endif } @@ -277,7 +329,11 @@ void ftNcclStreamSynchronize(NcclParam tensor_para, NcclParam pipeline_para, cud void ftNcclGetUniqueId(NcclUid& uid) { #ifdef BUILD_MULTI_GPU +#ifndef DIOPI_ENABLE NCCLCHECK(ncclGetUniqueId(&uid.nccl_uid_)); +#else + dipu::devapis::diclGetUniqueId(&uid.nccl_uid_); +#endif // DIOPI_ENABLE #endif } @@ -293,7 +349,11 @@ void ftNcclCommInitRank(NcclParam& param, const int rank, const int world_size, param.rank_ = rank; param.world_size_ = world_size; param.nccl_uid_ = uid.nccl_uid_; +#ifndef DIOPI_ENABLE NCCLCHECK(ncclCommInitRank(¶m.nccl_comm_, param.world_size_, param.nccl_uid_, param.rank_)); +#else + dipu::devapis::diclCommInitRank(¶m.nccl_comm_, param.world_size_, param.nccl_uid_, param.rank_); +#endif // DIOPI_ENABLE #endif TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } @@ -302,7 +362,11 @@ void ftNcclParamDestroy(NcclParam& param) { #ifdef BUILD_MULTI_GPU if (param.nccl_comm_ != nullptr) { +#ifndef DIOPI_ENABLE ncclCommDestroy(param.nccl_comm_); +#else + dipu::devapis::diclCommDestroy(param.nccl_comm_); +#endif } #endif } diff --git a/src/turbomind/utils/nccl_utils.h b/src/turbomind/utils/nccl_utils.h index 9827297c5a..6d796212db 100644 --- a/src/turbomind/utils/nccl_utils.h +++ b/src/turbomind/utils/nccl_utils.h @@ -19,20 +19,27 @@ #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/logger.h" -#include #ifdef BUILD_MULTI_GPU +#ifndef DIOPI_ENABLE #include +#else +#include "rawdiclapis.h" +#endif // DIOPI_ENABLE #endif #include #include +#ifndef DIOPI_ENABLE +#include #if defined(NCCL_VERSION_CODE) && (NCCL_VERSION_CODE >= 21003) #define ENABLE_BF16_NCCL #endif +#endif // DIOPI_ENABLE namespace turbomind { #ifdef BUILD_MULTI_GPU -#define NCCLCHECK(cmd) \ +#define NCCLCHECK(cmd) +#ifndef DIOPI_ENABLE \ do { \ ncclResult_t r = cmd; \ if (r != ncclSuccess) { \ @@ -41,6 +48,7 @@ namespace turbomind { exit(EXIT_FAILURE); \ } \ } while (0) +#endif // DIOPI_ENABLE #else #define NCCLCHECK(cmd) printf("[WARNING} No NCCL"); #endif @@ -50,7 +58,7 @@ struct NcclUid { NcclUid(){}; NcclUid(NcclUid const& uid){}; #else - ncclUniqueId nccl_uid_; + dipu::commUniqueId nccl_uid_; NcclUid(){}; NcclUid(NcclUid const& uid): nccl_uid_(uid.nccl_uid_){}; #endif @@ -61,8 +69,8 @@ struct NcclParam { int world_size_{1}; int group_id_{0}; #ifdef BUILD_MULTI_GPU - ncclUniqueId nccl_uid_{}; - ncclComm_t nccl_comm_ = nullptr; + dipu::commUniqueId nccl_uid_{}; + dipu::diclComm_t nccl_comm_ = nullptr; #endif #ifdef BUILD_MULTI_GPU @@ -92,28 +100,28 @@ struct NcclParam { // New APIs template -void ftNcclAllReduceSum(const T* send_buf, T* recv_buf, const int data_size, NcclParam nccl_param, cudaStream_t stream); +void ftNcclAllReduceSum(const T* send_buf, T* recv_buf, const int data_size, NcclParam nccl_param, deviceStream_t stream); template void ftNcclAllGather( - const T* send_buf, T* recv_buf, const int data_size, const int rank, NcclParam nccl_param, cudaStream_t stream); + const T* send_buf, T* recv_buf, const int data_size, const int rank, NcclParam nccl_param, deviceStream_t stream); template -void ftNcclBroadCast(T* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream); +void ftNcclBroadCast(T* buff, const int data_size, const int root, NcclParam nccl_param, deviceStream_t stream); template -void ftNcclRecv(T* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream); +void ftNcclRecv(T* recv_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream); template -void ftNcclSend(const T* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream); +void ftNcclSend(const T* send_buf, const int data_size, const int peer, NcclParam nccl_param, deviceStream_t stream); // nccl stream synchronize, abort nccl comms and throw errors when nccl async errors detected -void ftNcclStreamSynchronize(NcclParam tensor_para, NcclParam pipeline_para_, cudaStream_t stream); +void ftNcclStreamSynchronize(NcclParam tensor_para, NcclParam pipeline_para_, deviceStream_t stream); void ftNcclGroupStart(); void ftNcclGroupEnd(); void ftNcclGetUniqueId(NcclUid& uid); -void ftNcclCommInitRank(NcclParam& param, const int rank, const int world_size, const NcclUid uid); +void ftNcclCommInitRank(NcclParam& param, const int rank, const int world_size, const commUniqueId uid); void ftNcclParamDestroy(NcclParam& param); int ftNcclNextGroupId();