Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cmake/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,32 +11,37 @@ if(WITH_NV_JETSON)
set(paddle_known_gpu_archs10 "53 62 72")
set(paddle_known_gpu_archs11 "53 62 72 87")
set(paddle_known_gpu_archs12 "53 62 72 87 90 100")
set(paddle_known_gpu_archs13 "87 90 100")
elseif(NEW_RELEASE_ALL)
message("Using New Release Strategy - All Arches Package")
add_definitions(-DNEW_RELEASE_ALL)
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90 100")
set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
set(paddle_known_gpu_archs11 "50 60 61 70 75 80")
set(paddle_known_gpu_archs12 "50 60 61 70 75 80 90 100")
set(paddle_known_gpu_archs13 "75 80 86 90 100")
elseif(NEW_RELEASE_PYPI)
message("Using New Release Strategy - Cubin Package")
add_definitions(-DNEW_RELEASE_PYPI)
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90 100")
set(paddle_known_gpu_archs10 "")
set(paddle_known_gpu_archs11 "61 70 75 80")
set(paddle_known_gpu_archs12 "61 70 75 80 90 100")
set(paddle_known_gpu_archs13 "75 80 86 90 100")
elseif(NEW_RELEASE_JIT)
message("Using New Release Strategy - JIT Package")
add_definitions(-DNEW_RELEASE_JIT)
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90 100")
set(paddle_known_gpu_archs10 "50 60 70 75")
set(paddle_known_gpu_archs11 "50 60 70 75 80")
set(paddle_known_gpu_archs12 "50 60 70 75 80 90 100")
set(paddle_known_gpu_archs13 "75 80 86 90 100")
else()
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 90 100")
set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
set(paddle_known_gpu_archs12 "52 60 61 70 75 80 90 100")
set(paddle_known_gpu_archs13 "75 80 86 90 100")
endif()

######################################################################################
Expand Down Expand Up @@ -289,6 +294,11 @@ elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 13.0) # CUDA 12.0+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 14.0) # CUDA 13.0+
set(paddle_known_gpu_archs ${paddle_known_gpu_archs13})
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
endif()

# Fix ARM NEON conflict with CUDA on aarch64 platforms.
Expand Down
3 changes: 2 additions & 1 deletion cmake/third_party.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,8 @@ if(WITH_GPU
AND NOT WITH_ARM
AND NOT WIN32
AND NOT APPLE)
if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.3)
if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.3
AND ${CMAKE_CUDA_COMPILER_VERSION} LESS_EQUAL 12.9)
foreach(arch ${NVCC_ARCH_BIN})
if(${arch} GREATER_EQUAL 90)
set(WITH_FLASHATTN_V3 ON)
Expand Down
30 changes: 23 additions & 7 deletions paddle/phi/core/kernel_registry.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,26 @@ struct KernelRegistrar {
::phi::KernelArgsParseFunctor< \
decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse

// nvcc 13.x crashes in cudafe++ on the explicit instantiation form
// `template decltype(fn<T, Ctx>) fn<T, Ctx>;`. Keep macro registration intact
// by replacing it with a `used` anchor that still forces the specialization
// to be emitted without hitting the buggy syntax.
#if defined(__CUDACC__) && !defined(_WIN32) && \
defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 13)
#define PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION_IMPL(id, ...) \
static auto* const PD_CONCATENATE(__pd_kernel_instantiation_anchor_, id) \
__attribute__((used)) = &__VA_ARGS__;
#define PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(...) \
PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION_IMPL(PD_ID, __VA_ARGS__)
#else
#define PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(...) \
template decltype(__VA_ARGS__) __VA_ARGS__;
#endif

// The macro for instantiating function kernel
#define FUNCTION_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, context) \
template decltype(meta_kernel_fn<cpp_dtype, context>) \
meta_kernel_fn<cpp_dtype, context>;
PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION( \
meta_kernel_fn<cpp_dtype, context>)

/** PD_REGISTER_KERNEL
*
Expand Down Expand Up @@ -1368,7 +1384,7 @@ struct KernelRegistrar {
#if (defined(PADDLE_WITH_CUSTOM_DEVICE) && defined(PADDLE_WITH_CUDA))
#define PD_REGISTER_KERNEL_FOR_ALL_DTYPE( \
kernel_name, backend, layout, kernel_fn) \
template decltype(kernel_fn) kernel_fn; \
PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(kernel_fn) \
static void \
__FAKE_PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
const ::phi::KernelKey& kernel_key UNUSED, \
Expand All @@ -1391,7 +1407,7 @@ struct KernelRegistrar {
#ifndef _WIN32
#define __PD_REGISTER_KERNEL_FOR_ALL_DTYPE( \
reg_type, kernel_name, backend, layout, kernel_fn) \
template decltype(kernel_fn) kernel_fn; \
PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(kernel_fn) \
static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel); \
static const ::phi::KernelRegistrar \
Expand Down Expand Up @@ -1440,8 +1456,8 @@ struct KernelRegistrar {
#if (defined(PADDLE_WITH_CUSTOM_DEVICE) && defined(PADDLE_WITH_CUDA))
#define PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE( \
kernel_name, layout, meta_kernel_fn) \
template decltype(meta_kernel_fn<::phi::CustomContext>) \
meta_kernel_fn<::phi::CustomContext>; \
PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION( \
meta_kernel_fn<::phi::CustomContext>) \
static void \
__FAKE_PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
const ::phi::KernelKey kernel_key UNUSED, \
Expand Down Expand Up @@ -1535,7 +1551,7 @@ struct KernelRegistrar {
#ifndef _WIN32
#define ___PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE( \
reg_type, kernel_name, backend, layout, kernel_fn, args_def_fn) \
template decltype(kernel_fn) kernel_fn; \
PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(kernel_fn) \
static const ::phi::KernelRegistrar \
__reg_phi_kernel_##kernel_name##_##backend##_##layout( \
reg_type, \
Expand Down
9 changes: 7 additions & 2 deletions paddle/phi/kernels/gpu/arange_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,13 @@ void ArangeKernel(const Context& dev_ctx,
<<<grid, block, 0, stream>>>(start_value, step_value, size, out_data);
}

template decltype(ArangeNullaryKernel<int64_t, GPUContext>) ArangeNullaryKernel;
template decltype(ArangeNullaryKernel<int, GPUContext>) ArangeNullaryKernel;
template void ArangeNullaryKernel<int64_t, GPUContext>(const GPUContext&,
const int64_t,
const int64_t,
const int64_t,
DenseTensor*);
template void ArangeNullaryKernel<int, GPUContext>(
const GPUContext&, const int, const int, const int, DenseTensor*);
} // namespace phi

PD_REGISTER_KERNEL(arange_tensor,
Expand Down
9 changes: 7 additions & 2 deletions paddle/phi/kernels/gpu/range_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,13 @@ void RangeKernel(const Context& dev_ctx,
<<<grid, block, 0, stream>>>(start_value, step_value, size, out_data);
}

template decltype(RangeNullaryKernel<int64_t, GPUContext>) RangeNullaryKernel;
template decltype(RangeNullaryKernel<int, GPUContext>) RangeNullaryKernel;
template void RangeNullaryKernel<int64_t, GPUContext>(const GPUContext&,
const int64_t,
const int64_t,
const int64_t,
DenseTensor*);
template void RangeNullaryKernel<int, GPUContext>(
const GPUContext&, const int, const int, const int, DenseTensor*);
} // namespace phi

PD_REGISTER_KERNEL(range_tensor,
Expand Down
70 changes: 50 additions & 20 deletions python/setup.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,7 @@ def get_paddle_extra_install_requirements():
#(Note risemeup1): Paddle will install the pypi cuda package provided by Nvidia, which includes the cuda runtime, cudnn, and cublas. Additionally, it now supports the installation of TensorRT, further enhancing its functionality. This integration simplifies the process as the operation of 'pip install paddle' is no longer dependent on the separate installation of cuda, cudnn, or TensorRT.
paddle_cuda_requires = []
paddle_tensorrt_requires = []
cuda_major_version = None
if '@WITH_PIP_CUDA_LIBRARIES@' == 'ON':
if platform.system() == 'Linux':
PADDLE_CUDA_INSTALL_REQUIREMENTS = {
Expand Down Expand Up @@ -721,6 +722,23 @@ def get_paddle_extra_install_requirements():
"nvidia-cufile==1.15.1.6; platform_system == 'Linux' | "
"cuda-python==13.0.3; platform_system == 'Linux'"
),
"13.2": (
"nvidia-cuda-nvrtc==13.2.78; platform_system == 'Linux' | "
"nvidia-cuda-runtime==13.2.75; platform_system == 'Linux' | "
"nvidia-cuda-cupti==13.2.75; platform_system == 'Linux' | "
"nvidia-cudnn-cu13==9.21.0.82; platform_system == 'Linux' | "
"nvidia-cublas==13.4.0.1; platform_system == 'Linux' | "
"nvidia-cufft==12.2.0.46; platform_system == 'Linux' | "
"nvidia-curand==10.4.2.55; platform_system == 'Linux' | "
"nvidia-cusolver==12.2.0.1; platform_system == 'Linux' | "
"nvidia-cusparse==12.7.10.1; platform_system == 'Linux' | "
"nvidia-cusparselt-cu13==0.9.0; platform_system == 'Linux' | "
"nvidia-nccl-cu13==2.29.7; platform_system == 'Linux' | "
"nvidia-nvtx==13.2.75; platform_system == 'Linux' | "
"nvidia-nvjitlink==13.2.78; platform_system == 'Linux' | "
"nvidia-cufile==1.17.1.22; platform_system == 'Linux' | "
"cuda-python==13.2.0; platform_system == 'Linux'"
),
}
if '@WITH_CINN@' == 'ON':
PADDLE_CUDA_INSTALL_REQUIREMENTS["12.3"] += (
Expand All @@ -741,6 +759,9 @@ def get_paddle_extra_install_requirements():
PADDLE_CUDA_INSTALL_REQUIREMENTS["13.0"] += (
" | nvidia-cuda-cccl==13.0.85;platform_system == 'Linux' "
)
PADDLE_CUDA_INSTALL_REQUIREMENTS["13.2"] += (
" | nvidia-cuda-cccl==13.2.75;platform_system == 'Linux' "
)
elif platform.system() == 'Windows':
PADDLE_CUDA_INSTALL_REQUIREMENTS = {
"11.8": (
Expand Down Expand Up @@ -811,37 +832,46 @@ def get_paddle_extra_install_requirements():

if '@WITH_PIP_TENSORRT@' == 'ON':
version_str = get_tensorrt_version()
version_default = int(version_str.split(".")[0])
if platform.system() =='Linux' or (platform.system()=='Windows' and version_default>=10):

version_default = int(version_str.split(".")[0]) if version_str else None
if platform.system() == 'Linux' and cuda_major_version == '13.2':
if not version_str and platform.machine() == 'aarch64':
return paddle_cuda_requires, ["tensorrt-cu13==10.16.1.11"]
PADDLE_TENSORRT_INSTALL_REQUIREMENTS = [
"tensorrt-cu13==10.16.1.11",
]
elif platform.system() =='Linux' or (platform.system()=='Windows' and version_default is not None and version_default>=10):
PADDLE_TENSORRT_INSTALL_REQUIREMENTS = [
"tensorrt==8.5.3.1",
"tensorrt==8.6.0",
"tensorrt==8.6.1.post1",
"tensorrt==10.3.0",
]
else:
return paddle_cuda_requires, []

if not version_str:
return paddle_cuda_requires,[]
if not version_str:
return paddle_cuda_requires,[]

version_main = ".".join(version_str.split(".")[:3])
version_main = ".".join(version_str.split(".")[:3])

matched_package = None
for paddle_tensorrt_requires in PADDLE_TENSORRT_INSTALL_REQUIREMENTS:
paddle_tensorrt_version = paddle_tensorrt_requires.split("==")[1]
paddle_tensorrt_main = ".".join(paddle_tensorrt_version.split(".")[:3])
matched_package = None
for paddle_tensorrt_requires in PADDLE_TENSORRT_INSTALL_REQUIREMENTS:
paddle_tensorrt_version = paddle_tensorrt_requires.split("==")[1]
paddle_tensorrt_main = ".".join(paddle_tensorrt_version.split(".")[:3])

if version_main == paddle_tensorrt_main:
matched_package = paddle_tensorrt_requires
break
if version_main == paddle_tensorrt_main:
matched_package = paddle_tensorrt_requires
break

if matched_package:
paddle_tensorrt_requires = [matched_package]
else:
print(
f"No exact match found for TensorRT Version: {version_str}. We currently support TensorRT versions 8.5.3.1, 8.6.0, and 8.6.1."
)
return paddle_cuda_requires, []
if matched_package:
paddle_tensorrt_requires = [matched_package]
else:
print(
"No exact match found for TensorRT Version: "
f"{version_str}. We currently support TensorRT versions "
"8.5.3.1, 8.6.0, 8.6.1.post1, 10.3.0, and 10.16.1.11."
)
return paddle_cuda_requires, []

return paddle_cuda_requires,paddle_tensorrt_requires

Expand Down
Loading
Loading