PaddlePaddle · sneaxiy · May 27, 2026 · May 26, 2026
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
@@ -11,32 +11,37 @@ if(WITH_NV_JETSON)
   set(paddle_known_gpu_archs10 "53 62 72")
   set(paddle_known_gpu_archs11 "53 62 72 87")
   set(paddle_known_gpu_archs12 "53 62 72 87 90 100")
+  set(paddle_known_gpu_archs13 "87 90 100")
 elseif(NEW_RELEASE_ALL)
   message("Using New Release Strategy - All Arches Package")
   add_definitions(-DNEW_RELEASE_ALL)
   set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90 100")
   set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
   set(paddle_known_gpu_archs11 "50 60 61 70 75 80")
   set(paddle_known_gpu_archs12 "50 60 61 70 75 80 90 100")
+  set(paddle_known_gpu_archs13 "75 80 86 90 100")
 elseif(NEW_RELEASE_PYPI)
   message("Using New Release Strategy - Cubin Package")
   add_definitions(-DNEW_RELEASE_PYPI)
   set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90 100")
   set(paddle_known_gpu_archs10 "")
   set(paddle_known_gpu_archs11 "61 70 75 80")
   set(paddle_known_gpu_archs12 "61 70 75 80 90 100")
+  set(paddle_known_gpu_archs13 "75 80 86 90 100")
 elseif(NEW_RELEASE_JIT)
   message("Using New Release Strategy - JIT Package")
   add_definitions(-DNEW_RELEASE_JIT)
   set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90 100")
   set(paddle_known_gpu_archs10 "50 60 70 75")
   set(paddle_known_gpu_archs11 "50 60 70 75 80")
   set(paddle_known_gpu_archs12 "50 60 70 75 80 90 100")
+  set(paddle_known_gpu_archs13 "75 80 86 90 100")
 else()
   set(paddle_known_gpu_archs "50 52 60 61 70 75 80 90 100")
   set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
   set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
   set(paddle_known_gpu_archs12 "52 60 61 70 75 80 90 100")
+  set(paddle_known_gpu_archs13 "75 80 86 90 100")
 endif()
 
 ######################################################################################
@@ -289,6 +294,11 @@ elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 13.0) # CUDA 12.0+
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 14.0) # CUDA 13.0+
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs13})
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 endif()
 
 # Fix ARM NEON conflict with CUDA on aarch64 platforms.

diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
@@ -604,7 +604,8 @@ if(WITH_GPU
    AND NOT WITH_ARM
    AND NOT WIN32
    AND NOT APPLE)
-  if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.3)
+  if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.3
+     AND ${CMAKE_CUDA_COMPILER_VERSION} LESS_EQUAL 12.9)
     foreach(arch ${NVCC_ARCH_BIN})
       if(${arch} GREATER_EQUAL 90)
         set(WITH_FLASHATTN_V3 ON)

@@ -197,10 +197,26 @@ struct KernelRegistrar {
   ::phi::KernelArgsParseFunctor<                              \
       decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse
 
+// nvcc 13.x crashes in cudafe++ on the explicit instantiation form
+// `template decltype(fn<T, Ctx>) fn<T, Ctx>;`. Keep macro registration intact
+// by replacing it with a `used` anchor that still forces the specialization
+// to be emitted without hitting the buggy syntax.
+#if defined(__CUDACC__) && !defined(_WIN32) && \
+    defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 13)
+#define PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION_IMPL(id, ...)          \
+  static auto* const PD_CONCATENATE(__pd_kernel_instantiation_anchor_, id) \
+      __attribute__((used)) = &__VA_ARGS__;
+#define PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(...) \
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION_IMPL(PD_ID, __VA_ARGS__)
+#else
+#define PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(...) \
+  template decltype(__VA_ARGS__) __VA_ARGS__;
+#endif
+
 // The macro for instantiating function kernel
 #define FUNCTION_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, context) \
-  template decltype(meta_kernel_fn<cpp_dtype, context>)                   \
-      meta_kernel_fn<cpp_dtype, context>;
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(                            \
+      meta_kernel_fn<cpp_dtype, context>)
 
 /** PD_REGISTER_KERNEL
  *
@@ -1368,7 +1384,7 @@ struct KernelRegistrar {
 #if (defined(PADDLE_WITH_CUSTOM_DEVICE) && defined(PADDLE_WITH_CUDA))
 #define PD_REGISTER_KERNEL_FOR_ALL_DTYPE(                                \
     kernel_name, backend, layout, kernel_fn)                             \
-  template decltype(kernel_fn) kernel_fn;                                \
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(kernel_fn)                 \
   static void                                                            \
       __FAKE_PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
           const ::phi::KernelKey& kernel_key UNUSED,                     \
@@ -1391,7 +1407,7 @@ struct KernelRegistrar {
 #ifndef _WIN32
 #define __PD_REGISTER_KERNEL_FOR_ALL_DTYPE(                                    \
     reg_type, kernel_name, backend, layout, kernel_fn)                         \
-  template decltype(kernel_fn) kernel_fn;                                      \
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(kernel_fn)                       \
   static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(    \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);              \
   static const ::phi::KernelRegistrar                                          \
@@ -1440,8 +1456,8 @@ struct KernelRegistrar {
 #if (defined(PADDLE_WITH_CUSTOM_DEVICE) && defined(PADDLE_WITH_CUDA))
 #define PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(                        \
     kernel_name, layout, meta_kernel_fn)                                 \
-  template decltype(meta_kernel_fn<::phi::CustomContext>)                \
-      meta_kernel_fn<::phi::CustomContext>;                              \
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(                           \
+      meta_kernel_fn<::phi::CustomContext>)                              \
   static void                                                            \
       __FAKE_PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
           const ::phi::KernelKey kernel_key UNUSED,                      \
@@ -1535,7 +1551,7 @@ struct KernelRegistrar {
 #ifndef _WIN32
 #define ___PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(                           \
     reg_type, kernel_name, backend, layout, kernel_fn, args_def_fn)            \
-  template decltype(kernel_fn) kernel_fn;                                      \
+  PD_DECLTYPE_FUNCTION_TEMPLATE_INSTANTIATION(kernel_fn)                       \
   static const ::phi::KernelRegistrar                                          \
       __reg_phi_kernel_##kernel_name##_##backend##_##layout(                   \
           reg_type,                                                            \

diff --git a/paddle/phi/kernels/gpu/arange_kernel.cu b/paddle/phi/kernels/gpu/arange_kernel.cu
@@ -135,8 +135,13 @@ void ArangeKernel(const Context& dev_ctx,
       <<<grid, block, 0, stream>>>(start_value, step_value, size, out_data);
 }
 
-template decltype(ArangeNullaryKernel<int64_t, GPUContext>) ArangeNullaryKernel;
-template decltype(ArangeNullaryKernel<int, GPUContext>) ArangeNullaryKernel;
+template void ArangeNullaryKernel<int64_t, GPUContext>(const GPUContext&,
+                                                       const int64_t,
+                                                       const int64_t,
+                                                       const int64_t,
+                                                       DenseTensor*);
+template void ArangeNullaryKernel<int, GPUContext>(
+    const GPUContext&, const int, const int, const int, DenseTensor*);
 }  // namespace phi
 
 PD_REGISTER_KERNEL(arange_tensor,

diff --git a/paddle/phi/kernels/gpu/range_kernel.cu b/paddle/phi/kernels/gpu/range_kernel.cu
@@ -135,8 +135,13 @@ void RangeKernel(const Context& dev_ctx,
       <<<grid, block, 0, stream>>>(start_value, step_value, size, out_data);
 }
 
-template decltype(RangeNullaryKernel<int64_t, GPUContext>) RangeNullaryKernel;
-template decltype(RangeNullaryKernel<int, GPUContext>) RangeNullaryKernel;
+template void RangeNullaryKernel<int64_t, GPUContext>(const GPUContext&,
+                                                      const int64_t,
+                                                      const int64_t,
+                                                      const int64_t,
+                                                      DenseTensor*);
+template void RangeNullaryKernel<int, GPUContext>(
+    const GPUContext&, const int, const int, const int, DenseTensor*);
 }  // namespace phi
 
 PD_REGISTER_KERNEL(range_tensor,

diff --git a/python/setup.py.in b/python/setup.py.in
@@ -611,6 +611,7 @@ def get_paddle_extra_install_requirements():
     #(Note risemeup1): Paddle will install the pypi cuda package provided by Nvidia, which includes the cuda runtime, cudnn, and cublas. Additionally, it now supports the installation of TensorRT, further enhancing its functionality. This integration simplifies the process as the operation of 'pip install paddle' is no longer dependent on the separate installation of cuda, cudnn, or TensorRT.
     paddle_cuda_requires = []
     paddle_tensorrt_requires = []
+    cuda_major_version = None
     if '@WITH_PIP_CUDA_LIBRARIES@' == 'ON':
         if platform.system() == 'Linux':
             PADDLE_CUDA_INSTALL_REQUIREMENTS = {
@@ -721,6 +722,23 @@ def get_paddle_extra_install_requirements():
                 "nvidia-cufile==1.15.1.6; platform_system == 'Linux' | "
                 "cuda-python==13.0.3; platform_system == 'Linux'"
             ),
+            "13.2": (
+                "nvidia-cuda-nvrtc==13.2.78; platform_system == 'Linux' | "
+                "nvidia-cuda-runtime==13.2.75; platform_system == 'Linux' | "
+                "nvidia-cuda-cupti==13.2.75; platform_system == 'Linux' | "
+                "nvidia-cudnn-cu13==9.21.0.82; platform_system == 'Linux' | "
+                "nvidia-cublas==13.4.0.1; platform_system == 'Linux' | "
+                "nvidia-cufft==12.2.0.46; platform_system == 'Linux' | "
+                "nvidia-curand==10.4.2.55; platform_system == 'Linux' | "
+                "nvidia-cusolver==12.2.0.1; platform_system == 'Linux' | "
+                "nvidia-cusparse==12.7.10.1; platform_system == 'Linux' | "
+                "nvidia-cusparselt-cu13==0.9.0; platform_system == 'Linux' | "
+                "nvidia-nccl-cu13==2.29.7; platform_system == 'Linux' | "
+                "nvidia-nvtx==13.2.75; platform_system == 'Linux' | "
+                "nvidia-nvjitlink==13.2.78; platform_system == 'Linux' | "
+                "nvidia-cufile==1.17.1.22; platform_system == 'Linux' | "
+                "cuda-python==13.2.0; platform_system == 'Linux'"
+            ),
         }
         if '@WITH_CINN@' == 'ON':
             PADDLE_CUDA_INSTALL_REQUIREMENTS["12.3"] += (
@@ -741,6 +759,9 @@ def get_paddle_extra_install_requirements():
             PADDLE_CUDA_INSTALL_REQUIREMENTS["13.0"] += (
                     " | nvidia-cuda-cccl==13.0.85;platform_system == 'Linux' "
             )
+            PADDLE_CUDA_INSTALL_REQUIREMENTS["13.2"] += (
+                    " | nvidia-cuda-cccl==13.2.75;platform_system == 'Linux' "
+            )
         elif platform.system() == 'Windows':
             PADDLE_CUDA_INSTALL_REQUIREMENTS = {
                 "11.8": (
@@ -811,37 +832,46 @@ def get_paddle_extra_install_requirements():
 
     if '@WITH_PIP_TENSORRT@' == 'ON':
         version_str = get_tensorrt_version()
-        version_default = int(version_str.split(".")[0])
-        if platform.system() =='Linux' or (platform.system()=='Windows' and version_default>=10):
-
+        version_default = int(version_str.split(".")[0]) if version_str else None
+        if platform.system() == 'Linux' and cuda_major_version == '13.2':
+            if not version_str and platform.machine() == 'aarch64':
+                return paddle_cuda_requires, ["tensorrt-cu13==10.16.1.11"]
+            PADDLE_TENSORRT_INSTALL_REQUIREMENTS = [
+                "tensorrt-cu13==10.16.1.11",
+            ]
+        elif platform.system() =='Linux' or (platform.system()=='Windows' and version_default is not None and version_default>=10):
             PADDLE_TENSORRT_INSTALL_REQUIREMENTS = [
                 "tensorrt==8.5.3.1",
                 "tensorrt==8.6.0",
                 "tensorrt==8.6.1.post1",
                 "tensorrt==10.3.0",
             ]
+        else:
+            return paddle_cuda_requires, []
 
-            if not version_str:
-                return paddle_cuda_requires,[]
+        if not version_str:
+            return paddle_cuda_requires,[]
 
-            version_main = ".".join(version_str.split(".")[:3])
+        version_main = ".".join(version_str.split(".")[:3])
 
-            matched_package = None
-            for paddle_tensorrt_requires in PADDLE_TENSORRT_INSTALL_REQUIREMENTS:
-                paddle_tensorrt_version = paddle_tensorrt_requires.split("==")[1]
-                paddle_tensorrt_main = ".".join(paddle_tensorrt_version.split(".")[:3])
+        matched_package = None
+        for paddle_tensorrt_requires in PADDLE_TENSORRT_INSTALL_REQUIREMENTS:
+            paddle_tensorrt_version = paddle_tensorrt_requires.split("==")[1]
+            paddle_tensorrt_main = ".".join(paddle_tensorrt_version.split(".")[:3])
 
-                if version_main == paddle_tensorrt_main:
-                    matched_package = paddle_tensorrt_requires
-                    break
+            if version_main == paddle_tensorrt_main:
+                matched_package = paddle_tensorrt_requires
+                break
 
-            if matched_package:
-                paddle_tensorrt_requires = [matched_package]
-            else:
-                print(
-                    f"No exact match found for TensorRT Version: {version_str}. We currently support TensorRT versions 8.5.3.1, 8.6.0, and 8.6.1."
-                )
-                return paddle_cuda_requires, []
+        if matched_package:
+            paddle_tensorrt_requires = [matched_package]
+        else:
+            print(
+                "No exact match found for TensorRT Version: "
+                f"{version_str}. We currently support TensorRT versions "
+                "8.5.3.1, 8.6.0, 8.6.1.post1, 10.3.0, and 10.16.1.11."
+            )
+            return paddle_cuda_requires, []
 
     return paddle_cuda_requires,paddle_tensorrt_requires