diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 51e8dad4ce..bec49b29f2 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -102,18 +102,32 @@ struct tile_x_sizes { }; static int get_mmq_x_max_host(const int cc) { - return (amd_mfma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc)) ? 128 : - GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ? + // RDNA3_5 (GFX1151): mmq_x_max=48 for optimal VGPR/performance balance + if (GGML_CUDA_CC_IS_RDNA3_5(cc)) { + return 48; + } + if (amd_mfma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc)) { + return 128; + } + if (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) { #ifdef GGML_CUDA_FORCE_MMQ - 128 : 64; + return 128; #else - MMQ_DP4A_MAX_BATCH_SIZE : 64; + return MMQ_DP4A_MAX_BATCH_SIZE; #endif // GGML_CUDA_FORCE_MMQ + } + return 64; } static constexpr __device__ int get_mmq_x_max_device() { -#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) +#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) + return 128; +#elif defined(AMD_WMMA_AVAILABLE) +#if defined(RDNA3_5) + return 48; +#else return 128; +#endif // defined(RDNA3_5) #else // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) #if defined(GGML_USE_HIP) @@ -135,8 +149,16 @@ static constexpr __device__ int get_mmq_x_max_device() { } static int get_mmq_y_host(const int cc) { - return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) : - ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64); + if (GGML_CUDA_CC_IS_AMD(cc)) { + if (GGML_CUDA_CC_IS_RDNA1(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc)) { + return 64; + } + return 128; + } + if (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) { + return 128; + } + return 64; } static constexpr __device__ int get_iter_k([[maybe_unused]] const ggml_type type) { @@ -149,7 +171,7 @@ static constexpr __device__ int get_iter_k([[maybe_unused]] const ggml_type type static constexpr __device__ int get_mmq_y_device() { #if defined(GGML_USE_HIP) -#if defined(RDNA1) +#if defined(RDNA1) || defined(RDNA3_5) return 64; #else return 128; @@ -284,6 +306,9 @@ static constexpr __device__ int mmq_get_granularity_device(const int /*mmq_x*/) #if defined(GGML_USE_HIP) static int mmq_get_nwarps_host(const int cc, const int warp_size) { + if (GGML_CUDA_CC_IS_RDNA3_5(cc)) { + return 4; + } return amd_mfma_available(cc) ? 8 : 256/warp_size; } #else @@ -294,7 +319,11 @@ static int mmq_get_nwarps_host(const int /*cc*/, const int warp_size) { static constexpr __device__ int mmq_get_nwarps_device() { #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) +#if defined(RDNA3_5) + return 4; +#else return 8; +#endif // defined(RDNA3_5) #else return 256/ggml_cuda_get_physical_warp_size(); #endif // AMD_MFMA_AVAILABLE