From e0e3c3fc672370c6bf2969093843f15c5e5b998c Mon Sep 17 00:00:00 2001 From: Sunil Pedapudi Date: Thu, 2 Apr 2026 16:43:55 -0700 Subject: [PATCH 1/3] gfx1151 nwarps, tile sizing to curb VGPR pressure --- ggml/src/ggml-cuda/mmq.cuh | 47 +++++++++++++++++++++++++++++++------- ggml/src/ggml-cuda/mmvq.cu | 10 ++++++-- 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 51e8dad4ce..85b4c4e903 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -102,18 +102,32 @@ struct tile_x_sizes { }; static int get_mmq_x_max_host(const int cc) { - return (amd_mfma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc)) ? 128 : - GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ? + // RDNA3_5 (GFX1151): mmq_x_max=48 for optimal VGPR/performance balance + if (GGML_CUDA_CC_IS_RDNA3_5(cc)) { + return 48; + } + if (amd_mfma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc)) { + return 128; + } + if (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) { #ifdef GGML_CUDA_FORCE_MMQ - 128 : 64; + return 128; #else - MMQ_DP4A_MAX_BATCH_SIZE : 64; + return MMQ_DP4A_MAX_BATCH_SIZE; #endif // GGML_CUDA_FORCE_MMQ + } + return 64; } static constexpr __device__ int get_mmq_x_max_device() { -#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) +#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) + return 128; +#elif defined(AMD_WMMA_AVAILABLE) +#if defined(RDNA3_5) + return 48; +#else return 128; +#endif #else // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) #if defined(GGML_USE_HIP) @@ -135,8 +149,16 @@ static constexpr __device__ int get_mmq_x_max_device() { } static int get_mmq_y_host(const int cc) { - return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) : - ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64); + if (GGML_CUDA_CC_IS_AMD(cc)) { + if (GGML_CUDA_CC_IS_RDNA1(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc)) { + return 64; + } + return 128; + } + if (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) { + return 128; + } + return 64; } static constexpr __device__ int get_iter_k([[maybe_unused]] const ggml_type type) { @@ -149,7 +171,9 @@ static constexpr __device__ int get_iter_k([[maybe_unused]] const ggml_type type static constexpr __device__ int get_mmq_y_device() { #if defined(GGML_USE_HIP) -#if defined(RDNA1) +#if defined(RDNA3_5) + return 64; +#elif defined(RDNA1) return 64; #else return 128; @@ -284,6 +308,9 @@ static constexpr __device__ int mmq_get_granularity_device(const int /*mmq_x*/) #if defined(GGML_USE_HIP) static int mmq_get_nwarps_host(const int cc, const int warp_size) { + if (GGML_CUDA_CC_IS_RDNA3_5(cc)) { + return 4; + } return amd_mfma_available(cc) ? 8 : 256/warp_size; } #else @@ -294,7 +321,11 @@ static int mmq_get_nwarps_host(const int /*cc*/, const int warp_size) { static constexpr __device__ int mmq_get_nwarps_device() { #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) +#if defined(RDNA3_5) + return 4; +#else return 8; +#endif #else return 256/ggml_cuda_get_physical_warp_size(); #endif // AMD_MFMA_AVAILABLE diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 07b10167bc..3f58cebade 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -64,6 +64,7 @@ enum mmvq_parameter_table_id { MMVQ_PARAMETERS_GCN, MMVQ_PARAMETERS_RDNA2, MMVQ_PARAMETERS_RDNA3_0, + MMVQ_PARAMETERS_RDNA3_5, MMVQ_PARAMETERS_RDNA4 }; @@ -72,7 +73,9 @@ static constexpr __device__ mmvq_parameter_table_id get_device_table_id() { return MMVQ_PARAMETERS_RDNA4; #elif defined(RDNA3_0) return MMVQ_PARAMETERS_RDNA3_0; -#elif defined(RDNA2) || defined(RDNA3_5) +#elif defined(RDNA3_5) + return MMVQ_PARAMETERS_RDNA3_5; +#elif defined(RDNA2) return MMVQ_PARAMETERS_RDNA2; #elif defined(GCN) || defined(CDNA) return MMVQ_PARAMETERS_GCN; @@ -88,7 +91,10 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) { if (GGML_CUDA_CC_IS_RDNA3_0(cc)) { return MMVQ_PARAMETERS_RDNA3_0; } - if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc)) { + if (GGML_CUDA_CC_IS_RDNA3_5(cc)) { + return MMVQ_PARAMETERS_RDNA3_5; + } + if (GGML_CUDA_CC_IS_RDNA2(cc)) { return MMVQ_PARAMETERS_RDNA2; } if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) { From 668d4bd660f89e2a21da0c116cdb43755391a63a Mon Sep 17 00:00:00 2001 From: Sunil Pedapudi Date: Mon, 6 Apr 2026 18:47:16 -0700 Subject: [PATCH 2/3] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/mmq.cuh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 85b4c4e903..bec49b29f2 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -127,7 +127,7 @@ static constexpr __device__ int get_mmq_x_max_device() { return 48; #else return 128; -#endif +#endif // defined(RDNA3_5) #else // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) #if defined(GGML_USE_HIP) @@ -149,7 +149,7 @@ static constexpr __device__ int get_mmq_x_max_device() { } static int get_mmq_y_host(const int cc) { - if (GGML_CUDA_CC_IS_AMD(cc)) { + if (GGML_CUDA_CC_IS_AMD(cc)) { if (GGML_CUDA_CC_IS_RDNA1(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc)) { return 64; } @@ -171,9 +171,7 @@ static constexpr __device__ int get_iter_k([[maybe_unused]] const ggml_type type static constexpr __device__ int get_mmq_y_device() { #if defined(GGML_USE_HIP) -#if defined(RDNA3_5) - return 64; -#elif defined(RDNA1) +#if defined(RDNA1) || defined(RDNA3_5) return 64; #else return 128; @@ -325,7 +323,7 @@ static constexpr __device__ int mmq_get_nwarps_device() { return 4; #else return 8; -#endif +#endif // defined(RDNA3_5) #else return 256/ggml_cuda_get_physical_warp_size(); #endif // AMD_MFMA_AVAILABLE From 7957de9dcf3b2c2d6590dccea1e548a28f81ac29 Mon Sep 17 00:00:00 2001 From: Sunil Pedapudi Date: Mon, 6 Apr 2026 18:48:53 -0700 Subject: [PATCH 3/3] revert changes to mmvq.cu --- ggml/src/ggml-cuda/mmvq.cu | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 3f58cebade..07b10167bc 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -64,7 +64,6 @@ enum mmvq_parameter_table_id { MMVQ_PARAMETERS_GCN, MMVQ_PARAMETERS_RDNA2, MMVQ_PARAMETERS_RDNA3_0, - MMVQ_PARAMETERS_RDNA3_5, MMVQ_PARAMETERS_RDNA4 }; @@ -73,9 +72,7 @@ static constexpr __device__ mmvq_parameter_table_id get_device_table_id() { return MMVQ_PARAMETERS_RDNA4; #elif defined(RDNA3_0) return MMVQ_PARAMETERS_RDNA3_0; -#elif defined(RDNA3_5) - return MMVQ_PARAMETERS_RDNA3_5; -#elif defined(RDNA2) +#elif defined(RDNA2) || defined(RDNA3_5) return MMVQ_PARAMETERS_RDNA2; #elif defined(GCN) || defined(CDNA) return MMVQ_PARAMETERS_GCN; @@ -91,10 +88,7 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) { if (GGML_CUDA_CC_IS_RDNA3_0(cc)) { return MMVQ_PARAMETERS_RDNA3_0; } - if (GGML_CUDA_CC_IS_RDNA3_5(cc)) { - return MMVQ_PARAMETERS_RDNA3_5; - } - if (GGML_CUDA_CC_IS_RDNA2(cc)) { + if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc)) { return MMVQ_PARAMETERS_RDNA2; } if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {