Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 37 additions & 8 deletions ggml/src/ggml-cuda/mmq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -102,18 +102,32 @@ struct tile_x_sizes {
};

static int get_mmq_x_max_host(const int cc) {
return (amd_mfma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc)) ? 128 :
GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ?
// RDNA3_5 (GFX1151): mmq_x_max=48 for optimal VGPR/performance balance
if (GGML_CUDA_CC_IS_RDNA3_5(cc)) {
return 48;
}
if (amd_mfma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc)) {
return 128;
}
if (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) {
#ifdef GGML_CUDA_FORCE_MMQ
128 : 64;
return 128;
#else
MMQ_DP4A_MAX_BATCH_SIZE : 64;
return MMQ_DP4A_MAX_BATCH_SIZE;
#endif // GGML_CUDA_FORCE_MMQ
}
return 64;
}

static constexpr __device__ int get_mmq_x_max_device() {
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
return 128;
#elif defined(AMD_WMMA_AVAILABLE)
#if defined(RDNA3_5)
return 48;
#else
return 128;
#endif // defined(RDNA3_5)
#else // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)

#if defined(GGML_USE_HIP)
Expand All @@ -135,8 +149,16 @@ static constexpr __device__ int get_mmq_x_max_device() {
}

static int get_mmq_y_host(const int cc) {
return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64);
if (GGML_CUDA_CC_IS_AMD(cc)) {
if (GGML_CUDA_CC_IS_RDNA1(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc)) {
return 64;
}
return 128;
}
if (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) {
return 128;
}
return 64;
}

static constexpr __device__ int get_iter_k([[maybe_unused]] const ggml_type type) {
Expand All @@ -149,7 +171,7 @@ static constexpr __device__ int get_iter_k([[maybe_unused]] const ggml_type type

static constexpr __device__ int get_mmq_y_device() {
#if defined(GGML_USE_HIP)
#if defined(RDNA1)
#if defined(RDNA1) || defined(RDNA3_5)
return 64;
#else
return 128;
Expand Down Expand Up @@ -284,6 +306,9 @@ static constexpr __device__ int mmq_get_granularity_device(const int /*mmq_x*/)

#if defined(GGML_USE_HIP)
static int mmq_get_nwarps_host(const int cc, const int warp_size) {
if (GGML_CUDA_CC_IS_RDNA3_5(cc)) {
return 4;
}
return amd_mfma_available(cc) ? 8 : 256/warp_size;
}
#else
Expand All @@ -294,7 +319,11 @@ static int mmq_get_nwarps_host(const int /*cc*/, const int warp_size) {

static constexpr __device__ int mmq_get_nwarps_device() {
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
#if defined(RDNA3_5)
return 4;
#else
return 8;
#endif // defined(RDNA3_5)
#else
return 256/ggml_cuda_get_physical_warp_size();
#endif // AMD_MFMA_AVAILABLE
Expand Down
Loading