domvox · TheTom · May 11, 2026
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
@@ -464,6 +464,19 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
     // For small batch sizes the vector kernel may be preferable over the kernels optimized for large batch sizes:
     const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
 
+#ifdef GGML_USE_HIP
+    // HIP/ROCm: the TILE/MMA/WMMA FA paths allocate unbounded f16 temp buffers
+    // for quantized KV types (K_f16, V_f16 in launch_fattn). The pool retains
+    // peak allocation size, so the temp buffer VRAM exceeds KV compression savings.
+    // This causes quantized KV to OOM before f16 on the same context length.
+    // Force VEC path which does inline dequant with zero temp buffer overhead.
+    // Trade-off: prefill is slower (sequential query processing).
+    // Limitation: head_dim > 256 cannot use VEC (falls through to TILE).
+    if ((ggml_is_quantized(K->type) || ggml_is_quantized(V->type)) && can_use_vector_kernel) {
+        return BEST_FATTN_KERNEL_VEC;
+    }
+#endif // GGML_USE_HIP
+
     // If Turing tensor cores are available, use them:
     if (turing_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
         if (can_use_vector_kernel) {