From a67328cd907f98dc0367b493305328273fe92863 Mon Sep 17 00:00:00 2001
From: Mike Wilkins <mwilkins@cornelisnetworks.com>
Date: Fri, 12 Jun 2026 18:06:01 -0400
Subject: [PATCH] Add ROCm/HIP GPU backend

Add an AMD ROCm/HIP reduction backend to CAIL alongside the existing CUDA
path, so GPU-aware MPI_Allreduce interposition works on AMD GPUs.

- configure: --with-rocm / --with-rocm-arch; m4/ax_check_rocm.m4 probes
  hipcc, headers, and runtime; CUDA and ROCm backends are mutually exclusive
  and selected at configure time.
- src/gpu/rocm: cail_rocm_mem.c (device-pointer detection, hipMalloc/
  hipMemcpy with a post-copy hipStreamSynchronize(0) release fence so the
  staged buffer is visible to a subsequent GPU-aware MPI / NIC / GDRCopy
  read; stream-scoped so unrelated device work is not serialized) and
  cail_rocm_reduce.hip (elementwise SUM/PROD/MIN/MAX kernels for all
  supported datatypes).
- tests: single-source GPU tests via tests/test_gpu_compat.h, compiled for
  CUDA or HIP from one source. The GPU correctness test sweeps a small set
  of element counts {1, 2, 3, base, base+1, base+3} to cover edge and
  non-power-of-two remainder paths in addition to the requested size.
- hip_lt.sh: libtool shim so .hip sources link cleanly under the autotools
  build.

Verified on AMD MI210 (gfx90a), ROCm 6.4.2: correctness 160/160 per count
across np=2/3/4 and all algorithms (recursive_doubling, ring, rabenseifner),
single-node and 2-node, plus OSU osu_allreduce interpose (CPU and ROCm
device-to-device). GPU-aware allreduce shows a significant speedup over
native for messages above the small-message cutoff, with native faster for
small messages.
---
 Makefile.am                             |   2 +-
 README.md                               |  20 +-
 configure.ac                            |  65 ++++-
 hip_lt.sh                               |  57 ++++
 m4/ax_check_cuda.m4                     |  19 +-
 m4/ax_check_rocm.m4                     |  91 +++++++
 src/Makefile.am                         |   4 +
 src/gpu/Makefile.am                     |   4 +
 src/gpu/cail_gpu.h                      |   4 +-
 src/gpu/rocm/Makefile.am                |  21 +-
 src/gpu/rocm/cail_rocm_mem.c            |  43 +++
 src/gpu/rocm/cail_rocm_reduce.hip       | 339 ++++++++++++++++++++++++
 src/gpu/rocm/cail_rocm_reduce_stub.c    |  25 --
 tests/Makefile.am                       |  26 +-
 tests/bench_allreduce_gpu.cu            |  25 +-
 tests/test_allreduce_correctness_gpu.cu | 242 ++++++++++-------
 tests/test_gpu_compat.h                 |  51 ++++
 17 files changed, 876 insertions(+), 162 deletions(-)
 create mode 100755 hip_lt.sh
 create mode 100644 m4/ax_check_rocm.m4
 create mode 100644 src/gpu/rocm/cail_rocm_mem.c
 create mode 100644 src/gpu/rocm/cail_rocm_reduce.hip
 delete mode 100644 src/gpu/rocm/cail_rocm_reduce_stub.c
 create mode 100644 tests/test_gpu_compat.h

diff --git a/Makefile.am b/Makefile.am
index b261bf3..57fe041 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -7,5 +7,5 @@ ACLOCAL_AMFLAGS = -I m4
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = cail.pc
 
-EXTRA_DIST = autogen.sh cail.pc.in cuda_lt.sh
+EXTRA_DIST = autogen.sh cail.pc.in cuda_lt.sh hip_lt.sh
 DISTCLEANFILES = cail.pc
diff --git a/README.md b/README.md
index 0738747..8efac87 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 CAIL is a drop-in GPU-aware `MPI_Allreduce` optimization library. It uses the
 MPI profiling interface (PMPI) to transparently intercept `MPI_Allreduce` calls
-and route them through optimized algorithms with native CUDA reduction kernels.
+and route them through optimized algorithms with native CUDA/HIP reduction kernels.
 No application changes are required, just `LD_PRELOAD` the library.
 
 ## Quick Start
@@ -10,7 +10,7 @@ No application changes are required, just `LD_PRELOAD` the library.
 ### Prerequisites
 
 - MPI implementation (OpenMPI, MPICH, Intel MPI, etc.)
-- CUDA Toolkit (nvcc, cudart)
+- CUDA Toolkit (nvcc, cudart) or ROCm Toolkit (hipcc, amdhip64)
 - Autotools (autoconf >= 2.69, automake, libtool)
 
 ### Build
@@ -48,6 +48,8 @@ Look for `[cail] initialized:` and `[cail] algorithm=` on stderr to confirm CAIL
 |---------------------------------|------------------------------------------------------|----------|
 | `--with-cuda=PATH`              | Path to CUDA toolkit installation                    | auto     |
 | `--with-cuda-arch=SM`           | NVCC architecture flag (e.g. `sm_70`, `sm_90`)       | `sm_70`  |
+| `--with-rocm=PATH`              | Path to ROCm toolkit installation                    | auto     |
+| `--with-rocm-arch=GFX`          | HIP offload architecture (e.g. `gfx90a`); empty lets hipcc autodetect | empty |
 | `--with-mpi=PATH`               | Path to MPI installation                             | auto     |
 | `--enable-host-path`            | Build without GPU support (host-only, uses `MPI_Reduce_local`) | no |
 | `--enable-debug`                | Debug build with `-g -O0`                            | no       |
@@ -55,7 +57,13 @@ Look for `[cail] initialized:` and `[cail] algorithm=` on stderr to confirm CAIL
 | `--enable-ring`                 | Enable ring algorithm                                | yes      |
 | `--enable-rabenseifner`         | Enable Rabenseifner algorithm                        | yes      |
 
-### Host-Only Build (No CUDA)
+### ROCm Build
+
+```sh
+./configure --with-rocm=/opt/rocm --with-rocm-arch=gfx90a
+```
+
+### Host-Only Build (No CUDA/ROCm)
 
 ```sh
 ./configure --enable-host-path
@@ -68,7 +76,7 @@ with `malloc`/`free`. Useful for development or CPU-only clusters.
 
 CAIL intercepts `MPI_Allreduce` when all of these are true:
 
-- Buffer resides on a CUDA device (or built with `--enable-host-path`)
+- Buffer resides on a CUDA or ROCm device (or built with `--enable-host-path`)
 - Datatype is one of the 20 supported MPI types (see below)
 - Operation is SUM, PROD, MAX, or MIN
 - Communicator is an intracommunicator
@@ -211,9 +219,9 @@ CAIL_ALGO=ring CAIL_DEBUG=1 mpirun -np 4 -x LD_PRELOAD=... ./my_app
 | `test_allreduce_basic`        | CPU     | Float SUM across 7 message sizes, internal count sweep |
 | `test_allreduce_correctness`  | CPU     | All 20 datatypes × 4 ops × {normal, MPI_IN_PLACE}. Requires `-c count`. |
 | `test_allreduce_edge`         | CPU     | Edge cases: count=0, count=1, large counts, single process |
-| `test_allreduce_correctness_gpu` | GPU  | All 20 datatypes × 4 ops × {normal, MPI_IN_PLACE}. Requires `-c count`. CUDA build only. |
+| `test_allreduce_correctness_gpu` | GPU  | All 20 datatypes × 4 ops × {normal, MPI_IN_PLACE}. Requires `-c count`. CUDA or ROCm build. |
 | `bench_allreduce`             | CPU     | Performance benchmark across message sizes    |
-| `bench_allreduce_gpu`         | GPU     | GPU performance benchmark. CUDA build only.   |
+| `bench_allreduce_gpu`         | GPU     | GPU performance benchmark. CUDA or ROCm build. |
 
 ### Test CLI
 
diff --git a/configure.ac b/configure.ac
index c8e0b6d..0042d9f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -99,6 +99,36 @@ AS_IF([test "x$enable_rabenseifner" = "xyes"], [
     [Define to 1 to enable Rabenseifner algorithm])
 ])
 
+dnl ---------------------------------------------------------------------------
+dnl GPU backend selection options
+dnl ---------------------------------------------------------------------------
+AC_ARG_WITH([cuda],
+  [AS_HELP_STRING([--with-cuda=PATH],
+    [Path to CUDA toolkit @<:@default=auto@:>@])],
+  [with_cuda=$withval],
+  [with_cuda=auto])
+
+AC_ARG_WITH([rocm],
+  [AS_HELP_STRING([--with-rocm=PATH],
+    [Path to ROCm toolkit @<:@default=auto@:>@])],
+  [with_rocm=$withval],
+  [with_rocm=auto])
+
+user_with_cuda=$with_cuda
+user_with_rocm=$with_rocm
+
+AS_IF([test "x$with_rocm" != "xauto" && test "x$with_rocm" != "xno" && test "x$with_cuda" = "xauto"], [
+  with_cuda=no
+])
+AS_IF([test "x$with_cuda" != "xauto" && test "x$with_cuda" != "xno" && test "x$with_rocm" = "xauto"], [
+  with_rocm=no
+])
+
+AS_IF([test "x$enable_host_path" = "xyes"], [
+  with_cuda=no
+  with_rocm=no
+])
+
 dnl ---------------------------------------------------------------------------
 dnl --with-cuda-arch : NVCC architecture flag
 dnl ---------------------------------------------------------------------------
@@ -111,10 +141,41 @@ AC_ARG_WITH([cuda-arch],
 AC_SUBST([CUDA_ARCH], [$cuda_arch])
 
 dnl ---------------------------------------------------------------------------
-dnl CUDA (macro provided by m4/ax_check_cuda.m4)
-dnl Must come after --with-cuda-arch so $cuda_arch is set for NVCCFLAGS
+dnl --with-rocm-arch : HIP architecture flag
+dnl ---------------------------------------------------------------------------
+AC_ARG_WITH([rocm-arch],
+  [AS_HELP_STRING([--with-rocm-arch=GFX],
+    [Set ROCm offload architecture for hipcc (e.g. gfx90a); default empty uses hipcc autodetect])],
+  [rocm_arch=$withval],
+  [rocm_arch=])
+
+AC_SUBST([ROCM_ARCH], [$rocm_arch])
+
+dnl ---------------------------------------------------------------------------
+dnl CUDA / ROCm detection
 dnl ---------------------------------------------------------------------------
 AX_CHECK_CUDA
+AX_CHECK_ROCM
+
+dnl ---------------------------------------------------------------------------
+dnl Final backend selection validation
+dnl ---------------------------------------------------------------------------
+AS_IF([test "x$enable_host_path" = "xyes"], [
+  AS_IF([test "x$user_with_cuda" != "xauto" && test "x$user_with_cuda" != "xno"], [
+    AC_MSG_ERROR([host-path excludes GPU backends])
+  ])
+  AS_IF([test "x$user_with_rocm" != "xauto" && test "x$user_with_rocm" != "xno"], [
+    AC_MSG_ERROR([host-path excludes GPU backends])
+  ])
+], [
+  AS_IF([test "x$have_cuda" = "xyes" && test "x$have_rocm" = "xyes"], [
+    AC_MSG_ERROR([choose one GPU backend: CUDA or ROCm])
+  ], [
+    AS_IF([test "x$have_cuda" = "xno" && test "x$have_rocm" = "xno"], [
+      AC_MSG_ERROR([no GPU backend: pass --with-cuda=PATH, --with-rocm=PATH, or --enable-host-path])
+    ])
+  ])
+])
 
 dnl ---------------------------------------------------------------------------
 dnl Output
diff --git a/hip_lt.sh b/hip_lt.sh
new file mode 100755
index 0000000..715f5d2
--- /dev/null
+++ b/hip_lt.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright (c) 2026 Cornelis Networks. All rights reserved.
+
+# hip_lt.sh — Wrapper to compile .hip files into libtool .lo objects.
+# Implements the same UCC-pattern rationale as cuda_lt.sh: emit PIC/non-PIC
+# objects plus .lo metadata so libtool keeps HIP objects during convenience-lib linking.
+
+set -e
+
+libtool_file=$1
+lo_filepath=$2
+
+# Derive .o path from .lo path
+o_filepath="${lo_filepath%.lo}.o"
+lo_dir=$(dirname "$o_filepath")
+o_filename=$(basename "$o_filepath")
+
+# Libtool convention: PIC objects go in .libs/, non-PIC in current dir
+local_pic_dir=".libs/"
+local_npic_dir=""
+pic_dir="${lo_dir}/${local_pic_dir}"
+npic_dir="${lo_dir}/${local_npic_dir}"
+
+pic_filepath="${pic_dir}${o_filename}"
+npic_filepath="${npic_dir}${o_filename}"
+local_pic_filepath="${local_pic_dir}${o_filename}"
+local_npic_filepath="${local_npic_dir}${o_filename}"
+
+mkdir -p "$pic_dir"
+
+# Build PIC version (for shared library)
+cmd="${@:3} -fPIC -o ${pic_filepath}"
+echo "$cmd"
+$cmd
+
+# Build non-PIC version (for static library)
+cmd="${@:3} -o ${npic_filepath}"
+echo "$cmd"
+$cmd
+
+# Write the .lo metadata file that libtool expects
+libtool_version="$(${libtool_file} --version | head -1 | sed 's/^/#/g')"
+
+cat > "${lo_filepath}" <<LOEOF
+# ${lo_filepath} - a libtool object file
+# Generated by hip_lt.sh for hipcc/libtool integration
+# ${libtool_version}
+
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# Name of the PIC object.
+pic_object='${local_pic_filepath}'
+
+# Name of the non-PIC object.
+non_pic_object='${local_npic_filepath}'
+LOEOF
diff --git a/m4/ax_check_cuda.m4 b/m4/ax_check_cuda.m4
index 6599a7f..ac1a5bf 100644
--- a/m4/ax_check_cuda.m4
+++ b/m4/ax_check_cuda.m4
@@ -3,11 +3,13 @@ dnl AX_CHECK_CUDA — Detect CUDA toolkit, nvcc, and set build variables
 dnl ---------------------------------------------------------------------------
 AC_DEFUN([AX_CHECK_CUDA], [
   dnl --with-cuda=PATH
-  AC_ARG_WITH([cuda],
-    [AS_HELP_STRING([--with-cuda=PATH],
-      [Path to CUDA toolkit @<:@default=auto@:>@])],
-    [with_cuda=$withval],
-    [with_cuda=auto])
+  AS_IF([test "x$with_cuda" = "x"], [
+    AC_ARG_WITH([cuda],
+      [AS_HELP_STRING([--with-cuda=PATH],
+        [Path to CUDA toolkit @<:@default=auto@:>@])],
+      [with_cuda=$withval],
+      [with_cuda=auto])
+  ])
 
   have_cuda=no
 
@@ -52,13 +54,6 @@ AC_DEFUN([AX_CHECK_CUDA], [
     ])
   ])
 
-  dnl If CUDA required (not host-path and not --without-cuda), fail
-  AS_IF([test "x$have_cuda" = "xno" && test "x$with_cuda" != "xno" && test "x$enable_host_path" != "xyes"], [
-    AS_IF([test "x$with_cuda" != "xauto"], [
-      AC_MSG_ERROR([CUDA requested but not found. Use --without-cuda or --enable-host-path for CPU-only build.])
-    ])
-  ])
-
   AC_SUBST([CUDA_HOME])
   AC_SUBST([NVCC])
   AC_SUBST([CUDA_CFLAGS])
diff --git a/m4/ax_check_rocm.m4 b/m4/ax_check_rocm.m4
new file mode 100644
index 0000000..d00a4a3
--- /dev/null
+++ b/m4/ax_check_rocm.m4
@@ -0,0 +1,91 @@
+dnl ---------------------------------------------------------------------------
+dnl AX_CHECK_ROCM — Detect ROCm toolkit, hipcc, and set build variables
+dnl ---------------------------------------------------------------------------
+AC_DEFUN([AX_CHECK_ROCM], [
+  dnl --with-rocm=PATH
+  AS_IF([test "x$with_rocm" = "x"], [
+    AC_ARG_WITH([rocm],
+      [AS_HELP_STRING([--with-rocm=PATH],
+        [Path to ROCm toolkit @<:@default=auto@:>@])],
+      [with_rocm=$withval],
+      [with_rocm=auto])
+  ])
+
+  have_rocm=no
+  ROCM_HOME=
+  HIPCC=
+  ROCM_CFLAGS=
+  ROCM_LIBS=
+  HIPCCFLAGS=
+
+  dnl Skip ROCm if --without-rocm
+  AS_IF([test "x$with_rocm" != "xno"], [
+    dnl Find hipcc
+    AS_IF([test "x$with_rocm" != "xauto"], [
+      HIPCC="$with_rocm/bin/hipcc"
+      ROCM_HOME="$with_rocm"
+    ], [
+      AC_PATH_PROG([HIPCC], [hipcc], [])
+      AS_IF([test "x$HIPCC" != "x"], [
+        dnl Derive ROCM_HOME from hipcc location
+        rocm_bin_dir=`AS_DIRNAME([$HIPCC])`
+        ROCM_HOME=`AS_DIRNAME([$rocm_bin_dir])`
+      ], [
+        AS_IF([test -x "/opt/rocm/bin/hipcc"], [
+          HIPCC="/opt/rocm/bin/hipcc"
+          ROCM_HOME="/opt/rocm"
+        ])
+      ])
+    ])
+
+    dnl Check hipcc exists
+    AS_IF([test "x$HIPCC" != "x" && test -x "$HIPCC"], [
+      rocm_inc="$ROCM_HOME/include"
+      dnl Prefer lib64 if that is where libamdhip64.so lives (some distros)
+      AS_IF([test -f "$ROCM_HOME/lib64/libamdhip64.so"],
+            [rocm_lib="$ROCM_HOME/lib64"],
+            [rocm_lib="$ROCM_HOME/lib"])
+
+      dnl Validate HIP runtime header and ROCm runtime library
+      AC_CHECK_FILE([$rocm_inc/hip/hip_runtime.h], [
+        AC_CHECK_FILE([$rocm_lib/libamdhip64.so], [
+          have_rocm=yes
+          ROCM_CFLAGS="-I$rocm_inc -D__HIP_PLATFORM_AMD__"
+          ROCM_LIBS="-L$rocm_lib -lamdhip64"
+          HIPCCFLAGS="-fPIC"
+          AS_IF([test "x$rocm_arch" != "x"], [
+            HIPCCFLAGS="$HIPCCFLAGS --offload-arch=$rocm_arch"
+          ])
+          AC_DEFINE([HAVE_ROCM], [1], [Define to 1 if ROCm is available])
+          AC_MSG_NOTICE([ROCm found: $ROCM_HOME])
+        ], [
+          AS_IF([test "x$with_rocm" != "xauto"], [
+            AC_MSG_ERROR([libamdhip64.so not found in $rocm_lib])
+          ], [
+            AC_MSG_NOTICE([libamdhip64.so not found in $rocm_lib — ROCm disabled])
+          ])
+        ])
+      ], [
+        AS_IF([test "x$with_rocm" != "xauto"], [
+          AC_MSG_ERROR([hip_runtime.h not found in $rocm_inc/hip])
+        ], [
+          AC_MSG_NOTICE([hip_runtime.h not found in $rocm_inc/hip — ROCm disabled])
+        ])
+      ])
+    ], [
+      AS_IF([test "x$with_rocm" != "xauto"], [
+        AC_MSG_ERROR([hipcc not found at $HIPCC])
+      ], [
+        AC_MSG_NOTICE([hipcc not found — ROCm disabled])
+      ])
+    ])
+  ])
+
+  AC_SUBST([ROCM_HOME])
+  AC_SUBST([HIPCC])
+  AC_SUBST([ROCM_CFLAGS])
+  AC_SUBST([ROCM_LIBS])
+  AC_SUBST([HIPCCFLAGS])
+
+  AM_CONDITIONAL([HAVE_ROCM], [test "x$have_rocm" = "xyes"])
+])
diff --git a/src/Makefile.am b/src/Makefile.am
index a0a2756..12673a2 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -15,6 +15,10 @@ if HAVE_CUDA
 libcail_la_LDFLAGS += $(CUDA_LIBS) -lstdc++
 endif
 
+if HAVE_ROCM
+libcail_la_LDFLAGS += $(ROCM_LIBS) -lstdc++
+endif
+
 include_HEADERS = core/cail.h
 
 AM_CFLAGS = $(MPI_CFLAGS)
diff --git a/src/gpu/Makefile.am b/src/gpu/Makefile.am
index cb474fe..0f41df7 100644
--- a/src/gpu/Makefile.am
+++ b/src/gpu/Makefile.am
@@ -13,6 +13,10 @@ libcail_gpu_la_LIBADD += host/libcail_host.la
 else
 if HAVE_CUDA
 libcail_gpu_la_LIBADD += cuda/libcail_cuda.la
+else
+if HAVE_ROCM
+libcail_gpu_la_LIBADD += rocm/libcail_rocm.la
+endif
 endif
 endif
 
diff --git a/src/gpu/cail_gpu.h b/src/gpu/cail_gpu.h
index 9cda23c..d3b7633 100644
--- a/src/gpu/cail_gpu.h
+++ b/src/gpu/cail_gpu.h
@@ -3,12 +3,12 @@
 /* cail_gpu.h — Backend-agnostic GPU interface for cail
  * Implemented by: CUDA (cail_cuda_reduce.cu + cail_cuda_mem.c)
  *                 Host-path (cail_host_reduce.c)
- *                 ROCm stub (cail_rocm_reduce_stub.c)
+ *                 ROCm (cail_rocm_reduce.hip + cail_rocm_mem.c)
  */
 #ifndef CAIL_GPU_H
 #define CAIL_GPU_H
 
-#include "cail_types.h"
+#include "../core/cail_types.h"
 #include <stddef.h>
 #include <mpi.h>
 
diff --git a/src/gpu/rocm/Makefile.am b/src/gpu/rocm/Makefile.am
index d03b97a..d6b1783 100644
--- a/src/gpu/rocm/Makefile.am
+++ b/src/gpu/rocm/Makefile.am
@@ -1,7 +1,26 @@
 # Copyright (c) 2026 Cornelis Networks. All rights reserved.
 
+if HAVE_ROCM
+
 noinst_LTLIBRARIES = libcail_rocm.la
 
-libcail_rocm_la_SOURCES = cail_rocm_stub.c
+libcail_rocm_la_SOURCES = cail_rocm_mem.c cail_rocm_reduce.hip
+
+SUFFIXES = .hip
+
+.hip.lo:
+	/bin/bash $(top_srcdir)/hip_lt.sh "$(LIBTOOL)" $@ \
+	    $(HIPCC) $(HIPCCFLAGS) $(ROCM_CFLAGS) $(MPI_CFLAGS) \
+	    -I$(top_srcdir)/src/core -I$(top_srcdir)/src/gpu -c $<
+
+AM_CFLAGS = $(ROCM_CFLAGS) $(MPI_CFLAGS) -I$(top_srcdir)/src/core -I$(top_srcdir)/src/gpu
 
+CLEANFILES = cail_rocm_reduce.lo
+
+else
+
+noinst_LTLIBRARIES = libcail_rocm.la
+libcail_rocm_la_SOURCES = cail_rocm_stub.c
 AM_CFLAGS = -I$(top_srcdir)/src/core -I$(top_srcdir)/src/gpu
+
+endif
diff --git a/src/gpu/rocm/cail_rocm_mem.c b/src/gpu/rocm/cail_rocm_mem.c
new file mode 100644
index 0000000..0341891
--- /dev/null
+++ b/src/gpu/rocm/cail_rocm_mem.c
@@ -0,0 +1,43 @@
+/* Copyright (c) 2026 Cornelis Networks. All rights reserved. */
+
+#include "../cail_gpu.h"
+#include <hip/hip_runtime_api.h>
+#include <stddef.h>
+
+int  cail_rocm_init(void);
+void cail_rocm_finalize(void);
+int  cail_rocm_synchronize(void);
+
+int cail_gpu_is_device_pointer(const void *ptr) {
+    hipPointerAttribute_t attr;
+    hipError_t err = hipPointerGetAttributes(&attr, ptr);
+    if (err != hipSuccess) {
+        hipGetLastError();
+        return 0;
+    }
+    /* ROCm 6.x may report unregistered host pointers as hipSuccess with
+       hipMemoryTypeUnregistered; only device/managed types are GPU-resident. */
+    return (attr.type == hipMemoryTypeDevice || attr.type == hipMemoryTypeManaged);
+}
+
+int cail_gpu_malloc(void **ptr, size_t size) {
+    return (hipMalloc(ptr, size) == hipSuccess) ? 0 : -1;
+}
+
+int cail_gpu_free(void *ptr) {
+    return (hipFree(ptr) == hipSuccess) ? 0 : -1;
+}
+
+int cail_gpu_memcpy(void *dst, const void *src, size_t size) {
+    if (hipMemcpy(dst, src, size, hipMemcpyDefault) != hipSuccess)
+        return -1;
+    /* Send-side release fence: the staged buffer may next be read by an
+       external GPU-aware MPI / NIC / GDRCopy engine, not by HIP stream work.
+       Synchronize the copy's (default) stream before exposing dst to PMPI.
+       Stream-scoped (not device-wide) so unrelated device work is not serialized. */
+    return (hipStreamSynchronize(0) == hipSuccess) ? 0 : -1;
+}
+
+int cail_gpu_synchronize(void) { return cail_rocm_synchronize(); }
+int cail_gpu_init(void)        { return cail_rocm_init(); }
+void cail_gpu_finalize(void)   { cail_rocm_finalize(); }
diff --git a/src/gpu/rocm/cail_rocm_reduce.hip b/src/gpu/rocm/cail_rocm_reduce.hip
new file mode 100644
index 0000000..518ee27
--- /dev/null
+++ b/src/gpu/rocm/cail_rocm_reduce.hip
@@ -0,0 +1,339 @@
+/* Copyright (c) 2026 Cornelis Networks. All rights reserved. */
+
+#include "../cail_gpu.h"
+#include "../../core/cail_types.h"
+#include <hip/hip_runtime.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+/* Pinned host byte for PCIe read fence in cail_gpu_flush_recv_buf().
+   hipMemcpy D→H with a single byte forces a PCIe read transaction that
+   flushes all prior posted writes (GDRCopy BAR stores, NIC RDMA) to GPU memory. */
+static char  *flush_host_byte = NULL;
+
+#define BLOCK_SIZE 256
+#define ELEMENTS_PER_THREAD 4
+#define MAX_GRID 65535
+
+/* gfx90a wave64 does not affect this grid-stride implementation, so no tuning changes are required. */
+typedef void (*cail_rocm_kernel_fn)(const void*, void*, size_t, hipStream_t);
+
+static hipStream_t cail_stream = 0;
+static cail_rocm_kernel_fn kernel_table[CAIL_NUM_OPS][CAIL_NUM_DTYPES];
+
+template<typename T>
+struct OpSum {
+    __device__ __forceinline__ T operator()(T a, T b) const { return a + b; }
+};
+
+template<typename T>
+struct OpProd {
+    __device__ __forceinline__ T operator()(T a, T b) const { return a * b; }
+};
+
+template<typename T>
+struct OpMax {
+    __device__ __forceinline__ T operator()(T a, T b) const { return (a > b) ? a : b; }
+};
+
+template<typename T>
+struct OpMin {
+    __device__ __forceinline__ T operator()(T a, T b) const { return (a < b) ? a : b; }
+};
+
+template<typename T, typename Op>
+__global__ void cail_reduce_kernel(const T *in, T *inout, size_t count) {
+    size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = (size_t)blockDim.x * gridDim.x;
+    Op op;
+    for (size_t i = idx; i < count; i += stride) {
+        inout[i] = op(inout[i], in[i]);
+    }
+}
+
+__global__ void cail_reduce_float4_sum(const float *in, float *inout, size_t count) {
+    size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = (size_t)blockDim.x * gridDim.x;
+    size_t vec_count = count / 4;
+    const float4 *in4 = (const float4*)in;
+    float4 *inout4 = (float4*)inout;
+    for (size_t i = idx; i < vec_count; i += stride) {
+        float4 a = inout4[i];
+        float4 b = in4[i];
+        inout4[i] = make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+    }
+    size_t rem_start = vec_count * 4;
+    for (size_t i = rem_start + idx; i < count; i += stride) {
+        inout[i] += in[i];
+    }
+}
+
+__global__ void cail_reduce_double2_sum(const double *in, double *inout, size_t count) {
+    size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = (size_t)blockDim.x * gridDim.x;
+    size_t vec_count = count / 2;
+    const double2 *in2 = (const double2*)in;
+    double2 *inout2 = (double2*)inout;
+    for (size_t i = idx; i < vec_count; i += stride) {
+        double2 a = inout2[i];
+        double2 b = in2[i];
+        inout2[i] = make_double2(a.x + b.x, a.y + b.y);
+    }
+    size_t rem_start = vec_count * 2;
+    for (size_t i = rem_start + idx; i < count; i += stride) {
+        inout[i] += in[i];
+    }
+}
+
+static inline bool is_aligned(const void *ptr, size_t alignment) {
+    return ((uintptr_t)ptr % alignment) == 0;
+}
+
+static inline int grid_for_scalar(size_t count) {
+    if (count == 0) return 0;
+    size_t blocks = (count + (BLOCK_SIZE * ELEMENTS_PER_THREAD) - 1) / (BLOCK_SIZE * ELEMENTS_PER_THREAD);
+    if (blocks > MAX_GRID) blocks = MAX_GRID;
+    return (int)blocks;
+}
+
+static inline int grid_for_vec(size_t vec_count) {
+    if (vec_count == 0) return 0;
+    size_t blocks = (vec_count + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    if (blocks > MAX_GRID) blocks = MAX_GRID;
+    return (int)blocks;
+}
+
+template<typename T, typename Op>
+static void launch_scalar(const void *in, void *inout, size_t count, hipStream_t s) {
+    int grid = grid_for_scalar(count);
+    if (grid > 0) {
+        cail_reduce_kernel<T, Op><<<grid, BLOCK_SIZE, 0, s>>>((const T*)in, (T*)inout, count);
+    }
+}
+
+static void launch_sum_float(const void *in, void *inout, size_t count, hipStream_t s) {
+    if (count == 0) return;
+    if ((count % 4) == 0 && is_aligned(in, sizeof(float4)) && is_aligned(inout, sizeof(float4))) {
+        int grid = grid_for_vec(count / 4);
+        if (grid > 0) {
+            cail_reduce_float4_sum<<<grid, BLOCK_SIZE, 0, s>>>((const float*)in, (float*)inout, count);
+            return;
+        }
+    }
+    launch_scalar<float, OpSum<float>>(in, inout, count, s);
+}
+
+static void launch_sum_double(const void *in, void *inout, size_t count, hipStream_t s) {
+    if (count == 0) return;
+    if ((count % 2) == 0 && is_aligned(in, sizeof(double2)) && is_aligned(inout, sizeof(double2))) {
+        int grid = grid_for_vec(count / 2);
+        if (grid > 0) {
+            cail_reduce_double2_sum<<<grid, BLOCK_SIZE, 0, s>>>((const double*)in, (double*)inout, count);
+            return;
+        }
+    }
+    launch_scalar<double, OpSum<double>>(in, inout, count, s);
+}
+
+#define DECL_LAUNCH_SUM(type, name) \
+static void launch_sum_##name(const void *in, void *inout, size_t count, hipStream_t s) { \
+    launch_scalar<type, OpSum<type>>(in, inout, count, s); \
+}
+
+#define DECL_LAUNCH_PROD(type, name) \
+static void launch_prod_##name(const void *in, void *inout, size_t count, hipStream_t s) { \
+    launch_scalar<type, OpProd<type>>(in, inout, count, s); \
+}
+
+#define DECL_LAUNCH_MAX(type, name) \
+static void launch_max_##name(const void *in, void *inout, size_t count, hipStream_t s) { \
+    launch_scalar<type, OpMax<type>>(in, inout, count, s); \
+}
+
+#define DECL_LAUNCH_MIN(type, name) \
+static void launch_min_##name(const void *in, void *inout, size_t count, hipStream_t s) { \
+    launch_scalar<type, OpMin<type>>(in, inout, count, s); \
+}
+
+DECL_LAUNCH_SUM(char, char)
+DECL_LAUNCH_SUM(int, int)
+DECL_LAUNCH_SUM(long, long)
+DECL_LAUNCH_SUM(long long, longlong)
+DECL_LAUNCH_SUM(unsigned char, uchar)
+DECL_LAUNCH_SUM(unsigned int, uint)
+DECL_LAUNCH_SUM(unsigned long, ulong)
+DECL_LAUNCH_SUM(unsigned long long, ulonglong)
+DECL_LAUNCH_SUM(short, short)
+DECL_LAUNCH_SUM(unsigned short, ushort)
+
+DECL_LAUNCH_PROD(char, char)
+DECL_LAUNCH_PROD(short, short)
+DECL_LAUNCH_PROD(int, int)
+DECL_LAUNCH_PROD(long, long)
+DECL_LAUNCH_PROD(float, float)
+DECL_LAUNCH_PROD(double, double)
+DECL_LAUNCH_PROD(long long, longlong)
+DECL_LAUNCH_PROD(unsigned char, uchar)
+DECL_LAUNCH_PROD(unsigned short, ushort)
+DECL_LAUNCH_PROD(unsigned int, uint)
+DECL_LAUNCH_PROD(unsigned long, ulong)
+DECL_LAUNCH_PROD(unsigned long long, ulonglong)
+
+DECL_LAUNCH_MAX(char, char)
+DECL_LAUNCH_MAX(short, short)
+DECL_LAUNCH_MAX(int, int)
+DECL_LAUNCH_MAX(long, long)
+DECL_LAUNCH_MAX(float, float)
+DECL_LAUNCH_MAX(double, double)
+DECL_LAUNCH_MAX(long long, longlong)
+DECL_LAUNCH_MAX(unsigned char, uchar)
+DECL_LAUNCH_MAX(unsigned short, ushort)
+DECL_LAUNCH_MAX(unsigned int, uint)
+DECL_LAUNCH_MAX(unsigned long, ulong)
+DECL_LAUNCH_MAX(unsigned long long, ulonglong)
+
+DECL_LAUNCH_MIN(char, char)
+DECL_LAUNCH_MIN(short, short)
+DECL_LAUNCH_MIN(int, int)
+DECL_LAUNCH_MIN(long, long)
+DECL_LAUNCH_MIN(float, float)
+DECL_LAUNCH_MIN(double, double)
+DECL_LAUNCH_MIN(long long, longlong)
+DECL_LAUNCH_MIN(unsigned char, uchar)
+DECL_LAUNCH_MIN(unsigned short, ushort)
+DECL_LAUNCH_MIN(unsigned int, uint)
+DECL_LAUNCH_MIN(unsigned long, ulong)
+DECL_LAUNCH_MIN(unsigned long long, ulonglong)
+
+static void init_kernel_table(void) {
+    for (int op = 0; op < CAIL_NUM_OPS; ++op) {
+        for (int dt = 0; dt < CAIL_NUM_DTYPES; ++dt) {
+            kernel_table[op][dt] = 0;
+        }
+    }
+
+    kernel_table[CAIL_SUM][CAIL_CHAR]      = launch_sum_char;
+    kernel_table[CAIL_SUM][CAIL_INT]       = launch_sum_int;
+    kernel_table[CAIL_SUM][CAIL_LONG]      = launch_sum_long;
+    kernel_table[CAIL_SUM][CAIL_FLOAT]     = launch_sum_float;
+    kernel_table[CAIL_SUM][CAIL_DOUBLE]    = launch_sum_double;
+    kernel_table[CAIL_SUM][CAIL_LONG_LONG] = launch_sum_longlong;
+    kernel_table[CAIL_SUM][CAIL_UCHAR]     = launch_sum_uchar;
+    kernel_table[CAIL_SUM][CAIL_UINT]      = launch_sum_uint;
+    kernel_table[CAIL_SUM][CAIL_ULONG]     = launch_sum_ulong;
+    kernel_table[CAIL_SUM][CAIL_ULONGLONG] = launch_sum_ulonglong;
+    kernel_table[CAIL_SUM][CAIL_SHORT]     = launch_sum_short;
+    kernel_table[CAIL_SUM][CAIL_USHORT]    = launch_sum_ushort;
+
+    kernel_table[CAIL_PROD][CAIL_CHAR]      = launch_prod_char;
+    kernel_table[CAIL_PROD][CAIL_SHORT]     = launch_prod_short;
+    kernel_table[CAIL_PROD][CAIL_INT]       = launch_prod_int;
+    kernel_table[CAIL_PROD][CAIL_LONG]      = launch_prod_long;
+    kernel_table[CAIL_PROD][CAIL_FLOAT]     = launch_prod_float;
+    kernel_table[CAIL_PROD][CAIL_DOUBLE]    = launch_prod_double;
+    kernel_table[CAIL_PROD][CAIL_LONG_LONG] = launch_prod_longlong;
+    kernel_table[CAIL_PROD][CAIL_UCHAR]     = launch_prod_uchar;
+    kernel_table[CAIL_PROD][CAIL_USHORT]    = launch_prod_ushort;
+    kernel_table[CAIL_PROD][CAIL_UINT]      = launch_prod_uint;
+    kernel_table[CAIL_PROD][CAIL_ULONG]     = launch_prod_ulong;
+    kernel_table[CAIL_PROD][CAIL_ULONGLONG] = launch_prod_ulonglong;
+
+    kernel_table[CAIL_MAX][CAIL_CHAR]      = launch_max_char;
+    kernel_table[CAIL_MAX][CAIL_SHORT]     = launch_max_short;
+    kernel_table[CAIL_MAX][CAIL_INT]       = launch_max_int;
+    kernel_table[CAIL_MAX][CAIL_LONG]      = launch_max_long;
+    kernel_table[CAIL_MAX][CAIL_FLOAT]     = launch_max_float;
+    kernel_table[CAIL_MAX][CAIL_DOUBLE]    = launch_max_double;
+    kernel_table[CAIL_MAX][CAIL_LONG_LONG] = launch_max_longlong;
+    kernel_table[CAIL_MAX][CAIL_UCHAR]     = launch_max_uchar;
+    kernel_table[CAIL_MAX][CAIL_USHORT]    = launch_max_ushort;
+    kernel_table[CAIL_MAX][CAIL_UINT]      = launch_max_uint;
+    kernel_table[CAIL_MAX][CAIL_ULONG]     = launch_max_ulong;
+    kernel_table[CAIL_MAX][CAIL_ULONGLONG] = launch_max_ulonglong;
+
+    kernel_table[CAIL_MIN][CAIL_CHAR]      = launch_min_char;
+    kernel_table[CAIL_MIN][CAIL_SHORT]     = launch_min_short;
+    kernel_table[CAIL_MIN][CAIL_INT]       = launch_min_int;
+    kernel_table[CAIL_MIN][CAIL_LONG]      = launch_min_long;
+    kernel_table[CAIL_MIN][CAIL_FLOAT]     = launch_min_float;
+    kernel_table[CAIL_MIN][CAIL_DOUBLE]    = launch_min_double;
+    kernel_table[CAIL_MIN][CAIL_LONG_LONG] = launch_min_longlong;
+    kernel_table[CAIL_MIN][CAIL_UCHAR]     = launch_min_uchar;
+    kernel_table[CAIL_MIN][CAIL_USHORT]    = launch_min_ushort;
+    kernel_table[CAIL_MIN][CAIL_UINT]      = launch_min_uint;
+    kernel_table[CAIL_MIN][CAIL_ULONG]     = launch_min_ulong;
+    kernel_table[CAIL_MIN][CAIL_ULONGLONG] = launch_min_ulonglong;
+}
+
+extern "C" int cail_rocm_init(void) {
+    if (cail_stream != 0) {
+        return 0;
+    }
+    hipError_t err = hipStreamCreate(&cail_stream);
+    if (err != hipSuccess) {
+        cail_stream = 0;
+        return -1;
+    }
+    init_kernel_table();
+
+    /* Allocate a pinned host byte for the PCIe read fence used by
+       cail_gpu_flush_recv_buf(). */
+    if (hipHostMalloc((void**)&flush_host_byte, 1, hipHostMallocDefault) != hipSuccess)
+        flush_host_byte = NULL;
+
+    return 0;
+}
+
+extern "C" void cail_rocm_finalize(void) {
+    if (flush_host_byte) { hipHostFree(flush_host_byte); flush_host_byte = NULL; }
+    if (cail_stream != 0) {
+        hipStreamDestroy(cail_stream);
+        cail_stream = 0;
+    }
+}
+
+extern "C" int cail_rocm_synchronize(void) {
+    if (cail_stream == 0) return -1;
+    return (hipStreamSynchronize(cail_stream) == hipSuccess) ? 0 : -1;
+}
+
+/* PCIe read fence: reads the last byte of the receive buffer into pinned
+   host memory, forcing a D→H transaction that drains any prior posted
+   writes (NIC RDMA, GDRCopy BAR stores) to the same GPU memory region. */
+extern "C" int cail_gpu_flush_recv_buf(const void *recv_buf, size_t recv_bytes) {
+    if (!flush_host_byte || !recv_buf || recv_bytes == 0) return 0;
+    /* A 1-byte D2H read from the receive tail forces PCIe posted writes to drain
+       before this call returns, making subsequent kernel reads observe fresh data. */
+    hipMemcpy(flush_host_byte, (const char*)recv_buf + recv_bytes - 1,
+              1, hipMemcpyDeviceToHost);
+    return 0;
+}
+
+extern "C" int cail_gpu_reduce_local(const void *in, void *inout, size_t count,
+                                        cail_datatype_t dtype, cail_op_t op) {
+    if (op < 0 || op >= CAIL_NUM_OPS || dtype < 0 || dtype >= CAIL_NUM_DTYPES) {
+        fprintf(stderr, "[cail WARN] gpu_reduce_local: dtype=%d op=%d out of range\n",
+                (int)dtype, (int)op);
+        return -1;
+    }
+    cail_rocm_kernel_fn fn = kernel_table[op][dtype];
+    if (fn == 0) {
+        fprintf(stderr, "[cail WARN] gpu_reduce_local: no kernel for dtype=%s op=%s\n",
+                cail_dtype_name(dtype), cail_op_name(op));
+        return -1;
+    }
+    if (cail_stream == 0) {
+        fprintf(stderr, "[cail WARN] gpu_reduce_local: HIP stream not initialized\n");
+        return -1;
+    }
+    fn(in, inout, count, cail_stream);
+    hipError_t err = hipStreamSynchronize(cail_stream);
+    if (err != hipSuccess) {
+        fprintf(stderr, "[cail WARN] gpu_reduce_local: hipStreamSynchronize failed (%s) "
+                "for dtype=%s op=%s count=%zu\n",
+                hipGetErrorString(err), cail_dtype_name(dtype), cail_op_name(op), count);
+        return -1;
+    }
+    return 0;
+}
diff --git a/src/gpu/rocm/cail_rocm_reduce_stub.c b/src/gpu/rocm/cail_rocm_reduce_stub.c
deleted file mode 100644
index fb04b49..0000000
--- a/src/gpu/rocm/cail_rocm_reduce_stub.c
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2026 Cornelis Networks. All rights reserved. */
-
-/* cail_rocm_reduce_stub.c — ROCm/HIP stub implementation.
- * Provides the full cail_gpu.h interface as no-ops / error returns.
- * This file is compiled but NOT linked into libcail today (the rocm/
- * subdirectory builds libcail_rocm.la, but gpu/Makefile.am does not
- * add it to libcail_gpu_la_LIBADD).  It exists as a scaffold for a
- * future ROCm backend: replace these stubs with real HIP calls and
- * wire libcail_rocm.la into the build under a HAVE_ROCM conditional.
- */
-#include "../cail_gpu.h"
-
-int cail_gpu_is_device_pointer(const void *ptr) { (void)ptr; return 0; }
-int cail_gpu_reduce_local(const void *in, void *inout, size_t count,
-                            cail_datatype_t dtype, cail_op_t op)
-{ (void)in; (void)inout; (void)count; (void)dtype; (void)op; return -1; }
-int cail_gpu_malloc(void **ptr, size_t size) { (void)ptr; (void)size; return -1; }
-int cail_gpu_free(void *ptr) { (void)ptr; return -1; }
-int cail_gpu_memcpy(void *dst, const void *src, size_t size)
-{ (void)dst; (void)src; (void)size; return -1; }
-int cail_gpu_synchronize(void) { return -1; }
-int cail_gpu_init(void) { return -1; }
-void cail_gpu_finalize(void) {}
-int cail_gpu_flush_recv_buf(const void *recv_buf, size_t recv_bytes)
-{ (void)recv_buf; (void)recv_bytes; return 0; }
diff --git a/tests/Makefile.am b/tests/Makefile.am
index be54b76..7de437e 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -42,4 +42,28 @@ CLEANFILES = test_allreduce_correctness_gpu.o bench_allreduce_gpu.o
 
 endif
 
-EXTRA_DIST = test_runner.sh test_allreduce_correctness_gpu.cu bench_allreduce_gpu.cu
+if HAVE_ROCM
+
+SUFFIXES = .cu
+
+noinst_PROGRAMS += test_allreduce_correctness_gpu bench_allreduce_gpu
+
+test_allreduce_correctness_gpu_SOURCES =
+test_allreduce_correctness_gpu_LDADD = test_allreduce_correctness_gpu.o $(top_builddir)/src/libcail.la $(MPI_LIBS) $(ROCM_LIBS) -lstdc++ -lm
+test_allreduce_correctness_gpu_DEPENDENCIES = test_allreduce_correctness_gpu.o
+
+bench_allreduce_gpu_SOURCES =
+bench_allreduce_gpu_LDADD = bench_allreduce_gpu.o $(top_builddir)/src/libcail.la $(MPI_LIBS) $(ROCM_LIBS) -lstdc++ -lm
+bench_allreduce_gpu_DEPENDENCIES = bench_allreduce_gpu.o
+
+test_allreduce_correctness_gpu.o: $(srcdir)/test_allreduce_correctness_gpu.cu
+	$(HIPCC) $(HIPCCFLAGS) $(ROCM_CFLAGS) $(MPI_CFLAGS) -x hip -I$(top_srcdir)/src/core -c -o $@ $<
+
+bench_allreduce_gpu.o: $(srcdir)/bench_allreduce_gpu.cu
+	$(HIPCC) $(HIPCCFLAGS) $(ROCM_CFLAGS) $(MPI_CFLAGS) -x hip -I$(top_srcdir)/src/core -c -o $@ $<
+
+CLEANFILES = test_allreduce_correctness_gpu.o bench_allreduce_gpu.o
+
+endif
+
+EXTRA_DIST = test_runner.sh test_allreduce_correctness_gpu.cu bench_allreduce_gpu.cu test_gpu_compat.h
diff --git a/tests/bench_allreduce_gpu.cu b/tests/bench_allreduce_gpu.cu
index 2440bb9..daf44d4 100644
--- a/tests/bench_allreduce_gpu.cu
+++ b/tests/bench_allreduce_gpu.cu
@@ -3,11 +3,11 @@
 /*
  * bench_allreduce_gpu.cu — GPU-buffer benchmark for cail MPI_Allreduce
  *
- * Same as bench_allreduce.c but uses cudaMalloc'd device buffers so
+ * Same as bench_allreduce.c but uses GPU device buffers so
  * cail's GPU path is exercised instead of falling back to PMPI.
  */
 #include <mpi.h>
-#include <cuda_runtime.h>
+#include "test_gpu_compat.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -15,15 +15,6 @@
 #define DEFAULT_WARMUP     10
 #define DEFAULT_ITERATIONS 100
 
-#define CUDA_CHECK(call) do {                                          \
-    cudaError_t _e = (call);                                           \
-    if (_e != cudaSuccess) {                                           \
-        fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
-                cudaGetErrorString(_e));                                \
-        MPI_Abort(MPI_COMM_WORLD, 1);                                  \
-    }                                                                  \
-} while (0)
-
 int main(int argc, char **argv)
 {
     MPI_Init(&argc, &argv);
@@ -32,13 +23,13 @@ int main(int argc, char **argv)
     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
 
     int dev_count = 0;
-    cudaGetDeviceCount(&dev_count);
+    gpuGetDeviceCount(&dev_count);
     if (dev_count == 0) {
-        if (rank == 0) fprintf(stderr, "No CUDA devices found\n");
+        if (rank == 0) fprintf(stderr, "No GPU devices found\n");
         MPI_Finalize();
         return 1;
     }
-    CUDA_CHECK(cudaSetDevice(rank % dev_count));
+    GPU_CHECK(gpuSetDevice(rank % dev_count));
 
     size_t min_bytes  = 4;
     size_t max_bytes  = 16 * 1024 * 1024;
@@ -57,8 +48,8 @@ int main(int argc, char **argv)
     }
 
     float *d_buf;
-    CUDA_CHECK(cudaMalloc(&d_buf, max_bytes));
-    CUDA_CHECK(cudaMemset(d_buf, 0, max_bytes));
+    GPU_CHECK(gpuMalloc(&d_buf, max_bytes));
+    GPU_CHECK(gpuMemset(d_buf, 0, max_bytes));
 
     for (size_t size = min_bytes; size <= max_bytes; size *= 4) {
         int count = (int)(size / sizeof(float));
@@ -98,7 +89,7 @@ int main(int argc, char **argv)
         }
     }
 
-    cudaFree(d_buf);
+    gpuFree(d_buf);
     MPI_Finalize();
     return 0;
 }
diff --git a/tests/test_allreduce_correctness_gpu.cu b/tests/test_allreduce_correctness_gpu.cu
index b8f979c..727324a 100644
--- a/tests/test_allreduce_correctness_gpu.cu
+++ b/tests/test_allreduce_correctness_gpu.cu
@@ -3,31 +3,26 @@
 /*
  * test_allreduce_correctness_gpu.cu — GPU-buffer correctness tests for cail
  *
- * Mirrors test_allreduce_correctness.c but with CUDA device buffers.
- * For a given element count (-c, required) and algorithm (-a, optional),
+ * Mirrors test_allreduce_correctness.c but with CUDA or ROCm device buffers.
+ * For a base element count (-c, required) and algorithm (-a, optional),
  * exercises every supported {datatype x op} combination with both
- * separate send/recv buffers and MPI_IN_PLACE.
+ * separate send/recv buffers and MPI_IN_PLACE across a small count sweep
+ * that covers edge and remainder paths.
  *
- * 20 datatypes x 4 ops x 2 modes = 160 sub-tests per invocation
- * (minus PROD skips for nprocs > 4 to avoid overflow).
+ * 20 datatypes x 4 ops x 2 modes = 160 sub-tests per count
+ * (minus PROD skips for nprocs > 4 to avoid overflow), swept over up to
+ * 6 derived counts: {1, 2, 3, base, base+1, base+3} (deduplicated).
  */
 #include <mpi.h>
-#include <cuda_runtime.h>
+#include "test_gpu_compat.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
 #include <math.h>
 #include <unistd.h>
-
-#define CUDA_CHECK(call) do {                                          \
-    cudaError_t _e = (call);                                           \
-    if (_e != cudaSuccess) {                                           \
-        fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
-                cudaGetErrorString(_e));                                \
-        MPI_Abort(MPI_COMM_WORLD, 1);                                  \
-    }                                                                  \
-} while (0)
+#include <limits.h>
+#include <stddef.h>
 
 /* ------------------------------------------------------------------ */
 /* Type / op descriptors (same 20 types as CPU correctness test)      */
@@ -87,11 +82,14 @@ __global__ void fill_elements(char *buf, int count, int elem_size,
     }
 }
 
+/* d_val is a caller-owned, preallocated 16-byte device scratch buffer for the
+ * fill value template. It is allocated once in main() and reused for every
+ * call: per-call gpuMalloc/gpuFree of this tiny buffer hammers the Open MPI
+ * rcache VMA interval tree and can wedge collectives (see the buffer-reuse
+ * note in main()), so it must NOT be allocated inside this hot path. */
 static void gpu_fill_buf(void *d_buf, int count, size_t elem_size,
-                         int is_float, int rank)
+                         int is_float, int rank, char *d_val)
 {
-    /* Build the value on host, then copy the raw bytes to constant memory
-     * via a small device buffer. */
     char val_bytes[16] = {0};  /* large enough for any MPI type */
 
     if (is_float == 1) {
@@ -105,16 +103,12 @@ static void gpu_fill_buf(void *d_buf, int count, size_t elem_size,
         memcpy(val_bytes, &v, elem_size);
     }
 
-    /* Copy value template to device */
-    char *d_val;
-    CUDA_CHECK(cudaMalloc(&d_val, 16));
-    CUDA_CHECK(cudaMemcpy(d_val, val_bytes, 16, cudaMemcpyHostToDevice));
+    GPU_CHECK(gpuMemcpy(d_val, val_bytes, 16, gpuMemcpyHostToDevice));
 
     int grid = (count + 255) / 256;
-    fill_elements<<<grid, 256>>>((char *)d_buf, count, (int)elem_size, d_val);
-    CUDA_CHECK(cudaDeviceSynchronize());
-
-    cudaFree(d_val);
+    GPU_KERNEL_LAUNCH(fill_elements, dim3(grid), dim3(256), 0, 0,
+                      (char *)d_buf, count, (int)elem_size, d_val);
+    GPU_CHECK(gpuDeviceSynchronize());
 }
 
 /* ------------------------------------------------------------------ */
@@ -161,7 +155,7 @@ static int check_buf(const void *h_buf, int count, const type_info_t *t,
 static void usage(const char *prog)
 {
     fprintf(stderr, "Usage: %s -c count [-a algo] [-h]\n", prog);
-    fprintf(stderr, "  -c count   element count (required)\n");
+    fprintf(stderr, "  -c count   base element count for sweep (required)\n");
     fprintf(stderr, "  -a algo    set CAIL_ALGO before MPI_Init (default: auto)\n");
 }
 
@@ -171,14 +165,14 @@ static void usage(const char *prog)
 
 int main(int argc, char **argv)
 {
-    int count = 0;
+    int base_count = 0;
     const char *algo_name = NULL;
     int opt;
 
     while ((opt = getopt(argc, argv, "c:a:h")) != -1) {
         switch (opt) {
         case 'c':
-            count = atoi(optarg);
+            base_count = atoi(optarg);
             break;
         case 'a':
             algo_name = optarg;
@@ -191,7 +185,7 @@ int main(int argc, char **argv)
         }
     }
 
-    if (count <= 0) {
+    if (base_count <= 0) {
         fprintf(stderr, "%s: -c count is required and must be > 0\n", argv[0]);
         usage(argv[0]);
         return 1;
@@ -204,21 +198,21 @@ int main(int argc, char **argv)
     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
 
     int dev_count = 0;
-    cudaGetDeviceCount(&dev_count);
+    gpuGetDeviceCount(&dev_count);
     if (dev_count == 0) {
-        if (rank == 0) fprintf(stderr, "No CUDA devices found\n");
+        if (rank == 0) fprintf(stderr, "No GPU devices found\n");
         MPI_Finalize();
         return 1;
     }
-    CUDA_CHECK(cudaSetDevice(rank % dev_count));
+    GPU_CHECK(gpuSetDevice(rank % dev_count));
 
     if (!algo_name) algo_name = "auto";
     int is_pof2 = (nprocs > 0) && ((nprocs & (nprocs - 1)) == 0);
 
     if (rank == 0) {
         printf("=== cail GPU allreduce correctness tests ===\n");
-        printf("nprocs=%d  GPUs_per_node=%d  count=%d  algo=%s\n",
-               nprocs, dev_count, count, algo_name);
+        printf("nprocs=%d  GPUs_per_node=%d  base_count=%d  algo=%s\n",
+               nprocs, dev_count, base_count, algo_name);
         fflush(stdout);
     }
 
@@ -259,82 +253,140 @@ int main(int argc, char **argv)
     int g_pass = 0;
     int g_fail = 0;
 
-    /* Pre-allocate GPU and host buffers at the maximum needed size
-     * (count * largest_type_size).  Reusing a single allocation avoids
-     * hammering the Open MPI rcache VMA interval tree with hundreds of
-     * cudaMalloc/cudaFree cycles, which triggers a SEGV in
-     * opal_interval_tree_traverse at certain power-of-2 buffer sizes. */
     size_t max_elem_size = 0;
     for (int ti = 0; ti < ntypes; ti++)
         if (types[ti].size > max_elem_size) max_elem_size = types[ti].size;
-    size_t max_bufsize = (size_t)count * max_elem_size;
+
+    /* Keep the sweep small: edge counts + base + remainder-forcing offsets. */
+    int candidate_counts[6];
+    int ncandidates = 0;
+    candidate_counts[ncandidates++] = 1;
+    candidate_counts[ncandidates++] = 2;
+    candidate_counts[ncandidates++] = 3;
+    candidate_counts[ncandidates++] = base_count;
+    if (base_count < INT_MAX)
+        candidate_counts[ncandidates++] = base_count + 1;
+    if (base_count <= INT_MAX - 3)
+        candidate_counts[ncandidates++] = base_count + 3;
+
+    int counts[6];
+    int ncounts = 0;
+    size_t max_count_allowed = SIZE_MAX / max_elem_size;
+    for (int ci = 0; ci < ncandidates; ci++) {
+        int c = candidate_counts[ci];
+        if (c <= 0) continue;
+        if ((size_t)c > max_count_allowed) continue;
+
+        int dup = 0;
+        for (int ei = 0; ei < ncounts; ei++) {
+            if (counts[ei] == c) {
+                dup = 1;
+                break;
+            }
+        }
+        if (!dup) counts[ncounts++] = c;
+    }
+
+    if (ncounts == 0) {
+        if (rank == 0) fprintf(stderr, "No valid test counts after sweep filtering\n");
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    int max_count = counts[0];
+    for (int ci = 1; ci < ncounts; ci++)
+        if (counts[ci] > max_count) max_count = counts[ci];
+
+    if (rank == 0) {
+        printf("count sweep:");
+        for (int ci = 0; ci < ncounts; ci++)
+            printf(" %d", counts[ci]);
+        printf("\n");
+        fflush(stdout);
+    }
+
+    /* Pre-allocate GPU and host buffers once at the largest sweep count
+     * (max_count * largest_type_size). Reusing a single allocation avoids
+     * hammering the Open MPI rcache VMA interval tree with hundreds of
+     * gpuMalloc/gpuFree cycles, which triggers a SEGV in
+     * opal_interval_tree_traverse at certain power-of-2 buffer sizes. */
+    size_t max_bufsize = (size_t)max_count * max_elem_size;
 
     void *d_send, *d_recv, *d_inplace;
-    CUDA_CHECK(cudaMalloc(&d_send, max_bufsize));
-    CUDA_CHECK(cudaMalloc(&d_recv, max_bufsize));
-    CUDA_CHECK(cudaMalloc(&d_inplace, max_bufsize));
+    GPU_CHECK(gpuMalloc(&d_send, max_bufsize));
+    GPU_CHECK(gpuMalloc(&d_recv, max_bufsize));
+    GPU_CHECK(gpuMalloc(&d_inplace, max_bufsize));
+    /* One-time 16-byte scratch for the fill-value template, reused by every
+     * gpu_fill_buf() call. Allocated here (not per-call) for the same
+     * rcache-stability reason as the buffers above. */
+    char *d_val;
+    GPU_CHECK(gpuMalloc(&d_val, 16));
     void *h_check = malloc(max_bufsize);
     if (!h_check) { fprintf(stderr, "malloc failed\n"); MPI_Abort(MPI_COMM_WORLD, 1); }
 
-    for (int ti = 0; ti < ntypes; ti++) {
-        for (int oi = 0; oi < nops; oi++) {
-            /* PROD overflows quickly — skip for large process counts. */
-            if (ops[oi].mpi_op == MPI_PROD && nprocs > 4) continue;
-
-            double expected = compute_expected(ops[oi].mpi_op, nprocs);
-            size_t bufsize = (size_t)count * types[ti].size;
-
-            /* --- Separate send/recv buffers --- */
-            CUDA_CHECK(cudaMemset(d_recv, 0, bufsize));
-            gpu_fill_buf(d_send, count, types[ti].size,
-                         types[ti].is_float, rank);
-
-            MPI_Allreduce(d_send, d_recv, count, types[ti].mpi_type,
-                          ops[oi].mpi_op, MPI_COMM_WORLD);
-
-            CUDA_CHECK(cudaMemcpy(h_check, d_recv, bufsize,
-                                  cudaMemcpyDeviceToHost));
-
-            {
-                int pass = check_buf(h_check, count, &types[ti], expected);
-                if (rank == 0) {
-                    printf("%s: algo=%s %s x %s count=%d np=%d pof2=%s\n",
-                           pass ? "PASS" : "FAIL", algo_name,
-                           types[ti].name, ops[oi].name,
-                           count, nprocs, is_pof2 ? "yes" : "no");
-                    fflush(stdout);
+    for (int ci = 0; ci < ncounts; ci++) {
+        int count = counts[ci];
+
+        for (int ti = 0; ti < ntypes; ti++) {
+            for (int oi = 0; oi < nops; oi++) {
+                /* PROD overflows quickly — skip for large process counts. */
+                if (ops[oi].mpi_op == MPI_PROD && nprocs > 4) continue;
+
+                double expected = compute_expected(ops[oi].mpi_op, nprocs);
+                size_t bufsize = (size_t)count * types[ti].size;
+
+                /* --- Separate send/recv buffers --- */
+                GPU_CHECK(gpuMemset(d_recv, 0, bufsize));
+                gpu_fill_buf(d_send, count, types[ti].size,
+                             types[ti].is_float, rank, d_val);
+
+                MPI_Allreduce(d_send, d_recv, count, types[ti].mpi_type,
+                              ops[oi].mpi_op, MPI_COMM_WORLD);
+
+                GPU_CHECK(gpuMemcpy(h_check, d_recv, bufsize,
+                                    gpuMemcpyDeviceToHost));
+
+                {
+                    int pass = check_buf(h_check, count, &types[ti], expected);
+                    if (rank == 0) {
+                        printf("%s: algo=%s %s x %s count=%d np=%d pof2=%s\n",
+                               pass ? "PASS" : "FAIL", algo_name,
+                               types[ti].name, ops[oi].name,
+                               count, nprocs, is_pof2 ? "yes" : "no");
+                        fflush(stdout);
+                    }
+                    if (pass) g_pass++; else g_fail++;
                 }
-                if (pass) g_pass++; else g_fail++;
-            }
-
-            /* --- MPI_IN_PLACE --- */
-            gpu_fill_buf(d_inplace, count, types[ti].size,
-                         types[ti].is_float, rank);
-
-            MPI_Allreduce(MPI_IN_PLACE, d_inplace, count, types[ti].mpi_type,
-                          ops[oi].mpi_op, MPI_COMM_WORLD);
-
-            CUDA_CHECK(cudaMemcpy(h_check, d_inplace, bufsize,
-                                  cudaMemcpyDeviceToHost));
 
-            {
-                int pass = check_buf(h_check, count, &types[ti], expected);
-                if (rank == 0) {
-                    printf("%s: algo=%s MPI_IN_PLACE %s x %s count=%d np=%d pof2=%s\n",
-                           pass ? "PASS" : "FAIL", algo_name,
-                           types[ti].name, ops[oi].name,
-                           count, nprocs, is_pof2 ? "yes" : "no");
-                    fflush(stdout);
+                /* --- MPI_IN_PLACE --- */
+                gpu_fill_buf(d_inplace, count, types[ti].size,
+                             types[ti].is_float, rank, d_val);
+
+                MPI_Allreduce(MPI_IN_PLACE, d_inplace, count, types[ti].mpi_type,
+                              ops[oi].mpi_op, MPI_COMM_WORLD);
+
+                GPU_CHECK(gpuMemcpy(h_check, d_inplace, bufsize,
+                                    gpuMemcpyDeviceToHost));
+
+                {
+                    int pass = check_buf(h_check, count, &types[ti], expected);
+                    if (rank == 0) {
+                        printf("%s: algo=%s MPI_IN_PLACE %s x %s count=%d np=%d pof2=%s\n",
+                               pass ? "PASS" : "FAIL", algo_name,
+                               types[ti].name, ops[oi].name,
+                               count, nprocs, is_pof2 ? "yes" : "no");
+                        fflush(stdout);
+                    }
+                    if (pass) g_pass++; else g_fail++;
                 }
-                if (pass) g_pass++; else g_fail++;
             }
         }
     }
 
     free(h_check);
-    cudaFree(d_send);
-    cudaFree(d_recv);
-    cudaFree(d_inplace);
+    gpuFree(d_send);
+    gpuFree(d_recv);
+    gpuFree(d_inplace);
+    gpuFree(d_val);
 
     MPI_Barrier(MPI_COMM_WORLD);
 
diff --git a/tests/test_gpu_compat.h b/tests/test_gpu_compat.h
new file mode 100644
index 0000000..165e6df
--- /dev/null
+++ b/tests/test_gpu_compat.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2026 Cornelis Networks. All rights reserved. */
+
+#ifndef TEST_GPU_COMPAT_H
+#define TEST_GPU_COMPAT_H
+
+#ifdef __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+typedef hipError_t gpuError_t;
+#define gpuSuccess hipSuccess
+#define gpuMalloc hipMalloc
+#define gpuFree hipFree
+#define gpuMemcpy hipMemcpy
+#define gpuMemset hipMemset
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuSetDevice hipSetDevice
+#define gpuGetDeviceCount hipGetDeviceCount
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuGetErrorString hipGetErrorString
+#define GPU_KERNEL_LAUNCH(kernel, grid, block, shmem, stream, ...) \
+    hipLaunchKernelGGL(kernel, grid, block, shmem, stream, __VA_ARGS__)
+#define GPU_ERROR_PREFIX "HIP"
+#else
+#include <cuda_runtime.h>
+typedef cudaError_t gpuError_t;
+#define gpuSuccess cudaSuccess
+#define gpuMalloc cudaMalloc
+#define gpuFree cudaFree
+#define gpuMemcpy cudaMemcpy
+#define gpuMemset cudaMemset
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuSetDevice cudaSetDevice
+#define gpuGetDeviceCount cudaGetDeviceCount
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuGetErrorString cudaGetErrorString
+#define GPU_KERNEL_LAUNCH(kernel, grid, block, shmem, stream, ...) \
+    kernel<<<grid, block, shmem, stream>>>(__VA_ARGS__)
+#define GPU_ERROR_PREFIX "CUDA"
+#endif
+
+#define GPU_CHECK(call) do {                                           \
+    gpuError_t _e = (call);                                            \
+    if (_e != gpuSuccess) {                                            \
+        fprintf(stderr, "%s error %s:%d: %s\n", GPU_ERROR_PREFIX,   \
+                __FILE__, __LINE__, gpuGetErrorString(_e));            \
+        MPI_Abort(MPI_COMM_WORLD, 1);                                  \
+    }                                                                   \
+} while (0)
+
+#endif /* TEST_GPU_COMPAT_H */