From a67328cd907f98dc0367b493305328273fe92863 Mon Sep 17 00:00:00 2001 From: Mike Wilkins Date: Fri, 12 Jun 2026 18:06:01 -0400 Subject: [PATCH] Add ROCm/HIP GPU backend Add an AMD ROCm/HIP reduction backend to CAIL alongside the existing CUDA path, so GPU-aware MPI_Allreduce interposition works on AMD GPUs. - configure: --with-rocm / --with-rocm-arch; m4/ax_check_rocm.m4 probes hipcc, headers, and runtime; CUDA and ROCm backends are mutually exclusive and selected at configure time. - src/gpu/rocm: cail_rocm_mem.c (device-pointer detection, hipMalloc/ hipMemcpy with a post-copy hipStreamSynchronize(0) release fence so the staged buffer is visible to a subsequent GPU-aware MPI / NIC / GDRCopy read; stream-scoped so unrelated device work is not serialized) and cail_rocm_reduce.hip (elementwise SUM/PROD/MIN/MAX kernels for all supported datatypes). - tests: single-source GPU tests via tests/test_gpu_compat.h, compiled for CUDA or HIP from one source. The GPU correctness test sweeps a small set of element counts {1, 2, 3, base, base+1, base+3} to cover edge and non-power-of-two remainder paths in addition to the requested size. - hip_lt.sh: libtool shim so .hip sources link cleanly under the autotools build. Verified on AMD MI210 (gfx90a), ROCm 6.4.2: correctness 160/160 per count across np=2/3/4 and all algorithms (recursive_doubling, ring, rabenseifner), single-node and 2-node, plus OSU osu_allreduce interpose (CPU and ROCm device-to-device). GPU-aware allreduce shows a significant speedup over native for messages above the small-message cutoff, with native faster for small messages. --- Makefile.am | 2 +- README.md | 20 +- configure.ac | 65 ++++- hip_lt.sh | 57 ++++ m4/ax_check_cuda.m4 | 19 +- m4/ax_check_rocm.m4 | 91 +++++++ src/Makefile.am | 4 + src/gpu/Makefile.am | 4 + src/gpu/cail_gpu.h | 4 +- src/gpu/rocm/Makefile.am | 21 +- src/gpu/rocm/cail_rocm_mem.c | 43 +++ src/gpu/rocm/cail_rocm_reduce.hip | 339 ++++++++++++++++++++++++ src/gpu/rocm/cail_rocm_reduce_stub.c | 25 -- tests/Makefile.am | 26 +- tests/bench_allreduce_gpu.cu | 25 +- tests/test_allreduce_correctness_gpu.cu | 242 ++++++++++------- tests/test_gpu_compat.h | 51 ++++ 17 files changed, 876 insertions(+), 162 deletions(-) create mode 100755 hip_lt.sh create mode 100644 m4/ax_check_rocm.m4 create mode 100644 src/gpu/rocm/cail_rocm_mem.c create mode 100644 src/gpu/rocm/cail_rocm_reduce.hip delete mode 100644 src/gpu/rocm/cail_rocm_reduce_stub.c create mode 100644 tests/test_gpu_compat.h diff --git a/Makefile.am b/Makefile.am index b261bf3..57fe041 100644 --- a/Makefile.am +++ b/Makefile.am @@ -7,5 +7,5 @@ ACLOCAL_AMFLAGS = -I m4 pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = cail.pc -EXTRA_DIST = autogen.sh cail.pc.in cuda_lt.sh +EXTRA_DIST = autogen.sh cail.pc.in cuda_lt.sh hip_lt.sh DISTCLEANFILES = cail.pc diff --git a/README.md b/README.md index 0738747..8efac87 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ CAIL is a drop-in GPU-aware `MPI_Allreduce` optimization library. It uses the MPI profiling interface (PMPI) to transparently intercept `MPI_Allreduce` calls -and route them through optimized algorithms with native CUDA reduction kernels. +and route them through optimized algorithms with native CUDA/HIP reduction kernels. No application changes are required, just `LD_PRELOAD` the library. ## Quick Start @@ -10,7 +10,7 @@ No application changes are required, just `LD_PRELOAD` the library. ### Prerequisites - MPI implementation (OpenMPI, MPICH, Intel MPI, etc.) -- CUDA Toolkit (nvcc, cudart) +- CUDA Toolkit (nvcc, cudart) or ROCm Toolkit (hipcc, amdhip64) - Autotools (autoconf >= 2.69, automake, libtool) ### Build @@ -48,6 +48,8 @@ Look for `[cail] initialized:` and `[cail] algorithm=` on stderr to confirm CAIL |---------------------------------|------------------------------------------------------|----------| | `--with-cuda=PATH` | Path to CUDA toolkit installation | auto | | `--with-cuda-arch=SM` | NVCC architecture flag (e.g. `sm_70`, `sm_90`) | `sm_70` | +| `--with-rocm=PATH` | Path to ROCm toolkit installation | auto | +| `--with-rocm-arch=GFX` | HIP offload architecture (e.g. `gfx90a`); empty lets hipcc autodetect | empty | | `--with-mpi=PATH` | Path to MPI installation | auto | | `--enable-host-path` | Build without GPU support (host-only, uses `MPI_Reduce_local`) | no | | `--enable-debug` | Debug build with `-g -O0` | no | @@ -55,7 +57,13 @@ Look for `[cail] initialized:` and `[cail] algorithm=` on stderr to confirm CAIL | `--enable-ring` | Enable ring algorithm | yes | | `--enable-rabenseifner` | Enable Rabenseifner algorithm | yes | -### Host-Only Build (No CUDA) +### ROCm Build + +```sh +./configure --with-rocm=/opt/rocm --with-rocm-arch=gfx90a +``` + +### Host-Only Build (No CUDA/ROCm) ```sh ./configure --enable-host-path @@ -68,7 +76,7 @@ with `malloc`/`free`. Useful for development or CPU-only clusters. CAIL intercepts `MPI_Allreduce` when all of these are true: -- Buffer resides on a CUDA device (or built with `--enable-host-path`) +- Buffer resides on a CUDA or ROCm device (or built with `--enable-host-path`) - Datatype is one of the 20 supported MPI types (see below) - Operation is SUM, PROD, MAX, or MIN - Communicator is an intracommunicator @@ -211,9 +219,9 @@ CAIL_ALGO=ring CAIL_DEBUG=1 mpirun -np 4 -x LD_PRELOAD=... ./my_app | `test_allreduce_basic` | CPU | Float SUM across 7 message sizes, internal count sweep | | `test_allreduce_correctness` | CPU | All 20 datatypes × 4 ops × {normal, MPI_IN_PLACE}. Requires `-c count`. | | `test_allreduce_edge` | CPU | Edge cases: count=0, count=1, large counts, single process | -| `test_allreduce_correctness_gpu` | GPU | All 20 datatypes × 4 ops × {normal, MPI_IN_PLACE}. Requires `-c count`. CUDA build only. | +| `test_allreduce_correctness_gpu` | GPU | All 20 datatypes × 4 ops × {normal, MPI_IN_PLACE}. Requires `-c count`. CUDA or ROCm build. | | `bench_allreduce` | CPU | Performance benchmark across message sizes | -| `bench_allreduce_gpu` | GPU | GPU performance benchmark. CUDA build only. | +| `bench_allreduce_gpu` | GPU | GPU performance benchmark. CUDA or ROCm build. | ### Test CLI diff --git a/configure.ac b/configure.ac index c8e0b6d..0042d9f 100644 --- a/configure.ac +++ b/configure.ac @@ -99,6 +99,36 @@ AS_IF([test "x$enable_rabenseifner" = "xyes"], [ [Define to 1 to enable Rabenseifner algorithm]) ]) +dnl --------------------------------------------------------------------------- +dnl GPU backend selection options +dnl --------------------------------------------------------------------------- +AC_ARG_WITH([cuda], + [AS_HELP_STRING([--with-cuda=PATH], + [Path to CUDA toolkit @<:@default=auto@:>@])], + [with_cuda=$withval], + [with_cuda=auto]) + +AC_ARG_WITH([rocm], + [AS_HELP_STRING([--with-rocm=PATH], + [Path to ROCm toolkit @<:@default=auto@:>@])], + [with_rocm=$withval], + [with_rocm=auto]) + +user_with_cuda=$with_cuda +user_with_rocm=$with_rocm + +AS_IF([test "x$with_rocm" != "xauto" && test "x$with_rocm" != "xno" && test "x$with_cuda" = "xauto"], [ + with_cuda=no +]) +AS_IF([test "x$with_cuda" != "xauto" && test "x$with_cuda" != "xno" && test "x$with_rocm" = "xauto"], [ + with_rocm=no +]) + +AS_IF([test "x$enable_host_path" = "xyes"], [ + with_cuda=no + with_rocm=no +]) + dnl --------------------------------------------------------------------------- dnl --with-cuda-arch : NVCC architecture flag dnl --------------------------------------------------------------------------- @@ -111,10 +141,41 @@ AC_ARG_WITH([cuda-arch], AC_SUBST([CUDA_ARCH], [$cuda_arch]) dnl --------------------------------------------------------------------------- -dnl CUDA (macro provided by m4/ax_check_cuda.m4) -dnl Must come after --with-cuda-arch so $cuda_arch is set for NVCCFLAGS +dnl --with-rocm-arch : HIP architecture flag +dnl --------------------------------------------------------------------------- +AC_ARG_WITH([rocm-arch], + [AS_HELP_STRING([--with-rocm-arch=GFX], + [Set ROCm offload architecture for hipcc (e.g. gfx90a); default empty uses hipcc autodetect])], + [rocm_arch=$withval], + [rocm_arch=]) + +AC_SUBST([ROCM_ARCH], [$rocm_arch]) + +dnl --------------------------------------------------------------------------- +dnl CUDA / ROCm detection dnl --------------------------------------------------------------------------- AX_CHECK_CUDA +AX_CHECK_ROCM + +dnl --------------------------------------------------------------------------- +dnl Final backend selection validation +dnl --------------------------------------------------------------------------- +AS_IF([test "x$enable_host_path" = "xyes"], [ + AS_IF([test "x$user_with_cuda" != "xauto" && test "x$user_with_cuda" != "xno"], [ + AC_MSG_ERROR([host-path excludes GPU backends]) + ]) + AS_IF([test "x$user_with_rocm" != "xauto" && test "x$user_with_rocm" != "xno"], [ + AC_MSG_ERROR([host-path excludes GPU backends]) + ]) +], [ + AS_IF([test "x$have_cuda" = "xyes" && test "x$have_rocm" = "xyes"], [ + AC_MSG_ERROR([choose one GPU backend: CUDA or ROCm]) + ], [ + AS_IF([test "x$have_cuda" = "xno" && test "x$have_rocm" = "xno"], [ + AC_MSG_ERROR([no GPU backend: pass --with-cuda=PATH, --with-rocm=PATH, or --enable-host-path]) + ]) + ]) +]) dnl --------------------------------------------------------------------------- dnl Output diff --git a/hip_lt.sh b/hip_lt.sh new file mode 100755 index 0000000..715f5d2 --- /dev/null +++ b/hip_lt.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright (c) 2026 Cornelis Networks. All rights reserved. + +# hip_lt.sh — Wrapper to compile .hip files into libtool .lo objects. +# Implements the same UCC-pattern rationale as cuda_lt.sh: emit PIC/non-PIC +# objects plus .lo metadata so libtool keeps HIP objects during convenience-lib linking. + +set -e + +libtool_file=$1 +lo_filepath=$2 + +# Derive .o path from .lo path +o_filepath="${lo_filepath%.lo}.o" +lo_dir=$(dirname "$o_filepath") +o_filename=$(basename "$o_filepath") + +# Libtool convention: PIC objects go in .libs/, non-PIC in current dir +local_pic_dir=".libs/" +local_npic_dir="" +pic_dir="${lo_dir}/${local_pic_dir}" +npic_dir="${lo_dir}/${local_npic_dir}" + +pic_filepath="${pic_dir}${o_filename}" +npic_filepath="${npic_dir}${o_filename}" +local_pic_filepath="${local_pic_dir}${o_filename}" +local_npic_filepath="${local_npic_dir}${o_filename}" + +mkdir -p "$pic_dir" + +# Build PIC version (for shared library) +cmd="${@:3} -fPIC -o ${pic_filepath}" +echo "$cmd" +$cmd + +# Build non-PIC version (for static library) +cmd="${@:3} -o ${npic_filepath}" +echo "$cmd" +$cmd + +# Write the .lo metadata file that libtool expects +libtool_version="$(${libtool_file} --version | head -1 | sed 's/^/#/g')" + +cat > "${lo_filepath}" <@])], - [with_cuda=$withval], - [with_cuda=auto]) + AS_IF([test "x$with_cuda" = "x"], [ + AC_ARG_WITH([cuda], + [AS_HELP_STRING([--with-cuda=PATH], + [Path to CUDA toolkit @<:@default=auto@:>@])], + [with_cuda=$withval], + [with_cuda=auto]) + ]) have_cuda=no @@ -52,13 +54,6 @@ AC_DEFUN([AX_CHECK_CUDA], [ ]) ]) - dnl If CUDA required (not host-path and not --without-cuda), fail - AS_IF([test "x$have_cuda" = "xno" && test "x$with_cuda" != "xno" && test "x$enable_host_path" != "xyes"], [ - AS_IF([test "x$with_cuda" != "xauto"], [ - AC_MSG_ERROR([CUDA requested but not found. Use --without-cuda or --enable-host-path for CPU-only build.]) - ]) - ]) - AC_SUBST([CUDA_HOME]) AC_SUBST([NVCC]) AC_SUBST([CUDA_CFLAGS]) diff --git a/m4/ax_check_rocm.m4 b/m4/ax_check_rocm.m4 new file mode 100644 index 0000000..d00a4a3 --- /dev/null +++ b/m4/ax_check_rocm.m4 @@ -0,0 +1,91 @@ +dnl --------------------------------------------------------------------------- +dnl AX_CHECK_ROCM — Detect ROCm toolkit, hipcc, and set build variables +dnl --------------------------------------------------------------------------- +AC_DEFUN([AX_CHECK_ROCM], [ + dnl --with-rocm=PATH + AS_IF([test "x$with_rocm" = "x"], [ + AC_ARG_WITH([rocm], + [AS_HELP_STRING([--with-rocm=PATH], + [Path to ROCm toolkit @<:@default=auto@:>@])], + [with_rocm=$withval], + [with_rocm=auto]) + ]) + + have_rocm=no + ROCM_HOME= + HIPCC= + ROCM_CFLAGS= + ROCM_LIBS= + HIPCCFLAGS= + + dnl Skip ROCm if --without-rocm + AS_IF([test "x$with_rocm" != "xno"], [ + dnl Find hipcc + AS_IF([test "x$with_rocm" != "xauto"], [ + HIPCC="$with_rocm/bin/hipcc" + ROCM_HOME="$with_rocm" + ], [ + AC_PATH_PROG([HIPCC], [hipcc], []) + AS_IF([test "x$HIPCC" != "x"], [ + dnl Derive ROCM_HOME from hipcc location + rocm_bin_dir=`AS_DIRNAME([$HIPCC])` + ROCM_HOME=`AS_DIRNAME([$rocm_bin_dir])` + ], [ + AS_IF([test -x "/opt/rocm/bin/hipcc"], [ + HIPCC="/opt/rocm/bin/hipcc" + ROCM_HOME="/opt/rocm" + ]) + ]) + ]) + + dnl Check hipcc exists + AS_IF([test "x$HIPCC" != "x" && test -x "$HIPCC"], [ + rocm_inc="$ROCM_HOME/include" + dnl Prefer lib64 if that is where libamdhip64.so lives (some distros) + AS_IF([test -f "$ROCM_HOME/lib64/libamdhip64.so"], + [rocm_lib="$ROCM_HOME/lib64"], + [rocm_lib="$ROCM_HOME/lib"]) + + dnl Validate HIP runtime header and ROCm runtime library + AC_CHECK_FILE([$rocm_inc/hip/hip_runtime.h], [ + AC_CHECK_FILE([$rocm_lib/libamdhip64.so], [ + have_rocm=yes + ROCM_CFLAGS="-I$rocm_inc -D__HIP_PLATFORM_AMD__" + ROCM_LIBS="-L$rocm_lib -lamdhip64" + HIPCCFLAGS="-fPIC" + AS_IF([test "x$rocm_arch" != "x"], [ + HIPCCFLAGS="$HIPCCFLAGS --offload-arch=$rocm_arch" + ]) + AC_DEFINE([HAVE_ROCM], [1], [Define to 1 if ROCm is available]) + AC_MSG_NOTICE([ROCm found: $ROCM_HOME]) + ], [ + AS_IF([test "x$with_rocm" != "xauto"], [ + AC_MSG_ERROR([libamdhip64.so not found in $rocm_lib]) + ], [ + AC_MSG_NOTICE([libamdhip64.so not found in $rocm_lib — ROCm disabled]) + ]) + ]) + ], [ + AS_IF([test "x$with_rocm" != "xauto"], [ + AC_MSG_ERROR([hip_runtime.h not found in $rocm_inc/hip]) + ], [ + AC_MSG_NOTICE([hip_runtime.h not found in $rocm_inc/hip — ROCm disabled]) + ]) + ]) + ], [ + AS_IF([test "x$with_rocm" != "xauto"], [ + AC_MSG_ERROR([hipcc not found at $HIPCC]) + ], [ + AC_MSG_NOTICE([hipcc not found — ROCm disabled]) + ]) + ]) + ]) + + AC_SUBST([ROCM_HOME]) + AC_SUBST([HIPCC]) + AC_SUBST([ROCM_CFLAGS]) + AC_SUBST([ROCM_LIBS]) + AC_SUBST([HIPCCFLAGS]) + + AM_CONDITIONAL([HAVE_ROCM], [test "x$have_rocm" = "xyes"]) +]) diff --git a/src/Makefile.am b/src/Makefile.am index a0a2756..12673a2 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -15,6 +15,10 @@ if HAVE_CUDA libcail_la_LDFLAGS += $(CUDA_LIBS) -lstdc++ endif +if HAVE_ROCM +libcail_la_LDFLAGS += $(ROCM_LIBS) -lstdc++ +endif + include_HEADERS = core/cail.h AM_CFLAGS = $(MPI_CFLAGS) diff --git a/src/gpu/Makefile.am b/src/gpu/Makefile.am index cb474fe..0f41df7 100644 --- a/src/gpu/Makefile.am +++ b/src/gpu/Makefile.am @@ -13,6 +13,10 @@ libcail_gpu_la_LIBADD += host/libcail_host.la else if HAVE_CUDA libcail_gpu_la_LIBADD += cuda/libcail_cuda.la +else +if HAVE_ROCM +libcail_gpu_la_LIBADD += rocm/libcail_rocm.la +endif endif endif diff --git a/src/gpu/cail_gpu.h b/src/gpu/cail_gpu.h index 9cda23c..d3b7633 100644 --- a/src/gpu/cail_gpu.h +++ b/src/gpu/cail_gpu.h @@ -3,12 +3,12 @@ /* cail_gpu.h — Backend-agnostic GPU interface for cail * Implemented by: CUDA (cail_cuda_reduce.cu + cail_cuda_mem.c) * Host-path (cail_host_reduce.c) - * ROCm stub (cail_rocm_reduce_stub.c) + * ROCm (cail_rocm_reduce.hip + cail_rocm_mem.c) */ #ifndef CAIL_GPU_H #define CAIL_GPU_H -#include "cail_types.h" +#include "../core/cail_types.h" #include #include diff --git a/src/gpu/rocm/Makefile.am b/src/gpu/rocm/Makefile.am index d03b97a..d6b1783 100644 --- a/src/gpu/rocm/Makefile.am +++ b/src/gpu/rocm/Makefile.am @@ -1,7 +1,26 @@ # Copyright (c) 2026 Cornelis Networks. All rights reserved. +if HAVE_ROCM + noinst_LTLIBRARIES = libcail_rocm.la -libcail_rocm_la_SOURCES = cail_rocm_stub.c +libcail_rocm_la_SOURCES = cail_rocm_mem.c cail_rocm_reduce.hip + +SUFFIXES = .hip + +.hip.lo: + /bin/bash $(top_srcdir)/hip_lt.sh "$(LIBTOOL)" $@ \ + $(HIPCC) $(HIPCCFLAGS) $(ROCM_CFLAGS) $(MPI_CFLAGS) \ + -I$(top_srcdir)/src/core -I$(top_srcdir)/src/gpu -c $< + +AM_CFLAGS = $(ROCM_CFLAGS) $(MPI_CFLAGS) -I$(top_srcdir)/src/core -I$(top_srcdir)/src/gpu +CLEANFILES = cail_rocm_reduce.lo + +else + +noinst_LTLIBRARIES = libcail_rocm.la +libcail_rocm_la_SOURCES = cail_rocm_stub.c AM_CFLAGS = -I$(top_srcdir)/src/core -I$(top_srcdir)/src/gpu + +endif diff --git a/src/gpu/rocm/cail_rocm_mem.c b/src/gpu/rocm/cail_rocm_mem.c new file mode 100644 index 0000000..0341891 --- /dev/null +++ b/src/gpu/rocm/cail_rocm_mem.c @@ -0,0 +1,43 @@ +/* Copyright (c) 2026 Cornelis Networks. All rights reserved. */ + +#include "../cail_gpu.h" +#include +#include + +int cail_rocm_init(void); +void cail_rocm_finalize(void); +int cail_rocm_synchronize(void); + +int cail_gpu_is_device_pointer(const void *ptr) { + hipPointerAttribute_t attr; + hipError_t err = hipPointerGetAttributes(&attr, ptr); + if (err != hipSuccess) { + hipGetLastError(); + return 0; + } + /* ROCm 6.x may report unregistered host pointers as hipSuccess with + hipMemoryTypeUnregistered; only device/managed types are GPU-resident. */ + return (attr.type == hipMemoryTypeDevice || attr.type == hipMemoryTypeManaged); +} + +int cail_gpu_malloc(void **ptr, size_t size) { + return (hipMalloc(ptr, size) == hipSuccess) ? 0 : -1; +} + +int cail_gpu_free(void *ptr) { + return (hipFree(ptr) == hipSuccess) ? 0 : -1; +} + +int cail_gpu_memcpy(void *dst, const void *src, size_t size) { + if (hipMemcpy(dst, src, size, hipMemcpyDefault) != hipSuccess) + return -1; + /* Send-side release fence: the staged buffer may next be read by an + external GPU-aware MPI / NIC / GDRCopy engine, not by HIP stream work. + Synchronize the copy's (default) stream before exposing dst to PMPI. + Stream-scoped (not device-wide) so unrelated device work is not serialized. */ + return (hipStreamSynchronize(0) == hipSuccess) ? 0 : -1; +} + +int cail_gpu_synchronize(void) { return cail_rocm_synchronize(); } +int cail_gpu_init(void) { return cail_rocm_init(); } +void cail_gpu_finalize(void) { cail_rocm_finalize(); } diff --git a/src/gpu/rocm/cail_rocm_reduce.hip b/src/gpu/rocm/cail_rocm_reduce.hip new file mode 100644 index 0000000..518ee27 --- /dev/null +++ b/src/gpu/rocm/cail_rocm_reduce.hip @@ -0,0 +1,339 @@ +/* Copyright (c) 2026 Cornelis Networks. All rights reserved. */ + +#include "../cail_gpu.h" +#include "../../core/cail_types.h" +#include +#include +#include +#include + +/* Pinned host byte for PCIe read fence in cail_gpu_flush_recv_buf(). + hipMemcpy D→H with a single byte forces a PCIe read transaction that + flushes all prior posted writes (GDRCopy BAR stores, NIC RDMA) to GPU memory. */ +static char *flush_host_byte = NULL; + +#define BLOCK_SIZE 256 +#define ELEMENTS_PER_THREAD 4 +#define MAX_GRID 65535 + +/* gfx90a wave64 does not affect this grid-stride implementation, so no tuning changes are required. */ +typedef void (*cail_rocm_kernel_fn)(const void*, void*, size_t, hipStream_t); + +static hipStream_t cail_stream = 0; +static cail_rocm_kernel_fn kernel_table[CAIL_NUM_OPS][CAIL_NUM_DTYPES]; + +template +struct OpSum { + __device__ __forceinline__ T operator()(T a, T b) const { return a + b; } +}; + +template +struct OpProd { + __device__ __forceinline__ T operator()(T a, T b) const { return a * b; } +}; + +template +struct OpMax { + __device__ __forceinline__ T operator()(T a, T b) const { return (a > b) ? a : b; } +}; + +template +struct OpMin { + __device__ __forceinline__ T operator()(T a, T b) const { return (a < b) ? a : b; } +}; + +template +__global__ void cail_reduce_kernel(const T *in, T *inout, size_t count) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = (size_t)blockDim.x * gridDim.x; + Op op; + for (size_t i = idx; i < count; i += stride) { + inout[i] = op(inout[i], in[i]); + } +} + +__global__ void cail_reduce_float4_sum(const float *in, float *inout, size_t count) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = (size_t)blockDim.x * gridDim.x; + size_t vec_count = count / 4; + const float4 *in4 = (const float4*)in; + float4 *inout4 = (float4*)inout; + for (size_t i = idx; i < vec_count; i += stride) { + float4 a = inout4[i]; + float4 b = in4[i]; + inout4[i] = make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); + } + size_t rem_start = vec_count * 4; + for (size_t i = rem_start + idx; i < count; i += stride) { + inout[i] += in[i]; + } +} + +__global__ void cail_reduce_double2_sum(const double *in, double *inout, size_t count) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = (size_t)blockDim.x * gridDim.x; + size_t vec_count = count / 2; + const double2 *in2 = (const double2*)in; + double2 *inout2 = (double2*)inout; + for (size_t i = idx; i < vec_count; i += stride) { + double2 a = inout2[i]; + double2 b = in2[i]; + inout2[i] = make_double2(a.x + b.x, a.y + b.y); + } + size_t rem_start = vec_count * 2; + for (size_t i = rem_start + idx; i < count; i += stride) { + inout[i] += in[i]; + } +} + +static inline bool is_aligned(const void *ptr, size_t alignment) { + return ((uintptr_t)ptr % alignment) == 0; +} + +static inline int grid_for_scalar(size_t count) { + if (count == 0) return 0; + size_t blocks = (count + (BLOCK_SIZE * ELEMENTS_PER_THREAD) - 1) / (BLOCK_SIZE * ELEMENTS_PER_THREAD); + if (blocks > MAX_GRID) blocks = MAX_GRID; + return (int)blocks; +} + +static inline int grid_for_vec(size_t vec_count) { + if (vec_count == 0) return 0; + size_t blocks = (vec_count + BLOCK_SIZE - 1) / BLOCK_SIZE; + if (blocks > MAX_GRID) blocks = MAX_GRID; + return (int)blocks; +} + +template +static void launch_scalar(const void *in, void *inout, size_t count, hipStream_t s) { + int grid = grid_for_scalar(count); + if (grid > 0) { + cail_reduce_kernel<<>>((const T*)in, (T*)inout, count); + } +} + +static void launch_sum_float(const void *in, void *inout, size_t count, hipStream_t s) { + if (count == 0) return; + if ((count % 4) == 0 && is_aligned(in, sizeof(float4)) && is_aligned(inout, sizeof(float4))) { + int grid = grid_for_vec(count / 4); + if (grid > 0) { + cail_reduce_float4_sum<<>>((const float*)in, (float*)inout, count); + return; + } + } + launch_scalar>(in, inout, count, s); +} + +static void launch_sum_double(const void *in, void *inout, size_t count, hipStream_t s) { + if (count == 0) return; + if ((count % 2) == 0 && is_aligned(in, sizeof(double2)) && is_aligned(inout, sizeof(double2))) { + int grid = grid_for_vec(count / 2); + if (grid > 0) { + cail_reduce_double2_sum<<>>((const double*)in, (double*)inout, count); + return; + } + } + launch_scalar>(in, inout, count, s); +} + +#define DECL_LAUNCH_SUM(type, name) \ +static void launch_sum_##name(const void *in, void *inout, size_t count, hipStream_t s) { \ + launch_scalar>(in, inout, count, s); \ +} + +#define DECL_LAUNCH_PROD(type, name) \ +static void launch_prod_##name(const void *in, void *inout, size_t count, hipStream_t s) { \ + launch_scalar>(in, inout, count, s); \ +} + +#define DECL_LAUNCH_MAX(type, name) \ +static void launch_max_##name(const void *in, void *inout, size_t count, hipStream_t s) { \ + launch_scalar>(in, inout, count, s); \ +} + +#define DECL_LAUNCH_MIN(type, name) \ +static void launch_min_##name(const void *in, void *inout, size_t count, hipStream_t s) { \ + launch_scalar>(in, inout, count, s); \ +} + +DECL_LAUNCH_SUM(char, char) +DECL_LAUNCH_SUM(int, int) +DECL_LAUNCH_SUM(long, long) +DECL_LAUNCH_SUM(long long, longlong) +DECL_LAUNCH_SUM(unsigned char, uchar) +DECL_LAUNCH_SUM(unsigned int, uint) +DECL_LAUNCH_SUM(unsigned long, ulong) +DECL_LAUNCH_SUM(unsigned long long, ulonglong) +DECL_LAUNCH_SUM(short, short) +DECL_LAUNCH_SUM(unsigned short, ushort) + +DECL_LAUNCH_PROD(char, char) +DECL_LAUNCH_PROD(short, short) +DECL_LAUNCH_PROD(int, int) +DECL_LAUNCH_PROD(long, long) +DECL_LAUNCH_PROD(float, float) +DECL_LAUNCH_PROD(double, double) +DECL_LAUNCH_PROD(long long, longlong) +DECL_LAUNCH_PROD(unsigned char, uchar) +DECL_LAUNCH_PROD(unsigned short, ushort) +DECL_LAUNCH_PROD(unsigned int, uint) +DECL_LAUNCH_PROD(unsigned long, ulong) +DECL_LAUNCH_PROD(unsigned long long, ulonglong) + +DECL_LAUNCH_MAX(char, char) +DECL_LAUNCH_MAX(short, short) +DECL_LAUNCH_MAX(int, int) +DECL_LAUNCH_MAX(long, long) +DECL_LAUNCH_MAX(float, float) +DECL_LAUNCH_MAX(double, double) +DECL_LAUNCH_MAX(long long, longlong) +DECL_LAUNCH_MAX(unsigned char, uchar) +DECL_LAUNCH_MAX(unsigned short, ushort) +DECL_LAUNCH_MAX(unsigned int, uint) +DECL_LAUNCH_MAX(unsigned long, ulong) +DECL_LAUNCH_MAX(unsigned long long, ulonglong) + +DECL_LAUNCH_MIN(char, char) +DECL_LAUNCH_MIN(short, short) +DECL_LAUNCH_MIN(int, int) +DECL_LAUNCH_MIN(long, long) +DECL_LAUNCH_MIN(float, float) +DECL_LAUNCH_MIN(double, double) +DECL_LAUNCH_MIN(long long, longlong) +DECL_LAUNCH_MIN(unsigned char, uchar) +DECL_LAUNCH_MIN(unsigned short, ushort) +DECL_LAUNCH_MIN(unsigned int, uint) +DECL_LAUNCH_MIN(unsigned long, ulong) +DECL_LAUNCH_MIN(unsigned long long, ulonglong) + +static void init_kernel_table(void) { + for (int op = 0; op < CAIL_NUM_OPS; ++op) { + for (int dt = 0; dt < CAIL_NUM_DTYPES; ++dt) { + kernel_table[op][dt] = 0; + } + } + + kernel_table[CAIL_SUM][CAIL_CHAR] = launch_sum_char; + kernel_table[CAIL_SUM][CAIL_INT] = launch_sum_int; + kernel_table[CAIL_SUM][CAIL_LONG] = launch_sum_long; + kernel_table[CAIL_SUM][CAIL_FLOAT] = launch_sum_float; + kernel_table[CAIL_SUM][CAIL_DOUBLE] = launch_sum_double; + kernel_table[CAIL_SUM][CAIL_LONG_LONG] = launch_sum_longlong; + kernel_table[CAIL_SUM][CAIL_UCHAR] = launch_sum_uchar; + kernel_table[CAIL_SUM][CAIL_UINT] = launch_sum_uint; + kernel_table[CAIL_SUM][CAIL_ULONG] = launch_sum_ulong; + kernel_table[CAIL_SUM][CAIL_ULONGLONG] = launch_sum_ulonglong; + kernel_table[CAIL_SUM][CAIL_SHORT] = launch_sum_short; + kernel_table[CAIL_SUM][CAIL_USHORT] = launch_sum_ushort; + + kernel_table[CAIL_PROD][CAIL_CHAR] = launch_prod_char; + kernel_table[CAIL_PROD][CAIL_SHORT] = launch_prod_short; + kernel_table[CAIL_PROD][CAIL_INT] = launch_prod_int; + kernel_table[CAIL_PROD][CAIL_LONG] = launch_prod_long; + kernel_table[CAIL_PROD][CAIL_FLOAT] = launch_prod_float; + kernel_table[CAIL_PROD][CAIL_DOUBLE] = launch_prod_double; + kernel_table[CAIL_PROD][CAIL_LONG_LONG] = launch_prod_longlong; + kernel_table[CAIL_PROD][CAIL_UCHAR] = launch_prod_uchar; + kernel_table[CAIL_PROD][CAIL_USHORT] = launch_prod_ushort; + kernel_table[CAIL_PROD][CAIL_UINT] = launch_prod_uint; + kernel_table[CAIL_PROD][CAIL_ULONG] = launch_prod_ulong; + kernel_table[CAIL_PROD][CAIL_ULONGLONG] = launch_prod_ulonglong; + + kernel_table[CAIL_MAX][CAIL_CHAR] = launch_max_char; + kernel_table[CAIL_MAX][CAIL_SHORT] = launch_max_short; + kernel_table[CAIL_MAX][CAIL_INT] = launch_max_int; + kernel_table[CAIL_MAX][CAIL_LONG] = launch_max_long; + kernel_table[CAIL_MAX][CAIL_FLOAT] = launch_max_float; + kernel_table[CAIL_MAX][CAIL_DOUBLE] = launch_max_double; + kernel_table[CAIL_MAX][CAIL_LONG_LONG] = launch_max_longlong; + kernel_table[CAIL_MAX][CAIL_UCHAR] = launch_max_uchar; + kernel_table[CAIL_MAX][CAIL_USHORT] = launch_max_ushort; + kernel_table[CAIL_MAX][CAIL_UINT] = launch_max_uint; + kernel_table[CAIL_MAX][CAIL_ULONG] = launch_max_ulong; + kernel_table[CAIL_MAX][CAIL_ULONGLONG] = launch_max_ulonglong; + + kernel_table[CAIL_MIN][CAIL_CHAR] = launch_min_char; + kernel_table[CAIL_MIN][CAIL_SHORT] = launch_min_short; + kernel_table[CAIL_MIN][CAIL_INT] = launch_min_int; + kernel_table[CAIL_MIN][CAIL_LONG] = launch_min_long; + kernel_table[CAIL_MIN][CAIL_FLOAT] = launch_min_float; + kernel_table[CAIL_MIN][CAIL_DOUBLE] = launch_min_double; + kernel_table[CAIL_MIN][CAIL_LONG_LONG] = launch_min_longlong; + kernel_table[CAIL_MIN][CAIL_UCHAR] = launch_min_uchar; + kernel_table[CAIL_MIN][CAIL_USHORT] = launch_min_ushort; + kernel_table[CAIL_MIN][CAIL_UINT] = launch_min_uint; + kernel_table[CAIL_MIN][CAIL_ULONG] = launch_min_ulong; + kernel_table[CAIL_MIN][CAIL_ULONGLONG] = launch_min_ulonglong; +} + +extern "C" int cail_rocm_init(void) { + if (cail_stream != 0) { + return 0; + } + hipError_t err = hipStreamCreate(&cail_stream); + if (err != hipSuccess) { + cail_stream = 0; + return -1; + } + init_kernel_table(); + + /* Allocate a pinned host byte for the PCIe read fence used by + cail_gpu_flush_recv_buf(). */ + if (hipHostMalloc((void**)&flush_host_byte, 1, hipHostMallocDefault) != hipSuccess) + flush_host_byte = NULL; + + return 0; +} + +extern "C" void cail_rocm_finalize(void) { + if (flush_host_byte) { hipHostFree(flush_host_byte); flush_host_byte = NULL; } + if (cail_stream != 0) { + hipStreamDestroy(cail_stream); + cail_stream = 0; + } +} + +extern "C" int cail_rocm_synchronize(void) { + if (cail_stream == 0) return -1; + return (hipStreamSynchronize(cail_stream) == hipSuccess) ? 0 : -1; +} + +/* PCIe read fence: reads the last byte of the receive buffer into pinned + host memory, forcing a D→H transaction that drains any prior posted + writes (NIC RDMA, GDRCopy BAR stores) to the same GPU memory region. */ +extern "C" int cail_gpu_flush_recv_buf(const void *recv_buf, size_t recv_bytes) { + if (!flush_host_byte || !recv_buf || recv_bytes == 0) return 0; + /* A 1-byte D2H read from the receive tail forces PCIe posted writes to drain + before this call returns, making subsequent kernel reads observe fresh data. */ + hipMemcpy(flush_host_byte, (const char*)recv_buf + recv_bytes - 1, + 1, hipMemcpyDeviceToHost); + return 0; +} + +extern "C" int cail_gpu_reduce_local(const void *in, void *inout, size_t count, + cail_datatype_t dtype, cail_op_t op) { + if (op < 0 || op >= CAIL_NUM_OPS || dtype < 0 || dtype >= CAIL_NUM_DTYPES) { + fprintf(stderr, "[cail WARN] gpu_reduce_local: dtype=%d op=%d out of range\n", + (int)dtype, (int)op); + return -1; + } + cail_rocm_kernel_fn fn = kernel_table[op][dtype]; + if (fn == 0) { + fprintf(stderr, "[cail WARN] gpu_reduce_local: no kernel for dtype=%s op=%s\n", + cail_dtype_name(dtype), cail_op_name(op)); + return -1; + } + if (cail_stream == 0) { + fprintf(stderr, "[cail WARN] gpu_reduce_local: HIP stream not initialized\n"); + return -1; + } + fn(in, inout, count, cail_stream); + hipError_t err = hipStreamSynchronize(cail_stream); + if (err != hipSuccess) { + fprintf(stderr, "[cail WARN] gpu_reduce_local: hipStreamSynchronize failed (%s) " + "for dtype=%s op=%s count=%zu\n", + hipGetErrorString(err), cail_dtype_name(dtype), cail_op_name(op), count); + return -1; + } + return 0; +} diff --git a/src/gpu/rocm/cail_rocm_reduce_stub.c b/src/gpu/rocm/cail_rocm_reduce_stub.c deleted file mode 100644 index fb04b49..0000000 --- a/src/gpu/rocm/cail_rocm_reduce_stub.c +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2026 Cornelis Networks. All rights reserved. */ - -/* cail_rocm_reduce_stub.c — ROCm/HIP stub implementation. - * Provides the full cail_gpu.h interface as no-ops / error returns. - * This file is compiled but NOT linked into libcail today (the rocm/ - * subdirectory builds libcail_rocm.la, but gpu/Makefile.am does not - * add it to libcail_gpu_la_LIBADD). It exists as a scaffold for a - * future ROCm backend: replace these stubs with real HIP calls and - * wire libcail_rocm.la into the build under a HAVE_ROCM conditional. - */ -#include "../cail_gpu.h" - -int cail_gpu_is_device_pointer(const void *ptr) { (void)ptr; return 0; } -int cail_gpu_reduce_local(const void *in, void *inout, size_t count, - cail_datatype_t dtype, cail_op_t op) -{ (void)in; (void)inout; (void)count; (void)dtype; (void)op; return -1; } -int cail_gpu_malloc(void **ptr, size_t size) { (void)ptr; (void)size; return -1; } -int cail_gpu_free(void *ptr) { (void)ptr; return -1; } -int cail_gpu_memcpy(void *dst, const void *src, size_t size) -{ (void)dst; (void)src; (void)size; return -1; } -int cail_gpu_synchronize(void) { return -1; } -int cail_gpu_init(void) { return -1; } -void cail_gpu_finalize(void) {} -int cail_gpu_flush_recv_buf(const void *recv_buf, size_t recv_bytes) -{ (void)recv_buf; (void)recv_bytes; return 0; } diff --git a/tests/Makefile.am b/tests/Makefile.am index be54b76..7de437e 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -42,4 +42,28 @@ CLEANFILES = test_allreduce_correctness_gpu.o bench_allreduce_gpu.o endif -EXTRA_DIST = test_runner.sh test_allreduce_correctness_gpu.cu bench_allreduce_gpu.cu +if HAVE_ROCM + +SUFFIXES = .cu + +noinst_PROGRAMS += test_allreduce_correctness_gpu bench_allreduce_gpu + +test_allreduce_correctness_gpu_SOURCES = +test_allreduce_correctness_gpu_LDADD = test_allreduce_correctness_gpu.o $(top_builddir)/src/libcail.la $(MPI_LIBS) $(ROCM_LIBS) -lstdc++ -lm +test_allreduce_correctness_gpu_DEPENDENCIES = test_allreduce_correctness_gpu.o + +bench_allreduce_gpu_SOURCES = +bench_allreduce_gpu_LDADD = bench_allreduce_gpu.o $(top_builddir)/src/libcail.la $(MPI_LIBS) $(ROCM_LIBS) -lstdc++ -lm +bench_allreduce_gpu_DEPENDENCIES = bench_allreduce_gpu.o + +test_allreduce_correctness_gpu.o: $(srcdir)/test_allreduce_correctness_gpu.cu + $(HIPCC) $(HIPCCFLAGS) $(ROCM_CFLAGS) $(MPI_CFLAGS) -x hip -I$(top_srcdir)/src/core -c -o $@ $< + +bench_allreduce_gpu.o: $(srcdir)/bench_allreduce_gpu.cu + $(HIPCC) $(HIPCCFLAGS) $(ROCM_CFLAGS) $(MPI_CFLAGS) -x hip -I$(top_srcdir)/src/core -c -o $@ $< + +CLEANFILES = test_allreduce_correctness_gpu.o bench_allreduce_gpu.o + +endif + +EXTRA_DIST = test_runner.sh test_allreduce_correctness_gpu.cu bench_allreduce_gpu.cu test_gpu_compat.h diff --git a/tests/bench_allreduce_gpu.cu b/tests/bench_allreduce_gpu.cu index 2440bb9..daf44d4 100644 --- a/tests/bench_allreduce_gpu.cu +++ b/tests/bench_allreduce_gpu.cu @@ -3,11 +3,11 @@ /* * bench_allreduce_gpu.cu — GPU-buffer benchmark for cail MPI_Allreduce * - * Same as bench_allreduce.c but uses cudaMalloc'd device buffers so + * Same as bench_allreduce.c but uses GPU device buffers so * cail's GPU path is exercised instead of falling back to PMPI. */ #include -#include +#include "test_gpu_compat.h" #include #include #include @@ -15,15 +15,6 @@ #define DEFAULT_WARMUP 10 #define DEFAULT_ITERATIONS 100 -#define CUDA_CHECK(call) do { \ - cudaError_t _e = (call); \ - if (_e != cudaSuccess) { \ - fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \ - cudaGetErrorString(_e)); \ - MPI_Abort(MPI_COMM_WORLD, 1); \ - } \ -} while (0) - int main(int argc, char **argv) { MPI_Init(&argc, &argv); @@ -32,13 +23,13 @@ int main(int argc, char **argv) MPI_Comm_size(MPI_COMM_WORLD, &nprocs); int dev_count = 0; - cudaGetDeviceCount(&dev_count); + gpuGetDeviceCount(&dev_count); if (dev_count == 0) { - if (rank == 0) fprintf(stderr, "No CUDA devices found\n"); + if (rank == 0) fprintf(stderr, "No GPU devices found\n"); MPI_Finalize(); return 1; } - CUDA_CHECK(cudaSetDevice(rank % dev_count)); + GPU_CHECK(gpuSetDevice(rank % dev_count)); size_t min_bytes = 4; size_t max_bytes = 16 * 1024 * 1024; @@ -57,8 +48,8 @@ int main(int argc, char **argv) } float *d_buf; - CUDA_CHECK(cudaMalloc(&d_buf, max_bytes)); - CUDA_CHECK(cudaMemset(d_buf, 0, max_bytes)); + GPU_CHECK(gpuMalloc(&d_buf, max_bytes)); + GPU_CHECK(gpuMemset(d_buf, 0, max_bytes)); for (size_t size = min_bytes; size <= max_bytes; size *= 4) { int count = (int)(size / sizeof(float)); @@ -98,7 +89,7 @@ int main(int argc, char **argv) } } - cudaFree(d_buf); + gpuFree(d_buf); MPI_Finalize(); return 0; } diff --git a/tests/test_allreduce_correctness_gpu.cu b/tests/test_allreduce_correctness_gpu.cu index b8f979c..727324a 100644 --- a/tests/test_allreduce_correctness_gpu.cu +++ b/tests/test_allreduce_correctness_gpu.cu @@ -3,31 +3,26 @@ /* * test_allreduce_correctness_gpu.cu — GPU-buffer correctness tests for cail * - * Mirrors test_allreduce_correctness.c but with CUDA device buffers. - * For a given element count (-c, required) and algorithm (-a, optional), + * Mirrors test_allreduce_correctness.c but with CUDA or ROCm device buffers. + * For a base element count (-c, required) and algorithm (-a, optional), * exercises every supported {datatype x op} combination with both - * separate send/recv buffers and MPI_IN_PLACE. + * separate send/recv buffers and MPI_IN_PLACE across a small count sweep + * that covers edge and remainder paths. * - * 20 datatypes x 4 ops x 2 modes = 160 sub-tests per invocation - * (minus PROD skips for nprocs > 4 to avoid overflow). + * 20 datatypes x 4 ops x 2 modes = 160 sub-tests per count + * (minus PROD skips for nprocs > 4 to avoid overflow), swept over up to + * 6 derived counts: {1, 2, 3, base, base+1, base+3} (deduplicated). */ #include -#include +#include "test_gpu_compat.h" #include #include #include #include #include #include - -#define CUDA_CHECK(call) do { \ - cudaError_t _e = (call); \ - if (_e != cudaSuccess) { \ - fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \ - cudaGetErrorString(_e)); \ - MPI_Abort(MPI_COMM_WORLD, 1); \ - } \ -} while (0) +#include +#include /* ------------------------------------------------------------------ */ /* Type / op descriptors (same 20 types as CPU correctness test) */ @@ -87,11 +82,14 @@ __global__ void fill_elements(char *buf, int count, int elem_size, } } +/* d_val is a caller-owned, preallocated 16-byte device scratch buffer for the + * fill value template. It is allocated once in main() and reused for every + * call: per-call gpuMalloc/gpuFree of this tiny buffer hammers the Open MPI + * rcache VMA interval tree and can wedge collectives (see the buffer-reuse + * note in main()), so it must NOT be allocated inside this hot path. */ static void gpu_fill_buf(void *d_buf, int count, size_t elem_size, - int is_float, int rank) + int is_float, int rank, char *d_val) { - /* Build the value on host, then copy the raw bytes to constant memory - * via a small device buffer. */ char val_bytes[16] = {0}; /* large enough for any MPI type */ if (is_float == 1) { @@ -105,16 +103,12 @@ static void gpu_fill_buf(void *d_buf, int count, size_t elem_size, memcpy(val_bytes, &v, elem_size); } - /* Copy value template to device */ - char *d_val; - CUDA_CHECK(cudaMalloc(&d_val, 16)); - CUDA_CHECK(cudaMemcpy(d_val, val_bytes, 16, cudaMemcpyHostToDevice)); + GPU_CHECK(gpuMemcpy(d_val, val_bytes, 16, gpuMemcpyHostToDevice)); int grid = (count + 255) / 256; - fill_elements<<>>((char *)d_buf, count, (int)elem_size, d_val); - CUDA_CHECK(cudaDeviceSynchronize()); - - cudaFree(d_val); + GPU_KERNEL_LAUNCH(fill_elements, dim3(grid), dim3(256), 0, 0, + (char *)d_buf, count, (int)elem_size, d_val); + GPU_CHECK(gpuDeviceSynchronize()); } /* ------------------------------------------------------------------ */ @@ -161,7 +155,7 @@ static int check_buf(const void *h_buf, int count, const type_info_t *t, static void usage(const char *prog) { fprintf(stderr, "Usage: %s -c count [-a algo] [-h]\n", prog); - fprintf(stderr, " -c count element count (required)\n"); + fprintf(stderr, " -c count base element count for sweep (required)\n"); fprintf(stderr, " -a algo set CAIL_ALGO before MPI_Init (default: auto)\n"); } @@ -171,14 +165,14 @@ static void usage(const char *prog) int main(int argc, char **argv) { - int count = 0; + int base_count = 0; const char *algo_name = NULL; int opt; while ((opt = getopt(argc, argv, "c:a:h")) != -1) { switch (opt) { case 'c': - count = atoi(optarg); + base_count = atoi(optarg); break; case 'a': algo_name = optarg; @@ -191,7 +185,7 @@ int main(int argc, char **argv) } } - if (count <= 0) { + if (base_count <= 0) { fprintf(stderr, "%s: -c count is required and must be > 0\n", argv[0]); usage(argv[0]); return 1; @@ -204,21 +198,21 @@ int main(int argc, char **argv) MPI_Comm_size(MPI_COMM_WORLD, &nprocs); int dev_count = 0; - cudaGetDeviceCount(&dev_count); + gpuGetDeviceCount(&dev_count); if (dev_count == 0) { - if (rank == 0) fprintf(stderr, "No CUDA devices found\n"); + if (rank == 0) fprintf(stderr, "No GPU devices found\n"); MPI_Finalize(); return 1; } - CUDA_CHECK(cudaSetDevice(rank % dev_count)); + GPU_CHECK(gpuSetDevice(rank % dev_count)); if (!algo_name) algo_name = "auto"; int is_pof2 = (nprocs > 0) && ((nprocs & (nprocs - 1)) == 0); if (rank == 0) { printf("=== cail GPU allreduce correctness tests ===\n"); - printf("nprocs=%d GPUs_per_node=%d count=%d algo=%s\n", - nprocs, dev_count, count, algo_name); + printf("nprocs=%d GPUs_per_node=%d base_count=%d algo=%s\n", + nprocs, dev_count, base_count, algo_name); fflush(stdout); } @@ -259,82 +253,140 @@ int main(int argc, char **argv) int g_pass = 0; int g_fail = 0; - /* Pre-allocate GPU and host buffers at the maximum needed size - * (count * largest_type_size). Reusing a single allocation avoids - * hammering the Open MPI rcache VMA interval tree with hundreds of - * cudaMalloc/cudaFree cycles, which triggers a SEGV in - * opal_interval_tree_traverse at certain power-of-2 buffer sizes. */ size_t max_elem_size = 0; for (int ti = 0; ti < ntypes; ti++) if (types[ti].size > max_elem_size) max_elem_size = types[ti].size; - size_t max_bufsize = (size_t)count * max_elem_size; + + /* Keep the sweep small: edge counts + base + remainder-forcing offsets. */ + int candidate_counts[6]; + int ncandidates = 0; + candidate_counts[ncandidates++] = 1; + candidate_counts[ncandidates++] = 2; + candidate_counts[ncandidates++] = 3; + candidate_counts[ncandidates++] = base_count; + if (base_count < INT_MAX) + candidate_counts[ncandidates++] = base_count + 1; + if (base_count <= INT_MAX - 3) + candidate_counts[ncandidates++] = base_count + 3; + + int counts[6]; + int ncounts = 0; + size_t max_count_allowed = SIZE_MAX / max_elem_size; + for (int ci = 0; ci < ncandidates; ci++) { + int c = candidate_counts[ci]; + if (c <= 0) continue; + if ((size_t)c > max_count_allowed) continue; + + int dup = 0; + for (int ei = 0; ei < ncounts; ei++) { + if (counts[ei] == c) { + dup = 1; + break; + } + } + if (!dup) counts[ncounts++] = c; + } + + if (ncounts == 0) { + if (rank == 0) fprintf(stderr, "No valid test counts after sweep filtering\n"); + MPI_Abort(MPI_COMM_WORLD, 1); + } + + int max_count = counts[0]; + for (int ci = 1; ci < ncounts; ci++) + if (counts[ci] > max_count) max_count = counts[ci]; + + if (rank == 0) { + printf("count sweep:"); + for (int ci = 0; ci < ncounts; ci++) + printf(" %d", counts[ci]); + printf("\n"); + fflush(stdout); + } + + /* Pre-allocate GPU and host buffers once at the largest sweep count + * (max_count * largest_type_size). Reusing a single allocation avoids + * hammering the Open MPI rcache VMA interval tree with hundreds of + * gpuMalloc/gpuFree cycles, which triggers a SEGV in + * opal_interval_tree_traverse at certain power-of-2 buffer sizes. */ + size_t max_bufsize = (size_t)max_count * max_elem_size; void *d_send, *d_recv, *d_inplace; - CUDA_CHECK(cudaMalloc(&d_send, max_bufsize)); - CUDA_CHECK(cudaMalloc(&d_recv, max_bufsize)); - CUDA_CHECK(cudaMalloc(&d_inplace, max_bufsize)); + GPU_CHECK(gpuMalloc(&d_send, max_bufsize)); + GPU_CHECK(gpuMalloc(&d_recv, max_bufsize)); + GPU_CHECK(gpuMalloc(&d_inplace, max_bufsize)); + /* One-time 16-byte scratch for the fill-value template, reused by every + * gpu_fill_buf() call. Allocated here (not per-call) for the same + * rcache-stability reason as the buffers above. */ + char *d_val; + GPU_CHECK(gpuMalloc(&d_val, 16)); void *h_check = malloc(max_bufsize); if (!h_check) { fprintf(stderr, "malloc failed\n"); MPI_Abort(MPI_COMM_WORLD, 1); } - for (int ti = 0; ti < ntypes; ti++) { - for (int oi = 0; oi < nops; oi++) { - /* PROD overflows quickly — skip for large process counts. */ - if (ops[oi].mpi_op == MPI_PROD && nprocs > 4) continue; - - double expected = compute_expected(ops[oi].mpi_op, nprocs); - size_t bufsize = (size_t)count * types[ti].size; - - /* --- Separate send/recv buffers --- */ - CUDA_CHECK(cudaMemset(d_recv, 0, bufsize)); - gpu_fill_buf(d_send, count, types[ti].size, - types[ti].is_float, rank); - - MPI_Allreduce(d_send, d_recv, count, types[ti].mpi_type, - ops[oi].mpi_op, MPI_COMM_WORLD); - - CUDA_CHECK(cudaMemcpy(h_check, d_recv, bufsize, - cudaMemcpyDeviceToHost)); - - { - int pass = check_buf(h_check, count, &types[ti], expected); - if (rank == 0) { - printf("%s: algo=%s %s x %s count=%d np=%d pof2=%s\n", - pass ? "PASS" : "FAIL", algo_name, - types[ti].name, ops[oi].name, - count, nprocs, is_pof2 ? "yes" : "no"); - fflush(stdout); + for (int ci = 0; ci < ncounts; ci++) { + int count = counts[ci]; + + for (int ti = 0; ti < ntypes; ti++) { + for (int oi = 0; oi < nops; oi++) { + /* PROD overflows quickly — skip for large process counts. */ + if (ops[oi].mpi_op == MPI_PROD && nprocs > 4) continue; + + double expected = compute_expected(ops[oi].mpi_op, nprocs); + size_t bufsize = (size_t)count * types[ti].size; + + /* --- Separate send/recv buffers --- */ + GPU_CHECK(gpuMemset(d_recv, 0, bufsize)); + gpu_fill_buf(d_send, count, types[ti].size, + types[ti].is_float, rank, d_val); + + MPI_Allreduce(d_send, d_recv, count, types[ti].mpi_type, + ops[oi].mpi_op, MPI_COMM_WORLD); + + GPU_CHECK(gpuMemcpy(h_check, d_recv, bufsize, + gpuMemcpyDeviceToHost)); + + { + int pass = check_buf(h_check, count, &types[ti], expected); + if (rank == 0) { + printf("%s: algo=%s %s x %s count=%d np=%d pof2=%s\n", + pass ? "PASS" : "FAIL", algo_name, + types[ti].name, ops[oi].name, + count, nprocs, is_pof2 ? "yes" : "no"); + fflush(stdout); + } + if (pass) g_pass++; else g_fail++; } - if (pass) g_pass++; else g_fail++; - } - - /* --- MPI_IN_PLACE --- */ - gpu_fill_buf(d_inplace, count, types[ti].size, - types[ti].is_float, rank); - - MPI_Allreduce(MPI_IN_PLACE, d_inplace, count, types[ti].mpi_type, - ops[oi].mpi_op, MPI_COMM_WORLD); - - CUDA_CHECK(cudaMemcpy(h_check, d_inplace, bufsize, - cudaMemcpyDeviceToHost)); - { - int pass = check_buf(h_check, count, &types[ti], expected); - if (rank == 0) { - printf("%s: algo=%s MPI_IN_PLACE %s x %s count=%d np=%d pof2=%s\n", - pass ? "PASS" : "FAIL", algo_name, - types[ti].name, ops[oi].name, - count, nprocs, is_pof2 ? "yes" : "no"); - fflush(stdout); + /* --- MPI_IN_PLACE --- */ + gpu_fill_buf(d_inplace, count, types[ti].size, + types[ti].is_float, rank, d_val); + + MPI_Allreduce(MPI_IN_PLACE, d_inplace, count, types[ti].mpi_type, + ops[oi].mpi_op, MPI_COMM_WORLD); + + GPU_CHECK(gpuMemcpy(h_check, d_inplace, bufsize, + gpuMemcpyDeviceToHost)); + + { + int pass = check_buf(h_check, count, &types[ti], expected); + if (rank == 0) { + printf("%s: algo=%s MPI_IN_PLACE %s x %s count=%d np=%d pof2=%s\n", + pass ? "PASS" : "FAIL", algo_name, + types[ti].name, ops[oi].name, + count, nprocs, is_pof2 ? "yes" : "no"); + fflush(stdout); + } + if (pass) g_pass++; else g_fail++; } - if (pass) g_pass++; else g_fail++; } } } free(h_check); - cudaFree(d_send); - cudaFree(d_recv); - cudaFree(d_inplace); + gpuFree(d_send); + gpuFree(d_recv); + gpuFree(d_inplace); + gpuFree(d_val); MPI_Barrier(MPI_COMM_WORLD); diff --git a/tests/test_gpu_compat.h b/tests/test_gpu_compat.h new file mode 100644 index 0000000..165e6df --- /dev/null +++ b/tests/test_gpu_compat.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2026 Cornelis Networks. All rights reserved. */ + +#ifndef TEST_GPU_COMPAT_H +#define TEST_GPU_COMPAT_H + +#ifdef __HIP_PLATFORM_AMD__ +#include +typedef hipError_t gpuError_t; +#define gpuSuccess hipSuccess +#define gpuMalloc hipMalloc +#define gpuFree hipFree +#define gpuMemcpy hipMemcpy +#define gpuMemset hipMemset +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuSetDevice hipSetDevice +#define gpuGetDeviceCount hipGetDeviceCount +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuGetErrorString hipGetErrorString +#define GPU_KERNEL_LAUNCH(kernel, grid, block, shmem, stream, ...) \ + hipLaunchKernelGGL(kernel, grid, block, shmem, stream, __VA_ARGS__) +#define GPU_ERROR_PREFIX "HIP" +#else +#include +typedef cudaError_t gpuError_t; +#define gpuSuccess cudaSuccess +#define gpuMalloc cudaMalloc +#define gpuFree cudaFree +#define gpuMemcpy cudaMemcpy +#define gpuMemset cudaMemset +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuSetDevice cudaSetDevice +#define gpuGetDeviceCount cudaGetDeviceCount +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuGetErrorString cudaGetErrorString +#define GPU_KERNEL_LAUNCH(kernel, grid, block, shmem, stream, ...) \ + kernel<<>>(__VA_ARGS__) +#define GPU_ERROR_PREFIX "CUDA" +#endif + +#define GPU_CHECK(call) do { \ + gpuError_t _e = (call); \ + if (_e != gpuSuccess) { \ + fprintf(stderr, "%s error %s:%d: %s\n", GPU_ERROR_PREFIX, \ + __FILE__, __LINE__, gpuGetErrorString(_e)); \ + MPI_Abort(MPI_COMM_WORLD, 1); \ + } \ +} while (0) + +#endif /* TEST_GPU_COMPAT_H */