From 53c264b194e9cb19083acb527afe6e38cd68b2ed Mon Sep 17 00:00:00 2001 From: Mike Wilkins Date: Fri, 12 Jun 2026 17:10:57 -0400 Subject: [PATCH] Flush GPU recv buffer after non-pof2 unfold receives The Phase-3 unfold PMPI_Recv on even folded ranks lands the final result in recvbuf via GPU-aware MPI (posted PCIe writes), but unlike every other receive site no cail_gpu_flush_recv_buf follows. The application may launch a kernel reading recvbuf immediately after MPI_Allreduce returns and observe stale device memory. Add the missing flush after the unfold receive in recursive doubling and Rabenseifner. --- src/coll/allreduce/cail_allreduce_rabenseifner.c | 4 ++++ src/coll/allreduce/cail_allreduce_recursive_doubling.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/src/coll/allreduce/cail_allreduce_rabenseifner.c b/src/coll/allreduce/cail_allreduce_rabenseifner.c index 9a1dc32..f35016e 100644 --- a/src/coll/allreduce/cail_allreduce_rabenseifner.c +++ b/src/coll/allreduce/cail_allreduce_rabenseifner.c @@ -259,6 +259,10 @@ int cail_allreduce_rabenseifner(const void *sendbuf, void *recvbuf, int count, rc = PMPI_Send(recvbuf, count, datatype, rank - 1, 0, comm); } else { rc = PMPI_Recv(recvbuf, count, datatype, rank + 1, 0, comm, MPI_STATUS_IGNORE); + if (rc == MPI_SUCCESS) { + /* Flush the GPU-aware receive before the caller reads recvbuf. */ + cail_gpu_flush_recv_buf(recvbuf, bufsize); + } } } diff --git a/src/coll/allreduce/cail_allreduce_recursive_doubling.c b/src/coll/allreduce/cail_allreduce_recursive_doubling.c index c4766ca..e028126 100644 --- a/src/coll/allreduce/cail_allreduce_recursive_doubling.c +++ b/src/coll/allreduce/cail_allreduce_recursive_doubling.c @@ -130,6 +130,8 @@ int cail_allreduce_recursive_doubling(const void *sendbuf, void *recvbuf, if ((rank % 2) == 0) { rc = PMPI_Recv(recvbuf, count, datatype, rank + 1, 0, comm, MPI_STATUS_IGNORE); if (rc != MPI_SUCCESS) goto cleanup; + /* Flush the GPU-aware receive before the caller reads recvbuf. */ + cail_gpu_flush_recv_buf(recvbuf, bufsize); } else { rc = PMPI_Send(recvbuf, count, datatype, rank - 1, 0, comm); if (rc != MPI_SUCCESS) goto cleanup;