diff --git a/src/coll/allreduce/cail_allreduce_rabenseifner.c b/src/coll/allreduce/cail_allreduce_rabenseifner.c index 9a1dc32..f35016e 100644 --- a/src/coll/allreduce/cail_allreduce_rabenseifner.c +++ b/src/coll/allreduce/cail_allreduce_rabenseifner.c @@ -259,6 +259,10 @@ int cail_allreduce_rabenseifner(const void *sendbuf, void *recvbuf, int count, rc = PMPI_Send(recvbuf, count, datatype, rank - 1, 0, comm); } else { rc = PMPI_Recv(recvbuf, count, datatype, rank + 1, 0, comm, MPI_STATUS_IGNORE); + if (rc == MPI_SUCCESS) { + /* Flush the GPU-aware receive before the caller reads recvbuf. */ + cail_gpu_flush_recv_buf(recvbuf, bufsize); + } } } diff --git a/src/coll/allreduce/cail_allreduce_recursive_doubling.c b/src/coll/allreduce/cail_allreduce_recursive_doubling.c index c4766ca..e028126 100644 --- a/src/coll/allreduce/cail_allreduce_recursive_doubling.c +++ b/src/coll/allreduce/cail_allreduce_recursive_doubling.c @@ -130,6 +130,8 @@ int cail_allreduce_recursive_doubling(const void *sendbuf, void *recvbuf, if ((rank % 2) == 0) { rc = PMPI_Recv(recvbuf, count, datatype, rank + 1, 0, comm, MPI_STATUS_IGNORE); if (rc != MPI_SUCCESS) goto cleanup; + /* Flush the GPU-aware receive before the caller reads recvbuf. */ + cail_gpu_flush_recv_buf(recvbuf, bufsize); } else { rc = PMPI_Send(recvbuf, count, datatype, rank - 1, 0, comm); if (rc != MPI_SUCCESS) goto cleanup;