From 5d806550268cf471c5f45aaf0ceef023c37c78c3 Mon Sep 17 00:00:00 2001 From: Bob Cernohous Date: Wed, 4 Mar 2026 17:54:35 -0600 Subject: [PATCH 1/8] prov/opx: Drop mm lock before memory operations Drop the lock so memory monitor doesn't risk a deadlock. Signed-off-by: Bob Cernohous (cherry picked from commit ec4b693eca34ad9dc0aea7fdc5a277a28125e1f5) --- prov/opx/src/fi_opx_tid_cache.c | 36 ++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/prov/opx/src/fi_opx_tid_cache.c b/prov/opx/src/fi_opx_tid_cache.c index 7f83fe92d26..4b09d9bd205 100644 --- a/prov/opx/src/fi_opx_tid_cache.c +++ b/prov/opx/src/fi_opx_tid_cache.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates. All rights reserved. - * Copyright (C) 2022-2024 Cornelis Networks. + * Copyright (C) 2022-2026 Cornelis Networks. * * Copyright (c) 2016-2017 Cray Inc. All rights reserved. * Copyright (c) 2017-2019 Intel Corporation, Inc. All rights reserved. @@ -633,7 +633,11 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, const struct ofi_mr_info *inf } OPX_DEBUG_ENTRY(info); - /* drop the mm lock across alloc/register */ + /* drop the mm lock across alloc/register and rbnode pre-allocation + * to avoid deadlock: malloc() can trigger madvise() which OPAL + * intercepts, calling back into ofi_import_monitor_notify() which + * needs mm_lock. + */ pthread_mutex_unlock(&mm_lock); struct opx_mr_tid_info *tid_info; int ret = opx_mr_entry_alloc_init(cache, info, opx_ep, entry, &tid_info); @@ -648,18 +652,22 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, const struct ofi_mr_info *inf ret = opx_register_tid_region_retryable(cache, (uint64_t) info->iov.iov_base, MIN(info->iov.iov_len, register_max_len), info->iface, info->device, opx_ep, tid_info); - - /* re-acquire mm_lock */ - pthread_mutex_lock(&mm_lock); - if (ret) { /* Failed, tid_info->ninfo will be zero */ - FI_DBG(fi_opx_global.prov, FI_LOG_MR, - "opx_register_tid_region failed with return code %d (%s), FREE node %p\n", ret, strerror(ret), - (*entry)->node); + FI_DBG(fi_opx_global.prov, FI_LOG_MR, "opx_register_tid_region failed with return code %d (%s)\n", ret, + strerror(ret)); goto error; } + struct ofi_rbnode *rbnode = ofi_rbnode_new(&cache->tree); + if (OFI_UNLIKELY(!rbnode)) { + FI_DBG(fi_opx_global.prov, FI_LOG_MR, "Failed to alloc rbnode, return -FI_ENOMEM\n"); + goto error; + } + + /* re-acquire mm_lock */ + pthread_mutex_lock(&mm_lock); + FI_DBG(fi_opx_global.prov, FI_LOG_MR, "NEW vaddr [%#lx - %#lx] length %lu, tid vaddr [%#lx - %#lx] , tid length %lu\n", (uint64_t) info->iov.iov_base, (uint64_t) info->iov.iov_base + (uint64_t) info->iov.iov_len, @@ -669,11 +677,12 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, const struct ofi_mr_info *inf (*entry)->info.iov.iov_base = (void *) tid_info->tid_vaddr; (*entry)->info.iov.iov_len = tid_info->tid_length; - ret = ofi_rbmap_insert(&cache->tree, (void *) &(*entry)->info, (void *) *entry, &(*entry)->node); + ret = ofi_rbmap_insert_at(&cache->tree, (void *) &(*entry)->info, (void *) *entry, &(*entry)->node, rbnode); if (OFI_UNLIKELY(ret)) { FI_DBG(fi_opx_global.prov, FI_LOG_MR, "ofi_rbmap_insert returned %d (%s) %p\n", ret, strerror(ret), (*entry)->node); + ofi_rbnode_del(&cache->tree, rbnode); goto error; } cache->cached_cnt++; @@ -682,9 +691,8 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, const struct ofi_mr_info *inf ret = ofi_monitor_subscribe(monitor, info->iov.iov_base, info->iov.iov_len, &(*entry)->hmem_info); if (OFI_UNLIKELY(ret)) { opx_mr_uncache_entry_storage(cache, *entry); - cache->uncached_cnt++; - cache->uncached_size += (*entry)->info.iov.iov_len; - FI_WARN(fi_opx_global.prov, FI_LOG_MR, "MONITOR SUBSCRIBE FAILED UNCACHED ERROR\n"); + FI_WARN(fi_opx_global.prov, FI_LOG_MR, "MONITOR SUBSCRIBE FAILED UNCACHED ERROR %d (%s)\n", ret, + strerror(ret)); goto error; } OPX_DEBUG_EXIT((*entry), OPX_TID_CACHE_ENTRY_FOUND); @@ -696,7 +704,7 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, const struct ofi_mr_info *inf (char *) info->iov.iov_base + info->iov.iov_len, info->iov.iov_len, info->iov.iov_len); tid_info->npairs = 0; /* error == no tid pairs */ OPX_DEBUG_EXIT((*entry), OPX_TID_CACHE_ENTRY_NOT_FOUND); - return ret; // TODO - handle case for free + return ret; // entry freed by caller } __OPX_FORCE_INLINE__ From 9119d0bfca3987194a2bc97fc924c11482158d2b Mon Sep 17 00:00:00 2001 From: Armando Vinciguerra Date: Wed, 10 Jun 2026 21:15:29 -0400 Subject: [PATCH 2/8] util: Drop mm lock before memory operations Drop the memory-monitor lock before performing memory operations and pre-allocate the rbnode outside the lock, so the cache insert path no longer holds mm_lock across allocation. Also avoid freeing a still-valid MR entry when the cache is full or a subscribe fails. As part of this, the ofi_rbnode_alloc and ofi_rbnode_free helpers are now public (the thin ofi_rbnode_new/ ofi_rbnode_del wrappers are removed) and the single-statement if is de-braced, per ofiwg maintainer review feedback. Signed-off-by: Armando Vinciguerra --- include/ofi_tree.h | 9 ++++++++- prov/opx/src/fi_opx_tid_cache.c | 4 ++-- prov/util/src/util_mr_cache.c | 31 +++++++++++++++++++++++++++---- src/tree.c | 33 ++++++++++++++++++++++++++------- 4 files changed, 63 insertions(+), 14 deletions(-) diff --git a/include/ofi_tree.h b/include/ofi_tree.h index 1d11a5e48f9..133f74ddca1 100644 --- a/include/ofi_tree.h +++ b/include/ofi_tree.h @@ -1,7 +1,8 @@ /* * Copyright (c) 2015 Cray Inc. All rights reserved. * Copyright (c) 2018 Intel Corp, Inc. All rights reserved. - * + * Copyright (C) 2026 Cornelis Networks. + * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file @@ -92,9 +93,15 @@ struct ofi_rbnode *ofi_rbmap_search(struct ofi_rbmap *map, void *key, int (*compare)(struct ofi_rbmap *map, void *key, void *data)); int ofi_rbmap_insert(struct ofi_rbmap *map, void *key, void *data, struct ofi_rbnode **node); +int ofi_rbmap_insert_at(struct ofi_rbmap *map, void *key, void *data, + struct ofi_rbnode **ret_node, + struct ofi_rbnode *prealloc); void ofi_rbmap_delete(struct ofi_rbmap *map, struct ofi_rbnode *node); int ofi_rbmap_find_delete(struct ofi_rbmap *map, void *key); int ofi_rbmap_empty(struct ofi_rbmap *map); +struct ofi_rbnode *ofi_rbnode_alloc(struct ofi_rbmap *map); +void ofi_rbnode_free(struct ofi_rbmap *map, struct ofi_rbnode *node); + #endif /* OFI_TREE_H_ */ diff --git a/prov/opx/src/fi_opx_tid_cache.c b/prov/opx/src/fi_opx_tid_cache.c index 4b09d9bd205..bd28c984497 100644 --- a/prov/opx/src/fi_opx_tid_cache.c +++ b/prov/opx/src/fi_opx_tid_cache.c @@ -659,7 +659,7 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, const struct ofi_mr_info *inf goto error; } - struct ofi_rbnode *rbnode = ofi_rbnode_new(&cache->tree); + struct ofi_rbnode *rbnode = ofi_rbnode_alloc(&cache->tree); if (OFI_UNLIKELY(!rbnode)) { FI_DBG(fi_opx_global.prov, FI_LOG_MR, "Failed to alloc rbnode, return -FI_ENOMEM\n"); goto error; @@ -682,7 +682,7 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, const struct ofi_mr_info *inf if (OFI_UNLIKELY(ret)) { FI_DBG(fi_opx_global.prov, FI_LOG_MR, "ofi_rbmap_insert returned %d (%s) %p\n", ret, strerror(ret), (*entry)->node); - ofi_rbnode_del(&cache->tree, rbnode); + ofi_rbnode_free(&cache->tree, rbnode); goto error; } cache->cached_cnt++; diff --git a/prov/util/src/util_mr_cache.c b/prov/util/src/util_mr_cache.c index 2f0af31fc4c..d44dfdb9f0b 100644 --- a/prov/util/src/util_mr_cache.c +++ b/prov/util/src/util_mr_cache.c @@ -4,6 +4,7 @@ * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved. * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * Copyright (C) 2026 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -268,6 +269,12 @@ void ofi_mr_cache_delete(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) * new entry, then check under lock that a conflict with another thread * hasn't occurred. If a conflict occurred, we return -EAGAIN and * restart the entire operation. + * + * The rbnode for the tree insertion is also pre-allocated outside the + * lock for the same reason: ofi_rbmap_insert() may call malloc() which + * can trigger madvise() via the allocator, and an intercepting memory + * monitor (e.g. OPAL) would call back into ofi_import_monitor_notify() + * which needs mm_lock -- deadlocking the thread. */ static int util_mr_cache_create(struct ofi_mr_cache *cache, struct ofi_mr_info *info, @@ -300,6 +307,12 @@ util_mr_cache_create(struct ofi_mr_cache *cache, struct ofi_mr_info *info, assert(ofi_iov_within(&(*info).iov, &(*entry)->info.iov)); *info = (*entry)->info; + struct ofi_rbnode *rbnode = ofi_rbnode_alloc(&cache->tree); + if (OFI_UNLIKELY(!rbnode)) { + ret = -FI_ENOMEM; + goto free; + } + pthread_mutex_lock(&mm_lock); cur = ofi_mr_rbt_find(&cache->tree, info); if (cur) { @@ -311,11 +324,14 @@ util_mr_cache_create(struct ofi_mr_cache *cache, struct ofi_mr_info *info, cache->uncached_cnt++; cache->uncached_size += info->iov.iov_len; } else { - if (ofi_rbmap_insert(&cache->tree, (void *) &(*entry)->info, - (void *) *entry, &(*entry)->node)) { - ret = -FI_ENOMEM; + ret = ofi_rbmap_insert_at(&cache->tree, + (void *) &(*entry)->info, + (void *) *entry, &(*entry)->node, + rbnode); + if (ret) goto unlock; - } + rbnode = NULL; /* now owned by the tree */ + cache->cached_cnt++; cache->cached_size += info->iov.iov_len; @@ -328,10 +344,17 @@ util_mr_cache_create(struct ofi_mr_cache *cache, struct ofi_mr_info *info, cache->uncached_size += (*entry)->info.iov.iov_len; } } + /* ofi_rbnode_free() only returns the node to the tree free list (never + * allocates), so it is safe to call while holding mm_lock. + */ + if (rbnode) + ofi_rbnode_free(&cache->tree, rbnode); pthread_mutex_unlock(&mm_lock); return 0; unlock: + if (rbnode) + ofi_rbnode_free(&cache->tree, rbnode); pthread_mutex_unlock(&mm_lock); free: util_mr_free_entry(cache, *entry); diff --git a/src/tree.c b/src/tree.c index c66d3662441..23bd10979d5 100644 --- a/src/tree.c +++ b/src/tree.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2015 Cray Inc. All rights reserved. * Copyright (c) 2018 Intel Corp, Inc. All rights reserved. + * Copyright (C) 2026 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -51,7 +52,7 @@ #include -static struct ofi_rbnode *ofi_rbnode_alloc(struct ofi_rbmap *map) +struct ofi_rbnode *ofi_rbnode_alloc(struct ofi_rbmap *map) { struct ofi_rbnode *node; @@ -63,7 +64,7 @@ static struct ofi_rbnode *ofi_rbnode_alloc(struct ofi_rbmap *map) return node; } -static void ofi_rbnode_free(struct ofi_rbmap *map, struct ofi_rbnode *node) +void ofi_rbnode_free(struct ofi_rbmap *map, struct ofi_rbnode *node) { node->right = map->free_list ? map->free_list : NULL; map->free_list = node; @@ -231,8 +232,9 @@ ofi_insert_rebalance(struct ofi_rbmap *map, struct ofi_rbnode *x) map->root->color = BLACK; } -int ofi_rbmap_insert(struct ofi_rbmap *map, void *key, void *data, - struct ofi_rbnode **ret_node) +static inline int ofi_rbmap_insert_common(struct ofi_rbmap *map, void *key, void *data, + struct ofi_rbnode **ret_node, + struct ofi_rbnode *prealloc) { struct ofi_rbnode *current, *parent, *node; int ret; @@ -252,9 +254,13 @@ int ofi_rbmap_insert(struct ofi_rbmap *map, void *key, void *data, current = (ret < 0) ? current->left : current->right; } - node = ofi_rbnode_alloc(map); - if (!node) - return -FI_ENOMEM; + if (prealloc) { + node = prealloc; + } else { + node = ofi_rbnode_alloc(map); + if (!node) + return -FI_ENOMEM; + } node->parent = parent; node->left = &map->sentinel; @@ -277,6 +283,19 @@ int ofi_rbmap_insert(struct ofi_rbmap *map, void *key, void *data, return 0; } +int ofi_rbmap_insert(struct ofi_rbmap *map, void *key, void *data, + struct ofi_rbnode **ret_node) +{ + return ofi_rbmap_insert_common(map, key, data, ret_node, NULL); +} + +int ofi_rbmap_insert_at(struct ofi_rbmap *map, void *key, void *data, + struct ofi_rbnode **ret_node, + struct ofi_rbnode *prealloc) +{ + return ofi_rbmap_insert_common(map, key, data, ret_node, prealloc); +} + static void ofi_delete_rebalance(struct ofi_rbmap *map, struct ofi_rbnode *node) { struct ofi_rbnode *w; From 8ad41df80bc6d934bab7b378bf46729cda93ea9f Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Tue, 19 May 2026 09:40:30 -0500 Subject: [PATCH 3/8] prov/opx: Handle MP eager FI_CLAIM receives FI_PEEK | FI_CLAIM removes the unexpected FIRST packet from the UE queue and stores it for a later FI_CLAIM receive. That receive must perform the same multi-packet eager bookkeeping as the normal unexpected-match path so remaining NTH packets can complete the message. Process pending MP eager UE packets after completing the claimed FIRST packet and queue the receive until all fragments arrive. Signed-off-by: Ken Raffenetti --- prov/opx/src/fi_opx_ep.c | 43 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/prov/opx/src/fi_opx_ep.c b/prov/opx/src/fi_opx_ep.c index 4cec0660ecf..2b133dbd622 100644 --- a/prov/opx/src/fi_opx_ep.c +++ b/prov/opx/src/fi_opx_ep.c @@ -3845,7 +3845,9 @@ fi_opx_ep_rx_process_context_noinline(struct fi_opx_ep *opx_ep, const uint64_t s struct fi_context *op_context = (struct fi_context *) context->err_entry.op_context; struct fi_opx_hfi1_ue_packet *claimed_pkt = op_context->internal[0]; - const unsigned is_shm = claimed_pkt->is_shm; + const unsigned is_shm = claimed_pkt->is_shm; + uint8_t is_mp_eager = (FI_OPX_HFI_BTH_OPCODE_BASE_OPCODE(claimed_pkt->hdr.bth.opcode) == + FI_OPX_HFI_BTH_OPCODE_MSG_MP_EAGER_FIRST); opx_ep_complete_receive_operation( ep, &claimed_pkt->hdr, (union fi_opx_hfi1_packet_payload *) &claimed_pkt->payload, @@ -3853,6 +3855,45 @@ fi_opx_ep_rx_process_context_noinline(struct fi_opx_ep *opx_ep, const uint64_t s is_shm, rx_op_flags & (FI_OPX_CQ_CONTEXT_HMEM | FI_OPX_CQ_CONTEXT_DMABUF_HMEM), lock_required, reliability, hfi1_type); + if (is_mp_eager) { + /* + * A claimed multi-packet eager message starts from an unexpected FIRST + * packet, just like fi_opx_ep_process_context_match_ue_packets(). The + * FIRST packet initializes context->byte_counter, but the remaining NTH + * packets are matched through mp_egr_queue. If we skip that bookkeeping + * here, FI_CLAIM receives can wait forever after MPI_Mprobe claims the + * FIRST packet of a multi-packet eager message. + */ + opx_lid_t slid; + if (hfi1_type & (OPX_HFI1_WFR | OPX_HFI1_MIXED_9B)) { + slid = (opx_lid_t) __be16_to_cpu24((__be16) claimed_pkt->hdr.lrh_9B.slid); + } else { + slid = (opx_lid_t) __le24_to_cpu((__le24) (claimed_pkt->hdr.lrh_16B.slid20 << 20) | + (claimed_pkt->hdr.lrh_16B.slid)); + } + + const uint64_t mp_egr_id = + OPX_GET_MP_EGR_ID(claimed_pkt->hdr.reliability.psn, slid, claimed_pkt->subctxt_rx); + + fi_opx_ep_rx_process_pending_mp_eager_ue(ep, context, mp_egr_id, is_shm, lock_required, + reliability, hfi1_type); + + if (context->byte_counter) { + context->mp_egr_id = mp_egr_id; + slist_insert_tail((struct slist_entry *) context, &opx_ep->rx->mp_egr_queue.mq); + } else { + FI_OPX_DEBUG_COUNTERS_INC( + opx_ep->debug_counters.mp_eager.recv_completed_process_context); + + context->next = NULL; + if (OFI_UNLIKELY(context->err_entry.err == FI_ETRUNC)) { + slist_insert_tail((struct slist_entry *) context, opx_ep->rx->cq_err_ptr); + } else { + fi_opx_enqueue_completed(opx_ep->rx->cq_completed_ptr, context, lock_required); + } + } + } + /* ... and prepend the claimed uepkt to the ue free list. claimed_pkt->next should have been set to NULL at the time we stored it in context->claim */ From be82f1394a2644684fe98729facb3a1212345520 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Tue, 26 May 2026 12:31:38 -0500 Subject: [PATCH 4/8] prov/opx: Reject incompatible MR registrations when HFISVC is enabled HFISVC pins memory regions eagerly at registration time. Two inputs that the upper libfabric layer accepts but HFISVC cannot service were producing kernel-side failures or silent truncation: 1. iov_base == NULL: hfi1_mem_region_pin returns -EFAULT, which surfaces as a crash from bulksvc_user.c:user_mr_record_create_ pinned_and_insert(). 2. iov_len > UINT32_MAX: the HFISVC cmd_mr_open API takes a uint32_t length, so larger sizes are silently truncated and only the low 4 GiB of the requested range is pinned. Reject both up front with FI_EOPNOTSUPP when use_hfisvc is set. Signed-off-by: Ken Raffenetti --- prov/opx/src/fi_opx_mr.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/prov/opx/src/fi_opx_mr.c b/prov/opx/src/fi_opx_mr.c index 2449cbe782e..e3c01bba14b 100644 --- a/prov/opx/src/fi_opx_mr.c +++ b/prov/opx/src/fi_opx_mr.c @@ -183,6 +183,24 @@ static inline int fi_opx_mr_reg_internal(struct fid *fid, const struct iovec *io return -errno; } +#if HAVE_HFISVC + /* HFISVC pins the memory region eagerly at registration time via the + * kernel hfi1_mem_region_pin path. A NULL base address cannot be pinned + * (the kernel returns EFAULT), and the HFISVC cmd_mr_open API truncates + * its length argument to 32 bits, so any length exceeding UINT32_MAX + * would silently register only the low 4 GiB of the requested range. + * Reject these cases up front with a libfabric-level error rather than + * letting them propagate to a kernel-side failure or a silent + * truncation. TODO: document this limit along with HFISVC */ + if (opx_domain->use_hfisvc && (!iov->iov_base || iov->iov_len > UINT32_MAX)) { + FI_WARN(fi_opx_global.prov, FI_LOG_MR, + "HFISVC cannot register MR with iov_base=%p iov_len=%zu (NULL base or len > UINT32_MAX is unsupported by HFISVC pinning)\n", + iov->iov_base, iov->iov_len); + errno = FI_EOPNOTSUPP; + return -errno; + } +#endif + struct fi_opx_mr *opx_mr; __attribute__((__unused__)) uint64_t hmem_device = 0UL; __attribute__((__unused__)) enum fi_hmem_iface hmem_iface = FI_HMEM_SYSTEM; From b88afc0555245757ae2f1358870b0fb06fc529f6 Mon Sep 17 00:00:00 2001 From: Lindsay Reiser Date: Thu, 28 May 2026 08:33:45 -0400 Subject: [PATCH 5/8] prov/opx: Document HMEM-dependent FI_OPX_RZV_MIN_PAYLOAD_BYTES defaults The default value of FI_OPX_RZV_MIN_PAYLOAD_BYTES is selected at build time based on the HMEM backend OPX was configured with (see OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT in prov/opx/include/rdma/opx/fi_opx_hfi1.h): - 4096 for CUDA-enabled builds - 8192 for AMD ROCR-enabled builds - 16385 for host-only builds (which effectively keeps payloads that fit in multi-packet eager off the rendezvous path) Previously the man page only listed "Defaults to 16385", which is correct only for host-only builds and gave no indication that GPU-enabled builds use a much lower rendezvous threshold. Spell the three defaults out so users tuning RZV behavior on CUDA/ROCR systems do not have to read the source. Documentation only; no functional change. Signed-off-by: Lindsay Reiser Co-authored-by: Ken Raffenetti (cherry picked from commit 8611ecb763e34d41ec6eb87a5cea70b98be15798) --- man/fi_opx.7.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/man/fi_opx.7.md b/man/fi_opx.7.md index db68290f07b..2e1c651f3fe 100644 --- a/man/fi_opx.7.md +++ b/man/fi_opx.7.md @@ -264,7 +264,15 @@ OPX is not compatible with Open MPI 4.1.x PML/BTL. *FI_OPX_RZV_MIN_PAYLOAD_BYTES* : Integer. The minimum length in bytes where rendezvous will be used. For messages smaller than this threshold, the send will first try to be completed using eager or multi-packet eager. - Value must be between 64 and 65536. Defaults to 16385. + Value must be between 64 and 65536. + The default, which applies to all memory types, is selected at build time based on the device-memory (HMEM) backend + that OPX was configured with: + + - 4096 — HMEM builds with CUDA support + - 8192 — HMEM builds with AMD ROCR support + - 16385 — host-only builds (one byte above the multi-packet eager maximum), + which effectively disables rendezvous for any payload that fits in + multi-packet eager. *FI_OPX_MP_EAGER_DISABLE* : Boolean (1/0, on/off, true/false, yes/no). Disables multi-packet eager. Defaults to 0. From 76fdc724ce8d0ae4fbc70950ac42be9ba207a45b Mon Sep 17 00:00:00 2001 From: Andriy Kot Date: Thu, 28 May 2026 11:14:02 -0700 Subject: [PATCH 6/8] prov/opx: Make OPENED MR notify non-owning for rzv completion Keep OPENED notify as observation-only for STL-77956 path by removing byte_counter decrement and rzv_comp free from MR notify handling. ICQ completion remains the owner for decrement/finalization. Signed-off-by: Andriy Kot (cherry picked from commit c8baf82d6446d91eef698305189c053d96c6d303) --- prov/opx/include/rdma/opx/opx_hfisvc_poll.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/prov/opx/include/rdma/opx/opx_hfisvc_poll.h b/prov/opx/include/rdma/opx/opx_hfisvc_poll.h index 54042cba80e..3ec2479108f 100644 --- a/prov/opx/include/rdma/opx/opx_hfisvc_poll.h +++ b/prov/opx/include/rdma/opx/opx_hfisvc_poll.h @@ -106,13 +106,8 @@ void opx_domain_hfisvc_poll(struct fi_opx_domain *opx_domain) struct opx_context *context = rzv_comp->context; OPX_HFISVC_DEBUG_LOG( - "STRIPE-MR-NOTIFY: MR notify completion for opx_mr=%p rzv_comp=%p context=%p byte_counter=%lu -> %lu\n", - opx_mr, rzv_comp, context, context->byte_counter, context->byte_counter - 1); - - assert(context->byte_counter > 0); - context->byte_counter -= 1; - - OPX_BUF_FREE(rzv_comp); + "STRIPE-MR-NOTIFY: MR notify completion for opx_mr=%p rzv_comp=%p context=%p byte_counter=%lu (no decrement/no free)\n", + opx_mr, rzv_comp, context, context ? context->byte_counter : 0UL); } else if (opx_mr->hfisvc.state == OPX_MR_HFISVC_STATE_PENDING_KEY_DISABLE) { opx_hfisvc_keyset_free_key(opx_domain->hfisvc.ctxs[0].access_key_set, opx_mr->hfisvc.access_key, NULL); From 4afcf33822e2fc0baae3c403165ce4c3ad5f73a9 Mon Sep 17 00:00:00 2001 From: Andriy Kot Date: Thu, 4 Jun 2026 21:50:41 -0700 Subject: [PATCH 7/8] prov/opx: Initialize deferred HFISVC receive contexts * prov/opx: Initialize deferred HFISVC receive contexts Deferred HMEM rendezvous receives enqueue work before the deferred worker submits RDMA reads. Initialize the matched receive context before enqueueing so the ICQ completion drains niov to zero instead of decrementing the posted-receive sentinel (-1). For truncated receives, keep the counted context pending until scratch reads complete, then report FI_ETRUNC through the CQ error path. * prov/opx: Limit deferred HFISVC context fix scope Keep the STL-77956 deferred receive context initialization focused on the non-truncated dmabuf path while preserving the existing HFISVC truncation behavior for follow-up work.\n\nThe deferred path now initializes the matched receive context before enqueueing only when the OPX MR is still opening and the receive is not truncated. * prov/opx: Format deferred HFISVC context initialization Apply the OPX provider formatting alignment to the deferred receive context initialization added for STL-77956. --------- Signed-off-by: Andriy Kot --- prov/opx/include/rdma/opx/fi_opx_endpoint.h | 10 +++++++++- prov/opx/include/rdma/opx/opx_hfisvc_poll.h | 9 +++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index 19789ee28b3..74d415d85dc 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -1132,7 +1132,15 @@ void fi_opx_handle_recv_rts_hfisvc(const union opx_hfi1_packet_hdr *const hdr, uint64_t do_dmabuf = (uint64_t) (context->flags & FI_OPX_CQ_CONTEXT_DMABUF_HMEM); if (do_dmabuf) { opx_mr = ((struct fi_opx_hmem_info *) context->hmem_info_qws)->dmabuf.opx_mr; - if (opx_mr->hfisvc.state != OPX_MR_HFISVC_STATE_OPENED) { + if (opx_mr->hfisvc.state != OPX_MR_HFISVC_STATE_OPENED && OFI_LIKELY(xfer_len <= recv_len)) { + context->byte_counter = niov; + context->len = xfer_len; + context->data = ofi_data; + context->tag = origin_tag; + context->next = NULL; + context->flags |= FI_RECV | FI_OPX_HFI_BTH_OPCODE_GET_CQ_FLAG(opcode) | + FI_OPX_HFI_BTH_OPCODE_GET_MSG_FLAG(opcode); + int deferred_rc = opx_hfisvc_deferred_recv_rts_enqueue(opx_ep, context, niov, recv_buf, &payload->rendezvous.hfisvc.iovs[0]); if (OFI_UNLIKELY(deferred_rc)) { diff --git a/prov/opx/include/rdma/opx/opx_hfisvc_poll.h b/prov/opx/include/rdma/opx/opx_hfisvc_poll.h index 3ec2479108f..54042cba80e 100644 --- a/prov/opx/include/rdma/opx/opx_hfisvc_poll.h +++ b/prov/opx/include/rdma/opx/opx_hfisvc_poll.h @@ -106,8 +106,13 @@ void opx_domain_hfisvc_poll(struct fi_opx_domain *opx_domain) struct opx_context *context = rzv_comp->context; OPX_HFISVC_DEBUG_LOG( - "STRIPE-MR-NOTIFY: MR notify completion for opx_mr=%p rzv_comp=%p context=%p byte_counter=%lu (no decrement/no free)\n", - opx_mr, rzv_comp, context, context ? context->byte_counter : 0UL); + "STRIPE-MR-NOTIFY: MR notify completion for opx_mr=%p rzv_comp=%p context=%p byte_counter=%lu -> %lu\n", + opx_mr, rzv_comp, context, context->byte_counter, context->byte_counter - 1); + + assert(context->byte_counter > 0); + context->byte_counter -= 1; + + OPX_BUF_FREE(rzv_comp); } else if (opx_mr->hfisvc.state == OPX_MR_HFISVC_STATE_PENDING_KEY_DISABLE) { opx_hfisvc_keyset_free_key(opx_domain->hfisvc.ctxs[0].access_key_set, opx_mr->hfisvc.access_key, NULL); From bf40ab93da2e0b8f32bd65397ae179bbd06da467 Mon Sep 17 00:00:00 2001 From: Lindsay Reiser Date: Tue, 9 Jun 2026 10:49:23 -0400 Subject: [PATCH 8/8] prov/opx: Lower GDRcopy threshold/Allow HMEM MP Eager Multi-packet eager sends would only allow SYSTEM memory on the belief that RZV was more performant. That does not appear to be the case, so allow GPU memory to use the multi-packet eager path. Lowered send threshold default from 4096 to 512. cuda_gdrcopy_from_dev (CPU pulls from GPU BAR1 write-combining mapping) saturates due to uncached PCIe round-trips per cacheline. Above ~512B, ofi_cudaMemcpy(D2H) wins over gdrcopy. Below ~512B, gdrcopy's lower fixed cost still beats the cudaMemcpy CUDA-driver floor. The receive-side knob (OPX_HMEM_DEV_REG_RECV_THRESHOLD) is unchanged: CPU writes to GPU BAR1 via gdr_copy_to_dev hit the fast WC store path. Signed-off-by: Lindsay Reiser (cherry picked from commit b44d45b25b756e4b4b69aa52d0b347abd34cfd13) --- prov/opx/include/rdma/opx/fi_opx_endpoint.h | 7 +++---- prov/opx/include/rdma/opx/opx_hmem_domain.h | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index 74d415d85dc..3e61350086f 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -1120,7 +1120,7 @@ void fi_opx_handle_recv_rts_hfisvc(const union opx_hfi1_packet_hdr *const hdr, uint64_t xfer_len = hdr->rzv_rts.message_length; #ifdef OPX_TRACER_ENABLED - const uint64_t trace_slid_len = OPX_TRACE_PACK_SLID_LEN(lid, xfer_len); + const uint64_t trace_slid_len = OPX_TRACE_PACK_SLID_LEN(OPX_TRACE_GET_SLID(hdr, hfi1_type), xfer_len); OPX_TRACE_RX_BEGIN(OPX_TRACE_EVENT_RX_RZV_RTS_HFISVC, trace_slid_len, origin_tag); #endif @@ -4449,13 +4449,12 @@ static inline ssize_t fi_opx_ep_tx_send_internal(struct fid_ep *ep, const void * return -FI_EAGAIN; } - /* If hmem_iface != FI_HMEM_SYSTEM, we skip MP EGR because RZV yields better performance for devices */ if (total_len <= opx_tx->mp_eager_max_payload_bytes && is_contiguous && - !fi_opx_hfi1_tx_is_shm(opx_ep, addr, caps) && (caps & FI_TAGGED) && hmem_iface == FI_HMEM_SYSTEM) { + !fi_opx_hfi1_tx_is_shm(opx_ep, addr, caps) && (caps & FI_TAGGED)) { assert(total_len >= opx_tx->mp_eager_min_payload_bytes); rc = opx_hfi1_tx_send_try_mp_egr(ep, buf, len, addr, tag, context, data, lock_required, override_flags, tx_op_flags, caps, reliability, - do_cq_completion, FI_HMEM_SYSTEM, 0ul, OPX_HMEM_NO_HANDLE, + do_cq_completion, hmem_iface, hmem_device, hmem_handle, hfi1_type, ctx_sharing); if (OFI_LIKELY(rc == FI_SUCCESS)) { OPX_TRACE_TX_END_SUCCESS(OPX_TRACE_EVENT_TX_SEND, 0, 0); diff --git a/prov/opx/include/rdma/opx/opx_hmem_domain.h b/prov/opx/include/rdma/opx/opx_hmem_domain.h index 8f365b01271..0f92340a666 100644 --- a/prov/opx/include/rdma/opx/opx_hmem_domain.h +++ b/prov/opx/include/rdma/opx/opx_hmem_domain.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2024-2025 Cornelis Networks. + * Copyright (C) 2024-2026 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -41,7 +41,7 @@ #define OPX_HMEM_NO_LOCK_ON_CLEANUP (0) -#define OPX_HMEM_DEV_REG_SEND_THRESHOLD_DEFAULT (4096) +#define OPX_HMEM_DEV_REG_SEND_THRESHOLD_DEFAULT (512) #define OPX_HMEM_DEV_REG_RECV_THRESHOLD_DEFAULT (OPX_HFI1_PKT_SIZE) #define OPX_HMEM_DEV_REG_THRESHOLD_MAX (OPX_HFI1_PKT_SIZE) #define OPX_HMEM_DEV_REG_THRESHOLD_MIN (0)