From e336708656a22b40efb0ac4eceb76c646ae9ae67 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 24 Mar 2026 00:51:56 +0000 Subject: [PATCH 1/2] Initial plan From cc8cbcb2812a551c491bbd19b1a18bbe4bce60bc Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 24 Mar 2026 01:00:43 +0000 Subject: [PATCH 2/2] fix: use scope="sys" for lock signal atomic in matmul_all_reduce kernel Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com> Agent-Logs-Url: https://github.com/ROCm/iris/sessions/09bd515d-f545-480e-b31c-040637ee3e8c --- iris/ops/matmul_all_reduce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iris/ops/matmul_all_reduce.py b/iris/ops/matmul_all_reduce.py index 73bea92c..7abf63f8 100644 --- a/iris/ops/matmul_all_reduce.py +++ b/iris/ops/matmul_all_reduce.py @@ -132,7 +132,7 @@ def _fused_matmul_all_reduce_kernel( # Use atomic_xchg with release semantics to ensure memory ordering tile_id = pid_m * num_tiles_n + pid_n lock_ptr = locks + tile_id - tl.atomic_xchg(lock_ptr, 1, sem="release", scope="gpu") # Release ensures prior stores visible + tl.atomic_xchg(lock_ptr, 1, sem="release", scope="sys") # Release ensures prior stores visible to remote GPUs # Create source view only when needed (aux_buffer is not None) src_view = iris.x.make_tensor_view(aux_buffer, M, N, stride_cm, stride_cn)