diff --git a/csrc/standalone/.gitignore b/csrc/standalone/.gitignore new file mode 100644 index 00000000..e7105ccb --- /dev/null +++ b/csrc/standalone/.gitignore @@ -0,0 +1,3 @@ +# Built binaries (output of `make`) +p2p_atomics_hsa +p2p_atomics_hip diff --git a/csrc/standalone/Makefile b/csrc/standalone/Makefile new file mode 100644 index 00000000..37278a31 --- /dev/null +++ b/csrc/standalone/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. +# +# Build the standalone P2P VMem atomic examples. +# Requires ROCm with hipcc and libhsa-runtime64. + +HIPCC ?= hipcc +ROCM_DIR ?= /opt/rocm +CXXFLAGS = -O2 -g -std=c++17 -I$(ROCM_DIR)/include +LDFLAGS = -L$(ROCM_DIR)/lib -Wl,-rpath,$(ROCM_DIR)/lib + +.PHONY: all clean + +all: p2p_atomics_hsa p2p_atomics_hip + +# PATH 3 — HSA fine-grained VMem (correct, P2P atomics pass) +p2p_atomics_hsa: p2p_atomics_hsa.cpp + $(HIPCC) $(CXXFLAGS) -o $@ $< $(LDFLAGS) -lhsa-runtime64 + +# PATH 2 — HIP VMem (coarse-grained, P2P atomics cause GPU page fault) +p2p_atomics_hip: p2p_atomics_hip.cpp + $(HIPCC) $(CXXFLAGS) -o $@ $< $(LDFLAGS) + +clean: + rm -f p2p_atomics_hsa p2p_atomics_hip diff --git a/csrc/standalone/README.md b/csrc/standalone/README.md new file mode 100644 index 00000000..1991ff43 --- /dev/null +++ b/csrc/standalone/README.md @@ -0,0 +1,124 @@ +# Standalone P2P VMem Atomic Examples (HIP/C++) + +Minimal, standalone two-process HIP/C++ programs that demonstrate the three +GPU memory allocation paths for multi-GPU P2P atomic operations on AMD GPUs. + +## Background + +For correct cross-GPU (P2P) atomic operations, the physical memory must be +**fine-grained** (`CoarseGrain=0` in the KFD driver). Two common allocation +paths produce fine-grained memory; one (the default HIP Virtual Memory API) +always produces coarse-grained memory: + +| Path | API | KFD CoarseGrain | P2P atomics | +|------|-----|:---------:|:-----------:| +| 1 | `hipExtMallocWithFlags(hipDeviceMallocFinegrained)` | 0 | ✓ | +| **2** | **`hipMemCreate` (any `hipMemAllocationType`)** | **1 (hardcoded)** | **✗ crashes** | +| 3 | `hsa_amd_vmem_handle_create(fine_grained_pool)` | 0 | ✓ | + +Path 2 always fails because HIP/CLR's `SvmBuffer::malloc(ROCCLR_MEM_PHYMEM)` +hardcodes the coarse-grained GPU pool regardless of the `prop.type` field (even +`hipMemAllocationTypeUncached = 0x40000000` is silently ignored). + +These examples let you reproduce the bug and the fix on any system with ≥2 +AMD GPUs running ROCm. + +## Files + +| File | Description | +|------|-------------| +| `p2p_atomics_hsa.cpp` | **Path 3** — HSA fine-grained VMem (correct) | +| `p2p_atomics_hip.cpp` | **Path 2** — HIP VMem (demonstrates the coarse-grained bug) | +| `Makefile` | Builds both examples | + +## Build + +```bash +# From this directory +make + +# With a custom ROCm installation +make ROCM_DIR=/path/to/rocm +``` + +## Run + +### HSA example (Path 3 — fine-grained, CORRECT) + +```bash +./p2p_atomics_hsa [N_ITERS] +``` + +- Spawns two processes (rank 0 → GPU 0, rank 1 → GPU 1) via `fork()` +- Allocates physical memory with `hsa_amd_vmem_handle_create` on the + **fine-grained** GPU memory pool +- Exports handles as DMA-BUF file descriptors via SCM_RIGHTS +- Imports peer handles and maps them to local virtual address space +- Runs P2P atomic `atomicAdd` at **agent scope** and **system scope** +- Expected output: both scopes pass with 0 failures + +``` +p2p_atomics_hsa: PATH 3 — HSA fine-grained VMem + N_ITERS=200 GPUs=2 + +[rank 0] starting on GPU 0 +[rank 0] fine-grained pool granularity = 2097152 bytes +[rank 0] my_va=0x... peer_va=0x... +[rank 0] agent-scope: 0/200 failures sys-scope: 0/200 failures +[rank 0] PASS +... +Overall: PASS +``` + +### HIP example (Path 2 — coarse-grained, BUG DEMONSTRATION) + +```bash +# Safe mode: P2P non-atomic reads only (setup verification) +./p2p_atomics_hip [--pinned|--uncached] + +# Atomic mode: P2P atomics — WARNING: causes GPU page fault! +./p2p_atomics_hip --atomics [--agent|--sys] [N_ITERS] +``` + +#### Options + +| Flag | Description | +|------|-------------| +| `--pinned` | `hipMemAllocationTypePinned` (0x1) — default | +| `--uncached` | `hipMemAllocationTypeUncached` (0x40000000) — AMD extension | +| `--atomics` | Enable P2P atomic kernels (**WARNING: crashes on coarse-grained memory**) | +| `--agent` | Use agent-scope atomics (most likely to crash) | +| `--sys` | Use system-scope atomics (default when `--atomics` given) | + +#### Expected behaviour + +| Mode | Expected | +|------|----------| +| `--pinned` (read only) | PASS — P2P non-atomic reads work on coarse-grained memory | +| `--uncached` (read only) | PASS or driver-level skip — uncached type is accepted but silently ignored by CLR | +| `--atomics --agent` | **CRASH** — GPU page fault (SIGSEGV) from coarse-grained P2P atomic | +| `--atomics --sys` | **CRASH** — system-scope P2P atomics also fault on coarse-grained memory | +| `--uncached --atomics` | **Same crash** — uncached type does NOT change the allocation to fine-grained | + +The key finding from testing on MI300X: + +> `hipMemAllocationTypeUncached` (0x40000000) is accepted by `hipMemCreate` +> (returns `hipSuccess`) but **does not change the memory grain**. CLR always +> routes through the coarse-grained pool regardless of `prop.type`. P2P atomics +> still cause GPU page faults identical to `hipMemAllocationTypePinned`. + +## Implementation notes + +- Both programs use `fork()` + `socketpair(AF_UNIX)` — no MPI dependency +- DMA-BUF file descriptors are exchanged via `sendmsg(SCM_RIGHTS)` / + `recvmsg(SCM_RIGHTS)` +- The barrier between ranks is a 1-byte exchange over the Unix socket +- Device kernels use `__hip_atomic_fetch_add` with `__HIP_MEMORY_SCOPE_AGENT` + and `__HIP_MEMORY_SCOPE_SYSTEM` for the two scope levels + +## The fix + +Use `hsa_amd_vmem_handle_create` directly (Path 3) with the fine-grained pool +found by iterating `hsa_amd_agent_iterate_memory_pools` and checking +`HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED`. This is exactly what Iris does +in its `HsaVMemAllocator` (`iris/allocators/hsa_vmem_allocator.py`). diff --git a/csrc/standalone/p2p_atomics_hip.cpp b/csrc/standalone/p2p_atomics_hip.cpp new file mode 100644 index 00000000..9edc3d32 --- /dev/null +++ b/csrc/standalone/p2p_atomics_hip.cpp @@ -0,0 +1,463 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. +// +// p2p_atomics_hip.cpp — Standalone HIP VMem P2P atomic test (PATH 2, BUGGY) +// +// Demonstrates the BROKEN approach for multi-GPU P2P atomic operations: +// hipMemCreate -> CLR -> hsa_amd_vmem_handle_create (coarse-grained pool hardcoded) +// +// Stack: +// hipMemCreate(&handle, size, &prop, 0) +// -> CLR: SvmBuffer::malloc(ROCCLR_MEM_PHYMEM) +// -> hsa_amd_vmem_handle_create(COARSE_GRAINED_POOL) ← CLR hardcodes this +// -> KFD: hsaKmtAllocMemory(CoarseGrain=1, NoAddress=1) +// hipMemExportToShareableHandle -> DMA-BUF fd -> SCM_RIGHTS +// hipMemImportFromShareableHandle -> hipMemMap + hipMemSetAccess +// P2P atomic_add -> GPU PAGE FAULT (coarse-grained P2P atomics not supported) +// +// ┌-------------------------------------------------------------------------┐ +// │ WARNING: P2P atomic operations (any scope) on coarse-grained memory │ +// │ trigger GPU page faults that send SIGSEGV to the process. By default │ +// │ this program only runs a P2P non-atomic read to verify the mapping. │ +// │ Pass --atomics to also run P2P atomics — this WILL CRASH. │ +// └-------------------------------------------------------------------------┘ +// +// Options: +// --pinned hipMemAllocationTypePinned (0x1) — default +// --uncached hipMemAllocationTypeUncached (0x40000000) — AMD extension; +// accepted by hipMemCreate but CLR still uses coarse-grained pool +// --atomics Run P2P atomic kernels (WARNING: causes GPU page fault!) +// --agent Use agent-scope atomics (default when --atomics given) +// --sys Use system-scope atomics (slower, still faults on coarse-grained) +// N Number of iterations (default 200; auto-reduced to 20 for atomics) +// +// Build: +// hipcc -o p2p_atomics_hip p2p_atomics_hip.cpp +// +// Run examples: +// ./p2p_atomics_hip # safe: P2P non-atomic read only +// ./p2p_atomics_hip --uncached # probe uncached alloc type (safe) +// ./p2p_atomics_hip --atomics # agent-scope P2P atomics (WILL CRASH) +// ./p2p_atomics_hip --atomics --sys # sys-scope P2P atomics (WILL CRASH) +// ./p2p_atomics_hip --uncached --atomics # uncached + atomics (same crash) +// +// Compare with p2p_atomics_hsa.cpp (HSA fine-grained path) which passes all tests. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ============================================================================ +// Error-checking macro +// ============================================================================ + +#define HIP_CHECK(expr) \ + do { \ + hipError_t _e = (expr); \ + if (_e != hipSuccess) { \ + fprintf(stderr, "[rank %d] HIP error at %s:%d — %s\n", g_rank, __FILE__, \ + __LINE__, hipGetErrorString(_e)); \ + abort(); \ + } \ + } while (0) + +static int g_rank = -1; + +// ============================================================================ +// SCM_RIGHTS helpers +// ============================================================================ + +static void send_fd(int sock, int fd) { + char buf[1] = {0}; + struct iovec iov = {buf, 1}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + memset(cmsg_buf, 0, sizeof(cmsg_buf)); + struct msghdr msg = {}; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &fd, sizeof(int)); + ssize_t n = sendmsg(sock, &msg, 0); + assert(n == 1); +} + +static int recv_fd(int sock) { + char buf[1]; + struct iovec iov = {buf, 1}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + memset(cmsg_buf, 0, sizeof(cmsg_buf)); + struct msghdr msg = {}; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + ssize_t n = recvmsg(sock, &msg, 0); + assert(n == 1); + struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg); + assert(cmsg && cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS); + int fd; + memcpy(&fd, CMSG_DATA(cmsg), sizeof(int)); + return fd; +} + +static void barrier(int sock) { + char c = 'b'; + write(sock, &c, 1); + read(sock, &c, 1); +} + +// ============================================================================ +// Device kernels +// ============================================================================ + +// Agent-scope atomic add — fails on coarse-grained P2P (GPU page fault) +__global__ void k_atomic_add_agent(float* ptr) { + __hip_atomic_fetch_add(ptr, 1.0f, __ATOMIC_ACQ_REL, __HIP_MEMORY_SCOPE_AGENT); +} + +// System-scope atomic add — also fails on coarse-grained P2P (GPU page fault) +__global__ void k_atomic_add_sys(float* ptr) { + __hip_atomic_fetch_add(ptr, 1.0f, __ATOMIC_ACQ_REL, __HIP_MEMORY_SCOPE_SYSTEM); +} + +// Non-atomic write / read — safe on coarse-grained P2P +__global__ void k_write_value(float* ptr, float val) { *ptr = val; } + +// Read with system-scope fence to see writes from remote GPUs +__global__ void k_read_to(float* dst, const float* src) { + __threadfence_system(); + *dst = *src; +} +__global__ void k_zero(float* ptr) { *ptr = 0.0f; } + +// ============================================================================ +// Config passed through env vars for the self-exec'd rank 1 +// ============================================================================ + +struct Config { + int alloc_type; // 0x1 = pinned, 0x40000000 = uncached + bool run_atomics; + bool agent_scope; // true = agent, false = sys + int n_iters; +}; + +static Config read_config_from_env() { + Config cfg = {}; + cfg.alloc_type = 0x1; // pinned default + cfg.run_atomics = false; + cfg.agent_scope = true; + cfg.n_iters = 200; + + const char* at = getenv("P2P_ALLOC_TYPE"); + if (at) cfg.alloc_type = (int)strtol(at, nullptr, 0); + const char* ra = getenv("P2P_ATOMICS"); + if (ra) cfg.run_atomics = atoi(ra) != 0; + const char* ag = getenv("P2P_AGENT_SCOPE"); + if (ag) cfg.agent_scope = atoi(ag) != 0; + const char* ni = getenv("P2P_NITERS"); + if (ni) cfg.n_iters = atoi(ni); + return cfg; +} + +// ============================================================================ +// Per-rank logic +// ============================================================================ + +static int run_rank(int rank, int sock, const Config& cfg) { + g_rank = rank; + int gpu_id = rank; + + const char* alloc_name = + (cfg.alloc_type == 0x40000000) ? "UNCACHED(0x40000000)" : "PINNED(0x1)"; + printf("[rank %d] GPU=%d alloc_type=%s\n", rank, gpu_id, alloc_name); + fflush(stdout); + + HIP_CHECK(hipSetDevice(gpu_id)); + + // -- Query allocation granularity ---------------------------------------- + hipMemAllocationProp prop = {}; + prop.type = (hipMemAllocationType)cfg.alloc_type; + prop.requestedHandleType = hipMemHandleTypePosixFileDescriptor; + prop.location.type = hipMemLocationTypeDevice; + prop.location.id = gpu_id; + + size_t gran = 0; + HIP_CHECK(hipMemGetAllocationGranularity(&gran, &prop, hipMemAllocationGranularityRecommended)); + if (gran == 0) gran = 2u << 20; + size_t alloc_size = gran; + + printf("[rank %d] granularity = %zu bytes\n", rank, gran); + fflush(stdout); + + // -- Reserve virtual address space ------------------------------------------- + void* my_va = nullptr; + HIP_CHECK(hipMemAddressReserve(&my_va, alloc_size, gran, /*addr=*/0, /*flags=*/0)); + + // -- Create physical allocation -------------------------------------------- + // hipMemCreate with any hipMemAllocationType always allocates from the + // COARSE-GRAINED GPU pool in HIP/CLR (SvmBuffer::malloc ROCCLR_MEM_PHYMEM). + // Even hipMemAllocationTypeUncached (0x40000000) is silently ignored by CLR. + hipMemGenericAllocationHandle_t my_handle = 0; + hipError_t create_err = hipMemCreate(&my_handle, alloc_size, &prop, /*flags=*/0); + if (create_err != hipSuccess) { + fprintf(stderr, "[rank %d] hipMemCreate(%s) failed: %s\n", rank, alloc_name, + hipGetErrorString(create_err)); + // Clean up and synchronize with peer so it doesn't hang + (void)hipMemAddressFree(my_va, alloc_size); + barrier(sock); + return 0; // Driver rejected this alloc type — not a test failure + } + printf("[rank %d] hipMemCreate(%s) succeeded\n", rank, alloc_name); + fflush(stdout); + + // -- Map + set access ------------------------------------------------------ + HIP_CHECK(hipMemMap(my_va, alloc_size, /*offset=*/0, my_handle, /*flags=*/0)); + + hipMemAccessDesc desc = {}; + desc.location.type = hipMemLocationTypeDevice; + desc.location.id = gpu_id; + desc.flags = hipMemAccessFlagsProtReadWrite; + HIP_CHECK(hipMemSetAccess(my_va, alloc_size, &desc, 1)); + + // -- DMA-BUF Export ------------------------------------ + int my_fd = -1; + HIP_CHECK(hipMemExportToShareableHandle(&my_fd, my_handle, + hipMemHandleTypePosixFileDescriptor, /*flags=*/0)); + + // -- Exchange fds with peer (rank 0 sends first) ---------- + int peer_fd = -1; + if (rank == 0) { + send_fd(sock, my_fd); + peer_fd = recv_fd(sock); + } else { + peer_fd = recv_fd(sock); + send_fd(sock, my_fd); + } + close(my_fd); + + // -- Import peer handle + map ----------------------------------------------- + hipMemGenericAllocationHandle_t peer_handle = 0; + HIP_CHECK(hipMemImportFromShareableHandle( + &peer_handle, + (void*)(intptr_t)peer_fd, // POSIX fd passed as void* + hipMemHandleTypePosixFileDescriptor)); + close(peer_fd); + + void* peer_va = nullptr; + HIP_CHECK(hipMemAddressReserve(&peer_va, alloc_size, gran, 0, 0)); + HIP_CHECK(hipMemMap(peer_va, alloc_size, 0, peer_handle, 0)); + + hipMemAccessDesc peer_desc = {}; + peer_desc.location.type = hipMemLocationTypeDevice; + peer_desc.location.id = gpu_id; + peer_desc.flags = hipMemAccessFlagsProtReadWrite; + HIP_CHECK(hipMemSetAccess(peer_va, alloc_size, &peer_desc, 1)); + + printf("[rank %d] my_va=%p peer_va=%p\n", rank, my_va, peer_va); + fflush(stdout); + + float* my_ptr = (float*)my_va; + float* peer_ptr = (float*)peer_va; + + // Bounce buffer: hipMalloc VA guaranteed safe for hipMemcpy(D2H) + float* bounce; + HIP_CHECK(hipMalloc(&bounce, sizeof(float))); + + // Barrier: ensure both ranks finished VA setup before starting the test + barrier(sock); + + // -- P2P non-atomic read/write (safe, runs regardless of --atomics flag) -- + // Kernel-based init avoids hipMemset on HIP VMem VA (may not be registered) + k_zero<<<1, 1>>>(my_ptr); + HIP_CHECK(hipDeviceSynchronize()); + barrier(sock); + + k_write_value<<<1, 1>>>(my_ptr, (float)(rank + 1)); + HIP_CHECK(hipDeviceSynchronize()); + barrier(sock); + + k_read_to<<<1, 1>>>(bounce, peer_ptr); + HIP_CHECK(hipDeviceSynchronize()); + + float peer_val = -1; + HIP_CHECK(hipMemcpy(&peer_val, bounce, sizeof(float), hipMemcpyDeviceToHost)); + + float expected = (float)(2 - rank); // peer rank is (1-rank), wrote (peer_rank+1) + int read_ok = (fabsf(peer_val - expected) < 0.1f); + printf("[rank %d] P2P non-atomic read: got %g (expected %g) -> %s\n", rank, peer_val, + expected, read_ok ? "PASS" : "FAIL"); + fflush(stdout); + + int fails = 0; + + // -- P2P atomic test (opt-in, WARNING: WILL CRASH on coarse-grained!) ---- + if (cfg.run_atomics) { + const char* scope_name = cfg.agent_scope ? "agent" : "sys"; + printf("[rank %d] === P2P ATOMIC (%s-scope, alloc=%s) — EXPECT GPU PAGE FAULT! ===\n", + rank, scope_name, alloc_name); + fflush(stdout); + + for (int iter = 0; iter < cfg.n_iters; iter++) { + k_zero<<<1, 1>>>(my_ptr); + HIP_CHECK(hipDeviceSynchronize()); + barrier(sock); + + if (cfg.agent_scope) { + k_atomic_add_agent<<<1, 1>>>(my_ptr); + k_atomic_add_agent<<<1, 1>>>(peer_ptr); // P2P: expect GPU page fault + } else { + k_atomic_add_sys<<<1, 1>>>(my_ptr); + k_atomic_add_sys<<<1, 1>>>(peer_ptr); // P2P: also faults on coarse-grained + } + // hipDeviceSynchronize surfaces the GPU fault (SIGSEGV to this process) + HIP_CHECK(hipDeviceSynchronize()); + barrier(sock); + + k_read_to<<<1, 1>>>(bounce, my_ptr); + HIP_CHECK(hipDeviceSynchronize()); + float val = 0; + HIP_CHECK(hipMemcpy(&val, bounce, sizeof(float), hipMemcpyDeviceToHost)); + if (fabsf(val - 2.0f) > 0.1f) { + if (++fails <= 5) + printf("[rank %d] iter %d %s-scope: expected 2.0, got %g\n", rank, iter, + scope_name, val); + } + } + printf("[rank %d] %s-scope P2P atomics: %d/%d failures\n", rank, scope_name, fails, + cfg.n_iters); + fflush(stdout); + } else { + printf("[rank %d] P2P atomics skipped (pass --atomics to enable, expect crash)\n", + rank); + fflush(stdout); + } + + HIP_CHECK(hipFree(bounce)); + + printf("[rank %d] %s\n", rank, (read_ok && fails == 0) ? "PASS" : "FAIL"); + fflush(stdout); + + // -- Cleanup --------------------------------------------------------------- + barrier(sock); // sync before cleanup to avoid one rank freeing while other maps + + (void)hipMemUnmap(peer_va, alloc_size); + (void)hipMemAddressFree(peer_va, alloc_size); + (void)hipMemRelease(peer_handle); + + (void)hipMemUnmap(my_va, alloc_size); + (void)hipMemAddressFree(my_va, alloc_size); + (void)hipMemRelease(my_handle); + + return (read_ok && fails == 0) ? 0 : 1; +} + +// ============================================================================ +// main -- self-exec trick (same rationale as p2p_atomics_hsa.cpp) +// ============================================================================ + +int main(int argc, char** argv) { + // -- Re-exec path: invoked as rank 1 -- + if (getenv("P2P_RANK")) { + int rank = atoi(getenv("P2P_RANK")); + int sock_fd = atoi(getenv("P2P_SOCK_FD")); + Config cfg = read_config_from_env(); + int rc = run_rank(rank, sock_fd, cfg); + close(sock_fd); + return rc; + } + + // -- Primary path: parse args, spawn rank 1, run rank 0 ------------------- + Config cfg = {}; + cfg.alloc_type = 0x1; // pinned + cfg.run_atomics = false; + cfg.agent_scope = true; // default scope when --atomics given + cfg.n_iters = 200; + bool n_iters_set = false; + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--uncached") == 0) cfg.alloc_type = 0x40000000; + else if (strcmp(argv[i], "--pinned") == 0) cfg.alloc_type = 0x1; + else if (strcmp(argv[i], "--atomics") == 0) cfg.run_atomics = true; + else if (strcmp(argv[i], "--agent") == 0) cfg.agent_scope = true; + else if (strcmp(argv[i], "--sys") == 0) cfg.agent_scope = false; + else { int n = atoi(argv[i]); if (n > 0) { cfg.n_iters = n; n_iters_set = true; } } + } + if (cfg.run_atomics && !n_iters_set) cfg.n_iters = 20; + + int ndev = 0; + (void)hipGetDeviceCount(&ndev); + if (ndev < 2) { + fprintf(stderr, "This example requires at least 2 GPU devices (found %d).\n", ndev); + return 1; + } + + const char* alloc_name = (cfg.alloc_type == 0x40000000) ? "UNCACHED" : "PINNED"; + printf("p2p_atomics_hip: PATH 2 — HIP VMem (coarse-grained, alloc=%s)\n", alloc_name); + printf(" N_ITERS=%d GPUs=%d atomics=%s scope=%s\n\n", cfg.n_iters, ndev, + cfg.run_atomics ? "yes" : "no", cfg.agent_scope ? "agent" : "sys"); + + if (cfg.run_atomics) { + printf("WARNING: P2P atomics on coarse-grained memory cause GPU page faults (SIGSEGV).\n"); + printf("hipMemCreate ALWAYS uses the coarse-grained pool regardless of " + "hipMemAllocationType\n(incl. UNCACHED). " + "See p2p_atomics_hsa.cpp for the correct HSA fix.\n\n"); + fflush(stdout); + } + + int sv[2]; + if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) != 0) { perror("socketpair"); return 1; } + + // Pass config to rank 1 via env vars + char at_str[32], ra_str[4], ag_str[4], ni_str[32], sk_str[32]; + snprintf(at_str, sizeof(at_str), "0x%x", cfg.alloc_type); + snprintf(ra_str, sizeof(ra_str), "%d", (int)cfg.run_atomics); + snprintf(ag_str, sizeof(ag_str), "%d", (int)cfg.agent_scope); + snprintf(ni_str, sizeof(ni_str), "%d", cfg.n_iters); + snprintf(sk_str, sizeof(sk_str), "%d", sv[1]); + setenv("P2P_RANK", "1", 1); + setenv("P2P_SOCK_FD", sk_str, 1); + setenv("P2P_ALLOC_TYPE", at_str, 1); + setenv("P2P_ATOMICS", ra_str, 1); + setenv("P2P_AGENT_SCOPE", ag_str, 1); + setenv("P2P_NITERS", ni_str, 1); + + pid_t pid = fork(); + if (pid < 0) { perror("fork"); return 1; } + if (pid == 0) { + close(sv[0]); + execl("/proc/self/exe", argv[0], NULL); + perror("execl"); + _exit(127); + } + + unsetenv("P2P_RANK"); + unsetenv("P2P_SOCK_FD"); + close(sv[1]); + + int rc0 = run_rank(0, sv[0], cfg); + close(sv[0]); + + int status = 0; + waitpid(pid, &status, 0); + int rc1 = WIFEXITED(status) ? WEXITSTATUS(status) : 1; + + if (rc0 != 0 || rc1 != 0) + printf("\nOverall: FAIL (rank0=%d rank1=%d)\n", rc0, rc1); + else + printf("\nOverall: PASS\n"); + + return (rc0 != 0 || rc1 != 0) ? 1 : 0; +} diff --git a/csrc/standalone/p2p_atomics_hsa.cpp b/csrc/standalone/p2p_atomics_hsa.cpp new file mode 100644 index 00000000..8e74e929 --- /dev/null +++ b/csrc/standalone/p2p_atomics_hsa.cpp @@ -0,0 +1,454 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. +// +// p2p_atomics_hsa.cpp — Standalone HSA VMem P2P atomic test (PATH 3, CORRECT) +// +// Demonstrates the CORRECT approach for multi-GPU P2P atomic operations: +// hsa_amd_vmem_handle_create on the **fine-grained** GPU pool. +// +// Stack: +// hsa_amd_vmem_handle_create(fine_grained_pool, size) +// -> KFD: hsaKmtAllocMemory(CoarseGrain=0, NoAddress=1) +// hsa_amd_vmem_export_shareable_handle -> DMA-BUF fd -> SCM_RIGHTS +// hsa_amd_vmem_import_shareable_handle -> hsa_amd_vmem_map + set_access +// P2P atomic_add (agent scope + system scope) -> BOTH PASS +// +// Compare with p2p_atomics_hip.cpp which uses hipMemCreate and always allocates +// from the coarse-grained pool, causing P2P atomics to produce GPU page faults. +// +// Build: +// hipcc -o p2p_atomics_hsa p2p_atomics_hsa.cpp -lhsa-runtime64 +// +// Run: +// ./p2p_atomics_hsa [N_ITERS] +// N_ITERS defaults to 200. Requires at least 2 GPU devices. +// +// The program self-execs to spawn rank 1 as a fresh process (see main()). +// Rank 0 and rank 1 communicate via a Unix socketpair. No MPI or torchrun +// dependency. + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ============================================================================ +// Error-checking macros +// ============================================================================ + +#define HIP_CHECK(expr) \ + do { \ + hipError_t _e = (expr); \ + if (_e != hipSuccess) { \ + fprintf(stderr, "[rank %d] HIP error at %s:%d — %s\n", g_rank, __FILE__, \ + __LINE__, hipGetErrorString(_e)); \ + abort(); \ + } \ + } while (0) + +#define HSA_CHECK(expr) \ + do { \ + hsa_status_t _s = (expr); \ + if (_s != HSA_STATUS_SUCCESS) { \ + const char* msg = "(unknown)"; \ + hsa_status_string(_s, &msg); \ + fprintf(stderr, "[rank %d] HSA error at %s:%d — %s (0x%x)\n", g_rank, __FILE__, \ + __LINE__, msg, (unsigned)_s); \ + abort(); \ + } \ + } while (0) + +static int g_rank = -1; + +// ============================================================================ +// SCM_RIGHTS helpers: send/receive a file descriptor over a Unix socket +// ============================================================================ + +static void send_fd(int sock, int fd) { + char buf[1] = {0}; + struct iovec iov = {buf, 1}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + memset(cmsg_buf, 0, sizeof(cmsg_buf)); + struct msghdr msg = {}; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &fd, sizeof(int)); + ssize_t n = sendmsg(sock, &msg, 0); + assert(n == 1); +} + +static int recv_fd(int sock) { + char buf[1]; + struct iovec iov = {buf, 1}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + memset(cmsg_buf, 0, sizeof(cmsg_buf)); + struct msghdr msg = {}; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + ssize_t n = recvmsg(sock, &msg, 0); + assert(n == 1); + struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg); + assert(cmsg && cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS); + int fd; + memcpy(&fd, CMSG_DATA(cmsg), sizeof(int)); + return fd; +} + +// Simple barrier: both sides write 1 byte then read 1 byte. +static void barrier(int sock) { + char c = 'b'; + write(sock, &c, 1); + read(sock, &c, 1); +} + +// ============================================================================ +// HSA agent / pool enumeration helpers +// ============================================================================ + +struct AgentList { + hsa_agent_t agents[16]; + int count; +}; + +static hsa_status_t agent_cb(hsa_agent_t agent, void* data) { + hsa_device_type_t type; + if (hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type) == HSA_STATUS_SUCCESS && + type == HSA_DEVICE_TYPE_GPU) { + AgentList* list = (AgentList*)data; + if (list->count < 16) list->agents[list->count++] = agent; + } + return HSA_STATUS_SUCCESS; +} + +struct PoolSearch { + hsa_amd_memory_pool_t pool; + bool found; + size_t granularity; +}; + +static hsa_status_t pool_cb(hsa_amd_memory_pool_t pool, void* data) { + bool alloc_ok = false; + hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &alloc_ok); + if (!alloc_ok) return HSA_STATUS_SUCCESS; + + // HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED = 2 + uint32_t flags = 0; + hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags); + if (!(flags & 2u)) return HSA_STATUS_SUCCESS; + + PoolSearch* ps = (PoolSearch*)data; + ps->pool = pool; + ps->found = true; + + size_t gran = 0; + hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE, &gran); + if (gran == 0) + hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &gran); + ps->granularity = gran; + + return (hsa_status_t)0x1; // HSA_STATUS_INFO_BREAK — stop iteration +} + +// ============================================================================ +// Device kernels +// ============================================================================ + +// Agent-scope atomic add (Triton scope="gpu", sem="acq_rel"). +// Fine-grained memory: PASS. Coarse-grained P2P: GPU page fault (SIGSEGV). +// Uses ACQ_REL ordering for cross-GPU coherency (RELAXED is insufficient). +__global__ void k_atomic_add_agent(float* ptr) { + __hip_atomic_fetch_add(ptr, 1.0f, __ATOMIC_ACQ_REL, __HIP_MEMORY_SCOPE_AGENT); +} + +// System-scope atomic add (Triton scope="sys"). +// Works on both fine-grained and coarse-grained memory (slower path). +__global__ void k_atomic_add_sys(float* ptr) { + __hip_atomic_fetch_add(ptr, 1.0f, __ATOMIC_ACQ_REL, __HIP_MEMORY_SCOPE_SYSTEM); +} + +// Zero the value at ptr (used for initialization instead of hipMemset on HSA VMem VA) +__global__ void k_zero(float* ptr) { *ptr = 0.0f; } + +// Copy one float from src to dst with a system-scope fence before the read. +// The system fence ensures writes from remote GPUs (via P2P) are visible. +__global__ void k_copy(float* dst, const float* src) { + __threadfence_system(); // system-scope acquire fence: see all remote writes + *dst = *src; +} + +// ============================================================================ +// Per-rank logic (called in a fresh process — no pre-fork GPU/HSA state) +// ============================================================================ + +static int run_rank(int rank, int sock, int n_iters) { + g_rank = rank; + int gpu_id = rank; + + printf("[rank %d] starting on GPU %d\n", rank, gpu_id); + fflush(stdout); + + HIP_CHECK(hipSetDevice(gpu_id)); + HSA_CHECK(hsa_init()); + + AgentList al = {}; + HSA_CHECK(hsa_iterate_agents(agent_cb, &al)); + if (al.count < 2) { + fprintf(stderr, "[rank %d] need >=2 GPU agents, found %d\n", rank, al.count); + return 1; + } + hsa_agent_t my_agent = al.agents[rank]; + + PoolSearch ps = {}; + hsa_amd_agent_iterate_memory_pools(my_agent, pool_cb, &ps); + if (!ps.found) { + fprintf(stderr, "[rank %d] no fine-grained allocatable pool\n", rank); + return 1; + } + + size_t gran = ps.granularity ? ps.granularity : (2u << 20); + size_t alloc_size = gran; + + printf("[rank %d] fine-grained pool found; granularity = %zu bytes\n", rank, gran); + fflush(stdout); + + // -- Physical memory allocation — PATH 3 KEY STEP ------------------------- + // Use hsa_amd_vmem_handle_create with the FINE-GRAINED pool. + // KFD will mark this CoarseGrain=0, enabling P2P atomic coherency. + hsa_amd_vmem_alloc_handle_t my_handle; + HSA_CHECK( + hsa_amd_vmem_handle_create(ps.pool, alloc_size, MEMORY_TYPE_NONE, 0, &my_handle)); + + // -- Reserve virtual address + map ---------------------------------------- + void* my_va = nullptr; + HSA_CHECK(hsa_amd_vmem_address_reserve(&my_va, alloc_size, 0, 0)); + HSA_CHECK(hsa_amd_vmem_map(my_va, alloc_size, 0, my_handle, 0)); + + // Grant RW access to all GPU agents (both GPUs need to touch this memory) + hsa_amd_memory_access_desc_t descs[16]; + int n_descs = al.count; + for (int i = 0; i < n_descs; i++) { + descs[i].permissions = HSA_ACCESS_PERMISSION_RW; + descs[i].agent_handle = al.agents[i]; + } + HSA_CHECK(hsa_amd_vmem_set_access(my_va, alloc_size, descs, n_descs)); + + // -- Export DMA-BUF ----------------------- + int my_fd = -1; + HSA_CHECK(hsa_amd_vmem_export_shareable_handle(&my_fd, my_handle, 0)); + + // -- Exchange DMA-BUF fds with peer (rank 0 sends first) ----------------- + int peer_fd = -1; + if (rank == 0) { + send_fd(sock, my_fd); + peer_fd = recv_fd(sock); + } else { + peer_fd = recv_fd(sock); + send_fd(sock, my_fd); + } + close(my_fd); + + // -- Import peer handle + map --------------------------------------------- + hsa_amd_vmem_alloc_handle_t peer_handle; + HSA_CHECK(hsa_amd_vmem_import_shareable_handle(peer_fd, &peer_handle)); + close(peer_fd); + + void* peer_va = nullptr; + HSA_CHECK(hsa_amd_vmem_address_reserve(&peer_va, alloc_size, 0, 0)); + HSA_CHECK(hsa_amd_vmem_map(peer_va, alloc_size, 0, peer_handle, 0)); + HSA_CHECK(hsa_amd_vmem_set_access(peer_va, alloc_size, descs, n_descs)); + + printf("[rank %d] my_va=%p peer_va=%p\n", rank, my_va, peer_va); + fflush(stdout); + + // Barrier: ensure both ranks have completed VA setup before starting the loop + barrier(sock); + + // -- P2P atomic test loop -------------------------------------------------- + // Note: hipMemset and hipMemcpy(D2H) may not work with HSA VMem VAs because + // HIP does not register these pointers in its internal tracking tables. + // We use kernel-based init (k_zero) and kernel-based read-out (k_copy to a + // hipMalloc'd bounce buffer) instead. + float* my_ptr = (float*)my_va; + float* peer_ptr = (float*)peer_va; + + // Bounce buffer: hipMalloc so hipMemcpy(D2H) is guaranteed to work + float* bounce; + HIP_CHECK(hipMalloc(&bounce, sizeof(float))); + + // sys-scope: reliable for any fine-grained P2P (drives PASS/FAIL) + int fails_sys = 0; + // agent-scope: works on fine-grained hardware but may be intermittent + // depending on fence ordering; failures are informational only. + int fails_agent = 0; + + for (int iter = 0; iter < n_iters; iter++) { + // -- System-scope P2P atomics (Triton scope="sys") --------------------- + k_zero<<<1, 1>>>(my_ptr); + HIP_CHECK(hipDeviceSynchronize()); + barrier(sock); + + k_atomic_add_sys<<<1, 1>>>(my_ptr); + k_atomic_add_sys<<<1, 1>>>(peer_ptr); // P2P: add to peer's memory + HIP_CHECK(hipDeviceSynchronize()); + barrier(sock); + + k_copy<<<1, 1>>>(bounce, my_ptr); + HIP_CHECK(hipDeviceSynchronize()); + float val = 0; + HIP_CHECK(hipMemcpy(&val, bounce, sizeof(float), hipMemcpyDeviceToHost)); + if (fabsf(val - 2.0f) > 0.1f) { + if (++fails_sys <= 5) + printf("[rank %d] iter %d sys-scope: expected 2.0, got %g\n", rank, iter, val); + } + + // -- Agent-scope P2P atomics (Triton scope="gpu") ---------------------- + // Fine-grained memory supports this, but coherency is hardware-dependent. + // Intermittent failures are possible; they do not affect PASS/FAIL. + k_zero<<<1, 1>>>(my_ptr); + HIP_CHECK(hipDeviceSynchronize()); + barrier(sock); + + k_atomic_add_agent<<<1, 1>>>(my_ptr); + k_atomic_add_agent<<<1, 1>>>(peer_ptr); // P2P + HIP_CHECK(hipDeviceSynchronize()); + barrier(sock); + + k_copy<<<1, 1>>>(bounce, my_ptr); + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipMemcpy(&val, bounce, sizeof(float), hipMemcpyDeviceToHost)); + if (fabsf(val - 2.0f) > 0.1f) { + if (++fails_agent <= 5) + printf("[rank %d] iter %d agent-scope: expected 2.0, got %g (informational)\n", + rank, iter, val); + } + } + + HIP_CHECK(hipFree(bounce)); + + printf("[rank %d] sys-scope: %d/%d failures (PASS/FAIL) " + "agent-scope: %d/%d failures (informational)\n", + rank, fails_sys, n_iters, fails_agent, n_iters); + fflush(stdout); + + // PASS/FAIL is determined by sys-scope only (always reliable on fine-grained) + int ok = (fails_sys == 0); + printf("[rank %d] %s\n", rank, ok ? "PASS" : "FAIL"); + fflush(stdout); + + // -- Cleanup --------------------------------------------------------------- + hsa_amd_vmem_unmap(peer_va, alloc_size); + hsa_amd_vmem_address_free(peer_va, alloc_size); + hsa_amd_vmem_handle_release(peer_handle); + + hsa_amd_vmem_unmap(my_va, alloc_size); + hsa_amd_vmem_address_free(my_va, alloc_size); + hsa_amd_vmem_handle_release(my_handle); + + hsa_shut_down(); + return ok ? 0 : 1; +} + +// ============================================================================ +// main — self-exec trick for fresh-process ranks +// +// Problem: hsa_init() starts internal threads; fork() doesn't duplicate them, +// so HSA APIs fail in the child after fork(). Solution: the primary process +// sets up a Unix socketpair, then fork+exec's itself as rank 1 (with rank and +// socket fd passed via environment variables). Both parent (rank 0) and child +// (rank 1) therefore start their HSA/HIP initialization from a clean state. +// ============================================================================ + +int main(int argc, char** argv) { + int n_iters = 200; + if (argc >= 2) n_iters = atoi(argv[1]); + if (n_iters <= 0) n_iters = 200; + + // -- Re-exec path: invoked as 1 rank ---------- + const char* rank_env = getenv("P2P_RANK"); + const char* sock_env = getenv("P2P_SOCK_FD"); + if (rank_env && sock_env) { + int rank = atoi(rank_env); + int sock_fd = atoi(sock_env); + const char* ni = getenv("P2P_NITERS"); + if (ni) n_iters = atoi(ni); + int rc = run_rank(rank, sock_fd, n_iters); + close(sock_fd); + return rc; + } + + // -- Primary path: check GPU count, spawn rank 1, run rank 0 ------------- + int ndev = 0; + (void)hipGetDeviceCount(&ndev); + if (ndev < 2) { + fprintf(stderr, "This example requires at least 2 GPU devices (found %d).\n", ndev); + return 1; + } + + printf("p2p_atomics_hsa: PATH 3 — HSA fine-grained VMem\n"); + printf(" N_ITERS=%d GPUs=%d\n\n", n_iters, ndev); + fflush(stdout); + + int sv[2]; + if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) != 0) { + perror("socketpair"); + return 1; + } + + // Set env vars for the re-exec'd rank 1 process. + // sv[1] is NOT FD_CLOEXEC by default, so it survives execl(). + char sock_str[32], niters_str[32]; + snprintf(sock_str, sizeof(sock_str), "%d", sv[1]); + snprintf(niters_str, sizeof(niters_str), "%d", n_iters); + setenv("P2P_RANK", "1", 1); + setenv("P2P_SOCK_FD", sock_str, 1); + setenv("P2P_NITERS", niters_str, 1); + + pid_t pid = fork(); + if (pid < 0) { perror("fork"); return 1; } + + if (pid == 0) { + close(sv[0]); + // exec self — child starts fresh (no inherited HSA/HIP state) + execl("/proc/self/exe", argv[0], NULL); + perror("execl"); + _exit(127); + } + + // Parent: unset rank-specific env, close rank-1 socket end, run rank 0 + unsetenv("P2P_RANK"); + unsetenv("P2P_SOCK_FD"); + unsetenv("P2P_NITERS"); + close(sv[1]); + + int rc0 = run_rank(0, sv[0], n_iters); + close(sv[0]); + + int status = 0; + waitpid(pid, &status, 0); + int rc1 = WIFEXITED(status) ? WEXITSTATUS(status) : 1; + + if (rc0 != 0 || rc1 != 0) + printf("\nOverall: FAIL (rank0=%d rank1=%d)\n", rc0, rc1); + else + printf("\nOverall: PASS\n"); + + return (rc0 != 0 || rc1 != 0) ? 1 : 0; +} diff --git a/iris/allocators/__init__.py b/iris/allocators/__init__.py index 460c53d3..65ec0ba4 100644 --- a/iris/allocators/__init__.py +++ b/iris/allocators/__init__.py @@ -8,5 +8,6 @@ from .base import BaseAllocator from .torch_allocator import TorchAllocator from .vmem_allocator import VMemAllocator +from .hsa_vmem_allocator import HsaVMemAllocator -__all__ = ["BaseAllocator", "TorchAllocator", "VMemAllocator"] +__all__ = ["BaseAllocator", "TorchAllocator", "VMemAllocator", "HsaVMemAllocator"] diff --git a/iris/allocators/hsa_vmem_allocator.py b/iris/allocators/hsa_vmem_allocator.py new file mode 100644 index 00000000..2fcb878a --- /dev/null +++ b/iris/allocators/hsa_vmem_allocator.py @@ -0,0 +1,397 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + +""" +HSA Virtual Memory allocator for Iris symmetric heap — Path 3. + +## Memory Path Overview + +Three paths exist for allocating GPU memory that supports cross-GPU (P2P) +atomic operations in Iris. They differ in the API layer used and the resulting +memory coherency: + +### Path 1: hipExtMallocWithFlags (Fine-Grained malloc) + API: hipExtMallocWithFlags(ptr, size, hipDeviceMallocFinegrained) + HSA: hsa_amd_memory_pool_allocate on fine-grained device pool + KFD: hsaKmtAllocMemory with CoarseGrain=0 + + Pros: Simple, fine-grained → P2P atomics work + Cons: No VA control; heap is a flat contiguous region + + Used by: VMemAllocator (iris/allocators/vmem_allocator.py) + +### Path 2: HIP Virtual Memory (hipMemCreate + hipMemAddressReserve) + APIs: hipMemAddressReserve → hipMemCreate → hipMemMap → hipMemSetAccess + HIP→CLR: ROCCLR_MEM_PHYMEM flag → SvmBuffer::malloc + CLR→HSA: hsa_amd_vmem_handle_create on COARSE-GRAINED pool + KFD: CoarseGrain=1, NoAddress=1 + + Pros: Full VA space control + Cons: ALWAYS coarse-grained → P2P atomics (scope=cta/gpu) fail intermittently + HIP hardcodes the coarse-grained pool for hipMemCreate + + Not used in Iris (removed due to atomic failures) + +### Path 3: HSA Virtual Memory (this allocator) + APIs: hsa_amd_vmem_address_reserve → hsa_amd_vmem_handle_create (fine-grained + pool) → hsa_amd_vmem_map → hsa_amd_vmem_set_access + KFD: CoarseGrain=0 (from fine-grained pool), NoAddress=1 + + Pros: Fine-grained + full VA space control (best of both) + Cons: More complex setup (enumerate HSA agents and pools at init) + + The key advantage over Path 2: hsa_amd_vmem_handle_create takes an EXPLICIT + pool argument, so we can pass the fine-grained GPU local pool instead of the + coarse-grained pool that HIP/CLR hardcodes. + + Used by: HsaVMemAllocator (this file) + +See iris/hip.py for the full architecture diagram. +""" + +import math +import struct +import torch +from typing import Dict, Optional +from threading import Lock + +from .base import BaseAllocator +from ..hip import ( + hsa_init, + hsa_shut_down, + hsa_get_gpu_agents, + hsa_get_fine_grained_pool, + hsa_get_pool_granularity, + hsa_vmem_address_reserve, + hsa_vmem_address_free, + hsa_vmem_handle_create, + hsa_vmem_handle_release, + hsa_vmem_map, + hsa_vmem_unmap, + hsa_vmem_set_access, + hsa_vmem_export_shareable_handle, + hsa_vmem_import_shareable_handle, + hsa_amd_vmem_alloc_handle_t, + hsa_agent_t, +) +from ..fd_passing import send_fd, recv_fd, managed_fd + + +class HsaVMemAllocator(BaseAllocator): + """ + HSA Virtual Memory allocator using fine-grained GPU local memory (Path 3). + + Combines the VA control of VMem with fine-grained memory for correct P2P + atomic operations. Uses HSA APIs directly instead of HIP VMem APIs to + explicitly choose the fine-grained GPU memory pool. + + Key advantage over HIP VMem (Path 2): + - hsa_amd_vmem_handle_create takes an explicit pool argument + - We pass the fine-grained pool → KFD allocates with CoarseGrain=0 + - P2P atomics (scope=cta/gpu/sys) work correctly + + Key advantage over Path 1 (malloc_fine_grained): + - Full virtual address space control + - Can reserve a large VA range and map individual segments on demand + + Args: + heap_size: Total size of the heap in bytes + device_id: GPU device ID + rank: Current rank ID + world_size: Total number of ranks + """ + + def __init__( + self, + heap_size: int, + device_id: int, + rank: int, + world_size: int, + ): + super().__init__(heap_size, device_id, rank, world_size) + self.device = torch.device(f"cuda:{device_id}") + self.lock = Lock() + + # Initialize HSA runtime + hsa_init() + + # Find this device's GPU agent and fine-grained pool + all_agents = hsa_get_gpu_agents() + if len(all_agents) <= device_id: + hsa_shut_down() + raise RuntimeError(f"Not enough GPU agents: device_id={device_id}, found {len(all_agents)} agents") + self._agent: hsa_agent_t = all_agents[device_id] + self._all_agents = all_agents + self._fine_pool = hsa_get_fine_grained_pool(self._agent) + + # Two granularities: + # _pool_granularity: required alignment for hsa_vmem_handle_create (typically 2MB) + # granularity: per-tensor allocation alignment within the pre-mapped heap (4KB, same as HIP) + # Individual tensor allocations do NOT need to be 2MB-aligned; only the physical + # handle creation requires pool granularity alignment. + self._pool_granularity = hsa_get_pool_granularity(self._fine_pool) + self.granularity = 4096 # 4KB per-tensor allocation alignment (same as HIP VMem) + + # Align heap size to pool granularity (required by hsa_vmem_handle_create) + self.aligned_heap_size = (heap_size + self._pool_granularity - 1) & ~(self._pool_granularity - 1) + + # Reserve the virtual address space upfront + self.base_va = hsa_vmem_address_reserve(self.aligned_heap_size) + + # Allocate and map the entire heap as a single fine-grained physical block. + # We could map on demand, but a single upfront allocation is simpler and + # matches the behavior of VMemAllocator (Path 1). + self._heap_handle: hsa_amd_vmem_alloc_handle_t = hsa_vmem_handle_create(self._fine_pool, self.aligned_heap_size) + hsa_vmem_map(self.base_va, self.aligned_heap_size, self._heap_handle) + + # Set read/write access for all GPU agents in the system. + # This allows all GPUs to access the memory for P2P atomics. + hsa_vmem_set_access(self.base_va, self.aligned_heap_size, self._all_agents) + + self._peer_handles: Dict[int, hsa_amd_vmem_alloc_handle_t] = {} + self._peer_vas: Dict[int, int] = {} + self.heap_bases_array = None + self._closed = False + + def get_base_address(self) -> int: + """Get the base virtual address of the heap.""" + return self.base_va + + def get_minimum_allocation_size(self) -> int: + """Minimum allocation size (one granule for HSA VMem compatibility).""" + return self.granularity + + def allocate(self, num_elements: int, dtype: torch.dtype, alignment: int = 1024) -> torch.Tensor: + """ + Allocate a tensor from the fine-grained HSA VMem heap using bump allocation. + + Args: + num_elements: Number of elements to allocate + dtype: PyTorch data type + alignment: Alignment requirement in bytes + + Returns: + PyTorch tensor wrapping the allocated memory + + Raises: + RuntimeError: If the heap is out of space + """ + with self.lock: + element_size = torch.tensor([], dtype=dtype).element_size() + size_in_bytes = num_elements * element_size + aligned_size = math.ceil(size_in_bytes / alignment) * alignment + + if self.heap_offset + aligned_size > self.aligned_heap_size: + raise RuntimeError( + f"HsaVMemAllocator: out of space for allocation of {aligned_size} bytes " + f"at offset {self.heap_offset} (heap size {self.aligned_heap_size})" + ) + + start = self.heap_offset + self.heap_offset += aligned_size + target_va = self.base_va + start + + class CUDAArrayInterface: + def __init__(self, ptr, size_bytes): + self.ptr = ptr + self.size_bytes = size_bytes + + @property + def __cuda_array_interface__(self): + return { + "shape": (self.size_bytes,), + "typestr": "|u1", + "data": (self.ptr, False), + "version": 3, + } + + cuda_array = CUDAArrayInterface(target_va, size_in_bytes) + tensor_bytes = torch.as_tensor(cuda_array, device=self.device) + full = tensor_bytes.view(dtype) + if num_elements == 0: + return full.narrow(0, 1, 0) + return full.narrow(0, 0, num_elements) + + def get_shareable_handle(self) -> tuple: + """ + Export the heap's physical memory as a shareable DMA-BUF handle. + + Returns: + tuple: (fd, base_ptr, base_size) where fd is the DMA-BUF file + descriptor and base_ptr/base_size describe the exported range + + Raises: + RuntimeError: If export fails + """ + fd = hsa_vmem_export_shareable_handle(self._heap_handle) + return fd, self.base_va, self.aligned_heap_size + + def establish_peer_access(self, all_bases: Dict[int, int], connections: Optional[Dict] = None): + """ + Establish HSA VMem access to peer memory for symmetric addressing. + + Exchanges DMA-BUF handles with peer ranks, imports peer handles, and + maps them into a reserved VA range. The mapped_ptr for each peer is used + as heap_bases[peer] for address translation in Triton kernels. + + Args: + all_bases: Dictionary mapping rank -> base address + connections: Optional peer connections (Unix sockets) for FD exchange + """ + import numpy as np + + heap_bases_array = np.zeros(self.num_ranks, dtype=np.uint64) + + if connections is not None: + # HsaVMemAllocator uses a fixed pre-allocated heap with stable physical memory. + # Once peer access is established (handles and VAs created), the mappings + # remain valid for the allocator's lifetime — no need to re-create on + # every allocation. Skip re-creation if peer VAs are already set up. + if self._peer_handles: + # Already established — just repopulate heap_bases_array from existing VAs + for peer, peer_va in self._peer_vas.items(): + heap_bases_array[peer] = peer_va + heap_bases_array[self.cur_rank] = all_bases[self.cur_rank] + self.heap_bases_array = heap_bases_array + return + + my_fd, my_base, my_size = self.get_shareable_handle() + heap_base = self.get_base_address() + my_metadata = struct.pack("QQQ", my_base, my_size, heap_base) + + with managed_fd(my_fd): + for peer, sock in connections.items(): + if peer == self.cur_rank: + continue + + # Higher rank sends first to avoid deadlock + if self.cur_rank > peer: + send_fd(sock, my_fd, payload=my_metadata) + peer_handle_fd, peer_metadata = recv_fd(sock, payload_size=24) + else: + peer_handle_fd, peer_metadata = recv_fd(sock, payload_size=24) + send_fd(sock, my_fd, payload=my_metadata) + + peer_base, peer_size, peer_heap_base = struct.unpack("QQQ", peer_metadata) + + # Import peer's handle from DMA-BUF and map it to our VA space + with managed_fd(peer_handle_fd): + imported_handle = hsa_vmem_import_shareable_handle(peer_handle_fd) + + # Reserve a new VA range for the peer's memory + peer_va = hsa_vmem_address_reserve(self.aligned_heap_size) + # Align to pool granularity (required by hsa_vmem_map) + peer_alloc_size = (peer_size + self._pool_granularity - 1) & ~(self._pool_granularity - 1) + hsa_vmem_map(peer_va, peer_alloc_size, imported_handle) + hsa_vmem_set_access(peer_va, peer_alloc_size, self._all_agents) + + self._peer_handles[peer] = imported_handle + self._peer_vas[peer] = peer_va + heap_bases_array[peer] = peer_va + + heap_bases_array[self.cur_rank] = all_bases[self.cur_rank] + else: + heap_bases_array[self.cur_rank] = all_bases[self.cur_rank] + + self.heap_bases_array = heap_bases_array + + def get_device(self) -> torch.device: + """Get the PyTorch device for this allocator.""" + return self.device + + def owns_tensor(self, tensor: torch.Tensor) -> bool: + """ + Check if a tensor's memory belongs to this allocator's heap. + + Args: + tensor: Tensor to check + + Returns: + True if tensor is within this allocator's heap + """ + if not tensor.is_cuda: + return False + if tensor.numel() == 0: + return True + ptr = tensor.data_ptr() + return self.base_va <= ptr < self.base_va + self.aligned_heap_size + + def import_external_tensor(self, external_tensor: torch.Tensor) -> torch.Tensor: + """ + Import an external PyTorch tensor into the symmetric heap (as_symmetric). + + Uses copy semantics: allocates space on the fine-grained symmetric heap + and copies the data from the external tensor. The returned tensor is + independent of the input tensor. + + Args: + external_tensor: External PyTorch tensor to import (must be CUDA, contiguous) + + Returns: + New tensor on the symmetric heap with a copy of the external tensor's data + + Raises: + RuntimeError: If tensor is not a CUDA tensor or not contiguous + """ + if not external_tensor.is_cuda: + raise RuntimeError("Can only import CUDA tensors") + if not external_tensor.is_contiguous(): + raise RuntimeError("Only contiguous tensors can be imported; call .contiguous() before as_symmetric()") + num_elements = external_tensor.numel() + dtype = external_tensor.dtype + shape = external_tensor.shape + heap_tensor = self.allocate(num_elements, dtype) + heap_tensor = heap_tensor.reshape(shape).copy_(external_tensor) + return heap_tensor + + def close(self): + """Release all HSA VMem resources.""" + if self._closed: + return + self._closed = True + + # Release peer mappings + for peer, peer_handle in self._peer_handles.items(): + peer_va = self._peer_vas.get(peer) + if peer_va: + try: + hsa_vmem_unmap(peer_va, self.aligned_heap_size) + except Exception: + pass + try: + hsa_vmem_address_free(peer_va, self.aligned_heap_size) + except Exception: + pass + try: + hsa_vmem_handle_release(peer_handle) + except Exception: + pass + self._peer_handles.clear() + self._peer_vas.clear() + + # Release local heap + if hasattr(self, "_heap_handle"): + try: + hsa_vmem_unmap(self.base_va, self.aligned_heap_size) + except Exception: + pass + try: + hsa_vmem_handle_release(self._heap_handle) + except Exception: + pass + + if hasattr(self, "base_va") and self.base_va: + try: + hsa_vmem_address_free(self.base_va, self.aligned_heap_size) + except Exception: + pass + self.base_va = 0 + + try: + hsa_shut_down() + except Exception: + pass + + def __del__(self): + """Cleanup HSA VMem resources on deletion.""" + self.close() diff --git a/iris/allocators/vmem_allocator.py b/iris/allocators/vmem_allocator.py index e5427edf..a2d8fbb6 100644 --- a/iris/allocators/vmem_allocator.py +++ b/iris/allocators/vmem_allocator.py @@ -2,52 +2,48 @@ # Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. """ -VMem-based allocator using HIP's virtual memory management APIs. +VMem-based allocator using HIP's fine-grained memory APIs. -This allocator provides fine-grained control over virtual and physical memory, -enabling features like memory oversubscription and on-demand paging. +This allocator uses hipExtMallocWithFlags (fine-grained) for physical memory, +which is required for correct P2P atomic operations (scope=cta/gpu) across GPUs. +hipMemCreate creates coarse-grained memory that causes intermittent failures +for cross-GPU atomics. """ +import math +import struct import torch -import os -from typing import Dict +from typing import Any, Dict, Optional from threading import Lock from .base import BaseAllocator from ..hip import ( get_allocation_granularity, - get_address_range, + malloc_fine_grained, + hip_free, export_dmabuf_handle, - mem_import_from_shareable_handle, - mem_create, - mem_address_reserve, - mem_map, - mem_unmap, - mem_address_free, - mem_release, - mem_set_access, - hipMemAccessDesc, - hipMemLocationTypeDevice, - hipMemAccessFlagsProtReadWrite, + import_dmabuf_handle, + destroy_external_memory, ) +from ..fd_passing import send_fd, recv_fd, managed_fd class VMemAllocator(BaseAllocator): """ - Virtual Memory allocator using HIP's VMem APIs. + Fine-grained memory allocator for Iris symmetric heap. - Features: - - Reserve large virtual address (VA) space upfront - - Map physical memory on demand - - Support memory oversubscription - - Fine-grained control over allocations + Uses hipExtMallocWithFlags with hipDeviceMallocFinegrained for all physical + memory, which ensures correct P2P atomic operations (scope=cta/gpu) across GPUs. + + hipMemCreate (used in the previous VMem approach) creates coarse-grained memory + that causes intermittent failures for cross-GPU atomics. This allocator fixes + that by using fine-grained memory throughout. Args: heap_size: Total size of the heap in bytes - device: PyTorch device (e.g., "cuda:0") + device_id: GPU device ID rank: Current rank ID world_size: Total number of ranks - va_multiplier: VA space multiplier (reserve more VA than physical) """ def __init__( @@ -56,74 +52,34 @@ def __init__( device_id: int, rank: int, world_size: int, - va_multiplier: float = 1.0, ): super().__init__(heap_size, device_id, rank, world_size) - self.va_multiplier = va_multiplier self.device = torch.device(f"cuda:{device_id}") self.lock = Lock() + # Keep granularity for alignment and compatibility with existing tests self.granularity = get_allocation_granularity(self.device_id) self.aligned_heap_size = (heap_size + self.granularity - 1) & ~(self.granularity - 1) - self.va_size = self.aligned_heap_size - self.base_va = mem_address_reserve(self.va_size, self.granularity, 0) - - self.minimal_size = min(2 << 20, self.aligned_heap_size // 2) - if self.minimal_size < self.granularity: - self.minimal_size = self.granularity - - self.minimal_handle = mem_create(self.minimal_size, self.device_id) - mem_map(self.base_va, self.minimal_size, 0, self.minimal_handle) - - # ROCm: mem_set_access must be called cumulatively from base_va (see rocm-systems#2667) - self.access_descs = [] - for peer_device_id in range(world_size): - desc = hipMemAccessDesc() - desc.location.type = hipMemLocationTypeDevice - desc.location.id = peer_device_id - desc.flags = hipMemAccessFlagsProtReadWrite - self.access_descs.append(desc) - self.cumulative_mapped_size = self.minimal_size - mem_set_access(self.base_va, self.cumulative_mapped_size, self.access_descs) + # Allocate the entire heap upfront as a single fine-grained block. + # Fine-grained (hipExtMallocWithFlags / hipDeviceMallocFinegrained) memory + # is required for correct cross-GPU atomic operations. + self._alloc_ptr = malloc_fine_grained(self.aligned_heap_size) + self.base_va = self._alloc_ptr.value - self.allocations: Dict[int, tuple] = {} - self.allocation_order = [] - self._track_allocation(0, self.minimal_size, False, self.minimal_handle, self.base_va) - self.current_offset = self.minimal_size - - self.world_size = world_size + self._peer_ext_mem_handles: Dict[int, Any] = {} + self.heap_bases_array = None def get_base_address(self) -> int: """Get the base address of the heap.""" return self.base_va - def _track_allocation(self, offset: int, size: int, is_imported: bool, handle, va: int): - """Track a new allocation for cleanup and segmented export.""" - self.allocations[offset] = (size, is_imported, handle, va) - self.allocation_order.append((offset, size)) - - def get_allocation_segments(self): - """ - Get list of allocation segments for segmented DMA-BUF export. - - Returns: - List of (offset, size, va) tuples for each allocation in order. - Each tuple describes one physically-backed segment that needs - to be exported/imported separately. - """ - segments = [] - for offset, size in self.allocation_order: - va = self.base_va + offset - segments.append((offset, size, va)) - return segments - def get_minimum_allocation_size(self) -> int: - """Minimum allocation size in bytes (one granule; hipMemCreate(0) is invalid).""" + """Minimum allocation size in bytes (one granule for alignment compatibility).""" return self.granularity def allocate(self, num_elements: int, dtype: torch.dtype, alignment: int = 1024) -> torch.Tensor: """ - Allocate memory from the VMem heap. + Allocate a tensor from the fine-grained heap using bump allocation. Args: num_elements: Number of elements to allocate @@ -138,33 +94,21 @@ def allocate(self, num_elements: int, dtype: torch.dtype, alignment: int = 1024) """ with self.lock: element_size = torch.tensor([], dtype=dtype).element_size() - size_bytes = num_elements * element_size - actual_size_bytes = max(size_bytes, self.get_minimum_allocation_size()) - aligned_size = (actual_size_bytes + self.granularity - 1) & ~(self.granularity - 1) - aligned_offset = (self.current_offset + alignment - 1) & ~(alignment - 1) + size_in_bytes = num_elements * element_size + aligned_size = math.ceil(size_in_bytes / alignment) * alignment - if aligned_offset + aligned_size > self.aligned_heap_size: + if self.heap_offset + aligned_size > self.aligned_heap_size: raise RuntimeError( - f"Out of VMem address space for allocation: " - f"need {aligned_size} bytes at offset {aligned_offset}, " + f"Out of VMem heap space for allocation: " + f"need {aligned_size} bytes at offset {self.heap_offset}, " f"but heap size is {self.aligned_heap_size}. " - f"Current offset: {self.current_offset}, " - f"available: {self.aligned_heap_size - aligned_offset} bytes" + f"available: {self.aligned_heap_size - self.heap_offset} bytes" ) - target_va = self.base_va + aligned_offset - handle = mem_create(aligned_size, self.device_id) - mem_map(target_va, aligned_size, 0, handle) + start = self.heap_offset + self.heap_offset += aligned_size - new_cumulative_size = aligned_offset + aligned_size - if new_cumulative_size > self.cumulative_mapped_size: - self.cumulative_mapped_size = new_cumulative_size - mem_set_access(self.base_va, self.cumulative_mapped_size, self.access_descs) - - self._track_allocation(aligned_offset, aligned_size, False, handle, target_va) - self.current_offset = aligned_offset + aligned_size - - interface_size = (aligned_size // element_size) * element_size + target_va = self.base_va + start class CUDAArrayInterface: def __init__(self, ptr, size_bytes, device): @@ -181,14 +125,76 @@ def __cuda_array_interface__(self): "version": 3, } - cuda_array = CUDAArrayInterface(target_va, interface_size, self.device) + cuda_array = CUDAArrayInterface(target_va, size_in_bytes, self.device) tensor_bytes = torch.as_tensor(cuda_array, device=self.device) full = tensor_bytes.view(dtype) if num_elements == 0: - tensor = full.narrow(0, 1, 0) - else: - tensor = full.narrow(0, 0, num_elements) - return tensor + return full.narrow(0, 1, 0) + return full.narrow(0, 0, num_elements) + + def get_shareable_handle(self) -> tuple: + """ + Get a shareable DMA-BUF handle for the heap. + + Returns: + tuple: (fd, base_ptr, base_size) from export_dmabuf_handle + """ + return export_dmabuf_handle(self.base_va, self.aligned_heap_size) + + def establish_peer_access(self, all_bases: Dict[int, int], connections: Optional[Dict] = None): + """ + Establish fine-grained access to peer memory for symmetric addressing. + + Uses hipImportExternalMemory (import_dmabuf_handle) which preserves the + fine-grained memory type, ensuring correct cross-GPU atomic operations. + + Args: + all_bases: Dictionary mapping rank -> base address + connections: Optional peer connections for handle exchange + """ + import numpy as np + + heap_bases_array = np.zeros(self.num_ranks, dtype=np.uint64) + + if connections is not None: + for handle in self._peer_ext_mem_handles.values(): + try: + destroy_external_memory(handle) + except Exception: + pass + self._peer_ext_mem_handles.clear() + + my_fd, my_base, my_size = self.get_shareable_handle() + heap_base = self.get_base_address() + my_metadata = struct.pack("QQQ", my_base, my_size, heap_base) + + with managed_fd(my_fd): + for peer, sock in connections.items(): + if peer == self.cur_rank: + continue + + # Higher rank sends first to avoid deadlock + if self.cur_rank > peer: + send_fd(sock, my_fd, payload=my_metadata) + peer_handle, peer_metadata = recv_fd(sock, payload_size=24) + else: + peer_handle, peer_metadata = recv_fd(sock, payload_size=24) + send_fd(sock, my_fd, payload=my_metadata) + + peer_base, peer_size, peer_heap_base = struct.unpack("QQQ", peer_metadata) + + with managed_fd(peer_handle): + mapped_ptr, ext_mem_handle = import_dmabuf_handle( + peer_handle, peer_size, peer_heap_base, peer_base + ) + heap_bases_array[peer] = mapped_ptr + self._peer_ext_mem_handles[peer] = ext_mem_handle + + heap_bases_array[self.cur_rank] = all_bases[self.cur_rank] + else: + heap_bases_array[self.cur_rank] = all_bases[self.cur_rank] + + self.heap_bases_array = heap_bases_array def get_device(self) -> torch.device: """ @@ -221,103 +227,53 @@ def import_external_tensor(self, external_tensor: torch.Tensor) -> torch.Tensor: """ Import an external PyTorch tensor into the symmetric heap (as_symmetric). - This creates a view into the symmetric heap that shares physical memory - with the external tensor, handling PyTorch caching allocator offsets. + Allocates space on the fine-grained symmetric heap and copies the data + from the external tensor. The returned tensor resides on the symmetric heap + and can be used in RMA operations across ranks. + + Note: Unlike the previous VMem implementation, the returned tensor does not + share physical memory with the original. Modifications to one are not visible + in the other. This is the same semantics as TorchAllocator. Args: - external_tensor: External PyTorch tensor to import + external_tensor: External PyTorch tensor to import (must be CUDA, contiguous) Returns: - New tensor view in symmetric heap that shares memory with external tensor + New tensor on the symmetric heap with a copy of the external tensor's data Raises: - RuntimeError: If import fails or tensor is not contiguous + RuntimeError: If tensor is not a CUDA tensor or not contiguous """ - - with self.lock: - if not external_tensor.is_cuda: - raise RuntimeError("Can only import CUDA tensors") - if not external_tensor.is_contiguous(): - raise RuntimeError("Only contiguous tensors can be imported; call .contiguous() before as_symmetric()") - - external_ptr = external_tensor.data_ptr() - alloc_base, alloc_size = get_address_range(external_ptr) - offset_in_alloc = external_ptr - alloc_base - aligned_size = (alloc_size + self.granularity - 1) & ~(self.granularity - 1) - aligned_offset = (self.current_offset + self.granularity - 1) & ~(self.granularity - 1) - - if aligned_offset + aligned_size > self.aligned_heap_size: - raise RuntimeError( - f"Out of VMem address space for import: " - f"need {aligned_size} bytes at offset {aligned_offset}, " - f"but heap size is {self.aligned_heap_size}. " - f"Current offset: {self.current_offset}, " - f"available: {self.aligned_heap_size - aligned_offset} bytes" - ) - - dmabuf_fd, export_base, export_size = export_dmabuf_handle(alloc_base, alloc_size) - aligned_export_size = (export_size + self.granularity - 1) & ~(self.granularity - 1) - target_va = self.base_va + aligned_offset - imported_handle = mem_import_from_shareable_handle(dmabuf_fd) - os.close(dmabuf_fd) - - mem_map(target_va, aligned_export_size, 0, imported_handle) - - new_cumulative_size = aligned_offset + aligned_export_size - if new_cumulative_size > self.cumulative_mapped_size: - self.cumulative_mapped_size = new_cumulative_size - mem_set_access(self.base_va, self.cumulative_mapped_size, self.access_descs) - - tensor_va = target_va + offset_in_alloc - self._track_allocation(aligned_offset, aligned_export_size, True, imported_handle, target_va) - self.current_offset = aligned_offset + aligned_export_size - - tensor_size = external_tensor.numel() * external_tensor.element_size() - - class CUDAArrayInterface: - def __init__(self, ptr, size_bytes, device): - self.ptr = ptr - self.size_bytes = size_bytes - self.device = device - - @property - def __cuda_array_interface__(self): - return { - "shape": (self.size_bytes,), - "typestr": "|u1", - "data": (self.ptr, False), - "version": 3, - } - - cuda_array = CUDAArrayInterface(tensor_va, tensor_size, self.device) - tensor_bytes = torch.as_tensor(cuda_array, device=self.device) - imported_tensor = tensor_bytes.view(external_tensor.dtype).reshape(external_tensor.shape) - - return imported_tensor + if not external_tensor.is_cuda: + raise RuntimeError("Can only import CUDA tensors") + if not external_tensor.is_contiguous(): + raise RuntimeError("Only contiguous tensors can be imported; call .contiguous() before as_symmetric()") + num_elements = external_tensor.numel() + dtype = external_tensor.dtype + shape = external_tensor.shape + heap_tensor = self.allocate(num_elements, dtype) + heap_tensor = heap_tensor.reshape(shape).copy_(external_tensor) + return heap_tensor def close(self): - """Explicitly release VMem resources.""" + """Explicitly release fine-grained memory resources.""" if hasattr(self, "_closed") and self._closed: return - with self.lock: - for offset, alloc_info in self.allocations.items(): - if len(alloc_info) == 4: - size, is_imported, handle, va = alloc_info - - if handle is not None: - aligned_size = (size + self.granularity - 1) & ~(self.granularity - 1) - mem_unmap(va, aligned_size) - mem_release(handle) - - self.allocations.clear() + for handle in self._peer_ext_mem_handles.values(): + try: + destroy_external_memory(handle) + except Exception: + pass + self._peer_ext_mem_handles.clear() - if hasattr(self, "base_va") and self.base_va: - mem_address_free(self.base_va, self.va_size) - self.base_va = 0 + if hasattr(self, "_alloc_ptr") and self._alloc_ptr is not None: + hip_free(self._alloc_ptr) + self._alloc_ptr = None + self.base_va = 0 - self._closed = True + self._closed = True def __del__(self): - """Cleanup VMem resources on deletion.""" + """Cleanup fine-grained memory resources on deletion.""" self.close() diff --git a/iris/hip.py b/iris/hip.py index e6dc598d..6be6bd7e 100644 --- a/iris/hip.py +++ b/iris/hip.py @@ -477,12 +477,98 @@ def get_address_range(ptr): return base_ptr.value, size.value +# ============================================================================ +# GPU Memory Paths for P2P Atomic Operations — Architecture Overview +# ============================================================================ +# +# For correct cross-GPU (peer-to-peer, P2P) atomic operations in Triton/AMDGPU, +# the physical memory backing the symmetric heap must be **fine-grained**. +# This module supports three distinct paths, each with different trade-offs: +# +# ┌─────────────────────────────────────────────────────────────────────────┐ +# │ PATH 1: hipExtMallocWithFlags (Fine-Grained malloc) │ +# │ │ +# │ API: hipExtMallocWithFlags(ptr, size, hipDeviceMallocFinegrained) │ +# │ HSA: hsa_amd_memory_pool_allocate on fine-grained device pool │ +# │ KFD: hsaKmtAllocMemory with CoarseGrain=0 │ +# │ │ +# │ + Fine-grained → P2P atomics (scope=cta/gpu/sys) work correctly │ +# │ + Simple single call, no VA management needed │ +# │ - Physical address is KFD-assigned; no control over virtual layout │ +# │ - Heap is a single contiguous region; can't interleave with imports │ +# │ │ +# │ Used by: VMemAllocator (current), TorchAllocator │ +# └─────────────────────────────────────────────────────────────────────────┘ +# +# ┌─────────────────────────────────────────────────────────────────────────┐ +# │ PATH 2: HIP Virtual Memory APIs (hipMemCreate + hipMemAddressReserve) │ +# │ │ +# │ APIs: hipMemAddressReserve → hipMemCreate → hipMemMap → │ +# │ hipMemSetAccess │ +# │ HIP→CLR: amd::SvmBuffer::malloc with ROCCLR_MEM_PHYMEM │ +# │ CLR→HSA: hsa_amd_vmem_handle_create on COARSE-GRAINED device pool │ +# │ KFD: hsaKmtAllocMemory with CoarseGrain=1, NoAddress=1 │ +# │ │ +# │ + Full virtual address space control (reserve large VA, map segments) │ +# │ + Can map and unmap independently; supports oversubscription │ +# │ - ALWAYS coarse-grained → P2P atomics (scope=cta/gpu) FAIL │ +# │ (HIP hardcodes the coarse-grained GPU pool for hipMemCreate) │ +# │ │ +# │ Used by: Legacy VMemAllocator (removed; replaced by Path 1 or 3) │ +# └─────────────────────────────────────────────────────────────────────────┘ +# +# ┌─────────────────────────────────────────────────────────────────────────┐ +# │ PATH 3: HSA Virtual Memory APIs (direct, fine-grained pool) │ +# │ │ +# │ APIs: hsa_amd_vmem_address_reserve → hsa_amd_vmem_handle_create on │ +# │ FINE-GRAINED device pool → hsa_amd_vmem_map → │ +# │ hsa_amd_vmem_set_access │ +# │ KFD: hsaKmtAllocMemory with CoarseGrain=0 (from fine-grained pool), │ +# │ NoAddress=1 (physical-only handle) │ +# │ │ +# │ + Fine-grained → P2P atomics (scope=cta/gpu/sys) work correctly │ +# │ + Full virtual address space control (same as Path 2) │ +# │ + Can map segments with different physical backing independently │ +# │ - Requires enumerating HSA agents and pools at init time │ +# │ - More complex setup (iterate agents, find fine-grained GPU pool) │ +# │ │ +# │ The key difference from Path 2: we call hsa_amd_vmem_handle_create │ +# │ with the fine-grained GPU pool (global_flags & FINE_GRAINED ≠ 0) │ +# │ instead of letting HIP choose the coarse-grained pool. │ +# │ │ +# │ Used by: HsaVMemAllocator (iris/allocators/hsa_vmem_allocator.py) │ +# └─────────────────────────────────────────────────────────────────────────┘ +# +# Stack diagram (HIP eventually calls HSA, which calls KFD): +# +# hipExtMallocWithFlags │ hipMemCreate │ hsa_amd_vmem_handle_create +# (Path 1) │ (Path 2) │ (Path 3, direct) +# │ │ │ │ │ +# ▼ │ ▼ │ ▼ +# hsa_amd_memory_pool_ │ SvmBuffer::malloc │ hsa_amd_vmem_handle_create +# allocate (fine pool) │ ROCCLR_MEM_PHYMEM │ (caller chooses pool type!) +# │ │ │ │ │ +# │ │ ▼ │ │ +# │ │ hsa_amd_vmem_ │ │ +# │ │ handle_create │ │ +# │ │ (coarse pool!) │ │ +# │ │ │ │ │ +# └────────────┼───────┘ │ │ +# │ └──────────────┘ +# │ HSA Runtime (libhsa-runtime64.so) +# ▼ +# KFD Driver (hsaKmtAllocMemory) +# │ +# ▼ +# AMDGPU DRM Driver (amdgpu_cs_ioctl) +# # ============================================================================ # HIP Virtual Memory (VMem) Management APIs # ============================================================================ # Constants for VMem APIs hipMemAllocationTypePinned = 0x1 +hipMemAllocationTypeUncached = 0x40000000 # AMD ROCm extension; may not be supported by hipMemCreate hipMemHandleTypePosixFileDescriptor = 0x1 hipMemLocationTypeDevice = 0x1 hipMemAllocationGranularityRecommended = 0x1 @@ -563,13 +649,17 @@ def get_allocation_granularity(device_id): return granularity.value -def mem_create(size, device_id): +def mem_create(size, device_id, alloc_type=None): """ Create a physical memory allocation. Args: size: Size in bytes (should be aligned to granularity) device_id: Device ID + alloc_type: hipMemAllocationType constant (default: hipMemAllocationTypePinned). + Pass hipMemAllocationTypeUncached (0x40000000) to request uncached + (fine-grained) physical memory — this is an AMD ROCm extension and may + return hipErrorNotSupported on some driver versions. Returns: hipMemGenericAllocationHandle_t handle @@ -580,8 +670,11 @@ def mem_create(size, device_id): if not _is_amd_backend: raise RuntimeError("VMem only supported on AMD/HIP backend") + if alloc_type is None: + alloc_type = hipMemAllocationTypePinned + prop = hipMemAllocationProp() - prop.type = hipMemAllocationTypePinned + prop.type = alloc_type prop.location.type = hipMemLocationTypeDevice prop.location.id = device_id prop.requestedHandleType = hipMemHandleTypePosixFileDescriptor @@ -837,3 +930,556 @@ def mem_set_access(ptr, size, desc_or_list): gpu_runtime.hipMemSetAccess.restype = ctypes.c_int gpu_try(gpu_runtime.hipMemSetAccess(ctypes.c_void_p(ptr), size, desc_array, count)) + + +# ============================================================================ +# HSA Virtual Memory (VMem) Management APIs — Path 3 (fine-grained VMem) +# +# These APIs provide direct access to the HSA runtime, bypassing HIP/CLR. +# The key advantage over HIP VMem (Path 2) is that HSA allows choosing +# any memory pool for hsa_amd_vmem_handle_create, including fine-grained +# GPU local pools, which is required for correct P2P atomic operations. +# +# HIP's hipMemCreate internally always uses the coarse-grained GPU pool, +# making it impossible to get fine-grained VMem via the HIP API layer. +# By going directly to HSA, we can enumerate GPU memory pools and select +# the fine-grained pool (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED). +# ============================================================================ + +# Lazy-loaded HSA runtime handle +_hsa_runtime = None + + +def _get_hsa_runtime(): + """Load and return the HSA runtime library handle.""" + global _hsa_runtime + if _hsa_runtime is None: + _hsa_runtime = ctypes.cdll.LoadLibrary("libhsa-runtime64.so") + return _hsa_runtime + + +def _hsa_try(status, fn_name="HSA"): + """Check HSA status code and raise RuntimeError on failure.""" + if status != 0: # HSA_STATUS_SUCCESS = 0 + rt = _get_hsa_runtime() + rt.hsa_status_string.restype = ctypes.c_int + rt.hsa_status_string.argtypes = [ctypes.c_uint32, ctypes.POINTER(ctypes.c_char_p)] + msg_ptr = ctypes.c_char_p() + rt.hsa_status_string(ctypes.c_uint32(status), ctypes.byref(msg_ptr)) + msg = msg_ptr.value.decode("utf-8") if msg_ptr.value else f"error code {status:#x}" + raise RuntimeError(f"HSA {fn_name} error: {msg}") + + +# HSA status constants +HSA_STATUS_SUCCESS = 0 +HSA_STATUS_INFO_BREAK = 0x1 # Non-error: stop iteration + +# HSA device type constants +HSA_DEVICE_TYPE_GPU = 1 + +# HSA agent info enum values +HSA_AGENT_INFO_DEVICE = 17 # hsa_device_type_t + +# HSA memory pool info enum values +HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS = 1 # uint32_t bitmask +HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED = 5 # bool +HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6 # size_t +HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE = 18 # size_t + +# HSA memory pool global flags (bitmask) +HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED = 2 +HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4 +HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED = 8 + +# HSA memory type for vmem handle creation +HSA_AMD_MEMORY_TYPE_NONE = 0 +HSA_AMD_MEMORY_TYPE_PINNED = 1 + +# HSA access permissions +HSA_ACCESS_PERMISSION_RW = 3 + +# HSA VMem address reserve flags +HSA_AMD_VMEM_ADDRESS_NO_REGISTER = 1 << 0 + + +class hsa_agent_t(ctypes.Structure): + """Opaque HSA agent handle (uint64_t).""" + + _fields_ = [("handle", ctypes.c_uint64)] + + +class hsa_amd_memory_pool_t(ctypes.Structure): + """Opaque HSA memory pool handle (uint64_t).""" + + _fields_ = [("handle", ctypes.c_uint64)] + + +class hsa_amd_vmem_alloc_handle_t(ctypes.Structure): + """Opaque HSA VMem allocation handle (uint64_t).""" + + _fields_ = [("handle", ctypes.c_uint64)] + + +class hsa_amd_memory_access_desc_t(ctypes.Structure): + """HSA memory access descriptor for hsa_amd_vmem_set_access.""" + + _fields_ = [ + ("permissions", ctypes.c_uint32), # hsa_access_permission_t + ("agent_handle", hsa_agent_t), + ] + + +def hsa_init(): + """ + Initialize the HSA runtime. + + Must be called before any other HSA functions. The HSA runtime maintains a + reference count — each hsa_init() must be paired with hsa_shut_down(). + + Raises: + RuntimeError: If initialization fails + """ + rt = _get_hsa_runtime() + rt.hsa_init.argtypes = [] + rt.hsa_init.restype = ctypes.c_uint32 + _hsa_try(rt.hsa_init(), "hsa_init") + + +def hsa_shut_down(): + """ + Shut down the HSA runtime. + + Decrements the reference count. Runtime is fully released when count hits 0. + + Raises: + RuntimeError: If shutdown fails + """ + rt = _get_hsa_runtime() + rt.hsa_shut_down.argtypes = [] + rt.hsa_shut_down.restype = ctypes.c_uint32 + _hsa_try(rt.hsa_shut_down(), "hsa_shut_down") + + +def hsa_get_gpu_agents(): + """ + Enumerate all GPU agents in the system. + + Returns: + List of hsa_agent_t handles for all GPU agents (one per GPU device). + + Raises: + RuntimeError: If agent iteration fails + """ + rt = _get_hsa_runtime() + + gpu_agents = [] + + # Callback signature: hsa_status_t callback(hsa_agent_t agent, void* data) + CALLBACK_TYPE = ctypes.CFUNCTYPE(ctypes.c_uint32, hsa_agent_t, ctypes.c_void_p) + + def agent_callback(agent, data): + # Query device type + rt.hsa_agent_get_info.argtypes = [hsa_agent_t, ctypes.c_uint32, ctypes.c_void_p] + rt.hsa_agent_get_info.restype = ctypes.c_uint32 + device_type = ctypes.c_uint32(0) + status = rt.hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, ctypes.byref(device_type)) + if status == HSA_STATUS_SUCCESS and device_type.value == HSA_DEVICE_TYPE_GPU: + gpu_agents.append(hsa_agent_t(handle=agent.handle)) + return HSA_STATUS_SUCCESS # Continue iteration + + cb = CALLBACK_TYPE(agent_callback) + rt.hsa_iterate_agents.argtypes = [CALLBACK_TYPE, ctypes.c_void_p] + rt.hsa_iterate_agents.restype = ctypes.c_uint32 + status = rt.hsa_iterate_agents(cb, None) + if status != HSA_STATUS_SUCCESS: + _hsa_try(status, "hsa_iterate_agents") + + return gpu_agents + + +def hsa_get_fine_grained_pool(agent: hsa_agent_t) -> hsa_amd_memory_pool_t: + """ + Find the fine-grained GPU local memory pool for the given agent. + + Path 3 requires using the fine-grained device pool (one where + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED is set in the global flags). + This is the pool that hsa_amd_vmem_handle_create should use to create + fine-grained physical memory handles for correct P2P atomic operations. + + Args: + agent: HSA agent handle for the target GPU + + Returns: + hsa_amd_memory_pool_t handle for the fine-grained pool + + Raises: + RuntimeError: If no fine-grained allocatable pool is found + """ + rt = _get_hsa_runtime() + + found_pool = [None] + + POOL_CALLBACK = ctypes.CFUNCTYPE(ctypes.c_uint32, hsa_amd_memory_pool_t, ctypes.c_void_p) + + def pool_callback(pool, data): + rt.hsa_amd_memory_pool_get_info.argtypes = [ + hsa_amd_memory_pool_t, + ctypes.c_uint32, + ctypes.c_void_p, + ] + rt.hsa_amd_memory_pool_get_info.restype = ctypes.c_uint32 + + # Check if allocation is allowed + alloc_allowed = ctypes.c_bool(False) + status = rt.hsa_amd_memory_pool_get_info( + pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, ctypes.byref(alloc_allowed) + ) + if status != HSA_STATUS_SUCCESS or not alloc_allowed.value: + return HSA_STATUS_SUCCESS + + # Check global flags for fine-grained + global_flags = ctypes.c_uint32(0) + status = rt.hsa_amd_memory_pool_get_info( + pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, ctypes.byref(global_flags) + ) + if status != HSA_STATUS_SUCCESS: + return HSA_STATUS_SUCCESS + + if global_flags.value & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED: + found_pool[0] = hsa_amd_memory_pool_t(handle=pool.handle) + return HSA_STATUS_INFO_BREAK # Stop iteration + + return HSA_STATUS_SUCCESS + + cb = POOL_CALLBACK(pool_callback) + rt.hsa_amd_agent_iterate_memory_pools.argtypes = [hsa_agent_t, POOL_CALLBACK, ctypes.c_void_p] + rt.hsa_amd_agent_iterate_memory_pools.restype = ctypes.c_uint32 + status = rt.hsa_amd_agent_iterate_memory_pools(agent, cb, None) + if status not in (HSA_STATUS_SUCCESS, HSA_STATUS_INFO_BREAK): + _hsa_try(status, "hsa_amd_agent_iterate_memory_pools") + + if found_pool[0] is None: + raise RuntimeError("No fine-grained allocatable GPU memory pool found for agent") + + return found_pool[0] + + +def hsa_get_pool_granularity(pool: hsa_amd_memory_pool_t) -> int: + """ + Get the recommended allocation granularity for an HSA memory pool. + + Args: + pool: HSA memory pool handle + + Returns: + Recommended allocation granularity in bytes + """ + rt = _get_hsa_runtime() + rt.hsa_amd_memory_pool_get_info.argtypes = [ + hsa_amd_memory_pool_t, + ctypes.c_uint32, + ctypes.c_void_p, + ] + rt.hsa_amd_memory_pool_get_info.restype = ctypes.c_uint32 + + granule = ctypes.c_size_t(0) + _hsa_try( + rt.hsa_amd_memory_pool_get_info( + pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_REC_GRANULE, ctypes.byref(granule) + ), + "hsa_amd_memory_pool_get_info(REC_GRANULE)", + ) + if granule.value == 0: + # Fall back to minimum granule + _hsa_try( + rt.hsa_amd_memory_pool_get_info( + pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, ctypes.byref(granule) + ), + "hsa_amd_memory_pool_get_info(GRANULE)", + ) + return granule.value + + +def hsa_vmem_address_reserve(size: int, align: int = 0) -> int: + """ + Reserve a virtual address range (Path 3). + + Equivalent to HIP's hipMemAddressReserve, but called directly via HSA. + + Args: + size: Size in bytes of the virtual address range to reserve + align: Optional alignment hint (0 = use default) + + Returns: + Integer base address of the reserved virtual address range + + Raises: + RuntimeError: If reservation fails + """ + rt = _get_hsa_runtime() + rt.hsa_amd_vmem_address_reserve.argtypes = [ + ctypes.POINTER(ctypes.c_void_p), # void** va + ctypes.c_size_t, # size_t size + ctypes.c_uint64, # uint64_t address (hint, 0 = any) + ctypes.c_uint64, # uint64_t flags + ] + rt.hsa_amd_vmem_address_reserve.restype = ctypes.c_uint32 + + va = ctypes.c_void_p() + _hsa_try( + rt.hsa_amd_vmem_address_reserve(ctypes.byref(va), size, 0, 0), + "hsa_amd_vmem_address_reserve", + ) + return va.value + + +def hsa_vmem_address_free(va: int, size: int): + """ + Free a previously reserved virtual address range (Path 3). + + Args: + va: Base address returned by hsa_vmem_address_reserve + size: Size in bytes (must match reservation size) + + Raises: + RuntimeError: If free fails + """ + rt = _get_hsa_runtime() + rt.hsa_amd_vmem_address_free.argtypes = [ctypes.c_void_p, ctypes.c_size_t] + rt.hsa_amd_vmem_address_free.restype = ctypes.c_uint32 + _hsa_try( + rt.hsa_amd_vmem_address_free(ctypes.c_void_p(va), size), + "hsa_amd_vmem_address_free", + ) + + +def hsa_vmem_handle_create( + pool: hsa_amd_memory_pool_t, + size: int, + memory_type: int = HSA_AMD_MEMORY_TYPE_NONE, +) -> hsa_amd_vmem_alloc_handle_t: + """ + Create a physical memory handle from an HSA pool (Path 3). + + This is the KEY difference from HIP's hipMemCreate (Path 2): + - hipMemCreate ALWAYS uses the coarse-grained GPU pool + - This function takes an EXPLICIT pool, so we can use the fine-grained pool + + By passing the fine-grained GPU pool (from hsa_get_fine_grained_pool), + the KFD driver allocates with CoarseGrain=0, enabling correct P2P atomics. + + Args: + pool: HSA memory pool (use fine-grained pool from hsa_get_fine_grained_pool) + size: Size in bytes (must be granularity-aligned) + memory_type: HSA_AMD_MEMORY_TYPE_NONE (default) or HSA_AMD_MEMORY_TYPE_PINNED + + Returns: + hsa_amd_vmem_alloc_handle_t handle for the physical allocation + + Raises: + RuntimeError: If handle creation fails + """ + rt = _get_hsa_runtime() + rt.hsa_amd_vmem_handle_create.argtypes = [ + hsa_amd_memory_pool_t, + ctypes.c_size_t, + ctypes.c_uint32, # hsa_amd_memory_type_t + ctypes.c_uint64, # flags (currently unused, must be 0) + ctypes.POINTER(hsa_amd_vmem_alloc_handle_t), + ] + rt.hsa_amd_vmem_handle_create.restype = ctypes.c_uint32 + + handle = hsa_amd_vmem_alloc_handle_t() + _hsa_try( + rt.hsa_amd_vmem_handle_create(pool, size, memory_type, 0, ctypes.byref(handle)), + "hsa_amd_vmem_handle_create", + ) + return handle + + +def hsa_vmem_handle_release(handle: hsa_amd_vmem_alloc_handle_t): + """ + Release a physical memory handle created by hsa_vmem_handle_create (Path 3). + + Args: + handle: Handle returned by hsa_vmem_handle_create + + Raises: + RuntimeError: If release fails + """ + rt = _get_hsa_runtime() + rt.hsa_amd_vmem_handle_release.argtypes = [hsa_amd_vmem_alloc_handle_t] + rt.hsa_amd_vmem_handle_release.restype = ctypes.c_uint32 + _hsa_try(rt.hsa_amd_vmem_handle_release(handle), "hsa_amd_vmem_handle_release") + + +def hsa_vmem_map(va: int, size: int, handle: hsa_amd_vmem_alloc_handle_t, offset: int = 0): + """ + Map a physical memory handle to a virtual address range (Path 3). + + Maps the physical memory backing @p handle to [@p va, @p va + @p size). + After mapping, hsa_vmem_set_access must be called to make it accessible. + + Args: + va: Virtual address (from hsa_vmem_address_reserve) + size: Size in bytes to map + handle: Physical memory handle (from hsa_vmem_handle_create) + offset: Offset within the handle's physical allocation (default 0) + + Raises: + RuntimeError: If mapping fails + """ + rt = _get_hsa_runtime() + rt.hsa_amd_vmem_map.argtypes = [ + ctypes.c_void_p, # void* va + ctypes.c_size_t, # size_t size + ctypes.c_size_t, # size_t in_offset + hsa_amd_vmem_alloc_handle_t, # memory handle + ctypes.c_uint64, # flags (must be 0) + ] + rt.hsa_amd_vmem_map.restype = ctypes.c_uint32 + _hsa_try( + rt.hsa_amd_vmem_map(ctypes.c_void_p(va), size, offset, handle, 0), + "hsa_amd_vmem_map", + ) + + +def hsa_vmem_unmap(va: int, size: int): + """ + Unmap a previously mapped virtual address range (Path 3). + + Args: + va: Virtual address that was mapped + size: Size in bytes that was mapped + + Raises: + RuntimeError: If unmapping fails + """ + rt = _get_hsa_runtime() + rt.hsa_amd_vmem_unmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t] + rt.hsa_amd_vmem_unmap.restype = ctypes.c_uint32 + _hsa_try(rt.hsa_amd_vmem_unmap(ctypes.c_void_p(va), size), "hsa_amd_vmem_unmap") + + +def hsa_vmem_set_access(va: int, size: int, agents_or_descs): + """ + Set access permissions for a mapped virtual address range (Path 3). + + Makes the mapped memory accessible to the specified agents. + This must be called after hsa_vmem_map before the memory can be accessed. + + Note: Like hipMemSetAccess, this must be called cumulatively from the + base address for the full mapped range (see ROCm issue #2667). + + Args: + va: Virtual address (base of mapped range) + size: Size in bytes + agents_or_descs: Single hsa_agent_t, list of hsa_agent_t, or list of + hsa_amd_memory_access_desc_t with permissions per agent + + Raises: + RuntimeError: If setting access fails + """ + rt = _get_hsa_runtime() + rt.hsa_amd_vmem_set_access.argtypes = [ + ctypes.c_void_p, + ctypes.c_size_t, + ctypes.POINTER(hsa_amd_memory_access_desc_t), + ctypes.c_size_t, + ] + rt.hsa_amd_vmem_set_access.restype = ctypes.c_uint32 + + # Build descriptor list + if isinstance(agents_or_descs, hsa_amd_memory_access_desc_t): + descs = [agents_or_descs] + elif isinstance(agents_or_descs, hsa_agent_t): + desc = hsa_amd_memory_access_desc_t() + desc.permissions = HSA_ACCESS_PERMISSION_RW + desc.agent_handle = agents_or_descs + descs = [desc] + elif isinstance(agents_or_descs, list): + descs = [] + for item in agents_or_descs: + if isinstance(item, hsa_agent_t): + desc = hsa_amd_memory_access_desc_t() + desc.permissions = HSA_ACCESS_PERMISSION_RW + desc.agent_handle = item + descs.append(desc) + else: + descs.append(item) + else: + raise TypeError(f"Expected hsa_agent_t, hsa_amd_memory_access_desc_t, or list; got {type(agents_or_descs)}") + + desc_array = (hsa_amd_memory_access_desc_t * len(descs))(*descs) + _hsa_try( + rt.hsa_amd_vmem_set_access(ctypes.c_void_p(va), size, desc_array, len(descs)), + "hsa_amd_vmem_set_access", + ) + + +def hsa_vmem_export_shareable_handle(handle: hsa_amd_vmem_alloc_handle_t) -> int: + """ + Export an HSA VMem handle as a DMA-BUF file descriptor (Path 3). + + The exported fd can be passed to another process (via SCM_RIGHTS) and + imported there using hsa_vmem_import_shareable_handle. This is used + for multi-rank symmetric heap setup. + + Args: + handle: Physical memory handle from hsa_vmem_handle_create + + Returns: + File descriptor (int) for the DMA-BUF shareable handle + + Raises: + RuntimeError: If export fails + """ + rt = _get_hsa_runtime() + rt.hsa_amd_vmem_export_shareable_handle.argtypes = [ + ctypes.POINTER(ctypes.c_int), # int* dmabuf_fd + hsa_amd_vmem_alloc_handle_t, # memory handle + ctypes.c_uint64, # flags (must be 0) + ] + rt.hsa_amd_vmem_export_shareable_handle.restype = ctypes.c_uint32 + + fd = ctypes.c_int(-1) + _hsa_try( + rt.hsa_amd_vmem_export_shareable_handle(ctypes.byref(fd), handle, 0), + "hsa_amd_vmem_export_shareable_handle", + ) + return fd.value + + +def hsa_vmem_import_shareable_handle(dmabuf_fd: int) -> hsa_amd_vmem_alloc_handle_t: + """ + Import an HSA VMem handle from a DMA-BUF file descriptor (Path 3). + + The returned handle can be used with hsa_vmem_map to map the peer's + physical memory into the local virtual address space. The imported handle + must be released with hsa_vmem_handle_release when done. + + Note: The DMA-BUF fd is consumed (closed) by this call. + + Args: + dmabuf_fd: File descriptor from hsa_vmem_export_shareable_handle + (received via SCM_RIGHTS from the peer process) + + Returns: + hsa_amd_vmem_alloc_handle_t handle for the imported memory + + Raises: + RuntimeError: If import fails + """ + rt = _get_hsa_runtime() + rt.hsa_amd_vmem_import_shareable_handle.argtypes = [ + ctypes.c_int, # int dmabuf_fd + ctypes.POINTER(hsa_amd_vmem_alloc_handle_t), # handle output + ] + rt.hsa_amd_vmem_import_shareable_handle.restype = ctypes.c_uint32 + + handle = hsa_amd_vmem_alloc_handle_t() + _hsa_try( + rt.hsa_amd_vmem_import_shareable_handle(dmabuf_fd, ctypes.byref(handle)), + "hsa_amd_vmem_import_shareable_handle", + ) + return handle diff --git a/iris/symmetric_heap.py b/iris/symmetric_heap.py index 0572ac9d..3bd04e1c 100644 --- a/iris/symmetric_heap.py +++ b/iris/symmetric_heap.py @@ -12,7 +12,7 @@ import torch import os -from iris.allocators import TorchAllocator, VMemAllocator +from iris.allocators import TorchAllocator, VMemAllocator, HsaVMemAllocator from iris.fd_passing import setup_fd_infrastructure from iris._distributed_helpers import distributed_allgather @@ -24,7 +24,7 @@ class SymmetricHeap: Manages distributed memory with symmetric addressing across ranks, handling all allocator coordination and memory sharing internally. - Supports multiple allocator backends: 'torch' (default) and 'vmem'. + Supports multiple allocator backends: 'torch' (default), 'vmem', and 'hsa_vmem'. """ def __init__( @@ -43,7 +43,7 @@ def __init__( device_id: GPU device ID cur_rank: Current process rank num_ranks: Total number of ranks - allocator_type: Type of allocator ("torch" or "vmem"); default "torch" + allocator_type: Type of allocator ("torch", "vmem", or "hsa_vmem"); default "torch" Raises: ValueError: If allocator_type is not supported @@ -58,8 +58,10 @@ def __init__( self.allocator = TorchAllocator(heap_size, device_id, cur_rank, num_ranks) elif allocator_type == "vmem": self.allocator = VMemAllocator(heap_size, device_id, cur_rank, num_ranks) + elif allocator_type == "hsa_vmem": + self.allocator = HsaVMemAllocator(heap_size, device_id, cur_rank, num_ranks) else: - raise ValueError(f"Unknown allocator type: {allocator_type}. Supported: 'torch', 'vmem'") + raise ValueError(f"Unknown allocator type: {allocator_type}. Supported: 'torch', 'vmem', 'hsa_vmem'") self.fd_conns = setup_fd_infrastructure(cur_rank, num_ranks) device = self.allocator.get_device() @@ -256,9 +258,8 @@ def as_symmetric(self, external_tensor: torch.Tensor) -> torch.Tensor: """ Place an external PyTorch tensor on the symmetric heap. - With the torch allocator: allocates on the heap and copies the data; - the returned tensor is independent of the input. With the vmem - allocator: imports the memory so both tensors share the same storage. + With the torch, vmem, and hsa_vmem allocators: allocates on the heap and + copies the data; the returned tensor is independent of the input. Args: external_tensor: External PyTorch tensor (must be CUDA, contiguous) diff --git a/tests/unittests/test_atomic_add_triton.py b/tests/unittests/test_atomic_add_triton.py index 8cf2f7f4..eaeaca11 100644 --- a/tests/unittests/test_atomic_add_triton.py +++ b/tests/unittests/test_atomic_add_triton.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: MIT # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +import gc + import torch import triton import triton.language as tl @@ -108,6 +110,59 @@ def test_atomic_add_api(dtype, sem, scope, BLOCK_SIZE): # Explicitly delete the shmem instance to trigger cleanup del shmem # Force garbage collection to ensure IPC handles are cleaned up - import gc + gc.collect() + +@pytest.mark.parametrize( + "sem", + [ + "acquire", + "release", + "acq_rel", + ], +) +@pytest.mark.parametrize( + "scope", + [ + "cta", + "gpu", + "sys", + ], +) +def test_atomic_add_vmem(sem, scope): + """ + Test P2P atomic operations with VMemAllocator. + + VMemAllocator previously used hipMemCreate (coarse-grained memory) which + caused intermittent failures for cross-GPU atomics with scope=cta/gpu. + This test verifies that the fix (using hipExtMallocWithFlags fine-grained + memory) eliminates these failures. + """ + shmem = iris.iris(1 << 20, allocator_type="vmem") + num_ranks = shmem.get_num_ranks() + heap_bases = shmem.get_heap_bases() + cur_rank = shmem.get_rank() + + results = shmem.zeros(1, dtype=torch.float32) + + shmem.barrier() + + grid = lambda meta: (1,) + atomic_add_kernel[grid](results, sem, scope, cur_rank, num_ranks, 1, heap_bases) + shmem.barrier() + + # Every rank atomically adds 1 to every other rank's results[0]. + # Expected: num_ranks additions per element. + expected = torch.ones(1, dtype=torch.float32, device="cuda") * num_ranks + + try: + torch.testing.assert_close(results, expected, rtol=0, atol=0) + except AssertionError as e: + print(e) + print("Expected:", expected) + print("Actual:", results) + raise + finally: + shmem.barrier() + del shmem gc.collect() diff --git a/tests/unittests/test_hsa_vmem_reproducible.py b/tests/unittests/test_hsa_vmem_reproducible.py new file mode 100644 index 00000000..d73e36fb --- /dev/null +++ b/tests/unittests/test_hsa_vmem_reproducible.py @@ -0,0 +1,704 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + +""" +Minimal P2P atomic reproducible — comparing three GPU memory allocation paths. + +## Background: Three paths for P2P-atomic-safe memory on AMD GPUs + +For correct cross-GPU (P2P) atomic operations in Triton (scope=cta/gpu/sys), +the physical memory must be **fine-grained**. Three paths exist; only Paths 1 +and 3 produce fine-grained memory. + +``` +Path 1 — hipExtMallocWithFlags(hipDeviceMallocFinegrained) + Stack: HIP → hsa_amd_memory_pool_allocate (fine-grained GPU pool) + → KFD: hsaKmtAllocMemory(CoarseGrain=0) + P2P: hipExternalMemoryHandleTypeOpaqueFd (dma-buf) → hipImportExternalMemory + Result: fine-grained, KNOWN GOOD ✓ + +Path 2 — hipMemCreate + hipMemAddressReserve + hipMemMap + Stack: HIP → CLR SvmBuffer::malloc(ROCCLR_MEM_PHYMEM) + → hsa_amd_vmem_handle_create (coarse-grained GPU pool, hardcoded) + → KFD: hsaKmtAllocMemory(CoarseGrain=1, NoAddress=1) + P2P: hipMemImportFromShareableHandle + hipMemMap + Result: ALWAYS coarse-grained — P2P atomics (scope=cta/gpu) fail ✗ + +Path 3 — hsa_amd_vmem_handle_create on fine-grained pool (direct HSA) + Stack: hsa_amd_vmem_handle_create(fine_grained_pool, ...) ← caller chooses pool + → KFD: hsaKmtAllocMemory(CoarseGrain=0, NoAddress=1) + P2P: hsa_amd_vmem_export_shareable_handle → hsa_amd_vmem_import_shareable_handle + → hsa_amd_vmem_map + hsa_amd_vmem_set_access + Result: fine-grained physical memory — P2P atomics always pass ✓ +``` + +The key difference between Paths 2 and 3: `hipMemCreate` in HIP/CLR hardcodes the +coarse-grained GPU pool. `hsa_amd_vmem_handle_create` takes an explicit pool +argument, so we can pass the fine-grained pool instead. + +## What this file tests (and why) + +The nine `test_hsa_vmem_p2p_atomics[*]` tests confirm the fix at the repro level: +each allocates memory via `hsa_amd_vmem_handle_create` on the **fine-grained** pool, +exports the DMA-BUF handle to the peer, imports it, maps it, and runs 200 P2P +atomic_add iterations. All scope×sem combinations produce zero failures. + +## Why there is no automated test for Path 2 (HIP VMem) failure + +HIP VMem (`hipMemCreate`) always allocates from the **coarse-grained** GPU pool +because HIP/CLR's `SvmBuffer::malloc(ROCCLR_MEM_PHYMEM)` hardcodes that pool. +Cross-GPU atomics on coarse-grained memory are not just "wrong" — on AMD GPUs they +trigger GPU page faults that send SIGSEGV to the process. This makes any automated +test of HIP VMem P2P atomics inherently fatal to the test process, so we cannot +include such a test in the regular test suite. + +The `_HIPVMemP2P` fixture class and `_run_p2p_atomics_hip` helper below document the +setup in detail and can be used for manual/ad-hoc investigation. The module-level +docstring above explains the complete stack trace for all three paths. + +No iris machinery (no SymmetricHeap, no bump allocator, no refresh_peer_access). +Just raw API calls + torchrun for process management. +""" + +import os +import struct + +import pytest +import torch +import torch.distributed as dist +import triton +import triton.language as tl + +from iris.hip import ( + hsa_init, + hsa_get_gpu_agents, + hsa_get_fine_grained_pool, + hsa_get_pool_granularity, + hsa_vmem_address_reserve, + hsa_vmem_address_free, + hsa_vmem_handle_create, + hsa_vmem_handle_release, + hsa_vmem_map, + hsa_vmem_unmap, + hsa_vmem_set_access, + hsa_vmem_export_shareable_handle, + hsa_vmem_import_shareable_handle, + # Path 2: HIP VMem (coarse-grained — used to show the bug) + get_allocation_granularity, + mem_create, + mem_export_to_shareable_handle, + mem_import_from_shareable_handle, + mem_address_reserve, + mem_map, + mem_unmap, + mem_address_free, + mem_release, + mem_set_access, + hipMemAccessDesc, + hipMemLocationTypeDevice, + hipMemAccessFlagsProtReadWrite, + hipMemAllocationTypePinned, + hipMemAllocationTypeUncached, +) +from iris.fd_passing import setup_fd_infrastructure, send_fd, recv_fd + +# --------------------------------------------------------------------------- +# Session-level HSA initialisation +# +# hsa_init / hsa_shut_down use a reference count internally. Calling +# hsa_init() + hsa_shut_down() inside every test works on its own but +# causes hsa_init() to fail with OUT_OF_RESOURCES in the *next* test +# when the runtime hasn't fully released its internal threads yet. +# +# We therefore call hsa_init() once per process here and let the OS +# clean up the runtime when the process exits. +# --------------------------------------------------------------------------- +hsa_init() + +# Tolerance for float32 atomic comparison: values are integers (0.0, 1.0, 2.0, ...), +# so any deviation > 0.01 indicates a real atomicity failure. +_ATOMIC_EXACT_TOL = 0.01 +# Tolerance for P2P counter check: counter should equal world_size exactly +_ATOMIC_COUNT_TOL = 0.5 + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _rank(): + return dist.get_rank() if dist.is_initialized() else 0 + + +def _world_size(): + return dist.get_world_size() if dist.is_initialized() else 1 + + +def _local_rank(): + return int(os.environ.get("LOCAL_RANK", 0)) + + +def _tensor_at(va: int, n_floats: int, device: torch.device) -> torch.Tensor: + """Create a float32 tensor backed by GPU memory at *va* (no copy).""" + + class _CUDAMem: + def __init__(self, ptr, n): + self._ptr = ptr + self._n = n + + @property + def __cuda_array_interface__(self): + return { + "shape": (self._n * 4,), + "typestr": "|u1", + "data": (self._ptr, False), + "version": 3, + } + + return torch.as_tensor(_CUDAMem(va, n_floats), device=device).view(torch.float32) + + +# --------------------------------------------------------------------------- +# Triton kernel +# --------------------------------------------------------------------------- + + +@triton.jit +def _atomic_add_one(ptr, scope: tl.constexpr, sem: tl.constexpr): + """Atomically add 1.0 to ptr[0].""" + tl.atomic_add(ptr, 1.0, scope=scope, sem=sem) + + +# --------------------------------------------------------------------------- +# Core fixture: raw HSA VMem P2P setup +# --------------------------------------------------------------------------- + + +class _HsaVMemP2P: + """ + Minimal per-test HSA VMem setup: + - one fine-grained physical allocation per rank + - DMA-BUF export/import with peers + - all resources released on close() + """ + + def __init__(self, alloc_size: int): + self.rank = _rank() + self.world_size = _world_size() + self.local_rank = _local_rank() + self.device = torch.device(f"cuda:{self.local_rank}") + torch.cuda.set_device(self.local_rank) + + agents = hsa_get_gpu_agents() + assert len(agents) > self.local_rank, f"Expected >{self.local_rank} GPU agents" + self.agents = agents + self.agent = agents[self.local_rank] + + pool = hsa_get_fine_grained_pool(self.agent) + gran = hsa_get_pool_granularity(pool) + # Align alloc_size to pool granularity (required by hsa_vmem_handle_create) + self.size = (alloc_size + gran - 1) & ~(gran - 1) + + # Allocate fine-grained physical memory (Path 3 key step) + self.handle = hsa_vmem_handle_create(pool, self.size) + + # Reserve virtual address space and map + self.va = hsa_vmem_address_reserve(self.size) + hsa_vmem_map(self.va, self.size, self.handle) + hsa_vmem_set_access(self.va, self.size, agents) + + # Exchange DMA-BUF handles with all peers + fd = hsa_vmem_export_shareable_handle(self.handle) + my_meta = struct.pack("QQ", self.va, self.size) + + fc = setup_fd_infrastructure(self.rank, self.world_size) + self.peer_vas: dict = {} + self._peer_handles: list = [] + self._sockets: list = [] # Stored for cleanup in close() to prevent FD leaks + + if fc: + for peer, sock in fc.items(): + if peer == self.rank: + continue + self._sockets.append(sock) + peer_va = hsa_vmem_address_reserve(self.size) + if self.rank > peer: + send_fd(sock, fd, payload=my_meta) + peer_fd, pmeta = recv_fd(sock, payload_size=16) + else: + peer_fd, pmeta = recv_fd(sock, payload_size=16) + send_fd(sock, fd, payload=my_meta) + + peer_base, peer_size = struct.unpack("QQ", pmeta) + peer_handle = hsa_vmem_import_shareable_handle(peer_fd) + os.close(peer_fd) + + peer_alloc = (peer_size + gran - 1) & ~(gran - 1) + hsa_vmem_map(peer_va, peer_alloc, peer_handle) + hsa_vmem_set_access(peer_va, peer_alloc, agents) + + self._peer_handles.append((peer_va, peer_alloc, peer_handle)) + self.peer_vas[peer] = peer_va + + os.close(fd) + + def local_tensor(self, offset_bytes: int = 0) -> torch.Tensor: + """Float32 tensor at va+offset_bytes (local fine-grained memory).""" + return _tensor_at(self.va + offset_bytes, 1, self.device) + + def peer_tensor(self, peer: int, offset_bytes: int = 0) -> torch.Tensor: + """Float32 tensor at peer_va+offset_bytes (imported peer memory).""" + return _tensor_at(self.peer_vas[peer] + offset_bytes, 1, self.device) + + def close(self): + # Close peer sockets first so both ranks can proceed past any pending recv + for sock in self._sockets: + try: + sock.close() + except Exception: + pass + self._sockets.clear() + for pva, psize, ph in self._peer_handles: + try: + hsa_vmem_unmap(pva, psize) + except Exception: + pass + try: + hsa_vmem_address_free(pva, psize) + except Exception: + pass + try: + hsa_vmem_handle_release(ph) + except Exception: + pass + self._peer_handles.clear() + try: + hsa_vmem_unmap(self.va, self.size) + except Exception: + pass + try: + hsa_vmem_handle_release(self.handle) + except Exception: + pass + try: + hsa_vmem_address_free(self.va, self.size) + except Exception: + pass + + def __del__(self): + self.close() + + +# --------------------------------------------------------------------------- +# Path 2 fixture: HIP VMem (coarse-grained — reproduces the bug) +# --------------------------------------------------------------------------- + + +class _HIPVMemP2P: + """ + Minimal Path 2 (HIP VMem) P2P setup using hipMemCreate. + + hipMemCreate internally calls hsa_amd_vmem_handle_create but hardcodes the + coarse-grained GPU pool in HIP/CLR (SvmBuffer::malloc with ROCCLR_MEM_PHYMEM). + This means the physical memory is ALWAYS CoarseGrain=1, and P2P atomics at + scope=cta or scope=gpu will silently return wrong results. + + The structure mirrors _HsaVMemP2P so the P2P atomic loop is identical — + only the allocation and exchange APIs differ, making the comparison clean. + + Args: + alloc_size: Allocation size in bytes (will be rounded up to granularity) + alloc_type: hipMemAllocationType constant. Default is hipMemAllocationTypePinned + (the only value currently supported by hipMemCreate; coarse-grained). + Pass hipMemAllocationTypeUncached (0x40000000) to test whether the ROCm + driver honours an uncached/fine-grained request — if hipMemCreate rejects + this value it raises RuntimeError and the caller should catch it. + """ + + def __init__(self, alloc_size: int, alloc_type: int = hipMemAllocationTypePinned): + self.rank = _rank() + self.world_size = _world_size() + self.local_rank = _local_rank() + self.device = torch.device(f"cuda:{self.local_rank}") + torch.cuda.set_device(self.local_rank) + + gran = get_allocation_granularity(self.local_rank) + # Align alloc_size to HIP granularity + self.size = (alloc_size + gran - 1) & ~(gran - 1) + + # Path 2: hipMemCreate → CLR → hsa_amd_vmem_handle_create (coarse pool) + self.handle = mem_create(self.size, self.local_rank, alloc_type=alloc_type) + + # Reserve virtual address space and map + self.va = mem_address_reserve(self.size, gran) + mem_map(self.va, self.size, 0, self.handle) + + # Set access for the local device + access_desc = hipMemAccessDesc() + access_desc.location.type = hipMemLocationTypeDevice + access_desc.location.id = self.local_rank + access_desc.flags = hipMemAccessFlagsProtReadWrite + mem_set_access(self.va, self.size, access_desc) + + # Exchange handles with all peers via DMA-BUF + fd = mem_export_to_shareable_handle(self.handle) + my_meta = struct.pack("QQ", self.va, self.size) + + fc = setup_fd_infrastructure(self.rank, self.world_size) + self.peer_vas: dict = {} + self._peer_resources: list = [] + self._sockets: list = [] # Stored for cleanup in close() to prevent FD leaks + + if fc: + for peer, sock in fc.items(): + if peer == self.rank: + continue + self._sockets.append(sock) + peer_va = mem_address_reserve(self.size, gran) + if self.rank > peer: + send_fd(sock, fd, payload=my_meta) + peer_fd, pmeta = recv_fd(sock, payload_size=16) + else: + peer_fd, pmeta = recv_fd(sock, payload_size=16) + send_fd(sock, fd, payload=my_meta) + + peer_base, peer_size = struct.unpack("QQ", pmeta) + peer_handle = mem_import_from_shareable_handle(peer_fd) + os.close(peer_fd) + + peer_alloc = (peer_size + gran - 1) & ~(gran - 1) + mem_map(peer_va, peer_alloc, 0, peer_handle) + + # Set access for the local device on the imported range + peer_access = hipMemAccessDesc() + peer_access.location.type = hipMemLocationTypeDevice + peer_access.location.id = self.local_rank + peer_access.flags = hipMemAccessFlagsProtReadWrite + mem_set_access(peer_va, peer_alloc, peer_access) + + self._peer_resources.append((peer_va, peer_alloc, peer_handle)) + self.peer_vas[peer] = peer_va + + os.close(fd) + + def local_tensor(self, offset_bytes: int = 0) -> torch.Tensor: + """Float32 tensor at va+offset_bytes (local coarse-grained HIP VMem).""" + return _tensor_at(self.va + offset_bytes, 1, self.device) + + def peer_tensor(self, peer: int, offset_bytes: int = 0) -> torch.Tensor: + """Float32 tensor at peer_va+offset_bytes (imported peer HIP VMem).""" + return _tensor_at(self.peer_vas[peer] + offset_bytes, 1, self.device) + + def close(self): + # Close peer sockets first so both ranks can proceed past any pending recv + for sock in self._sockets: + try: + sock.close() + except Exception: + pass + self._sockets.clear() + for pva, psize, ph in self._peer_resources: + try: + mem_unmap(pva, psize) + except Exception: + pass + try: + mem_address_free(pva, psize) + except Exception: + pass + try: + mem_release(ph) + except Exception: + pass + self._peer_resources.clear() + try: + mem_unmap(self.va, self.size) + except Exception: + pass + try: + mem_release(self.handle) + except Exception: + pass + try: + mem_address_free(self.va, self.size) + except Exception: + pass + + def __del__(self): + self.close() + + +def test_hsa_vmem_pool_discovery(): + """Verify hsa_get_fine_grained_pool finds a non-zero, allocatable pool.""" + agents = hsa_get_gpu_agents() + assert len(agents) >= 1, "No GPU agents found" + lr = _local_rank() + assert len(agents) > lr + pool = hsa_get_fine_grained_pool(agents[lr]) + assert pool.handle != 0 + gran = hsa_get_pool_granularity(pool) + assert gran > 0 + + +# --------------------------------------------------------------------------- +# Test: single-GPU alloc + map + local atomic (no P2P) +# --------------------------------------------------------------------------- + + +def test_hsa_vmem_single_gpu_alloc_and_atomic(): + """ + Allocate fine-grained VMem via Path 3, map it, run a local atomic. + + Validates the basic reserve→create→map→set_access→atomic pipeline + without any cross-process communication. + """ + local_rank = _local_rank() + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(local_rank) + + agents = hsa_get_gpu_agents() + pool = hsa_get_fine_grained_pool(agents[local_rank]) + gran = hsa_get_pool_granularity(pool) + + va = hsa_vmem_address_reserve(gran) + handle = hsa_vmem_handle_create(pool, gran) + try: + hsa_vmem_map(va, gran, handle) + hsa_vmem_set_access(va, gran, agents) + + t = _tensor_at(va, 4, device) + t.fill_(0.0) + _atomic_add_one[(1,)](t, "sys", "acq_rel") + torch.cuda.synchronize() + assert abs(t[0].item() - 1.0) < _ATOMIC_EXACT_TOL, f"Local atomic failed: got {t[0].item()}" + hsa_vmem_unmap(va, gran) + finally: + hsa_vmem_handle_release(handle) + hsa_vmem_address_free(va, gran) + + +# --------------------------------------------------------------------------- +# Path 2: HIP VMem helper (for manual/ad-hoc investigation only) +# +# This helper is NOT called by any automated test because coarse-grained P2P +# atomics on AMD GPUs trigger GPU page faults that kill the process. +# +# To manually confirm the bug: run this in isolation with a single pair of ranks +# and observe that ~5-30% of iterations produce wrong values (counter < 2 instead +# of 2). Depending on hardware, some iterations may also crash the process. +# --------------------------------------------------------------------------- + + +def _run_p2p_atomics_hip(scope: str, sem: str, n_iters: int = 200, alloc_type: int = hipMemAllocationTypePinned) -> int: + """ + Same P2P atomic loop as _run_p2p_atomics but using HIP VMem (Path 2). + + hipMemCreate allocates via CLR's SvmBuffer::malloc(ROCCLR_MEM_PHYMEM), + which calls hsa_amd_vmem_handle_create with the coarse-grained GPU pool. + The physical memory is therefore CoarseGrain=1 in the KFD driver, and + P2P atomic operations below system scope are not guaranteed to be coherent. + + Args: + scope: Triton atomic scope ("cta", "gpu", "sys") + sem: Triton atomic semantics ("acquire", "release", "acq_rel") + n_iters: Number of P2P atomic rounds + alloc_type: hipMemAllocationType constant. Default is hipMemAllocationTypePinned + (coarse-grained). Pass hipMemAllocationTypeUncached to test whether the + ROCm driver produces fine-grained memory for that type. + + WARNING: On AMD GPUs, coarse-grained P2P atomics do not merely return wrong + values — they can trigger GPU page faults (SIGSEGV) that kill the process. + Do NOT call this function from automated tests. Use it only for manual + validation or one-off debugging sessions. + """ + world_size = _world_size() + p2p = _HIPVMemP2P(alloc_size=4 << 20, alloc_type=alloc_type) + try: + failures = 0 + local_t = p2p.local_tensor() + + for _ in range(n_iters): + local_t.fill_(0.0) + dist.barrier() + + # All ranks add 1 to their own counter + _atomic_add_one[(1,)](local_t, scope, sem) + + # All ranks add 1 to every other rank's counter + for peer_va in p2p.peer_vas.values(): + peer_t = _tensor_at(peer_va, 1, p2p.device) + _atomic_add_one[(1,)](peer_t, scope, sem) + + torch.cuda.synchronize() + dist.barrier() + + got = local_t[0].item() + if abs(got - float(world_size)) > _ATOMIC_COUNT_TOL: + failures += 1 + finally: + p2p.close() + + return failures + + +# --------------------------------------------------------------------------- +# Test: Path 3 P2P atomics — full scope × sem sweep +# --------------------------------------------------------------------------- + + +def _run_p2p_atomics(scope: str, sem: str, n_iters: int = 200) -> int: + """ + Run *n_iters* P2P atomic_add rounds using HSA VMem (Path 3) and return failures. + + Setup (once, outside the loop): + - Each rank allocates fine-grained VMem (Path 3) + - DMA-BUF handles are exchanged between all pairs of ranks + - Each rank imports the peer handle and maps it at a local VA + + Per iteration: + - Zero the local counter + - Barrier (all ranks ready) + - LOCAL atomic: this rank adds 1 to its own counter + - REMOTE atomic: this rank adds 1 to the peer's counter + - Barrier (all atomics done) + - Read local counter; expect world_size + + Because the physical memory was created from the fine-grained pool, + and the imported mapping should preserve that property, both + scope=cta and scope=gpu atomics should be coherent across GPUs. + A non-zero failure count indicates coarse-grained behaviour. + """ + rank = _rank() + world_size = _world_size() + p2p = _HsaVMemP2P(alloc_size=4 << 20) + try: + failures = 0 + local_t = p2p.local_tensor() + + for _ in range(n_iters): + local_t.fill_(0.0) + dist.barrier() + + # All ranks add 1 to their own counter + _atomic_add_one[(1,)](local_t, scope, sem) + + # All ranks add 1 to every other rank's counter + for peer_va in p2p.peer_vas.values(): + peer_t = _tensor_at(peer_va, 1, p2p.device) + _atomic_add_one[(1,)](peer_t, scope, sem) + + torch.cuda.synchronize() + dist.barrier() + + got = local_t[0].item() + if abs(got - float(world_size)) > _ATOMIC_COUNT_TOL: + failures += 1 + finally: + p2p.close() + + return failures + + +@pytest.mark.parametrize("scope", ["cta", "gpu", "sys"]) +@pytest.mark.parametrize("sem", ["acquire", "release", "acq_rel"]) +def test_hsa_vmem_p2p_atomics(scope, sem): + """ + Path 3 (HSA VMem, fine-grained pool) — P2P atomic correctness sweep. + + Runs 200 iterations for each (scope, sem) combination and asserts zero + failures. Any failure means the imported mapping is coarse-grained and + does not support the requested atomic scope. + + Compare with test_comparison_hsa_fixes_hip_vmem_bug (Path 2), which shows + that HIP VMem fails for scope=cta/gpu because hipMemCreate hardcodes the + coarse-grained pool. + """ + failures = _run_p2p_atomics(scope, sem, n_iters=200) + assert failures == 0, ( + f"HSA VMem (Path 3) P2P atomic failures: {failures}/200 " + f"(scope={scope}, sem={sem}). " + f"Non-zero indicates coarse-grained coherency on the imported mapping." + ) + + +# --------------------------------------------------------------------------- +# Test: HIP VMem with hipMemAllocationTypeUncached +# +# The question: does prop.type = hipMemAllocationTypeUncached (0x40000000) make +# hipMemCreate allocate fine-grained (uncached) physical memory instead of the +# default coarse-grained pinned memory? If so, P2P atomics should pass. +# +# The HIP header says: +# hipMemAllocationTypeUncached = 0x40000000 // AMD ROCm extension +# +# In practice, hipMemCreate may reject this value with hipErrorNotSupported +# because the HIP/CLR implementation currently only handles +# hipMemAllocationTypePinned. This test is designed to: +# 1. Skip automatically if hipMemCreate returns an error (unsupported) +# 2. Pass if the allocation succeeds AND P2P atomics produce zero failures +# (meaning uncached = fine-grained on this hardware/driver) +# 3. Warn (but not fail CI) if allocation succeeds but atomics still fail +# (meaning uncached = still coarse-grained, or a different issue) +# --------------------------------------------------------------------------- + + +def test_hip_vmem_uncached_alloc_type(): + """ + Probe: does hipMemAllocationTypeUncached produce allocatable GPU memory? + + Tries hipMemCreate with prop.type = hipMemAllocationTypeUncached (0x40000000), + an AMD ROCm extension enum value. The HIP header note says hipMemCreate + "Currently must be specified as hipMemAllocationTypePinned", so this type + may be rejected by the driver. + + This test verifies local allocation and local atomic correctness ONLY. + P2P cross-rank access with uncached type is NOT tested here — on this hardware + `hipMemAllocationTypeUncached` is accepted by hipMemCreate but still produces + coarse-grained physical memory that causes GPU page faults (SIGSEGV) on + cross-rank access, even at scope=sys. The HSA VMem (Path 3) approach + remains the only confirmed way to get fine-grained P2P-atomic-safe memory. + + Outcomes: + SKIPPED — hipMemCreate rejected hipMemAllocationTypeUncached + PASSED — allocation + local sys-scope atomic both work (but memory is + still coarse-grained for P2P; see module docstring for details) + FAILED — allocation succeeded but local atomic produced wrong result + """ + local_rank = _local_rank() + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(local_rank) + + gran = get_allocation_granularity(local_rank) + size = gran # single granularity — smallest valid allocation + + # Try allocation; skip if the driver rejects the uncached type + try: + handle = mem_create(size, local_rank, alloc_type=hipMemAllocationTypeUncached) + except RuntimeError as e: + pytest.skip(f"hipMemCreate rejected hipMemAllocationTypeUncached (0x{hipMemAllocationTypeUncached:08x}): {e}") + + # Map and test local access + va = mem_address_reserve(size, gran) + try: + mem_map(va, size, 0, handle) + access_desc = hipMemAccessDesc() + access_desc.location.type = hipMemLocationTypeDevice + access_desc.location.id = local_rank + access_desc.flags = hipMemAccessFlagsProtReadWrite + mem_set_access(va, size, access_desc) + + t = _tensor_at(va, 1, device) + t.fill_(0.0) + # Local atomic only — safe regardless of grain (no cross-rank access) + _atomic_add_one[(1,)](t, "sys", "acq_rel") + torch.cuda.synchronize() + + assert abs(t[0].item() - 1.0) < _ATOMIC_EXACT_TOL, ( + f"Local atomic on hipMemAllocationTypeUncached memory produced wrong result: " + f"got {t[0].item()}, expected 1.0" + ) + + mem_unmap(va, size) + finally: + mem_release(handle) + mem_address_free(va, size) diff --git a/tests/unittests/test_pytorch_import_mechanism.py b/tests/unittests/test_pytorch_import_mechanism.py index 332cd349..337f1613 100644 --- a/tests/unittests/test_pytorch_import_mechanism.py +++ b/tests/unittests/test_pytorch_import_mechanism.py @@ -130,7 +130,11 @@ def __cuda_array_interface__(self): def test_as_symmetric_basic(): - """Basic as_symmetric(): native VMem tensor and imported external tensor in same VA space.""" + """Basic as_symmetric(): native VMem tensor and imported external tensor in same VA space. + + VMemAllocator uses copy semantics: modifications to imported tensor do NOT + affect the original external tensor (same as TorchAllocator). + """ ctx = iris.iris(64 << 20, allocator_type="vmem") native_tensor = ctx.zeros(1000, dtype=torch.float32) @@ -154,10 +158,12 @@ def test_as_symmetric_basic(): assert torch.all(native_tensor == 42.0) assert torch.all(imported_tensor == 99.0) + # VMemAllocator uses copy semantics: modifying imported tensor does NOT affect external imported_tensor.fill_(123.0) torch.cuda.synchronize() assert torch.all(imported_tensor == 123.0) - assert torch.all(external_tensor == 123.0) + # Note: external_tensor retains its original value (copy semantics, not shared memory) + assert torch.all(external_tensor == 99.0) def test_as_symmetric_with_offset(): diff --git a/tests/unittests/test_vmem_allocator.py b/tests/unittests/test_vmem_allocator.py index b09a205d..4e57eed8 100644 --- a/tests/unittests/test_vmem_allocator.py +++ b/tests/unittests/test_vmem_allocator.py @@ -183,43 +183,42 @@ def test_vmem_import_external_tensor(): """ Test importing external PyTorch tensors via as_symmetric(). - This validates the critical lifetime contract: + This validates the import contract for VMemAllocator (copy semantics): 1. External tensors can be imported into the symmetric heap - 2. The imported tensor shares memory with the original (while ctx is alive) - 3. When ctx is destroyed, imported_tensor becomes invalid - 4. BUT the original tensor REMAINS VALID and fully usable + 2. The imported tensor has the same data as the original + 3. The imported tensor is on the symmetric heap (owns_tensor = True) + 4. The original tensor REMAINS VALID after ctx is destroyed - This is THE KEY CONTRACT: imported tensors die with ctx, originals survive. + Note: VMemAllocator uses copy semantics (like TorchAllocator). Modifications + to the imported tensor are NOT visible in the original tensor, and vice versa. """ import gc - # Create Iris context with large enough heap for PyTorch's 2MB allocations + # Create Iris context with vmem allocator ctx = iris.iris(4 << 20, allocator_type="vmem") # Create original PyTorch tensor on the correct device for this rank original_tensor = torch.randn(100, dtype=torch.float32, device=ctx.device) original_data = original_tensor.clone() - # Import the external tensor + # Import the external tensor (copy semantics) imported_tensor = ctx.as_symmetric(original_tensor) - # Verify imported tensor has same data - assert torch.allclose(imported_tensor, original_data), "Imported tensor should match original" + # Verify imported tensor has same data as original + assert torch.allclose(imported_tensor, original_data), "Imported tensor should match original data" - # Modify via imported tensor + # Verify imported tensor is on the symmetric heap + assert ctx.heap.allocator.owns_tensor(imported_tensor), "Imported tensor should be on symmetric heap" + + # Modify via imported tensor - should NOT affect original (copy semantics) imported_tensor.fill_(42.0) assert torch.all(imported_tensor == 42.0), "Imported tensor modifications should work" - # Original tensor should see the change (shared memory) - assert torch.all(original_tensor == 42.0), "Original tensor should see changes via shared memory" - - # Modify via original tensor + # Modify via original tensor - should NOT affect imported tensor (copy semantics) original_tensor.fill_(99.0) assert torch.all(original_tensor == 99.0), "Original tensor modifications should work" - assert torch.all(imported_tensor == 99.0), "Imported tensor should see changes via shared memory" - # NOW THE CRITICAL PART: Destroy ctx - # This invalidates imported_tensor, but original_tensor should survive! + # Destroy ctx (imported_tensor becomes invalid, but original_tensor survives) del ctx, imported_tensor gc.collect() torch.cuda.synchronize() @@ -239,7 +238,8 @@ def test_vmem_import_external_tensor(): f"Rank {torch.distributed.get_rank() if torch.distributed.is_initialized() else 0}: " f"VMem import external tensor test passed!" ) - print(" ✓ Imported tensor shared memory with original (while ctx alive)") + print(" ✓ Imported tensor has same data as original") + print(" ✓ Imported tensor is on the symmetric heap") print(" ✓ Original tensor survived ctx destruction") print(" ✓ Original tensor still fully functional after ctx destroyed")