From bdb823c6f38028d60793a51674af49e08fa36345 Mon Sep 17 00:00:00 2001
From: ZaneHam <zanehambly@gmail.com>
Date: Wed, 11 Mar 2026 21:10:42 +1300
Subject: [PATCH 1/3] =?UTF-8?q?WIP:=20MI300X=20kernarg=20debug=20=E2=80=94?=
 =?UTF-8?q?=20CDNA=20fixes,=20SNAP,=20AMD=20pool=20(partial)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Graph coloring regalloc merge (PR#53) + 3 CDNA fixes (rw_ops ordering,
VGPR floor, rw_ops kind mapping). Kernel dispatches without fault/hang
on MI300X but all 26 kernarg params read as zero from GPU side.

SNAP instrumentation added (--snap flag). kernarg_size clipping fixed.
AMD memory pool API types added but not yet wired into dispatch —
leading theory is hsa_memory_allocate gives memory SMEM can't read
on MI300X, need hsa_amd_memory_pool_allocate instead.

Debug AQL packet dump still present in bc_dispatch. Clean up after
the pool alloc theory is tested.
---
 Makefile                 |   4 +-
 src/amdgpu/amdgpu.h      |  17 +
 src/amdgpu/emit.c        | 804 +++++++++++++++++++++++++++++++++++----
 src/amdgpu/encode.c      |  20 +-
 src/amdgpu/isel.c        | 523 +++++++++++++++++++------
 src/amdgpu/sched.c       | 126 +++++-
 src/amdgpu/verify.c      |   7 +-
 src/fe/parser.c          |  67 +++-
 src/fe/parser.h          |   6 +
 src/fe/sema.c            |   8 +-
 src/ir/bir.h             |   2 +-
 src/ir/bir_cfold.c       |   2 +-
 src/ir/bir_dce.c         |   2 +-
 src/ir/bir_lower.c       | 112 +++++-
 src/ir/bir_mem2reg.c     |   2 +-
 src/ir/bir_print.c       |   4 +
 src/main.c               |   4 +
 src/runtime/bc_runtime.c | 120 +++++-
 src/runtime/bc_runtime.h |   6 +
 tests/test_scopy.cu      |  38 +-
 20 files changed, 1595 insertions(+), 279 deletions(-)

diff --git a/Makefile b/Makefile
index 710e2c0..b401deb 100644
--- a/Makefile
+++ b/Makefile
@@ -14,7 +14,7 @@ LIBS    = -lm
 SOURCES = src/main.c \
           src/fe/bc_err.c src/fe/preproc.c src/fe/lexer.c src/fe/parser.c src/fe/sema.c \
           src/ir/bir.c src/ir/bir_print.c src/ir/bir_lower.c src/ir/bir_mem2reg.c src/ir/bir_cfold.c src/ir/bir_dce.c \
-          src/amdgpu/isel.c src/amdgpu/emit.c src/amdgpu/encode.c src/amdgpu/enc_tab.c src/amdgpu/sched.c src/amdgpu/verify.c \
+          src/amdgpu/amd_rplan.c src/amdgpu/isel.c src/amdgpu/emit.c src/amdgpu/encode.c src/amdgpu/enc_tab.c src/amdgpu/sched.c src/amdgpu/verify.c \
           src/tensix/isel.c src/tensix/emit.c src/tensix/coarsen.c src/tensix/datamov.c
 OBJECTS = $(SOURCES:.c=.o)
 TARGET  = barracuda
@@ -39,7 +39,7 @@ TSRC    = tests/tmain.c tests/tsmoke.c tests/tcomp.c tests/tenc.c \
           tests/tregalloc.c
 TOBJS   = $(TSRC:.c=.o)
 COBJS   = src/ir/bir.o src/ir/bir_print.o src/ir/bir_lower.o src/ir/bir_mem2reg.o src/ir/bir_cfold.o src/ir/bir_dce.o \
-          src/amdgpu/encode.o src/amdgpu/enc_tab.o src/amdgpu/isel.o src/amdgpu/emit.o src/amdgpu/sched.o src/amdgpu/verify.o \
+          src/amdgpu/amd_rplan.o src/amdgpu/encode.o src/amdgpu/enc_tab.o src/amdgpu/isel.o src/amdgpu/emit.o src/amdgpu/sched.o src/amdgpu/verify.o \
           src/fe/bc_err.o src/fe/lexer.o src/fe/parser.o src/fe/preproc.o src/fe/sema.o \
           src/runtime/bc_abend.o
 
diff --git a/src/amdgpu/amdgpu.h b/src/amdgpu/amdgpu.h
index 06c33a7..b4bfeee 100644
--- a/src/amdgpu/amdgpu.h
+++ b/src/amdgpu/amdgpu.h
@@ -364,10 +364,22 @@ typedef struct {
     uint16_t is_kernel;        /* 1 for __global__ */
     uint16_t wavefront_size;   /* 32 */
     uint16_t first_alloc_sgpr; /* first SGPR available to regalloc (after param pairs) */
+    uint16_t bir_func;         /* BIR func index (for rplan BIR scan) */
     uint8_t  needs_dispatch;   /* 1 if kernel uses blockDim/gridDim (dispatch_ptr) */
     uint8_t  max_dim;          /* highest dim used: 0=x, 1=xy, 2=xyz */
     uint32_t launch_bounds_max; /* 0 = unconstrained. >0 = programmer's optimistic thread count */
     uint32_t launch_bounds_min; /* 0 = not set */
+
+    /* Resource plan — stamped by amd_rplan(), read by isel + emit.
+     * Target decisions made once.  No is_cdna() downstream.
+     * Like pre-flight checks: argue with the checklist, not the runway. */
+    uint8_t  exec_w;     /* 0=B32 (Wave32), 1=B64 (Wave64) */
+    uint8_t  smem_hz;    /* 1=SMEM→SALU hazard, promote to VALU */
+    uint8_t  scr_afs;    /* 1=architected flat scratch (no prologue) */
+    uint8_t  rp_pad;
+    uint16_t imp_sgp;    /* implicit system SGPRs (6 on CDNA, 0 on RDNA) */
+    uint16_t sgp_min;    /* min SGPR block count for KD */
+    uint32_t r1_mode;    /* RSRC1 static mode bits (IEEE, DX10, WGP, etc) */
 } mfunc_t;
 
 /* ---- Kernel Descriptor (64 bytes, AMD spec) ---- */
@@ -402,6 +414,7 @@ typedef struct {
     /* Chip-specific ELF metadata (set once by main, read by emit) */
     uint32_t    elf_mach;       /* e_flags value for this exact chip */
     char        chip_name[12];  /* "gfx1151" etc. */
+    uint8_t     snap_mode;      /* 1 = photograph the suspects on entry */
 
     minst_t     minsts[AMD_MAX_MINSTS];
     uint32_t    num_minsts;
@@ -424,6 +437,7 @@ typedef struct {
     uint32_t    val_vreg[BIR_MAX_INSTS];   /* BIR inst index -> vreg */
     uint8_t     val_file[BIR_MAX_INSTS];   /* 0=scalar, 1=vector */
     uint16_t    val_sbase[BIR_MAX_INSTS];  /* SGPR pair base for pointers, 0xFFFF=none */
+    int32_t     val_scroff[BIR_MAX_INSTS]; /* scratch byte offset, -1=dynamic */
 
     /* Instruction byte offsets (populated by encode_function) */
     uint32_t    inst_off[AMD_MAX_MINSTS];
@@ -446,6 +460,9 @@ typedef struct {
 
 /* ---- Public API ---- */
 
+/* Resource planning: stamp target decisions onto mfuncs (before isel) */
+void amd_rplan(amd_module_t *A);
+
 /* Compile BIR module to AMDGCN machine IR (divergence + isel) */
 int  amdgpu_compile(const bir_module_t *bir, amd_module_t *amd);
 
diff --git a/src/amdgpu/emit.c b/src/amdgpu/emit.c
index aa1568f..bb761b0 100644
--- a/src/amdgpu/emit.c
+++ b/src/amdgpu/emit.c
@@ -136,6 +136,38 @@ typedef struct {
     uint8_t  spilled;
 } live_interval_t;
 
+/* Spill relay registers, reserved physical registers that shuttle
+ * values between scratch memory and the instruction stream.
+ * Like a postal depot: your letter (value) gets loaded from the
+ * warehouse (scratch) into the van (relay), delivered to the
+ * recipient (instruction), and the van goes back for more.
+ *
+ * Both RDNA and CDNA: v250-v252 relays, allocatable v0..v249.
+ * GFX942 "512 unified VGPRs" is a lie — 256 ArchVGPR + 256 AccVGPR.
+ * AccVGPRs only speak MFMA.  Regular VOP/FLAT encoding is 8-bit,
+ * so v256+ silently wraps to v0+.  We learned this the hard way.
+ *
+ * SGPR s99/s98: scalar relays.  v_readfirstlane promotes VGPR to SGPR.
+ * Two relays because some instructions have two spilled SGPR sources.
+ * One relay? The second load clobbers the first.  We learned this
+ * when k_eff came back 0.000 because every spilled comparison
+ * compared a value with itself. Whack. */
+#define RA_RELAY_V0   250   /* v250-v252 */
+#define RA_VGPR_CEIL  250   /* v0..v249 allocatable */
+#define RA_NUM_RELAY  3
+#define RA_RELAY_S    99    /* first SGPR relay */
+#define RA_RELAY_S2   98    /* second SGPR relay */
+
+/* Spill slot map -- one scratch offset per evicted vreg.
+ * 512 spills should suffice for any kernel that isn't trying
+ * to simulate the entire observable universe in registers. */
+#define RA_MAX_SPILL  512
+static struct {
+    uint16_t vreg;
+    uint16_t off;     /* byte offset in scratch memory */
+} ra_spills[RA_MAX_SPILL];
+static uint32_t ra_nspill;
+
 /* Static storage for regalloc (~4 MB) */
 static struct {
     live_interval_t intervals[AMD_MAX_VREGS];
@@ -176,7 +208,9 @@ static uint16_t operand_vreg(const moperand_t *op)
     return 0xFFFF;
 }
 
-static void compute_live_intervals(const amd_module_t *A, const mfunc_t *F)
+static uint32_t coalesce(amd_module_t *A, const mfunc_t *F);
+
+static void compute_live_intervals(amd_module_t *A, const mfunc_t *F)
 {
     RA.num_intervals = 0;
 
@@ -221,6 +255,183 @@ static void compute_live_intervals(const amd_module_t *A, const mfunc_t *F)
         }
     }
 
+    /* ---- Coalesce on raw intervals ----
+     * Do this BEFORE the back-edge and exec-mask extensions.
+     * The extensions inflate intervals conservatively, which makes
+     * nearly every copy pair interfere.  On raw [first_def, last_use]
+     * intervals, the copy point is where src dies and dst is born —
+     * no overlap.  Coalesce first, extend the merged intervals after.
+     * Order matters: Chaitin before conservatism. */
+    coalesce(A, F);
+
+    /* ---- Loop back-edge extension ----
+     * The linear scan above computes intervals as [first_def, last_use],
+     * blissfully ignorant of control flow.  A value used inside a loop
+     * body has its register freed after the last use — but the loop
+     * iterates, and the next pass finds the register holding someone
+     * else's laundry.  MEMORY_APERTURE_VIOLATION ensues.
+     *
+     * Fix: scan for back edges (branch from block B to earlier block H).
+     * Any interval alive inside the loop [H.first_inst, B.last_inst]
+     * must extend to B.last_inst.  Iterate to fixpoint for nested loops.
+     */
+    {
+        /* Collect back edges: (header_inst, tail_inst) pairs */
+        struct { uint32_t hdr; uint32_t tail; } bedge[64];
+        uint32_t n_be = 0;
+
+        for (uint32_t bi = 0; bi < F->num_blocks && n_be < 64; bi++) {
+            const mblock_t *MB = &A->mblocks[F->first_block + bi];
+            if (MB->num_insts == 0) continue;
+            uint32_t last = MB->first_inst + MB->num_insts - 1;
+            const minst_t *mi = &A->minsts[last];
+            /* Check all operands for label targets */
+            uint8_t total = mi->num_defs + mi->num_uses;
+            if (total > MINST_MAX_OPS) total = MINST_MAX_OPS;
+            for (uint8_t k = 0; k < total; k++) {
+                if (mi->operands[k].kind == MOP_LABEL) {
+                    uint32_t tgt = (uint32_t)mi->operands[k].imm;
+                    if (tgt < F->first_block + bi && tgt >= F->first_block) {
+                        const mblock_t *H = &A->mblocks[tgt];
+                        bedge[n_be].hdr  = H->first_inst;
+                        bedge[n_be].tail = last;
+                        n_be++;
+                    }
+                }
+            }
+        }
+
+        /* Extend intervals across back edges — iterate to fixpoint */
+        int changed = 1;
+        int guard = 32;
+        while (changed && guard-- > 0) {
+            changed = 0;
+            for (uint32_t e = 0; e < n_be; e++) {
+                uint32_t hdr  = bedge[e].hdr;
+                uint32_t tail = bedge[e].tail;
+                for (uint32_t v = 0; v < A->vreg_count && v < AMD_MAX_VREGS; v++) {
+                    if (RA.intervals[v].start == 0xFFFFFFFF) continue;
+                    /* Only extend values defined AT or BEFORE the loop
+                     * header — these are loop-invariant or PHI results
+                     * that must survive the back edge.  Values defined
+                     * INSIDE the loop body get re-materialised each
+                     * iteration and don't need extension.  The previous
+                     * condition (start <= tail) was catastrophically
+                     * aggressive: every temporary in the loop got
+                     * extended, register pressure went through the roof,
+                     * the spill path panicked, and the compiler segfaulted
+                     * into a pile of smoking silicon. */
+                    if (RA.intervals[v].start <= hdr  &&
+                        RA.intervals[v].end   >= hdr  &&
+                        RA.intervals[v].end   <  tail) {
+                        RA.intervals[v].end = tail;
+                        changed = 1;
+                    }
+                }
+            }
+        }
+    }
+
+    /* ---- Exec mask region extension ----
+     * Same principle as above, different villain.  Values alive
+     * across a saveexec→restore pair must survive the entire
+     * masked region.  Without this, the linear scan sees the
+     * last use inside the mask, frees the register, and some
+     * RNG temporary moves in.  Then exec restores, the formula
+     * reads the RNG's leftovers, and k_eff goes to zero.
+     * The hardware does not care about your feelings. */
+    {
+        /* Pair saveexec/restore using a bounded nesting stack.
+         * Structured control flow means they nest properly. */
+        struct { uint32_t save; uint32_t rest; } eregion[64];
+        uint32_t n_er = 0;
+        uint32_t estack[32];
+        uint32_t esp = 0;
+
+        for (uint32_t bi = 0; bi < F->num_blocks && n_er < 64; bi++) {
+            const mblock_t *MB = &A->mblocks[F->first_block + bi];
+            for (uint32_t ii = 0; ii < MB->num_insts; ii++) {
+                uint32_t mi_idx = MB->first_inst + ii;
+                const minst_t *mi = &A->minsts[mi_idx];
+
+                /* saveexec → push */
+                if (mi->op == AMD_S_AND_SAVEEXEC_B64 ||
+                    mi->op == AMD_S_AND_SAVEEXEC_B32) {
+                    if (esp < 32)
+                        estack[esp++] = mi_idx;
+                    continue;
+                }
+
+                /* exec restore (OR/XOR to EXEC) → pop and record */
+                if ((mi->op == AMD_S_OR_B64  || mi->op == AMD_S_OR_B32 ||
+                     mi->op == AMD_S_XOR_B64 || mi->op == AMD_S_XOR_B32) &&
+                    mi->num_defs > 0 &&
+                    mi->operands[0].kind == MOP_SPECIAL &&
+                    mi->operands[0].imm == AMD_SPEC_EXEC) {
+                    if (esp > 0 && n_er < 64) {
+                        eregion[n_er].save = estack[--esp];
+                        eregion[n_er].rest = mi_idx;
+                        n_er++;
+                    }
+                }
+            }
+        }
+
+        /* Extend intervals that straddle a masked region.
+         * If you were alive before the saveexec and your last
+         * use is inside the mask, you need to survive until
+         * the restore — even if nobody mentions you in between. */
+        for (uint32_t e = 0; e < n_er; e++) {
+            uint32_t sav = eregion[e].save;
+            uint32_t rst = eregion[e].rest;
+            for (uint32_t v = 0; v < A->vreg_count && v < AMD_MAX_VREGS; v++) {
+                if (RA.intervals[v].start == 0xFFFFFFFF) continue;
+                if (RA.intervals[v].start <= sav &&
+                    RA.intervals[v].end   >= sav &&
+                    RA.intervals[v].end   <  rst) {
+                    RA.intervals[v].end = rst;
+                }
+            }
+        }
+    }
+
+    /* ---- Prologue SGPR extension ----
+     * Function parameters and system SGPRs are defined in the entry
+     * block and semantically live for the entire kernel — they're
+     * inputs, not temporaries.  The back-edge and exec-mask extensions
+     * usually get this right, but not always: a scalar param used
+     * once deep in a nested loop can fall through the cracks when
+     * diamond patterns eat the extension stack.
+     *
+     * Fix: any SGPR defined in the first block whose value escapes
+     * to a later block gets pinned to the function's last instruction.
+     * This costs a handful of SGPRs worth of pressure.  Cheap
+     * insurance against the allocator recycling k_eff's register
+     * to store someone else's loop counter. */
+    {
+        uint32_t last_inst = 0;
+        for (uint32_t bi = 0; bi < F->num_blocks; bi++) {
+            const mblock_t *MB = &A->mblocks[F->first_block + bi];
+            if (MB->num_insts > 0) {
+                uint32_t ei = MB->first_inst + MB->num_insts - 1;
+                if (ei > last_inst) last_inst = ei;
+            }
+        }
+        if (F->num_blocks > 0 && last_inst > 0) {
+            const mblock_t *MB0 = &A->mblocks[F->first_block];
+            uint32_t blk0_end = MB0->first_inst + MB0->num_insts;
+            for (uint32_t v = 0; v < A->vreg_count && v < AMD_MAX_VREGS; v++) {
+                if (RA.intervals[v].start == 0xFFFFFFFF) continue;
+                if (RA.intervals[v].file != 0) continue; /* SGPRs only */
+                if (RA.intervals[v].start < blk0_end &&
+                    RA.intervals[v].end   >= blk0_end &&
+                    RA.intervals[v].end   <  last_inst) {
+                    RA.intervals[v].end = last_inst;
+                }
+            }
+        }
+    }
+
     /* Collect valid intervals */
     RA.num_intervals = 0;
     for (uint32_t v = 0; v < A->vreg_count && v < AMD_MAX_VREGS; v++) {
@@ -233,6 +444,96 @@ static void compute_live_intervals(const amd_module_t *A, const mfunc_t *F)
     qsort(RA.sorted, RA.num_intervals, sizeof(uint32_t), interval_cmp_start);
 }
 
+/* ---- Chaitin Coalescing (IBM Research, 1981) ----
+ * Two vregs joined by a copy whose live ranges don't overlap can
+ * share one register.  The copy evaporates.  Chaitin figured this
+ * out while building the PL.8 register allocator at Yorktown
+ * Heights; Dewar arrived at the same insight from the SPITBOL
+ * direction.  We're 45 years late to the party but the drinks
+ * are still good.
+ *
+ * This eliminates PHI copies that phi_elim inserted, which is
+ * the main source of register pressure inflation.  A kernel
+ * that needs 350 vregs often has 150 that are just copy aliases
+ * of each other.  After coalescing, the linear scan sees the
+ * truth: maybe 200 unique live values, which fit in 250 VGPRs
+ * without spilling. */
+static uint32_t coalesce(amd_module_t *A, const mfunc_t *F)
+{
+    int changed = 1;
+    int guard = 16;
+    uint32_t n_coal = 0;
+    uint32_t n_intf = 0, n_cand = 0;
+
+    while (changed && guard-- > 0) {
+        changed = 0;
+        for (uint32_t bi = 0; bi < F->num_blocks; bi++) {
+            const mblock_t *MB = &A->mblocks[F->first_block + bi];
+            for (uint32_t ii = 0; ii < MB->num_insts; ii++) {
+                uint32_t mi_idx = MB->first_inst + ii;
+                minst_t *mi = &A->minsts[mi_idx];
+
+                if (mi->op != AMD_PSEUDO_COPY) continue;
+                if (mi->num_defs != 1 || mi->num_uses != 1) continue;
+
+                uint16_t dst = operand_vreg(&mi->operands[0]);
+                uint16_t src = operand_vreg(&mi->operands[1]);
+                if (dst == 0xFFFF || src == 0xFFFF) continue;
+                if (dst == src) continue;
+
+                /* Same register file — can't merge SGPR with VGPR */
+                if (RA.intervals[dst].file != RA.intervals[src].file)
+                    continue;
+
+                /* Both must be live (not already killed by prior merge) */
+                if (RA.intervals[dst].start == 0xFFFFFFFF ||
+                    RA.intervals[src].start == 0xFFFFFFFF) continue;
+
+                n_cand++;
+
+                /* Interference: strict overlap of closed intervals. */
+                uint32_t s1 = RA.intervals[src].start;
+                uint32_t e1 = RA.intervals[src].end;
+                uint32_t s2 = RA.intervals[dst].start;
+                uint32_t e2 = RA.intervals[dst].end;
+
+                if (s1 < e2 && s2 < e1) { n_intf++; continue; }
+
+                /* Coalesce: src absorbs dst */
+                if (s2 < s1) RA.intervals[src].start = s2;
+                if (e2 > e1) RA.intervals[src].end = e2;
+                RA.intervals[dst].start = 0xFFFFFFFF;
+                RA.intervals[dst].end = 0;
+
+                /* Rename dst→src across the entire function */
+                for (uint32_t rb = 0; rb < F->num_blocks; rb++) {
+                    const mblock_t *RB = &A->mblocks[F->first_block + rb];
+                    for (uint32_t rj = 0; rj < RB->num_insts; rj++) {
+                        minst_t *mj = &A->minsts[RB->first_inst + rj];
+                        uint8_t tot = mj->num_defs + mj->num_uses;
+                        if (tot > MINST_MAX_OPS) tot = MINST_MAX_OPS;
+                        for (uint8_t k = 0; k < tot; k++) {
+                            if (operand_vreg(&mj->operands[k]) == dst)
+                                mj->operands[k].reg_num = src;
+                        }
+                    }
+                }
+
+                /* Kill the copy — its work is done */
+                mi->op = AMD_S_NOP;
+                mi->num_defs = 0;
+                mi->num_uses = 0;
+
+                n_coal++;
+                changed = 1;
+            }
+        }
+    }
+    if (n_coal > 0)
+        fprintf(stderr, "  coalesce: %u/%u copies merged\n", n_coal, n_cand);
+    return n_coal;
+}
+
 static void expire_old(uint32_t point)
 {
     /* Remove intervals that have ended before this point */
@@ -268,12 +569,30 @@ static void rw_ops(amd_module_t *A, const mfunc_t *F)
 
             for (uint8_t k = 0; k < total; k++) {
                 moperand_t *op = &mi->operands[k];
-                if (op->kind == MOP_VREG_S) {
-                    op->kind = MOP_SGPR;
-                    op->reg_num = A->reg_map[op->reg_num];
-                } else if (op->kind == MOP_VREG_V) {
-                    op->kind = MOP_VGPR;
-                    op->reg_num = A->reg_map[op->reg_num];
+                if (op->kind == MOP_VREG_S || op->kind == MOP_VREG_V) {
+                    uint16_t vr = op->reg_num;
+                    /* Use reg_file[] as the authority — operand kind
+                     * can disagree after SMEM→VALU hazard promotion
+                     * (vreg promoted to VGPR file but still referenced
+                     * as MOP_VREG_S at some use sites). */
+                    if (A->reg_file[vr])
+                        op->kind = MOP_VGPR;
+                    else
+                        op->kind = MOP_SGPR;
+                    uint16_t phys = A->reg_map[vr];
+                    if (phys == 0xFFFF) {
+                        /* Spilled vreg: encode slot index with bit 15
+                         * so the spill resolution pass can find it.
+                         * Without this the 0xFFFF passes straight through
+                         * and the verifier rightfully complains. */
+                        for (uint32_t si = 0; si < ra_nspill; si++) {
+                            if (ra_spills[si].vreg == vr) {
+                                phys = (uint16_t)(0x8000u | si);
+                                break;
+                            }
+                        }
+                    }
+                    op->reg_num = phys;
                 }
             }
 
@@ -314,35 +633,6 @@ static void dce_copy(amd_module_t *A, const mfunc_t *F)
                 mi->num_uses = 0;
                 continue;
             }
-
-            if ((mi->op == AMD_V_MOV_B32 || mi->op == AMD_S_MOV_B32 ||
-                 mi->op == AMD_PSEUDO_COPY) &&
-                mi->num_defs == 1 && ii + 1 < MB->num_insts) {
-                const minst_t *next = &A->minsts[MB->first_inst + ii + 1];
-                uint16_t dst_reg = mi->operands[0].reg_num;
-                uint8_t  dst_kind = mi->operands[0].kind;
-
-                int next_uses = 0;
-                for (uint8_t u = next->num_defs;
-                     u < next->num_defs + next->num_uses && u < MINST_MAX_OPS; u++) {
-                    if (next->operands[u].kind == dst_kind &&
-                        next->operands[u].reg_num == dst_reg)
-                        next_uses = 1;
-                }
-
-                int next_defs = 0;
-                for (uint8_t d = 0; d < next->num_defs && d < MINST_MAX_OPS; d++) {
-                    if (next->operands[d].kind == dst_kind &&
-                        next->operands[d].reg_num == dst_reg)
-                        next_defs = 1;
-                }
-
-                if (next_defs && !next_uses) {
-                    mi->op = AMD_PSEUDO_DEF;
-                    mi->num_defs = 0;
-                    mi->num_uses = 0;
-                }
-            }
         }
     }
 }
@@ -387,16 +677,19 @@ static void ra_lin(amd_module_t *A, uint32_t mf_idx)
     uint16_t sgpr_start = F->is_kernel ? F->first_alloc_sgpr : 0;
     if (sgpr_start < AMD_KERN_MIN_RESERVED && F->is_kernel)
         sgpr_start = AMD_KERN_MIN_RESERVED;
-    for (uint16_t r = AMD_MAX_SGPRS; r-- > sgpr_start; )
+    for (uint16_t r = AMD_MAX_SGPRS; r-- > sgpr_start; ) {
+        if (r == RA_RELAY_S || r == RA_RELAY_S2) continue; /* reserved for spill relays */
         RA.sgpr_free[RA.num_sgpr_free++] = (uint8_t)r;
+    }
 
-    /* CDNA flat scratch reserves v250:v251 as addr staging pair */
-    int cdna_scr = (A->target <= AMD_TARGET_GFX942 && F->scratch_bytes > 0);
-    for (uint16_t r = AMD_MAX_VGPRS; r-- > 0; ) {
-        if (cdna_scr && (r == AMD_VGPR_SCR_LO || r == AMD_VGPR_SCR_HI))
-            continue;
+    /* VGPR pool: v0..v249 on both targets.  v250-v252 reserved
+     * for spill relays.  GFX942's AccVGPRs are MFMA-only — the
+     * 8-bit encoding fields in VOP/FLAT literally can't see them.
+     * We tried.  The hardware was unimpressed. */
+    for (uint16_t r = RA_VGPR_CEIL; r-- > 0; )
         RA.vgpr_free[RA.num_vgpr_free++] = (uint8_t)r;
-    }
+
+    ra_nspill = 0;
 
     compute_live_intervals(A, F);
 
@@ -434,28 +727,60 @@ static void ra_lin(amd_module_t *A, uint32_t mf_idx)
                 }
             }
             if (farthest > iv->end && RA.num_active > 0) {
-                /* Spill the farthest, give its reg to us */
+                /* Evict the farthest — commandeer its register.
+                 * The evicted vreg gets a scratch slot and will be
+                 * loaded/stored via relay VGPRs at every use/def.
+                 * Expensive, but better than two values sharing
+                 * one register like flatmates sharing one toothbrush. */
                 uint32_t sv = RA.active[farthest_idx];
                 phys = RA.intervals[sv].phys;
                 RA.intervals[sv].spilled = 1;
                 RA.intervals[sv].phys = 0xFFFF;
-                /* Remove from active */
+                A->reg_map[sv] = 0xFFFF;
+                if (ra_nspill < RA_MAX_SPILL) {
+                    ra_spills[ra_nspill].vreg = (uint16_t)sv;
+                    ra_spills[ra_nspill].off = (uint16_t)(F->scratch_bytes +
+                                               ra_nspill * 4u);
+                    ra_nspill++;
+                }
                 RA.active[farthest_idx] = RA.active[--RA.num_active];
             } else {
-                /* Spill ourselves */
+                /* Spill ourselves — no register, straight to scratch.
+                 * The compiler equivalent of being told the hotel is
+                 * full and you'll be sleeping in the car park. */
                 iv->spilled = 1;
-                phys = 0; /* fallback */
+                phys = 0xFFFF;
+                if (ra_nspill < RA_MAX_SPILL) {
+                    ra_spills[ra_nspill].vreg = (uint16_t)v;
+                    ra_spills[ra_nspill].off = (uint16_t)(F->scratch_bytes +
+                                               ra_nspill * 4u);
+                    ra_nspill++;
+                }
             }
         }
 
         iv->phys = phys;
         A->reg_map[v] = phys;
 
-        /* Add to active */
-        if (RA.num_active < AMD_MAX_VREGS)
+        /* Add to active — but NOT self-spilled intervals.  Their
+         * phys=0 "fallback" is a lie, and expire_old would free
+         * register 0 back to the pool when it shouldn't.  That
+         * corrupts the free list and eventually overflows the
+         * vgpr_free[256] buffer.  Ask me how I know. */
+        if (!iv->spilled && RA.num_active < AMD_MAX_VREGS)
             RA.active[RA.num_active++] = v;
     }
 
+    if (ra_nspill > 0) {
+        uint32_t vs = 0, ss = 0;
+        for (uint32_t si = 0; si < ra_nspill; si++) {
+            if (RA.intervals[ra_spills[si].vreg].file == 0) ss++;
+            else vs++;
+        }
+        fprintf(stderr, "  regalloc: %u spills (%uV %uS)\n",
+                ra_nspill, vs, ss);
+    }
+
     /* Record usage for kernel descriptor.
      * Regalloc only tracks its own assigned SGPRs, but kernels also
      * use system SGPRs (kernarg, TGID) and param pair SGPRs.
@@ -465,13 +790,327 @@ static void ra_lin(amd_module_t *A, uint32_t mf_idx)
         F->num_sgprs = F->first_alloc_sgpr;
     F->num_vgprs = RA.max_vgpr;
 
-    /* CDNA flat scratch: v250:v251 are physical, not regalloc'd */
-    if (cdna_scr && F->num_vgprs < AMD_VGPR_SCR_HI + 1)
-        F->num_vgprs = AMD_VGPR_SCR_HI + 1;
-
+    /* Match old regalloc_function order exactly:
+     * 1. min SGPR/VGPR + launch_bounds
+     * 2. rw_ops (virtual→physical)
+     * 3. DCE (self-copy only)
+     * 4. spill resolution */
     fin_regs(A, F);
+
+    /* Rewrite virtual→physical BEFORE spill resolution.
+     * Spill resolution scans for 0x8000-encoded reg_nums to find
+     * spilled operands, and reads physical SGPRs from scratch ops
+     * to find scr_sgpr.  Without this, everything is still virtual
+     * and the spill code sees nothing to resolve. */
     rw_ops(A, F);
+
+    /* Dead copy elimination BEFORE spill resolution — matches old
+     * regalloc_function order.  Kill self-copies so spill resolution
+     * doesn't waste time inserting load/store plumbing around no-ops. */
     dce_copy(A, F);
+
+    /* ---- Spill Resolution ----
+     * When the register file runs dry, evicted values get parked in
+     * scratch memory.  Now we walk the instruction stream and insert
+     * the actual load/store plumbing -- scratch_load before every use,
+     * scratch_store after every def.  The relay VGPRs (v250-v252) are
+     * the middlemen: values hop from scratch to relay to instruction
+     * and back.
+     *
+     * This is the register allocator's overflow car park at the
+     * airport: slow, far from the terminal, but at least your car
+     * doesn't get towed.  Without this, two values share one register
+     * and corrupt each other silently.  We know because k_eff read
+     * 0.000 for three days while n_mat quietly aliased to zero. */
+    if (ra_nspill > 0) {
+        /* Find the scratch frame-pointer SGPR by scanning for an
+         * existing scratch op.  If we're spilling, we have scratch. */
+        uint16_t scr_sgpr = 0;
+        for (uint32_t bi = 0; bi < F->num_blocks && scr_sgpr == 0; bi++) {
+            mblock_t *MB = &A->mblocks[F->first_block + bi];
+            for (uint32_t ii = 0; ii < MB->num_insts; ii++) {
+                minst_t *mi = &A->minsts[MB->first_inst + ii];
+                if (mi->op == AMD_SCRATCH_LOAD_DWORD ||
+                    mi->op == AMD_SCRATCH_STORE_DWORD) {
+                    scr_sgpr = mi->operands[1].reg_num;
+                    break;
+                }
+            }
+        }
+
+        /* Account for spill area in scratch allocation */
+        F->scratch_bytes += ra_nspill * 4u;
+
+        /* Hoist scratch FP init to the very first instruction.
+         * The scheduler may have moved kernarg loads ahead of the
+         * original s_mov_b32 scrfp,0 — and spill ops around those
+         * early instructions would fire before the FP is set,
+         * sending scratch accesses to whatever address the CP
+         * left in the register.  We learned this the hard way:
+         * TEA pointed to a host address, which is the GPU's
+         * polite way of saying "that's not yours." */
+        if (scr_sgpr > 0) {
+            mblock_t *B0 = &A->mblocks[F->first_block];
+            uint32_t pos = B0->first_inst;
+            if (A->num_minsts + 1 < AMD_MAX_MINSTS) {
+                uint32_t tail = A->num_minsts - pos;
+                memmove(&A->minsts[pos + 1], &A->minsts[pos],
+                        tail * sizeof(minst_t));
+                A->num_minsts++;
+                B0->num_insts++;
+                for (uint32_t lb = F->first_block + 1;
+                     lb < F->first_block + F->num_blocks; lb++)
+                    A->mblocks[lb].first_inst++;
+                minst_t *fp = &A->minsts[pos];
+                memset(fp, 0, sizeof(minst_t));
+                fp->op = AMD_S_MOV_B32;
+                fp->num_defs = 1;
+                fp->num_uses = 1;
+                fp->operands[0].kind = MOP_SGPR;
+                fp->operands[0].reg_num = scr_sgpr;
+                fp->operands[1].kind = MOP_IMM;
+                fp->operands[1].imm = 0;
+            }
+        }
+
+        /* Process blocks backwards -- insertions shift later addresses,
+         * so going backwards means we never revisit shifted territory. */
+        for (int bi = (int)F->num_blocks - 1; bi >= 0; bi--) {
+            mblock_t *MB = &A->mblocks[F->first_block + (uint32_t)bi];
+
+            for (int ii = (int)MB->num_insts - 1; ii >= 0; ii--) {
+                uint32_t mi_idx = MB->first_inst + (uint32_t)ii;
+                minst_t *mi = &A->minsts[mi_idx];
+                if (mi->op == AMD_PSEUDO_DEF) continue;
+
+                uint8_t total = mi->num_defs + mi->num_uses;
+                if (total > MINST_MAX_OPS) total = MINST_MAX_OPS;
+
+                /* Detect spilled operands (bit 15 set by rewrite).
+                 * VGPR source = 2 insns (load + wait).
+                 * SGPR source = 3 insns (load + wait + readfirstlane).
+                 * VGPR dest   = 1 insn  (store).
+                 * SGPR dest   = 2 insns (v_mov + store). */
+                uint32_t n_insert = 0;
+                int has_spill = 0;
+                for (uint8_t k = 0; k < total; k++) {
+                    if (!(mi->operands[k].reg_num & 0x8000u)) continue;
+                    if (mi->operands[k].kind != MOP_VGPR &&
+                        mi->operands[k].kind != MOP_SGPR) continue;
+                    has_spill = 1;
+                    int is_sgpr = (mi->operands[k].kind == MOP_SGPR);
+                    if (k < mi->num_defs)
+                        n_insert += is_sgpr ? 2u : 1u;
+                    else
+                        n_insert += is_sgpr ? 3u : 2u;
+                }
+                if (!has_spill) continue;
+
+                /* Account for the trailing s_waitcnt after dest stores.
+                 * n_post gets +1 later (line ~955), but n_insert feeds
+                 * the memmove — miss this and the waitcnt writes one
+                 * slot past the gap, overwriting the next instruction.
+                 * The off-by-one that ate six hours of debugging. */
+                {
+                    int has_dest_spill = 0;
+                    for (uint8_t k2 = 0; k2 < mi->num_defs; k2++) {
+                        if ((mi->operands[k2].reg_num & 0x8000u) &&
+                            (mi->operands[k2].kind == MOP_VGPR ||
+                             mi->operands[k2].kind == MOP_SGPR)) {
+                            has_dest_spill = 1;
+                            break;
+                        }
+                    }
+                    if (has_dest_spill) n_insert += 1;
+                }
+
+                uint32_t tail_start = mi_idx + 1;
+                uint32_t tail_len = A->num_minsts - tail_start;
+                if (A->num_minsts + n_insert >= AMD_MAX_MINSTS) continue;
+
+                memmove(&A->minsts[tail_start + n_insert],
+                        &A->minsts[tail_start],
+                        tail_len * sizeof(minst_t));
+                A->num_minsts += n_insert;
+                MB->num_insts += n_insert;
+
+                for (uint32_t later = F->first_block + (uint32_t)bi + 1;
+                     later < F->first_block + F->num_blocks; later++)
+                    A->mblocks[later].first_inst += n_insert;
+
+                mi = &A->minsts[mi_idx];
+
+                /* Assign relay VGPRs and remember who was scalar.
+                 * Scratch is VMEM — all loads land in VGPRs first.
+                 * Scalar operands then hop across the VGPR/SGPR border
+                 * via v_readfirstlane (sources) or v_mov (dests).
+                 * Two bus changes and a customs declaration, but at
+                 * least nothing gets silently aliased to zero. */
+                uint16_t relay[MINST_MAX_OPS];
+                uint16_t soff[MINST_MAX_OPS];
+                uint8_t  was_sgpr[MINST_MAX_OPS];
+                uint16_t sgpr_relay[MINST_MAX_OPS]; /* which SGPR relay */
+                uint8_t rn = 0;
+                uint8_t sgpr_rn = 0;  /* tracks SGPR relay assignment */
+                uint32_t n_pre = 0, n_post = 0;
+                for (uint8_t k = 0; k < total; k++) {
+                    relay[k] = 0xFFFF;
+                    soff[k] = 0;
+                    was_sgpr[k] = 0;
+                    sgpr_relay[k] = RA_RELAY_S;
+                    if ((mi->operands[k].kind == MOP_VGPR ||
+                         mi->operands[k].kind == MOP_SGPR) &&
+                        (mi->operands[k].reg_num & 0x8000u)) {
+                        uint16_t si = mi->operands[k].reg_num & 0x7FFFu;
+                        if (si < ra_nspill) soff[k] = ra_spills[si].off;
+                        was_sgpr[k] = (mi->operands[k].kind == MOP_SGPR);
+                        relay[k] = (uint16_t)(RA_RELAY_V0 +
+                                   (rn % RA_NUM_RELAY));
+                        rn++;
+                        if (rn > RA_NUM_RELAY)
+                            fprintf(stderr, "  WARN: %u spilled ops in one insn (op %u)\n", rn, mi->op);
+                        if (k < mi->num_defs) {
+                            /* Dest: instruction writes SGPR relay or
+                             * relay VGPR, then we store to scratch. */
+                            if (was_sgpr[k]) {
+                                sgpr_relay[k] = (sgpr_rn % 2) ? RA_RELAY_S2 : RA_RELAY_S;
+                                sgpr_rn++;
+                                mi->operands[k].kind = MOP_SGPR;
+                                mi->operands[k].reg_num = sgpr_relay[k];
+                                n_post += 2; /* v_mov + store */
+                            } else {
+                                mi->operands[k].kind = MOP_VGPR;
+                                mi->operands[k].reg_num = relay[k];
+                                n_post += 1; /* store */
+                            }
+                        } else {
+                            /* Source: load from scratch into relay VGPR,
+                             * then readfirstlane if the instruction
+                             * expects an SGPR. */
+                            if (was_sgpr[k]) {
+                                sgpr_relay[k] = (sgpr_rn % 2) ? RA_RELAY_S2 : RA_RELAY_S;
+                                sgpr_rn++;
+                                mi->operands[k].kind = MOP_SGPR;
+                                mi->operands[k].reg_num = sgpr_relay[k];
+                                n_pre += 3; /* load + wait + rfl */
+                            } else {
+                                mi->operands[k].kind = MOP_VGPR;
+                                mi->operands[k].reg_num = relay[k];
+                                n_pre += 2; /* load + wait */
+                            }
+                        }
+                    }
+                }
+                n_post++; /* waitcnt after stores */
+
+                /* Slide instruction right to make room for source loads.
+                 * Layout: [loads+waits+rfl] [instruction] [mov+stores] */
+                uint32_t inst_pos = mi_idx + n_pre;
+                if (n_pre > 0) {
+                    A->minsts[inst_pos] = A->minsts[mi_idx];
+                    mi = &A->minsts[inst_pos];
+                }
+
+                /* ---- Source loads ----
+                 * Each spilled source: scratch_load → s_waitcnt → (rfl).
+                 * The scratch→VGPR→SGPR journey: data climbs out of the
+                 * scratch cellar through the VGPR ground floor, then
+                 * takes the lift to the SGPR penthouse.  v_readfirstlane
+                 * is the lift — it copies lane 0 from the VGPR to an
+                 * SGPR, which is exactly what you want for a uniform
+                 * scalar value that got evicted during a register
+                 * shortage.  All lanes have the same value anyway. */
+                uint32_t lp = mi_idx;
+                for (uint8_t k = mi->num_defs; k < total; k++) {
+                    if (relay[k] == 0xFFFF) continue;
+
+                    minst_t *ld = &A->minsts[lp++];
+                    memset(ld, 0, sizeof(minst_t));
+                    ld->op = AMD_SCRATCH_LOAD_DWORD;
+                    ld->num_defs = 1;
+                    ld->num_uses = 2;
+                    ld->operands[0].kind = MOP_VGPR;
+                    ld->operands[0].reg_num = relay[k];
+                    ld->operands[1].kind = MOP_SGPR;
+                    ld->operands[1].reg_num = scr_sgpr;
+                    ld->operands[2].kind = MOP_IMM;
+                    ld->operands[2].imm = (int32_t)soff[k];
+
+                    minst_t *wt = &A->minsts[lp++];
+                    memset(wt, 0, sizeof(minst_t));
+                    wt->op = AMD_S_WAITCNT;
+                    wt->flags = AMD_WAIT_VMCNT0;
+
+                    /* SGPR source: one more hop.  readfirstlane ferries
+                     * lane 0 from the VGPR relay to the SGPR relay.
+                     * Two SGPR relays (s99/s98) prevent the second load
+                     * from clobbering the first. */
+                    if (was_sgpr[k]) {
+                        minst_t *rf = &A->minsts[lp++];
+                        memset(rf, 0, sizeof(minst_t));
+                        rf->op = AMD_V_READFIRSTLANE_B32;
+                        rf->num_defs = 1;
+                        rf->num_uses = 1;
+                        rf->operands[0].kind = MOP_SGPR;
+                        rf->operands[0].reg_num = sgpr_relay[k];
+                        rf->operands[1].kind = MOP_VGPR;
+                        rf->operands[1].reg_num = relay[k];
+                    }
+                }
+
+                /* ---- Dest stores ----
+                 * Each spilled dest: (v_mov for SGPR) → scratch_store.
+                 * SGPR dests write s99, then v_mov copies s99 into a
+                 * VGPR relay for the scratch_store.  This is the
+                 * reverse of the source journey — value descends from
+                 * the SGPR penthouse back down to the scratch cellar. */
+                uint32_t sp = inst_pos + 1;
+                for (uint8_t k = 0; k < mi->num_defs; k++) {
+                    if (relay[k] == 0xFFFF) continue;
+
+                    uint16_t store_vgpr = relay[k];
+
+                    if (was_sgpr[k]) {
+                        /* SGPR relay → VGPR relay so scratch_store can reach it */
+                        minst_t *mv = &A->minsts[sp++];
+                        memset(mv, 0, sizeof(minst_t));
+                        mv->op = AMD_V_MOV_B32;
+                        mv->num_defs = 1;
+                        mv->num_uses = 1;
+                        mv->operands[0].kind = MOP_VGPR;
+                        mv->operands[0].reg_num = relay[k];
+                        mv->operands[1].kind = MOP_SGPR;
+                        mv->operands[1].reg_num = sgpr_relay[k];
+                    }
+
+                    minst_t *st = &A->minsts[sp++];
+                    memset(st, 0, sizeof(minst_t));
+                    st->op = AMD_SCRATCH_STORE_DWORD;
+                    st->num_defs = 0;
+                    st->num_uses = 3;
+                    st->operands[0].kind = MOP_VGPR;
+                    st->operands[0].reg_num = store_vgpr;
+                    st->operands[1].kind = MOP_SGPR;
+                    st->operands[1].reg_num = scr_sgpr;
+                    st->operands[2].kind = MOP_IMM;
+                    st->operands[2].imm = (int32_t)soff[k];
+                }
+
+                /* Fence: wait for all dest stores to land before any
+                 * later instruction tries to reload from the same slot.
+                 * Without this, scratch_load races scratch_store and
+                 * the relay delivers last Tuesday's value. */
+                if (n_post > 1) {
+                    minst_t *wt2 = &A->minsts[sp++];
+                    memset(wt2, 0, sizeof(minst_t));
+                    wt2->op = AMD_S_WAITCNT;
+                    wt2->flags = AMD_WAIT_VMCNT0;
+                }
+            }
+        }
+    }
+
+    /* fin_regs already called before rw_ops (matching old order) */
 }
 
 /* ---- Graph Coloring Register Allocation ---- */
@@ -1338,6 +1977,7 @@ static void ra_func(amd_module_t *A, uint32_t mf_idx)
     } else {
         ra_gc(A, mf_idx);
     }
+
 }
 
 /* ---- Assembly Text Printer ---- */
@@ -1636,6 +2276,19 @@ static void mp_fixarray(uint8_t *buf, uint32_t *pos, uint8_t count)
     buf[(*pos)++] = (uint8_t)(0x90 | count);
 }
 
+/* array16 format for counts > 15 (fixarray only handles 0-15) */
+static void mp_array(uint8_t *buf, uint32_t *pos, uint32_t count)
+{
+    if (count <= 15) {
+        mp_fixarray(buf, pos, (uint8_t)count);
+    } else {
+        if (*pos + 3 > MP_BUF_MAX) return;
+        buf[(*pos)++] = 0xDC;
+        buf[(*pos)++] = (uint8_t)(count >> 8);
+        buf[(*pos)++] = (uint8_t)(count);
+    }
+}
+
 static void mp_fixstr(uint8_t *buf, uint32_t *pos, const char *s)
 {
     uint8_t len = (uint8_t)strlen(s);
@@ -1813,7 +2466,6 @@ int amdgpu_emit_elf(amd_module_t *A, const char *path)
         if (num_kernels >= 64) break;
 
         mfunc_t *F = &A->mfuncs[fi];
-        int cdna = (A->target <= AMD_TARGET_GFX942);
 
         /* ---- KD → .rodata (64-byte aligned, CP microcode demands it) ---- */
         while (rodata_len % 64 != 0 && rodata_len < sizeof(rodata))
@@ -1827,30 +2479,30 @@ int amdgpu_emit_elf(amd_module_t *A, const char *path)
         kd.kernarg_size = F->kernarg_bytes;
         kd.kernel_code_entry_byte_offset = 0; /* patched after layout */
 
-        /* compute_pgm_rsrc1 — VGPR gran=8 (GFX90A+), SGPR gran=16 (GFX9).
+        /* compute_pgm_rsrc1 — VGPR gran=8 (GFX90A+).
+         * SGPR encoding gran=8 (ALL GFX9), alloc gran=16 from pool.
          * GFX10+ ignores the SGPR field entirely.
          * GFX9/CDNA: VCC, FLAT_SCRATCH, XNACK_MASK are carved from the
-         * RSRC1 SGPR allocation.  MI300X needs SGPR_BLOCKS >= 2 (i.e.
-         * >= 48 physical) or kernels with 12+ user SGPRs get Error 700.
-         * The +6 from the ISA manual is necessary but not sufficient —
-         * 48 is the empirically proven floor.  Don't fly with less. */
+         * RSRC1 SGPR allocation, from the top down.  The block count
+         * must cover user SGPRs + 6 (VCC=2, FLAT_SCRATCH=2, XNACK=2)
+         * or the CP silently aliases them with your live registers.
+         * That's how you get aperture faults from perfectly valid code. */
         uint32_t vgpr_blocks = (F->num_vgprs > 0)
             ? (uint32_t)((F->num_vgprs + 7) / 8 - 1) : 0;
-        uint32_t sgpr_gran = cdna ? 16u : 8u;
-        uint32_t total_sgprs = F->num_sgprs;
-        if (cdna && total_sgprs < 33) total_sgprs = 33;
+        /* GFX9 SGPR encoding granularity is 8, NOT 16.  The ALLOCATION
+         * granule from the physical pool is 16, but the RSRC1 field
+         * encodes in units of 8.  LLVM getSGPREncodingGranule() = 8.
+         * Using 16 here causes the hardware to see fewer SGPRs than
+         * we actually write, aliasing user regs with VCC/FLAT_SCRATCH. */
+        uint32_t sgpr_gran = 8u;
+        uint32_t total_sgprs = F->num_sgprs + F->imp_sgp;
         uint32_t sgpr_blocks = (total_sgprs > 0)
             ? (uint32_t)((total_sgprs + sgpr_gran - 1) / sgpr_gran - 1) : 0;
+        if (F->sgp_min && sgpr_blocks < F->sgp_min)
+            sgpr_blocks = F->sgp_min;
         kd.compute_pgm_rsrc1 = (vgpr_blocks & 0x3F) |
                                ((sgpr_blocks & 0xF) << 6) |
-                               (3u << 16) |   /* FLOAT_DENORM_MODE_32 = preserve all */
-                               (3u << 18) |   /* FLOAT_DENORM_MODE_16_64 = preserve all */
-                               (1u << 21) |   /* ENABLE_DX10_CLAMP */
-                               (1u << 23);    /* ENABLE_IEEE_MODE */
-        if (!cdna) {
-            kd.compute_pgm_rsrc1 |= (1u << 26) |  /* WGP_MODE (RDNA only) */
-                                    (1u << 27);    /* MEM_ORDERED (RDNA only) */
-        }
+                               F->r1_mode;
 
         /* compute_pgm_rsrc2 — [0] SCRATCH_EN, [5:1] USER_SGPR_COUNT,
            [7] TGID_X, [8] TGID_Y, [9] TGID_Z, [12:11] VGPR_WORKITEM_ID.
@@ -1869,7 +2521,7 @@ int amdgpu_emit_elf(amd_module_t *A, const char *path)
         /* compute_pgm_rsrc3 — ACCUM_OFFSET for CDNA (GFX90A/GFX942).
          * Tells the HW where ArchVGPRs end and AccVGPRs begin.
          * GFX942 unified VGPRs: all are ArchVGPR, so offset = vgpr_blocks. */
-        if (cdna) {
+        if (F->exec_w) {
             uint32_t ao_gran = (A->target == AMD_TARGET_GFX942) ? 8u : 4u;
             uint32_t accum_off = (F->num_vgprs > 0)
                 ? (uint32_t)((F->num_vgprs + ao_gran - 1) / ao_gran - 1) : 0;
@@ -1878,9 +2530,10 @@ int amdgpu_emit_elf(amd_module_t *A, const char *path)
 
         /* kernel_code_properties */
         kd.kernel_code_properties = (1u << 3);   /* ENABLE_SGPR_KERNARG_PTR */
-        /* CDNA flat scratch via src_private_base — does NOT need
-         * ENABLE_PRIVATE_SEGMENT. Runtime/CP sets up FLAT_SCRATCH
-         * from SCRATCH_EN + private_segment_fixed_size. */
+        /* Bit 0 = ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER — shifts SGPR
+         * layout on GFX9. SCRATCH_EN (RSRC2) + private_segment_fixed_size
+         * handle scratch allocation. Tested: bit 0 ON did not fix
+         * the y=1.0 scratch bug and may cause SGPR shift. */
 
         if (rodata_len + 64 <= sizeof(rodata)) {
             memcpy(rodata + rodata_len, &kd, 64);
@@ -2047,10 +2700,9 @@ int amdgpu_emit_elf(amd_module_t *A, const char *path)
                     BF = &A->bir->funcs[bfi]; break;
                 }
             uint32_t np = BF ? BF->num_params : 0;
-            if (np > 15) np = 15;
             /* 6 hidden args for block_count + group_size if needed */
             uint32_t n_hidden = F->needs_dispatch ? 6 : 0;
-            mp_fixarray(mp_buf, &mp_pos, (uint8_t)(np + n_hidden));
+            mp_array(mp_buf, &mp_pos, np + n_hidden);
             for (uint32_t pi = 0; pi < np; pi++) {
                 int is_ptr = 0;
                 uint32_t arg_sz = 8; /* default 8-byte aligned */
diff --git a/src/amdgpu/encode.c b/src/amdgpu/encode.c
index 4951f95..838360c 100644
--- a/src/amdgpu/encode.c
+++ b/src/amdgpu/encode.c
@@ -230,7 +230,10 @@ static void encode_vop1(amd_module_t *A, const minst_t *mi, uint16_t hw_op)
     /* [31:25]=0111111 [24:17]=VDST [16:9]=OP [8:0]=SRC0 */
     uint32_t literal = 0;
     int need_lit = 0;
-    uint8_t vdst = (mi->num_defs > 0 && mi->operands[0].kind == MOP_VGPR) ?
+    /* v_readfirstlane_b32 has an SGPR destination — accept both files */
+    uint8_t vdst = (mi->num_defs > 0 &&
+                    (mi->operands[0].kind == MOP_VGPR ||
+                     mi->operands[0].kind == MOP_SGPR)) ?
                    (uint8_t)mi->operands[0].reg_num : 0;
     uint16_t src0 = (mi->num_uses > 0) ?
                     encode_vsrc(&mi->operands[mi->num_defs], &literal, &need_lit) : 0;
@@ -348,13 +351,24 @@ static void encode_flat_global(amd_module_t *A, const minst_t *mi, uint16_t hw_o
     /* Default null SADDR (GFX11 default, adjusted per-target below) */
     uint32_t saddr = 0x7C;
 
-    /* Walk use operands: 1st VGPR=addr, 2nd VGPR=data, SGPR=saddr, IMM=offset */
+    /* Walk use operands: 1st VGPR=addr, 2nd VGPR=data, SGPR=saddr, IMM=offset.
+     * Scratch SADDR-only stores have one VGPR (data only, no VADDR) —
+     * detect this and put the single VGPR in data, leave addr=0. */
     uint8_t use_base = mi->num_defs;
     int got_addr = 0;
+    int n_vgprs = 0;
+    int has_sgpr = 0;
+    for (uint8_t k = use_base; k < (uint8_t)(use_base + mi->num_uses) && k < MINST_MAX_OPS; k++) {
+        if (mi->operands[k].kind == MOP_VGPR) n_vgprs++;
+        if (mi->operands[k].kind == MOP_SGPR) has_sgpr = 1;
+    }
+    /* SADDR-only scratch store: single VGPR is data, addr=0 (off) */
+    int saddr_only_store = (is_scratch && mi->num_defs == 0 && n_vgprs == 1 && has_sgpr);
     for (uint8_t k = use_base; k < (uint8_t)(use_base + mi->num_uses) && k < MINST_MAX_OPS; k++) {
         const moperand_t *op = &mi->operands[k];
         if (op->kind == MOP_VGPR) {
-            if (!got_addr) { addr = (uint8_t)op->reg_num; got_addr = 1; }
+            if (saddr_only_store) { data = (uint8_t)op->reg_num; }
+            else if (!got_addr) { addr = (uint8_t)op->reg_num; got_addr = 1; }
             else { data = (uint8_t)op->reg_num; }
         } else if (op->kind == MOP_SGPR) {
             saddr = op->reg_num;
diff --git a/src/amdgpu/isel.c b/src/amdgpu/isel.c
index 6236930..6375ca0 100644
--- a/src/amdgpu/isel.c
+++ b/src/amdgpu/isel.c
@@ -25,7 +25,6 @@ static struct {
     /* Scratch offset tracking */
     uint32_t        scratch_offset;
     uint8_t         has_scratch;    /* BIR pre-scan: function uses alloca */
-    uint16_t        sgpr_priv;      /* CDNA: SGPR pair for src_private_base */
 
     /* LDS (shared memory) offset tracking */
     uint32_t        lds_offset;
@@ -46,14 +45,14 @@ static struct {
         uint32_t saved_vreg;    /* virtual SGPR holding saved EXEC */
         uint32_t false_bir;     /* BIR block for else path */
         uint32_t merge_bir;     /* BIR block for merge (post-dominator) */
+        uint32_t cond_bir;      /* BIR block where saveexec lives (loop header) */
         int      has_else;      /* 1 = diamond (then + else), 0 = triangle */
+        int      in_then;       /* 1 = then-region, 0 = else-region */
     } div_stack[MAX_DIV_REGIONS];
     uint32_t        div_depth;
 
-    /* Branch suppression for divergent diamonds: then-block's BR to merge
-       gets suppressed so it falls through to the else-block. */
-    uint32_t        suppress_src;   /* BIR block whose BR to suppress */
-    uint32_t        suppress_dst;   /* target BIR block of the suppressed BR */
+    /* Current machine block index (for fallthrough detection) */
+    uint32_t        current_mb;
 
     /* Saved thread IDs: v0/v1/v2 must be copied before param loads clobber them */
     uint32_t        saved_tid[3];   /* virtual VGPR holding saved threadIdx.x/y/z */
@@ -66,17 +65,26 @@ static struct {
     uint8_t         needs_dispatch; /* this kernel uses dispatch_ptr */
     uint8_t         max_dim;        /* highest dim needed (0=x, 1=xy, 2=xyz) */
 
+    /* Current machine function (set after MF creation, read by isel_*) */
+    mfunc_t         *mf;
+
+    /* Scratch frame pointer SGPR (set to 0 at entry, used as SADDR) */
+    uint16_t        sgpr_scrfp;
+
     /* Hidden kernarg offset for __device__/__constant__ global pointers */
     uint32_t        hkrarg;
 
+    /* SNAP: MVS-style parameter dump for when printf isn't
+     * an option and staring at assembly isn't a lifestyle. */
+    uint16_t        snap_sgprs[64];  /* which SGPR holds each param */
+    uint32_t        snap_nparam;     /* how many we're watching */
+    uint32_t        snap_koff;       /* kernarg offset of the evidence bag */
+    uint16_t        snap_base;       /* SGPR pair pointing to said bag */
+
     /* Block mapping: BIR block index -> machine block index */
     uint32_t        block_map[BIR_MAX_BLOCKS];
 } S;
 
-/* ---- Target Helpers ---- */
-
-static int is_cdna(void) { return S.amd->target <= AMD_TARGET_GFX942; }
-
 /* ---- Divergence Analysis ---- */
 
 static int is_divergent(uint32_t bir_inst)
@@ -121,38 +129,97 @@ static uint32_t get_op(const bir_inst_t *I, uint32_t idx)
     return BIR_VAL_NONE;
 }
 
-/* Find the unconditional branch target of a BIR block, or 0xFFFFFFFF if none */
-static uint32_t bir_block_successor(uint32_t bir_bi)
-{
-    if (bir_bi >= S.bir->num_blocks) return 0xFFFFFFFF;
-    const bir_block_t *B = &S.bir->blocks[bir_bi];
-    if (B->num_insts == 0) return 0xFFFFFFFF;
-    const bir_inst_t *last = &S.bir->insts[B->first_inst + B->num_insts - 1];
-    if (last->op == BIR_BR)
-        return last->operands[0];
-    return 0xFFFFFFFF;
-}
+/* ---- Block Linearisation ----
+ *
+ * BIR creates blocks in declaration order: outer merge/else BEFORE inner
+ * then/else for nested control flow. Hardware needs true-block physically
+ * after divergent branch (cbranch_execz falls through to true-lanes).
+ * Walk CFG depth-first, suppressing then→merge edges in diamonds so the
+ * else-block lands next after the then-region.  Russian dolls, but with
+ * worse documentation and more explicit lane masking. */
 
-/* Find the merge block for a divergent if-then or if-then-else.
-   Triangle: then→false (false IS the merge).
-   Diamond: then→merge AND false→merge (both converge). */
-static uint32_t find_merge_block(uint32_t true_bir, uint32_t false_bir)
+typedef struct { uint32_t bi; uint32_t supp; } bord_t;
+#define BLK_ORD_MAX 8192
+#define BLK_STK_MAX 512
+
+static uint32_t s_blk_ord[BLK_ORD_MAX];
+static uint8_t  s_blk_vis[BLK_ORD_MAX];
+
+static uint32_t build_blk_ord(const bir_func_t *F, const bir_module_t *M)
 {
-    uint32_t true_succ = bir_block_successor(true_bir);
-    if (true_succ == false_bir)
-        return false_bir; /* triangle: then branches to false/merge */
-    uint32_t false_succ = bir_block_successor(false_bir);
-    if (true_succ != 0xFFFFFFFF && true_succ == false_succ)
-        return true_succ; /* diamond: both converge */
-    return false_bir; /* fallback: treat as triangle */
+    uint32_t nb = F->num_blocks;
+    if (nb > BLK_ORD_MAX) nb = BLK_ORD_MAX;
+    memset(s_blk_vis, 0, nb);
+
+    uint32_t n = 0;
+    bord_t stk[BLK_STK_MAX];
+    uint32_t top = 0;
+
+    stk[top++] = (bord_t){0, 0xFFFFFFFF};
+
+    while (top > 0 && n < nb) {
+        bord_t w = stk[--top];
+        if (w.bi >= nb || s_blk_vis[w.bi]) continue;
+        s_blk_vis[w.bi] = 1;
+        s_blk_ord[n++] = w.bi;
+
+        uint32_t bir_bi = F->first_block + w.bi;
+        if (bir_bi >= M->num_blocks) continue;
+        const bir_block_t *B = &M->blocks[bir_bi];
+        if (B->num_insts == 0) continue;
+
+        const bir_inst_t *last = &M->insts[B->first_inst + B->num_insts - 1];
+
+        if (last->op == BIR_BR) {
+            uint32_t tgt = last->operands[0];
+            if (tgt >= F->first_block && tgt < F->first_block + nb) {
+                uint32_t rel = tgt - F->first_block;
+                if (rel != w.supp && top < BLK_STK_MAX)
+                    stk[top++] = (bord_t){rel, w.supp};
+            }
+        } else if (last->op == BIR_BR_COND && last->num_operands >= 4) {
+            uint32_t T  = last->operands[1] - F->first_block;
+            uint32_t Fb = last->operands[2] - F->first_block;
+            uint32_t Mb = last->operands[3] - F->first_block;
+            int has_else = (Fb != Mb);
+
+            /* Push in reverse order (LIFO): merge, else, then */
+            if (top + 3 <= BLK_STK_MAX) {
+                stk[top++] = (bord_t){Mb, w.supp};     /* merge last */
+                if (has_else) {
+                    stk[top++] = (bord_t){Fb, w.supp};  /* else second */
+                    stk[top++] = (bord_t){T, Mb};        /* then first */
+                } else {
+                    stk[top++] = (bord_t){T, w.supp};   /* triangle */
+                }
+            }
+        } else if (last->op == BIR_BR_COND) {
+            /* 3-operand fallback (shouldn't happen, but be safe) */
+            uint32_t T  = last->operands[1] - F->first_block;
+            uint32_t Fb = last->operands[2] - F->first_block;
+            if (top + 2 <= BLK_STK_MAX) {
+                stk[top++] = (bord_t){Fb, w.supp};
+                stk[top++] = (bord_t){T, w.supp};
+            }
+        }
+        /* BIR_RET, BIR_UNREACHABLE, BIR_SWITCH: no special ordering needed */
+    }
+
+    /* Unreachable blocks: append in BIR order so nothing vanishes */
+    for (uint32_t bi = 0; bi < nb && n < nb; bi++) {
+        if (!s_blk_vis[bi]) s_blk_ord[n++] = bi;
+    }
+    return n;
 }
 
 /*
  * Forward dataflow. Seeds: THREAD_ID = divergent, BLOCK_ID/DIM/GRID_DIM = uniform,
  * constants = uniform, PARAMs = uniform. Propagate: any divergent input -> divergent output.
- * PHI: divergent if any incoming value is divergent.
+ * PHI: divergent if any incoming value is divergent, OR if the PHI's block
+ * is a merge point of a divergent branch (different EXEC masks on each edge).
  * Iterate until fixpoint (bounded: each bit set at most once).
  */
+
 static void divergence_analysis(const bir_func_t *F)
 {
     const bir_module_t *M = S.bir;
@@ -206,13 +273,35 @@ static void divergence_analysis(const bir_func_t *F)
                 int any_div = 0;
 
                 if (I->op == BIR_PHI) {
-                    /* PHI: check value operands (every other one) */
+                    /* PHI: divergent if any incoming VALUE is divergent */
                     for (uint32_t k = 1; k < nops; k += 2) {
                         if (val_is_divergent(get_op(I, k))) {
                             any_div = 1;
                             break;
                         }
                     }
+                    /* Also divergent if incoming BLOCKS have divergent
+                     * terminators.  Example: `a && b` short-circuits to
+                     * phi [cond: 0], [rhs: result] — both values uniform,
+                     * but which one arrives depends on per-lane divergent
+                     * control flow.  Without this, the combined condition
+                     * is treated as uniform and s_cbranch_scc1 replaces
+                     * s_and_saveexec, killing the while loop. */
+                    if (!any_div) {
+                        for (uint32_t k = 0; k < nops; k += 2) {
+                            uint32_t src_blk = get_op(I, k);
+                            if (src_blk >= BIR_MAX_BLOCKS) continue;
+                            const bir_block_t *SB = &M->blocks[src_blk];
+                            if (SB->num_insts == 0) continue;
+                            uint32_t term_idx = SB->first_inst + SB->num_insts - 1;
+                            const bir_inst_t *term = &M->insts[term_idx];
+                            if (term->op == BIR_BR_COND &&
+                                val_is_divergent(term->operands[0])) {
+                                any_div = 1;
+                                break;
+                            }
+                        }
+                    }
                 } else if (I->op == BIR_LOAD) {
                     /* Load from divergent address -> divergent */
                     if (nops > 0 && val_is_divergent(get_op(I, 0)))
@@ -530,6 +619,50 @@ static moperand_t ensure_vgpr(moperand_t op)
     return mop_vreg_v((uint16_t)v);
 }
 
+/* ---- SNAP: what MVS had in 1972 and CUDA still doesn't ----
+ * Each kernel parameter's SGPR value gets written to a host-visible
+ * diagnostic buffer.  When things go sideways you read the buffer
+ * instead of staring at assembly like it owes you money. */
+static void snap_emit(void)
+{
+    if (!S.amd->snap_mode || !S.is_kernel || S.snap_nparam == 0) return;
+
+    /* Where shall we send the evidence? */
+    uint16_t sb = S.next_param_sgpr;
+    if (sb & 1) sb++;
+    S.next_param_sgpr = sb + 2;
+    S.snap_base = sb;
+
+    emit2(AMD_S_LOAD_DWORDX2, mop_sgpr(sb),
+          mop_sgpr(S.sgpr_kernarg), mop_imm((int32_t)S.snap_koff));
+    emit_wait_smem();
+
+    /* Photograph each suspect and file it in the buffer */
+    uint32_t np = S.snap_nparam;
+    if (np > 64) np = 64;
+
+    for (uint32_t i = 0; i < np; i++) {
+        uint32_t voff = new_vreg(1);
+        uint32_t vdat = new_vreg(1);
+        emit1(AMD_V_MOV_B32, mop_vreg_v((uint16_t)voff),
+              mop_imm((int32_t)(i * 4)));
+        emit1(AMD_V_MOV_B32, mop_vreg_v((uint16_t)vdat),
+              mop_sgpr(S.snap_sgprs[i]));
+
+        moperand_t ops[MINST_MAX_OPS];
+        ops[0] = mop_vreg_v((uint16_t)voff);
+        ops[1] = mop_vreg_v((uint16_t)vdat);
+        ops[2] = mop_sgpr(sb);
+        emit_minst(AMD_GLOBAL_STORE_DWORD, 0, 3, ops, 0);
+    }
+
+    /* Wait for the photographs to develop before proceeding */
+    emit0_0(AMD_S_WAITCNT, AMD_WAIT_VMCNT0);
+
+    printf("  snap: %u params instrumented, buffer at kernarg+%u\n",
+           np, S.snap_koff);
+}
+
 /* Get the BIR type width in bits. Default 32 for pointers, etc. */
 static int bir_type_width(uint32_t tidx)
 {
@@ -672,7 +805,7 @@ static void isel_arith(uint32_t idx, const bir_inst_t *I, int div)
         /* CDNA s_add/s_sub hazard: yields 0 when both operands come
          * fresh from SMEM loads.  Promote to VALU even when both are
          * scalar vregs — the pipe hasn't settled yet. */
-        if (!vprom && is_cdna() &&
+        if (!vprom && S.mf->smem_hz &&
             (I->op == BIR_ADD || I->op == BIR_SUB) &&
             src0.kind == MOP_VREG_S && src1.kind == MOP_VREG_S)
             vprom = 1;
@@ -1156,18 +1289,26 @@ static void isel_load(uint32_t idx, const bir_inst_t *I, int div)
         break;
     }
     case BIR_AS_PRIVATE: {
-        moperand_t vaddr = ensure_vgpr(resolve_val(I->operands[0], div));
-        if (is_cdna()) {
-            /* Fence preceding flat_stores before reading back */
-            emit_wait_vm();
-            /* v250 = FLAT_SCRATCH_LO + alloca_offset (wave scratch base) */
-            emit2(AMD_V_ADD_U32, mop_vgpr(AMD_VGPR_SCR_LO),
-                  mop_sgpr(S.sgpr_priv), vaddr);
-            emit2(AMD_FLAT_LOAD_DWORD, mop_vreg_v((uint16_t)vr),
-                  mop_vgpr(AMD_VGPR_SCR_LO), mop_imm(0));
+        /* Check for constant scratch offset (alloca + constant GEP) */
+        int32_t scr_off = -1;
+        if (I->operands[0] != BIR_VAL_NONE && !BIR_VAL_IS_CONST(I->operands[0])) {
+            uint32_t si = BIR_VAL_INDEX(I->operands[0]);
+            if (si < BIR_MAX_INSTS) scr_off = S.amd->val_scroff[si];
+        }
+        moperand_t ops[MINST_MAX_OPS];
+        ops[0] = mop_vreg_v((uint16_t)vr);
+        if (scr_off >= 0 && scr_off < 4096) {
+            /* SADDR-only: scratch_load_dword vdst, off, sN offset:K */
+            ops[1] = mop_sgpr(S.sgpr_scrfp);
+            ops[2] = mop_imm(scr_off);
+            emit_minst(AMD_SCRATCH_LOAD_DWORD, 1, 2, ops, 0);
         } else {
-            emit2(AMD_SCRATCH_LOAD_DWORD, mop_vreg_v((uint16_t)vr),
-                  vaddr, mop_imm(0));
+            /* SVS fallback: VADDR + SADDR */
+            moperand_t vaddr = ensure_vgpr(resolve_val(I->operands[0], div));
+            ops[1] = vaddr;
+            ops[2] = mop_sgpr(S.sgpr_scrfp);
+            ops[3] = mop_imm(0);
+            emit_minst(AMD_SCRATCH_LOAD_DWORD, 1, 3, ops, 0);
         }
         emit_wait_vm();
         break;
@@ -1211,19 +1352,27 @@ static void isel_store(const bir_inst_t *I, int div)
         break;
     }
     case BIR_AS_PRIVATE: {
-        moperand_t vaddr = ensure_vgpr(resolve_val(I->operands[1], div));
+        /* Check for constant scratch offset */
+        int32_t scr_off = -1;
+        if (I->operands[1] != BIR_VAL_NONE && !BIR_VAL_IS_CONST(I->operands[1])) {
+            uint32_t si = BIR_VAL_INDEX(I->operands[1]);
+            if (si < BIR_MAX_INSTS) scr_off = S.amd->val_scroff[si];
+        }
         moperand_t ops[MINST_MAX_OPS];
-        if (is_cdna()) {
-            /* v250 = FLAT_SCRATCH_LO + alloca_offset (wave scratch base) */
-            emit2(AMD_V_ADD_U32, mop_vgpr(AMD_VGPR_SCR_LO),
-                  mop_sgpr(S.sgpr_priv), vaddr);
-            ops[0] = mop_vgpr(AMD_VGPR_SCR_LO);
-            ops[1] = ensure_vgpr(val);
-            ops[2] = mop_imm(0);
-            emit_minst(AMD_FLAT_STORE_DWORD, 0, 3, ops, 0);
-        } else {
-            ops[0] = vaddr; ops[1] = ensure_vgpr(val); ops[2] = mop_imm(0);
+        if (scr_off >= 0 && scr_off < 4096) {
+            /* SADDR-only: scratch_store_dword off, vdata, sN offset:K */
+            ops[0] = ensure_vgpr(val);
+            ops[1] = mop_sgpr(S.sgpr_scrfp);
+            ops[2] = mop_imm(scr_off);
             emit_minst(AMD_SCRATCH_STORE_DWORD, 0, 3, ops, 0);
+        } else {
+            /* SVS fallback: VADDR + SADDR */
+            moperand_t vaddr = ensure_vgpr(resolve_val(I->operands[1], div));
+            ops[0] = vaddr;
+            ops[1] = ensure_vgpr(val);
+            ops[2] = mop_sgpr(S.sgpr_scrfp);
+            ops[3] = mop_imm(0);
+            emit_minst(AMD_SCRATCH_STORE_DWORD, 0, 4, ops, 0);
         }
         break;
     }
@@ -1251,10 +1400,30 @@ static void isel_gep(uint32_t idx, const bir_inst_t *I, int div)
     }
 
     if (sbase != 0xFFFF) {
-        /* saddr path: propagate SGPR pair, compute 32-bit VGPR offset */
+        /* saddr path: propagate SGPR pair, compute 32-bit VGPR offset.
+         *
+         * Param base offsets are always 0 — re-materialise a fresh
+         * zero each time instead of referencing the original VGPR.
+         * The linear-scan regalloc doesn't extend live ranges across
+         * loop back edges, so the param VGPR can be clobbered inside
+         * a loop body and the next iteration reads garbage. Fresh
+         * vreg, short live range, no surprises. */
         S.amd->val_sbase[idx] = sbase;
 
-        moperand_t base_off = ensure_vgpr(resolve_val(base_val, 1));
+        moperand_t base_off;
+        int is_param = 0;
+        if (!BIR_VAL_IS_CONST(base_val) && base_val != BIR_VAL_NONE) {
+            uint32_t bi = BIR_VAL_INDEX(base_val);
+            if (bi < S.bir->num_insts && S.bir->insts[bi].op == BIR_PARAM)
+                is_param = 1;
+        }
+        if (is_param) {
+            uint32_t fresh = new_vreg(1);
+            emit1(AMD_V_MOV_B32, mop_vreg_v((uint16_t)fresh), mop_imm(0));
+            base_off = mop_vreg_v((uint16_t)fresh);
+        } else {
+            base_off = ensure_vgpr(resolve_val(base_val, 1));
+        }
         moperand_t acc = base_off;
 
         for (uint32_t k = 1; k < nops; k++) {
@@ -1297,7 +1466,7 @@ static void isel_gep(uint32_t idx, const bir_inst_t *I, int div)
             acc = mop_vreg_v((uint16_t)tmp);
         }
         emit1(AMD_V_MOV_B32, mop_vreg_v((uint16_t)vr), acc);
-    } else if (is_cdna()) {
+    } else if (S.mf->smem_hz) {
         /* CDNA GEP: the MI300X s_add_i32 zero-result errata strikes
          * again.  Two SMEM-sourced operands → s_add returns 0, your
          * pointer goes to la-la land, and the GPU segfaults with the
@@ -1334,6 +1503,28 @@ static void isel_gep(uint32_t idx, const bir_inst_t *I, int div)
         }
         emit1(AMD_S_MOV_B32, mop_vreg_s((uint16_t)vr), acc);
     }
+
+    /* Propagate constant scratch offset through GEP.
+     * If base is an alloca (or prior GEP) with known offset and all
+     * indices are constants, compute the new offset for SADDR-only mode. */
+    if (idx < BIR_MAX_INSTS && !BIR_VAL_IS_CONST(base_val) && base_val != BIR_VAL_NONE) {
+        uint32_t bi = BIR_VAL_INDEX(base_val);
+        if (bi < BIR_MAX_INSTS && S.amd->val_scroff[bi] >= 0) {
+            int32_t off = S.amd->val_scroff[bi];
+            int all_const = 1;
+            for (uint32_t k = 1; k < nops && all_const; k++) {
+                uint32_t opval = get_op(I, k);
+                if (BIR_VAL_IS_CONST(opval)) {
+                    int32_t cv = (int32_t)S.bir->consts[BIR_VAL_INDEX(opval)].d.ival;
+                    off += cv * (int32_t)elem_sz;
+                } else {
+                    all_const = 0;
+                }
+            }
+            if (all_const)
+                S.amd->val_scroff[idx] = off;
+        }
+    }
 }
 
 static void isel_alloca(uint32_t idx, const bir_inst_t *I)
@@ -1342,7 +1533,11 @@ static void isel_alloca(uint32_t idx, const bir_inst_t *I)
     uint32_t align = 1u << I->subop;
     S.scratch_offset = (S.scratch_offset + align - 1) & ~(align - 1);
 
-    /* Allocate a vreg holding the scratch offset */
+    /* Record constant scratch offset for immediate folding */
+    if (idx < BIR_MAX_INSTS)
+        S.amd->val_scroff[idx] = (int32_t)S.scratch_offset;
+
+    /* Allocate a vreg holding the scratch offset (for dynamic GEPs) */
     uint32_t vr = map_bir_val(idx, 1);
     /* v_mov_b32 vr, scratch_offset */
     emit1(AMD_V_MOV_B32, mop_vreg_v((uint16_t)vr), mop_imm((int32_t)S.scratch_offset));
@@ -1393,15 +1588,54 @@ static void isel_branch(const bir_inst_t *I)
     uint32_t target_bir = I->operands[0];
     if (target_bir >= BIR_MAX_BLOCKS) return;
 
-    /* Suppress then→merge branches in divergent diamonds.
-       The then-block falls through to the else-block instead. */
-    if (S.suppress_src == S.current_bir_block &&
-        S.suppress_dst == target_bir) {
-        S.suppress_src = 0xFFFFFFFF;
-        S.suppress_dst = 0xFFFFFFFF;
-        return;
+    /* Suppress then→merge branches in active divergent diamonds.
+     * In a diamond, the then-path falls through to the else-block
+     * instead of jumping to merge. Works for nested structures too —
+     * inner merge→outer merge gets suppressed at any depth.
+     * Like a shift handover: you don't leave until the next crew arrives. */
+    for (uint32_t di = 0; di < S.div_depth; di++) {
+        if (S.div_stack[di].has_else && S.div_stack[di].in_then &&
+            S.div_stack[di].merge_bir == target_bir) {
+            return; /* suppress: fall through to else instead */
+        }
     }
-    emit0_1(AMD_S_BRANCH, mop_label(S.block_map[target_bir]));
+
+    /* Back-edge to divergent loop header: restore EXEC before re-entry.
+     * Without this, saveexec AND's the already-narrowed EXEC on every
+     * iteration.  Lanes that die during physics stay dead forever — the
+     * mask is a one-way valve that only removes threads.
+     *
+     * A while(a && b) compiles to multiple BIR blocks (while.cond →
+     * land.rhs → land.end), each potentially pushing a div_stack entry.
+     * The back-edge targets while.cond, so we must restore ALL entries
+     * pushed since while.cond — innermost first, like unwinding a stack.
+     * The GPU equivalent of "everyone back to starting positions." */
+    {
+        uint32_t target_mb = S.block_map[target_bir];
+        if (target_mb <= S.current_mb) {
+            /* Back-edge detected.  Restore every div_stack entry whose
+             * saveexec was pushed in a block at or after the target. */
+            for (uint32_t di = S.div_depth; di > 0; di--) {
+                uint32_t cb = S.div_stack[di - 1].cond_bir;
+                uint32_t cb_mb = S.block_map[cb];
+                if (cb_mb >= target_mb) {
+                    uint16_t sv = (uint16_t)S.div_stack[di - 1].saved_vreg;
+                    moperand_t sv_op = S.mf->exec_w ?
+                        mop_sgpr(sv) : mop_vreg_s(sv);
+                    emit2(S.mf->exec_w ? AMD_S_OR_B64 : AMD_S_OR_B32,
+                          mop_special(AMD_SPEC_EXEC),
+                          mop_special(AMD_SPEC_EXEC), sv_op);
+                }
+            }
+        }
+    }
+
+    /* Fallthrough: if target is the next physical block, skip the branch */
+    uint32_t target_mb = S.block_map[target_bir];
+    if (target_mb == S.current_mb + 1)
+        return;
+
+    emit0_1(AMD_S_BRANCH, mop_label(target_mb));
 }
 
 static void isel_br_cond(const bir_inst_t *I, int cond_div)
@@ -1422,7 +1656,7 @@ static void isel_br_cond(const bir_inst_t *I, int cond_div)
         emit0_2(AMD_V_CMP_NE_U32, mop_imm(0), vcond);
 
         uint32_t saved;
-        if (is_cdna()) {
+        if (S.mf->exec_w) {
             /* Wave64: reuse SGPR pairs by depth — dead after merge */
             if (S.esave_base == 0xFFFF) {
                 S.esave_base = S.next_param_sgpr;
@@ -1442,24 +1676,20 @@ static void isel_br_cond(const bir_inst_t *I, int cond_div)
         emit0_1(AMD_S_CBRANCH_EXECZ, mop_label(false_mb));
         /* Fall through to true block (next in layout) */
 
-        /* Find the merge point and record the divergent region */
-        uint32_t merge_bir = find_merge_block(true_bir, false_bir);
+        /* Merge point: operand[3] if available, else fall back to false */
+        uint32_t merge_bir = (I->num_operands >= 4) ?
+                              I->operands[3] : false_bir;
         int has_else = (merge_bir != false_bir);
 
         if (S.div_depth < MAX_DIV_REGIONS) {
             S.div_stack[S.div_depth].saved_vreg = saved;
             S.div_stack[S.div_depth].false_bir = false_bir;
             S.div_stack[S.div_depth].merge_bir = merge_bir;
+            S.div_stack[S.div_depth].cond_bir = S.current_bir_block;
             S.div_stack[S.div_depth].has_else = has_else;
+            S.div_stack[S.div_depth].in_then = 1;
             S.div_depth++;
         }
-
-        /* For diamonds, suppress then-block's branch to merge
-           so it falls through to the else-block */
-        if (has_else) {
-            S.suppress_src = true_bir;
-            S.suppress_dst = merge_bir;
-        }
     } else {
         /* Uniform branch: compare and branch on SCC */
         moperand_t cond = resolve_val(I->operands[0], 0);
@@ -1579,6 +1809,10 @@ static void isel_param(uint32_t idx, const bir_inst_t *I)
             if (base_sgpr & 1) base_sgpr++;
             S.next_param_sgpr = base_sgpr + 2;
 
+            /* SNAP: note which drawer this param lives in */
+            if (S.amd->snap_mode && S.snap_nparam < 64)
+                S.snap_sgprs[S.snap_nparam++] = base_sgpr;
+
             emit2(AMD_S_LOAD_DWORDX2, mop_sgpr(base_sgpr),
                   mop_sgpr(S.sgpr_kernarg), mop_imm((int32_t)offset));
             emit_wait_smem();
@@ -1590,10 +1824,26 @@ static void isel_param(uint32_t idx, const bir_inst_t *I)
             S.amd->reg_file[vr] = 1;
             emit1(AMD_V_MOV_B32, mop_vreg_v((uint16_t)vr), mop_imm(0));
         } else {
-            uint32_t vr = map_bir_val(idx, 0);
-            emit2(AMD_S_LOAD_DWORD, mop_vreg_s((uint16_t)vr),
+            /* GFX942 workaround: s_load_dword mixed with a burst of
+               s_load_dwordx2 causes MEMORY_APERTURE_VIOLATION.
+               Promote scalar params to dwordx2 via physical SGPR pair,
+               then s_mov the low half into a vreg for normal SSA flow. */
+            uint16_t base_sgpr = S.next_param_sgpr;
+            if (base_sgpr + 1 >= AMD_MAX_SGPRS) return;
+            if (base_sgpr & 1) base_sgpr++;
+            S.next_param_sgpr = base_sgpr + 2;
+
+            /* SNAP: record which SGPR pair holds this param */
+            if (S.amd->snap_mode && S.snap_nparam < 64)
+                S.snap_sgprs[S.snap_nparam++] = base_sgpr;
+
+            emit2(AMD_S_LOAD_DWORDX2, mop_sgpr(base_sgpr),
                   mop_sgpr(S.sgpr_kernarg), mop_imm((int32_t)offset));
             emit_wait_smem();
+
+            uint32_t vr = map_bir_val(idx, 0);
+            emit1(AMD_S_MOV_B32, mop_vreg_s((uint16_t)vr),
+                  mop_sgpr(base_sgpr));
         }
     } else {
         /* Device function: params in v0, v1, ... */
@@ -1603,6 +1853,10 @@ static void isel_param(uint32_t idx, const bir_inst_t *I)
         if (param_idx < 32)
             emit1(AMD_V_MOV_B32, mop_vreg_v((uint16_t)vr), mop_vgpr((uint16_t)param_idx));
     }
+
+    /* SNAP: last param loaded — take the photograph */
+    if (S.amd->snap_mode && S.is_kernel && param_idx + 1 == S.num_params)
+        snap_emit();
 }
 
 static void isel_thread_model(uint32_t idx, const bir_inst_t *I)
@@ -1668,12 +1922,13 @@ static void isel_barrier(void)
 static void isel_atomic(uint32_t idx, const bir_inst_t *I, int div)
 {
     /* ops[0] = address, ops[1] = value (ops[2] = compare for CAS) */
-    moperand_t addr = ensure_vgpr(resolve_val(I->operands[0], div));
     uint32_t nops = get_num_ops(I);
     uint32_t ptr_type = 0;
+    uint16_t sbase = 0xFFFF;
     if (I->operands[0] != BIR_VAL_NONE && !BIR_VAL_IS_CONST(I->operands[0])) {
         uint32_t si = BIR_VAL_INDEX(I->operands[0]);
         if (si < S.bir->num_insts) ptr_type = S.bir->insts[si].type;
+        if (si < BIR_MAX_INSTS) sbase = S.amd->val_sbase[si];
     }
     int as = get_addrspace(ptr_type);
 
@@ -1682,6 +1937,7 @@ static void isel_atomic(uint32_t idx, const bir_inst_t *I, int div)
 
     if (as == BIR_AS_SHARED) {
         /* DS atomics */
+        moperand_t addr = ensure_vgpr(resolve_val(I->operands[0], div));
         moperand_t val = (nops > 1) ? ensure_vgpr(resolve_val(I->operands[1], 1)) : mop_imm(0);
         uint16_t ds_op;
         switch (I->op) {
@@ -1715,11 +1971,19 @@ static void isel_atomic(uint32_t idx, const bir_inst_t *I, int div)
         }
         if (I->op == BIR_ATOMIC_CAS && nops > 2) {
             moperand_t cmp = ensure_vgpr(resolve_val(I->operands[2], 1));
+            moperand_t ca = ensure_vgpr(resolve_val(I->operands[0], div));
+            moperand_t ops[MINST_MAX_OPS];
+            ops[0] = dst; ops[1] = ca; ops[2] = val; ops[3] = cmp;
+            emit_minst(glb_op, 1, 3, ops, AMD_FLAG_GLC);
+        } else if (sbase != 0xFFFF) {
+            /* saddr form: vOffset, vData, s[base:base+1] */
+            moperand_t voff = ensure_vgpr(resolve_val(I->operands[0], 1));
             moperand_t ops[MINST_MAX_OPS];
-            ops[0] = dst; ops[1] = addr; ops[2] = val; ops[3] = cmp;
+            ops[0] = dst; ops[1] = voff; ops[2] = val; ops[3] = mop_sgpr(sbase);
             emit_minst(glb_op, 1, 3, ops, AMD_FLAG_GLC);
         } else {
-            emit2f(glb_op, dst, addr, val, AMD_FLAG_GLC);
+            moperand_t va = ensure_vgpr(resolve_val(I->operands[0], div));
+            emit2f(glb_op, dst, va, val, AMD_FLAG_GLC);
         }
         emit_wait_vm();
     }
@@ -2003,8 +2267,7 @@ static void isel_function(uint32_t fi)
     S.lds_offset = 0;
     S.hkrarg = F->num_params * 8;
     S.div_depth = 0;
-    S.suppress_src = 0xFFFFFFFF;
-    S.suppress_dst = 0xFFFFFFFF;
+    S.current_mb = 0;
     S.current_bir_block = 0;
 
     /* Scan BIR to determine hidden kernarg needs */
@@ -2025,10 +2288,22 @@ static void isel_function(uint32_t fi)
     if (S.is_kernel && S.needs_dispatch)
         S.hkrarg = (uint32_t)(F->num_params * 8) + 24u;
 
+    /* SNAP: reserve 8 bytes for the diagnostic buffer pointer.
+     * The host leaves a forwarding address here.  We write each
+     * param's value to it on entry, like a witness statement. */
+    S.snap_nparam = 0;
+    S.snap_koff = 0;
+    S.snap_base = 0;
+    if (A->snap_mode && S.is_kernel) {
+        S.snap_koff = S.hkrarg;
+        S.hkrarg += 8;
+    }
+
     /* Pointer params get physical SGPR pairs starting after reserved regs */
     S.next_param_sgpr = S.kern_reserved;
     if (S.next_param_sgpr & 1) S.next_param_sgpr++; /* align to even */
     S.esave_base = 0xFFFF;
+    S.sgpr_scrfp = 0; /* set in prologue if scratch used */
     S.max_ddep = 0;
 
     /* Skip host-only functions */
@@ -2044,7 +2319,7 @@ static void isel_function(uint32_t fi)
     MF->name = F->name;
     MF->first_block = A->num_mblocks;
     MF->is_kernel = S.is_kernel;
-    MF->wavefront_size = is_cdna() ? AMD_WAVE64 : AMD_WAVE_SIZE;
+    MF->bir_func = (uint16_t)fi;
     MF->lds_bytes = 0;
     MF->scratch_bytes = 0;
     MF->kernarg_bytes = F->num_params * 8;
@@ -2053,17 +2328,40 @@ static void isel_function(uint32_t fi)
     MF->needs_dispatch = S.needs_dispatch;
     MF->max_dim = S.max_dim;
 
-    /* Pre-create machine blocks and build block map */
-    for (uint32_t bi = 0; bi < F->num_blocks; bi++) {
-        uint32_t bir_bi = F->first_block + bi;
-        uint32_t mb_idx = A->num_mblocks + bi;
+    /* Stamp resource plan — target decisions made once, right here.
+     * Downstream reads MF fields, never interrogates the target enum. */
+    {
+        int cdna = (A->target <= AMD_TARGET_GFX942);
+        MF->exec_w   = cdna ? 1 : 0;
+        MF->smem_hz  = cdna ? 1 : 0;
+        MF->scr_afs  = cdna ? 1 : 0;
+        MF->rp_pad   = 0;
+        MF->imp_sgp  = cdna ? 6 : 0;
+        MF->sgp_min  = cdna ? 2 : 0;
+        MF->wavefront_size = cdna ? AMD_WAVE64 : AMD_WAVE_SIZE;
+        MF->r1_mode  = (3u << 16) | (3u << 18) | (1u << 21) | (1u << 23);
+        if (!cdna)
+            MF->r1_mode |= (1u << 26) | (1u << 27);
+    }
+    S.mf = MF;
+
+    /* Build execution-order block list. Nested diamonds must be fully
+     * contained within their parent's then/else regions, otherwise the
+     * hardware falls through into someone else's merge block. */
+    uint32_t n_exec = build_blk_ord(F, M);
+
+    /* Pre-create machine blocks in execution order */
+    for (uint32_t i = 0; i < n_exec; i++) {
+        uint32_t bir_bi = F->first_block + s_blk_ord[i];
+        uint32_t mb_idx = A->num_mblocks + i;
         if (mb_idx >= AMD_MAX_MBLOCKS) break;
         S.block_map[bir_bi] = mb_idx;
         A->mblocks[mb_idx].bir_block = bir_bi;
     }
 
-    /* Select instructions block by block */
-    for (uint32_t bi = 0; bi < F->num_blocks; bi++) {
+    /* Select instructions in execution order */
+    for (uint32_t i = 0; i < n_exec; i++) {
+        uint32_t bi = s_blk_ord[i];
         uint32_t bir_bi = F->first_block + bi;
         const bir_block_t *B = &M->blocks[bir_bi];
         uint32_t mb_idx = A->num_mblocks;
@@ -2073,6 +2371,7 @@ static void isel_function(uint32_t fi)
         MB->first_inst = A->num_minsts;
         MB->bir_block = bir_bi;
         S.current_bir_block = bir_bi;
+        S.current_mb = mb_idx;
 
         /* EXEC mask restore for divergent regions.
            At else-block start: flip to false lanes.
@@ -2080,10 +2379,12 @@ static void isel_function(uint32_t fi)
            Like changing shifts — the work never stops, the crew just rotates. */
         for (uint32_t di = 0; di < S.div_depth; di++) {
             if (S.div_stack[di].has_else && S.div_stack[di].false_bir == bir_bi) {
-                /* Else block: xor EXEC to get false lanes */
+                /* Else block: transition from then-region to else-region */
+                S.div_stack[di].in_then = 0;
+                /* xor EXEC to get false lanes */
                 uint16_t sv = (uint16_t)S.div_stack[di].saved_vreg;
-                moperand_t sv_op = is_cdna() ? mop_sgpr(sv) : mop_vreg_s(sv);
-                emit2(is_cdna() ? AMD_S_XOR_B64 : AMD_S_XOR_B32,
+                moperand_t sv_op = S.mf->exec_w ? mop_sgpr(sv) : mop_vreg_s(sv);
+                emit2(S.mf->exec_w ? AMD_S_XOR_B64 : AMD_S_XOR_B32,
                       mop_special(AMD_SPEC_EXEC),
                       mop_special(AMD_SPEC_EXEC), sv_op);
                 /* If all false lanes are off, skip else body */
@@ -2095,8 +2396,8 @@ static void isel_function(uint32_t fi)
             if (S.div_stack[di - 1].merge_bir == bir_bi) {
                 /* Merge block: restore all lanes */
                 uint16_t sv = (uint16_t)S.div_stack[di - 1].saved_vreg;
-                moperand_t sv_op = is_cdna() ? mop_sgpr(sv) : mop_vreg_s(sv);
-                emit2(is_cdna() ? AMD_S_OR_B64 : AMD_S_OR_B32,
+                moperand_t sv_op = S.mf->exec_w ? mop_sgpr(sv) : mop_vreg_s(sv);
+                emit2(S.mf->exec_w ? AMD_S_OR_B64 : AMD_S_OR_B32,
                       mop_special(AMD_SPEC_EXEC),
                       mop_special(AMD_SPEC_EXEC), sv_op);
                 /* Pop the region */
@@ -2113,20 +2414,14 @@ static void isel_function(uint32_t fi)
                 emit1(AMD_V_MOV_B32, mop_vreg_v((uint16_t)S.saved_tid[d]),
                       mop_vgpr((uint16_t)d));
             }
-            /* CDNA flat scratch: read src_private_base hi word into v251.
-               scratch_store doesn't work on GFX942 — HIP uses flat_store
-               with the private aperture tag instead. Cheers, AMD. */
-            if (is_cdna() && S.has_scratch) {
-                uint16_t sp = S.next_param_sgpr;
-                if (sp & 1) sp++;
-                S.sgpr_priv = sp;
-                S.next_param_sgpr = sp + 2;
-                /* s_mov_b64 s[sp:sp+1], src_private_base */
-                emit1(AMD_S_MOV_B64, mop_sgpr(sp),
-                      mop_special(AMD_SPEC_PRIV_BASE));
-                /* v_mov_b32 v251, s[sp+1]  (aperture tag) */
-                emit1(AMD_V_MOV_B32, mop_vgpr(AMD_VGPR_SCR_HI),
-                      mop_sgpr((uint16_t)(sp + 1)));
+
+            /* Scratch frame pointer: SADDR for scratch_load/store.
+             * GFX942 architected flat scratch: CP sets FLAT_SCRATCH.
+             * We just need an SGPR=0 as the frame base offset. */
+            if (S.has_scratch) {
+                S.sgpr_scrfp = S.next_param_sgpr++;
+                emit1(AMD_S_MOV_B32, mop_sgpr(S.sgpr_scrfp),
+                      mop_imm(0));
             }
         }
 
@@ -2282,7 +2577,7 @@ static void isel_function(uint32_t fi)
     MF->kernarg_bytes = S.hkrarg;
     MF->lds_bytes = (uint16_t)S.lds_offset;
     MF->first_alloc_sgpr = S.next_param_sgpr;
-    if (is_cdna() && S.esave_base != 0xFFFF) {
+    if (S.mf->exec_w && S.esave_base != 0xFFFF) {
         uint16_t etop = (uint16_t)(S.esave_base + S.max_ddep * 2);
         if (etop > MF->first_alloc_sgpr)
             MF->first_alloc_sgpr = etop;
@@ -2311,11 +2606,15 @@ int amdgpu_compile(const bir_module_t *bir, amd_module_t *amd)
     memset(amd->reg_map, 0, sizeof(amd->reg_map));
     memset(amd->reg_file, 0, sizeof(amd->reg_file));
     memset(amd->val_sbase, 0xFF, sizeof(amd->val_sbase));
+    memset(amd->val_scroff, 0xFF, sizeof(amd->val_scroff)); /* -1 = dynamic */
 
     /* Process each function */
     for (uint32_t fi = 0; fi < bir->num_funcs; fi++) {
         isel_function(fi);
     }
 
+    /* Resource plan: scan BIR, print kernel summaries */
+    amd_rplan(amd);
+
     return BC_OK;
 }
diff --git a/src/amdgpu/sched.c b/src/amdgpu/sched.c
index cc85aaf..0d50a79 100644
--- a/src/amdgpu/sched.c
+++ b/src/amdgpu/sched.c
@@ -21,6 +21,7 @@
 #define SCHED_LATENCY_SMEM  20
 #define SCHED_LATENCY_DS    4
 #define SCHED_LATENCY_ALU   1
+#define SCHED_MAX_SMEM_FLY  10     /* GFX942: >11 outstanding SMEM → fault */
 
 /* Wait kind tags */
 #define WAIT_VMEM  0
@@ -71,6 +72,13 @@ static uint16_t      s_barrier_pos[SCHED_MAX_BARRIERS];
 static uint16_t      s_num_barriers;
 static minst_t       s_output[AMD_MAX_MINSTS];
 
+/* Per-VGPR VMEM pending-write tracker.  Turns out "is this
+ * a load" is the wrong question when 200 of them are in the
+ * post simultaneously.  The right question is "has this
+ * particular register arrived yet."  Wisdom is retrospective. */
+static uint8_t       s_vmem_pending[AMD_MAX_VREGS];
+static uint8_t       s_has_vmem_store;  /* VMEM store in flight */
+
 /* ---- Instruction classification ---- */
 
 static int is_wait_op(uint16_t op)
@@ -96,10 +104,37 @@ static int is_pseudo_start(uint16_t op)
            op == AMD_PSEUDO_DEF;
 }
 
-static int is_global_load(uint16_t op)
+/* Atomic opcodes must stay contiguous in amd_op_t.  If you
+ * rearrange the enum the range check quietly stops working
+ * and you get to explain to someone why their atomics return
+ * feelings instead of values.  This catches it at compile time. */
+typedef char sched_assert_atomics_contiguous_[
+    (AMD_GLOBAL_ATOMIC_CMPSWAP - AMD_GLOBAL_ATOMIC_ADD == 8) ? 1 : -1
+];
+
+static int is_vmem_write(uint16_t op)
 {
-    return op == AMD_GLOBAL_LOAD_DWORD || op == AMD_GLOBAL_LOAD_DWORDX2 ||
-           op == AMD_SCRATCH_LOAD_DWORD || op == AMD_FLAT_LOAD_DWORD;
+    /* Anything that writes a VGPR through the VMEM pipe.
+     * Classified by pipeline behaviour, not by an opcode list
+     * that someone has to remember to update.  Memory is short
+     * and schedules are long. */
+    if (op == AMD_GLOBAL_LOAD_DWORD || op == AMD_GLOBAL_LOAD_DWORDX2)
+        return 1;
+    if (op == AMD_SCRATCH_LOAD_DWORD || op == AMD_FLAT_LOAD_DWORD)
+        return 1;
+    /* Atomic-with-return: contiguous range (guarded above) */
+    if (op >= AMD_GLOBAL_ATOMIC_ADD && op <= AMD_GLOBAL_ATOMIC_CMPSWAP)
+        return 1;
+    return 0;
+}
+
+static int is_vmem_store(uint16_t op)
+{
+    /* Stores don't write VGPRs but they do occupy the VMEM pipe.
+     * A load that arrives before the store commits reads whatever
+     * was there previously, which is never what you wanted. */
+    return op == AMD_GLOBAL_STORE_DWORD || op == AMD_GLOBAL_STORE_DWORDX2 ||
+           op == AMD_SCRATCH_STORE_DWORD || op == AMD_FLAT_STORE_DWORD;
 }
 
 static int is_scalar_load(uint16_t op)
@@ -140,7 +175,7 @@ static int sop2_writes_scc(uint16_t op)
 
 static uint32_t inst_latency(uint16_t op)
 {
-    if (is_global_load(op))  return SCHED_LATENCY_VMEM;
+    if (is_vmem_write(op))   return SCHED_LATENCY_VMEM;
     if (is_scalar_load(op))  return SCHED_LATENCY_SMEM;
     if (is_ds_load(op))      return SCHED_LATENCY_DS;
     return SCHED_LATENCY_ALU;
@@ -148,7 +183,7 @@ static uint32_t inst_latency(uint16_t op)
 
 static uint8_t load_wait_kind(uint16_t op)
 {
-    if (is_global_load(op))  return WAIT_VMEM;
+    if (is_vmem_write(op))   return WAIT_VMEM;
     if (is_scalar_load(op))  return WAIT_SMEM;
     return WAIT_DS;
 }
@@ -247,7 +282,7 @@ static int build_dag(uint16_t n)
         nd->epoch = cur_epoch;
         nd->deps_overflow = 0;
 
-        if (is_global_load(op) || is_scalar_load(op) || is_ds_load(op)) {
+        if (is_vmem_write(op) || is_scalar_load(op) || is_ds_load(op)) {
             nd->is_load = 1;
             nd->wait_kind = load_wait_kind(op);
         }
@@ -559,7 +594,10 @@ static uint32_t schedule_block(const minst_t *insts, uint32_t n,
     }
 
     memset(s_load_waited, 0, (size_t)sn);
+    memset(s_vmem_pending, 0, sizeof(s_vmem_pending));
+    s_has_vmem_store = 0;
     uint16_t fn = 0;
+    uint16_t smem_fly = 0;
 
     for (uint16_t i = 0; i < out; i++) {
         const minst_t *mi = &s_scheduled[i];
@@ -568,17 +606,28 @@ static uint32_t schedule_block(const minst_t *insts, uint32_t n,
         uint8_t needs_wait[3] = {0, 0, 0};
         uint8_t total_ops = mi->num_defs + mi->num_uses;
 
-        /* CDNA flat scratch: fence preceding stores before private load.
-         * Flat stores are async -- the private aperture can't snoop
-         * the write buffer, so we must drain vmcnt first. */
-        if (mi->op == AMD_FLAT_LOAD_DWORD)
+        /* ---- Store-to-load ordering ----
+         * Scratch stores are async.  A scratch load issued after
+         * a store can arrive first, like a letter posted Tuesday
+         * arriving before Monday's.  Royal Mail would be proud.
+         * Only wait when there's actually a store outstanding. */
+        if ((mi->op == AMD_FLAT_LOAD_DWORD ||
+             mi->op == AMD_SCRATCH_LOAD_DWORD) && s_has_vmem_store)
             needs_wait[WAIT_VMEM] = 1;
 
+        /* ---- Per-VGPR RAW hazard check ----
+         * If a source register is still waiting on VMEM, reading
+         * it now gets you whatever was in there before.  Quite
+         * rude of the hardware, but there it is. */
         for (uint8_t u = mi->num_defs; u < total_ops; u++) {
             const moperand_t *mop = &mi->operands[u];
             if (!is_trackable(mop)) continue;
             uint32_t k = op_key(mop);
             if (k < AMD_MAX_VREGS) {
+                /* Has this register's post arrived yet? */
+                if (s_vmem_pending[k])
+                    needs_wait[WAIT_VMEM] = 1;
+                /* Also check the per-load tracker for good measure */
                 uint16_t li = s_vreg_load[k];
                 if (li != 0xFFFF && !s_load_waited[li]) {
                     needs_wait[s_nodes[li].wait_kind] = 1;
@@ -589,6 +638,8 @@ static uint32_t schedule_block(const minst_t *insts, uint32_t n,
             if (mop->kind == MOP_SGPR) {
                 uint32_t k2 = PHYS_SGPR_KEY(mop->reg_num + 1);
                 if (k2 < AMD_MAX_VREGS) {
+                    if (s_vmem_pending[k2])
+                        needs_wait[WAIT_VMEM] = 1;
                     uint16_t li2 = s_vreg_load[k2];
                     if (li2 != 0xFFFF && !s_load_waited[li2]) {
                         needs_wait[s_nodes[li2].wait_kind] = 1;
@@ -599,10 +650,14 @@ static uint32_t schedule_block(const minst_t *insts, uint32_t n,
         }
 
         if (orig < sn && s_nodes[orig].is_barrier) {
-            for (uint16_t j = 0; j < sn; j++) {
-                if (s_nodes[j].is_load && !s_load_waited[j]) {
-                    needs_wait[s_nodes[j].wait_kind] = 1;
-                    s_load_waited[j] = 1;
+            /* Only wait on loads already emitted — same principle as
+             * the regular wait handler below.  Marking future loads
+             * as waited caused missing vmcnt after exec restores. */
+            for (uint16_t p = 0; p < i; p++) {
+                uint16_t oi = s_order[p];
+                if (oi < sn && s_nodes[oi].is_load && !s_load_waited[oi]) {
+                    needs_wait[s_nodes[oi].wait_kind] = 1;
+                    s_load_waited[oi] = 1;
                 }
             }
         }
@@ -610,10 +665,14 @@ static uint32_t schedule_block(const minst_t *insts, uint32_t n,
         for (int k = 0; k < 3; k++) {
             if (needs_wait[k]) {
                 fn = emit_wait(fn, k, target);
-                /* s_waitcnt/s_wait_*cnt(0) retires ALL outstanding loads
-                 * of this kind that were issued before this point. Only
-                 * mark loads already emitted -- future loads in the
-                 * scheduled order haven't been issued yet. */
+                if (k == WAIT_SMEM) smem_fly = 0;
+                /* vmcnt(0): everything in the VMEM pipe retires.
+                 * Clean slate.  Almost therapeutic. */
+                if (k == WAIT_VMEM) {
+                    memset(s_vmem_pending, 0, sizeof(s_vmem_pending));
+                    s_has_vmem_store = 0;
+                }
+                /* Mark loads already emitted as waited */
                 for (uint16_t p = 0; p < i; p++) {
                     uint16_t oi = s_order[p];
                     if (oi < sn && s_nodes[oi].is_load &&
@@ -625,6 +684,37 @@ static uint32_t schedule_block(const minst_t *insts, uint32_t n,
 
         if (fn >= SCHED_MAX_BLOCK * 2) return 0;
         s_final[fn++] = *mi;
+
+        /* Note which VGPRs this instruction owes us.
+         * We'll collect when someone tries to read them. */
+        if (is_vmem_write(mi->op)) {
+            for (uint8_t d = 0; d < mi->num_defs; d++) {
+                if (is_trackable(&mi->operands[d])) {
+                    uint32_t k = op_key(&mi->operands[d]);
+                    if (k < AMD_MAX_VREGS)
+                        s_vmem_pending[k] = 1;
+                }
+            }
+        }
+        if (is_vmem_store(mi->op))
+            s_has_vmem_store = 1;
+
+        /* GFX942 errata: >11 outstanding SMEM loads causes
+         * MEMORY_APERTURE_VIOLATION. Drain periodically. */
+        if (orig < sn && s_nodes[orig].is_load &&
+            s_nodes[orig].wait_kind == WAIT_SMEM) {
+            smem_fly++;
+            if (smem_fly >= SCHED_MAX_SMEM_FLY) {
+                fn = emit_wait(fn, WAIT_SMEM, target);
+                smem_fly = 0;
+                for (uint16_t p = 0; p <= i; p++) {
+                    uint16_t oi = s_order[p];
+                    if (oi < sn && s_nodes[oi].is_load &&
+                        s_nodes[oi].wait_kind == WAIT_SMEM)
+                        s_load_waited[oi] = 1;
+                }
+            }
+        }
     }
 
     /* Trailing waits: loads consumed in successor blocks */
diff --git a/src/amdgpu/verify.c b/src/amdgpu/verify.c
index f47d38f..be5ccdb 100644
--- a/src/amdgpu/verify.c
+++ b/src/amdgpu/verify.c
@@ -157,11 +157,14 @@ static void vfy_bnds(const minst_t *mi, uint32_t idx, const char *mn)
     for (i = 0; i < total; i++) {
         uint8_t k = mi->operands[i].kind;
         uint16_t r = mi->operands[i].reg_num;
+        /* Skip spill-encoded operands (bit 15 = pending resolution) */
+        if (r & 0x8000u) continue;
         if (k == MOP_SGPR && r > 101) {
             vfy_err(idx, mn, "s%u out of bounds (max 101)", (unsigned)r);
         }
-        if (k == MOP_VGPR && r > 255) {
-            vfy_err(idx, mn, "v%u out of bounds (max 255)", (unsigned)r);
+        if (k == MOP_VGPR && r >= AMD_MAX_VGPRS) {
+            vfy_err(idx, mn, "v%u out of bounds (max %u)", (unsigned)r,
+                    (unsigned)(AMD_MAX_VGPRS - 1));
         }
     }
 }
diff --git a/src/fe/parser.c b/src/fe/parser.c
index 0e12595..9b9c378 100644
--- a/src/fe/parser.c
+++ b/src/fe/parser.c
@@ -165,6 +165,26 @@ static void add_child(parser_t *P, uint32_t parent, uint32_t child)
     }
 }
 
+/* ---- Type name registry for cast disambiguation ---- */
+
+static void reg_tname(parser_t *P, uint32_t off, uint16_t len)
+{
+    if (P->num_tnames >= 128) return;
+    P->tnames[P->num_tnames].off = off;
+    P->tnames[P->num_tnames].len = len;
+    P->num_tnames++;
+}
+
+static int is_reg_type(const parser_t *P, uint32_t off, uint16_t len)
+{
+    for (int i = 0; i < P->num_tnames; i++) {
+        if (P->tnames[i].len == len &&
+            memcmp(P->src + P->tnames[i].off, P->src + off, len) == 0)
+            return 1;
+    }
+    return 0;
+}
+
 static int is_type_keyword(int type)
 {
     switch (type) {
@@ -367,13 +387,29 @@ static int looks_like_cast(parser_t *P)
     int t = peek_type(P, 1);
     if (is_type_keyword(t)) return 1;
     if (t == TOK_CU_DEVICE || t == TOK_CU_HOST) return 1;
-    /* C++ casts handled directly in parse_primary now */
-    if (t == TOK_IDENT && peek_type(P, 2) == TOK_RPAREN &&
-        can_start_unary(peek_type(P, 3)))
-        return 1;
-    /* (ident*) or (ident**) — pointer cast to struct/vector type */
-    if (t == TOK_IDENT && peek_type(P, 2) == TOK_STAR)
-        return 1;
+    if (t == TOK_IDENT) {
+        const token_t *id = &P->tokens[P->pos + 1 < P->num_tokens
+                                        ? P->pos + 1 : P->num_tokens - 1];
+        int known = is_reg_type(P, id->offset, (uint16_t)id->len);
+        if (peek_type(P, 2) == TOK_RPAREN) {
+            int after = peek_type(P, 3);
+            /* (ident) *expr — the classic C ambiguity. Could be
+             * cast+deref OR multiplication. Only treat as cast if
+             * we KNOW it's a type. Cost of getting this wrong: one
+             * GPU aperture fault and an afternoon of disassembly. */
+            if (after == TOK_STAR || after == TOK_AMP) {
+                if (known) return 1;
+            } else if (can_start_unary(after)) {
+                /* (ident)0, (ident)x etc. — no mul/deref ambiguity,
+                 * safe to assume cast (covers template params too) */
+                return 1;
+            }
+        }
+        /* (ident*) or (ident**) — pointer cast to struct/vector type.
+         * Star is INSIDE the parens (part of the type), not ambiguous. */
+        if (peek_type(P, 2) == TOK_STAR)
+            return 1;
+    }
     return 0;
 }
 
@@ -990,6 +1026,11 @@ static uint32_t parse_declaration(parser_t *P)
         P->nodes[def].qualifiers = quals;
         P->nodes[def].cuda_flags = cuda;
         add_child(P, def, type_node);
+        /* Register struct/enum name so (name)*x parses as mul, not cast */
+        { uint32_t fc = P->nodes[type_node].first_child;
+          if (fc && P->nodes[fc].type == AST_IDENT)
+              reg_tname(P, P->nodes[fc].d.text.offset,
+                        (uint16_t)P->nodes[fc].d.text.len); }
         advance(P);
         if (is_enum) {
             while (cur_type(P) != TOK_RBRACE && cur_type(P) != TOK_EOF) {
@@ -1177,15 +1218,27 @@ static uint32_t parse_declaration(parser_t *P)
         P->nodes[decl_node].d.oper.flags = ptr_depth;
         add_child(P, decl_node, type_node);
         add_child(P, decl_node, name);
+        /* Register typedef name for cast disambiguation */
+        if (quals & QUAL_TYPEDEF)
+            reg_tname(P, P->nodes[name].d.text.offset,
+                      (uint16_t)P->nodes[name].d.text.len);
 
+        { int ndims = 0;
         while (cur_type(P) == TOK_LBRACKET) {
             advance(P);
             if (cur_type(P) != TOK_RBRACKET) {
                 uint32_t sz = parse_expr(P, 0);
                 add_child(P, decl_node, sz);
+                ndims++;
             }
             expect(P, TOK_RBRACKET);
         }
+        /* Store array dimension count so lowerer can distinguish
+         * `int x[4]` (ndims=1, child=4) from `int x = 4` (ndims=0, child=4).
+         * The classic "is the next child an array size or an initializer?"
+         * ambiguity that has haunted C compilers since 1972. */
+        P->nodes[decl_node].d.oper.op = ndims;
+        }
 
         if (cur_type(P) == TOK_ASSIGN) {
             advance(P);
diff --git a/src/fe/parser.h b/src/fe/parser.h
index f365623..113a603 100644
--- a/src/fe/parser.h
+++ b/src/fe/parser.h
@@ -18,6 +18,12 @@ typedef struct {
     uint32_t    lb_max_pending;
     uint32_t    lb_min_pending;
 
+    /* Type name registry — struct/typedef names for cast disambiguation.
+     * Without this, (var) * expr parses as cast+deref instead of mul.
+     * The classic C ambiguity that has ruined more weekends than ISO 8601. */
+    struct { uint32_t off; uint16_t len; } tnames[128];
+    int             num_tnames;
+
     bc_error_t      errors[BC_MAX_ERRORS];
     int             num_errors;
 } parser_t;
diff --git a/src/fe/sema.c b/src/fe/sema.c
index c95f984..72f4530 100644
--- a/src/fe/sema.c
+++ b/src/fe/sema.c
@@ -1156,8 +1156,10 @@ static void check_var_decl(sema_ctx_t *S, uint32_t node)
 
     uint32_t var_type = resolve_typespec(S, type_n, n->d.oper.flags);
 
+    /* Parser stores bracket-dimension count in d.oper.op */
+    int pdims = n->d.oper.op;
     uint32_t next = ND(S, name_n)->next_sibling;
-    if (next && ND(S, next)->type == AST_INT_LIT) {
+    if (pdims > 0 && next && ND(S, next)->type == AST_INT_LIT) {
         int64_t count = parse_int_value(S->src + ND(S, next)->d.text.offset,
                                         (int)ND(S, next)->d.text.len);
         uint32_t arr_t = st_array(S, var_type, (uint16_t)count);
@@ -1167,9 +1169,7 @@ static void check_var_decl(sema_ctx_t *S, uint32_t node)
         if (init_n) check_expr(S, init_n);
         return;
     }
-    if (next && ND(S, next)->type == AST_IDENT) {
-        /* Disambiguate arr[N] vs initializer: only treat as array size
-           when ident is an enum constant and base type isn't enum/struct */
+    if (pdims > 0 && next && ND(S, next)->type == AST_IDENT) {
         char aname[128];
         get_text(S, next, aname, sizeof(aname));
         const sema_sym_t *as = find_sym(S, aname);
diff --git a/src/ir/bir.h b/src/ir/bir.h
index 316c923..1e255fd 100644
--- a/src/ir/bir.h
+++ b/src/ir/bir.h
@@ -136,7 +136,7 @@ typedef enum {
 
     /* Control flow */
     BIR_BR,                     /* ops[0] = block */
-    BIR_BR_COND,                /* ops[0] = cond, ops[1] = true block, ops[2] = false block */
+    BIR_BR_COND,                /* ops[0] = cond, [1] = true, [2] = false, [3] = merge */
     BIR_SWITCH,                 /* ops[0] = val, ops[1] = default block, rest in extra */
     BIR_RET,                    /* ops[0] = value (or num_operands=0 for void) */
     BIR_UNREACHABLE,
diff --git a/src/ir/bir_cfold.c b/src/ir/bir_cfold.c
index bd6d39d..0f1c866 100644
--- a/src/ir/bir_cfold.c
+++ b/src/ir/bir_cfold.c
@@ -17,7 +17,7 @@ static int is_inline_block_ref(uint16_t op, uint8_t j)
 {
     switch (op) {
     case BIR_BR:      return j == 0;
-    case BIR_BR_COND: return j == 1 || j == 2;
+    case BIR_BR_COND: return j >= 1 && j <= 3;
     case BIR_SWITCH:  return j == 1;
     case BIR_PHI:     return j % 2 == 0;
     default:          return 0;
diff --git a/src/ir/bir_dce.c b/src/ir/bir_dce.c
index e120cac..7209fd3 100644
--- a/src/ir/bir_dce.c
+++ b/src/ir/bir_dce.c
@@ -34,7 +34,7 @@ static int is_inline_block_ref(uint16_t op, uint8_t j)
 {
     switch (op) {
     case BIR_BR:      return j == 0;
-    case BIR_BR_COND: return j == 1 || j == 2;
+    case BIR_BR_COND: return j >= 1 && j <= 3;
     case BIR_SWITCH:  return j == 1;
     case BIR_PHI:     return j % 2 == 0;
     default:          return 0;
diff --git a/src/ir/bir_lower.c b/src/ir/bir_lower.c
index d639cf4..9372c3c 100644
--- a/src/ir/bir_lower.c
+++ b/src/ir/bir_lower.c
@@ -771,10 +771,17 @@ static uint32_t lower_expr(lower_t *L, uint32_t node)
             return BIR_VAL_NONE;
         }
         if (s->is_alloca) {
-            /* Arrays decay to pointer — return alloca addr directly.
-             * Loading from an array alloca reads uninit scratch. */
+            /* Arrays and structs decay to pointer — return alloca addr
+             * directly.  Loading from an array alloca reads uninit
+             * scratch.  Loading a whole struct produces a bulk load
+             * that isel can't decompose into per-field dwords — the
+             * assignment path needs the pointer to fire its per-field
+             * copy decomposition.  Without this, fbank[slot] = site
+             * stores only field 0 and the rest read back as whatever
+             * the silicon had for breakfast. */
             if (s->type < L->M->num_types &&
-                L->M->types[s->type].kind == BIR_TYPE_ARRAY)
+                (L->M->types[s->type].kind == BIR_TYPE_ARRAY ||
+                 L->M->types[s->type].kind == BIR_TYPE_STRUCT))
                 return BIR_MAKE_VAL(s->ref);
             uint32_t inst = emit(L, BIR_LOAD, s->type, 1, 0);
             set_op(L, inst, 0, BIR_MAKE_VAL(s->ref));
@@ -985,10 +992,11 @@ static uint32_t lower_expr(lower_t *L, uint32_t node)
             uint32_t lhs   = lower_expr(L, lhs_n);
             uint32_t rhs_b = new_block(L, "land.rhs");
             uint32_t end_b = new_block(L, "land.end");
-            uint32_t br    = emit(L, BIR_BR_COND, bir_type_void(L->M), 3, 0);
+            uint32_t br    = emit(L, BIR_BR_COND, bir_type_void(L->M), 4, 0);
             set_op(L, br, 0, lhs);
             set_op(L, br, 1, rhs_b);
             set_op(L, br, 2, end_b);
+            set_op(L, br, 3, end_b);
             set_block(L, rhs_b);
             uint32_t rhs = lower_expr(L, rhs_n);
             uint32_t s1  = emit(L, BIR_STORE, bir_type_void(L->M), 2, 0);
@@ -1015,10 +1023,11 @@ static uint32_t lower_expr(lower_t *L, uint32_t node)
             uint32_t lhs   = lower_expr(L, lhs_n);
             uint32_t rhs_b = new_block(L, "lor.rhs");
             uint32_t end_b = new_block(L, "lor.end");
-            uint32_t br    = emit(L, BIR_BR_COND, bir_type_void(L->M), 3, 0);
+            uint32_t br    = emit(L, BIR_BR_COND, bir_type_void(L->M), 4, 0);
             set_op(L, br, 0, lhs);
             set_op(L, br, 1, end_b);  /* true → skip RHS */
             set_op(L, br, 2, rhs_b);  /* false → eval RHS */
+            set_op(L, br, 3, end_b);
             set_block(L, rhs_b);
             uint32_t rhs = lower_expr(L, rhs_n);
             uint32_t s1  = emit(L, BIR_STORE, bir_type_void(L->M), 2, 0);
@@ -2013,13 +2022,17 @@ static uint32_t lower_lvalue(lower_t *L, uint32_t node)
         uint32_t pt = ref_type(L, obj_ptr);
         uint32_t st = ptr_inner(L, pt);
 
+        /* Preserve source address space: local alloca → private,
+         * global array element → global. Don't assume private —
+         * cells[i].mat needs a global load, not a scratch read. */
+        uint8_t src_as = L->M->types[pt].addrspace;
         for (int si = 0; si < L->nstructs; si++) {
             if (L->structs[si].bir_type != st) continue;
             for (int fi = 0; fi < L->structs[si].num_fields; fi++) {
                 if (strcmp(L->structs[si].field_names[fi], fname) != 0)
                     continue;
                 uint32_t ft  = L->structs[si].field_types[fi];
-                uint32_t fpt = bir_type_ptr(L->M, ft, BIR_AS_PRIVATE);
+                uint32_t fpt = bir_type_ptr(L->M, ft, src_as);
                 uint32_t idx = BIR_MAKE_CONST(bir_const_int(L->M,
                     bir_type_int(L->M, 32), fi));
                 uint32_t gep = emit(L, BIR_GEP, fpt, 2, 0);
@@ -2056,13 +2069,16 @@ static void lower_var_decl(lower_t *L, uint32_t node)
 
     uint32_t elem_t = resolve_type(L, type_n, n->d.oper.flags, n->cuda_flags);
 
-    /* Collect array dimensions: float a[16][16] → dims={16,16}, ndim=2 */
+    /* Collect array dimensions: float a[16][16] → dims={16,16}, ndim=2.
+     * Parser stores the bracket-dimension count in d.oper.op so we don't
+     * accidentally eat `int step = 4096` as `int step[4096]`. */
     uint32_t next = ND(L, name_n)->next_sibling;
     int is_array = 0;
     uint32_t dims[8];
     int ndim = 0;
+    int max_dims = n->d.oper.op;  /* how many [N] the parser saw */
 
-    while (next && ndim < 8) {
+    while (next && ndim < 8 && ndim < max_dims) {
         if (ND(L, next)->type == AST_INT_LIT) {
             dims[ndim++] = (uint32_t)parse_int_text(
                 L->src + ND(L, next)->d.text.offset,
@@ -2107,8 +2123,8 @@ static void lower_var_decl(lower_t *L, uint32_t node)
         uint32_t alloca = emit(L, BIR_ALLOCA, ptr_t, 0, 0);
         add_sym(L, name, alloca, arr_t, 1);
         /* Array initializer list: int arr[3] = {1, 2, 3}; */
-        uint32_t init_n = ND(L, next)->next_sibling;
-        if (init_n && ND(L, init_n)->type == AST_INIT_LIST) {
+        if (next && ND(L, next)->type == AST_INIT_LIST) {
+            uint32_t init_n = next;
             uint32_t ept = bir_type_ptr(L->M, elem_t, BIR_AS_PRIVATE);
             uint32_t el = ND(L, init_n)->first_child;
             uint32_t idx = 0;
@@ -2161,9 +2177,55 @@ static void lower_var_decl(lower_t *L, uint32_t node)
         }
     } else if (init_n && ND(L, init_n)->type != AST_NONE) {
         uint32_t init_v = lower_expr(L, init_n);
-        uint32_t st = emit(L, BIR_STORE, bir_type_void(L->M), 2, 0);
-        set_op(L, st, 0, init_v);
-        set_op(L, st, 1, BIR_MAKE_VAL(alloca));
+
+        /* Struct copy from pointer: Part p = parts[tid];
+         * lower_expr returned a GEP (ptr to struct in global/shared),
+         * not the struct value. Field-by-field copy required, or we'd
+         * dump 8 bytes of raw pointer into the alloca and the remaining
+         * fields would contain whatever the silicon was dreaming about. */
+        uint32_t vt = ref_type(L, init_v);
+        int did_copy = 0;
+        if (is_ptr_type(L, vt) &&
+            elem_t < L->M->num_types &&
+            L->M->types[elem_t].kind == BIR_TYPE_STRUCT) {
+            struct_def_t *sd = NULL;
+            for (int si = 0; si < L->nstructs; si++) {
+                if (L->structs[si].bir_type == elem_t) {
+                    sd = &L->structs[si]; break;
+                }
+            }
+            if (sd) {
+                uint8_t src_as = L->M->types[vt].addrspace;
+                uint32_t t32 = bir_type_int(L->M, 32);
+                for (int fi = 0; fi < sd->num_fields && fi < 16; fi++) {
+                    uint32_t ci = BIR_MAKE_CONST(
+                        bir_const_int(L->M, t32, fi));
+                    uint32_t spt = bir_type_ptr(L->M,
+                        sd->field_types[fi], src_as);
+                    uint32_t sg = emit(L, BIR_GEP, spt, 2, 0);
+                    set_op(L, sg, 0, init_v);
+                    set_op(L, sg, 1, ci);
+                    uint32_t ld = emit(L, BIR_LOAD,
+                        sd->field_types[fi], 1, 0);
+                    set_op(L, ld, 0, BIR_MAKE_VAL(sg));
+                    uint32_t dpt = bir_type_ptr(L->M,
+                        sd->field_types[fi], BIR_AS_PRIVATE);
+                    uint32_t dg = emit(L, BIR_GEP, dpt, 2, 0);
+                    set_op(L, dg, 0, BIR_MAKE_VAL(alloca));
+                    set_op(L, dg, 1, ci);
+                    uint32_t s = emit(L, BIR_STORE,
+                        bir_type_void(L->M), 2, 0);
+                    set_op(L, s, 0, BIR_MAKE_VAL(ld));
+                    set_op(L, s, 1, BIR_MAKE_VAL(dg));
+                }
+                did_copy = 1;
+            }
+        }
+        if (!did_copy) {
+            uint32_t st = emit(L, BIR_STORE, bir_type_void(L->M), 2, 0);
+            set_op(L, st, 0, init_v);
+            set_op(L, st, 1, BIR_MAKE_VAL(alloca));
+        }
     }
 }
 
@@ -2211,10 +2273,11 @@ static void lower_stmt(lower_t *L, uint32_t node)
         uint32_t else_b = else_n ? new_block(L, "if.else") : 0;
         uint32_t end_b  = new_block(L, "if.end");
 
-        uint32_t br = emit(L, BIR_BR_COND, bir_type_void(L->M), 3, 0);
+        uint32_t br = emit(L, BIR_BR_COND, bir_type_void(L->M), 4, 0);
         set_op(L, br, 0, cond);
         set_op(L, br, 1, then_b);
         set_op(L, br, 2, else_n ? else_b : end_b);
+        set_op(L, br, 3, end_b);
 
         /* Then */
         set_block(L, then_b);
@@ -2275,10 +2338,11 @@ static void lower_stmt(lower_t *L, uint32_t node)
         set_block(L, cond_b);
         if (cond_n && ND(L, cond_n)->type != AST_NONE) {
             uint32_t cv = lower_expr(L, cond_n);
-            uint32_t br = emit(L, BIR_BR_COND, bir_type_void(L->M), 3, 0);
+            uint32_t br = emit(L, BIR_BR_COND, bir_type_void(L->M), 4, 0);
             set_op(L, br, 0, cv);
             set_op(L, br, 1, body_b);
             set_op(L, br, 2, end_b);
+            set_op(L, br, 3, end_b);
         } else {
             /* Infinite loop: for(;;) */
             uint32_t br = emit(L, BIR_BR, bir_type_void(L->M), 1, 0);
@@ -2330,10 +2394,11 @@ static void lower_stmt(lower_t *L, uint32_t node)
         set_block(L, cond_b);
         {
             uint32_t cv = lower_expr(L, cond_n);
-            uint32_t br = emit(L, BIR_BR_COND, bir_type_void(L->M), 3, 0);
+            uint32_t br = emit(L, BIR_BR_COND, bir_type_void(L->M), 4, 0);
             set_op(L, br, 0, cv);
             set_op(L, br, 1, body_b);
             set_op(L, br, 2, end_b);
+            set_op(L, br, 3, end_b);
         }
 
         set_block(L, body_b);
@@ -2377,10 +2442,11 @@ static void lower_stmt(lower_t *L, uint32_t node)
         set_block(L, cond_b);
         {
             uint32_t cv = lower_expr(L, cond_n);
-            uint32_t br = emit(L, BIR_BR_COND, bir_type_void(L->M), 3, 0);
+            uint32_t br = emit(L, BIR_BR_COND, bir_type_void(L->M), 4, 0);
             set_op(L, br, 0, cv);
             set_op(L, br, 1, body_b);
             set_op(L, br, 2, end_b);
+            set_op(L, br, 3, end_b);
         }
 
         if (L->loop_depth > 0) L->loop_depth--;
@@ -2748,14 +2814,18 @@ static void collect_struct(lower_t *L, uint32_t node)
                     extra = ND(L, extra)->next_sibling;
                 }
 
-                /* Check for chained var_decl siblings (different types) */
+                /* Check for chained var_decl siblings (different types).
+                 * Each has its own type_spec — resolve it, or we'd
+                 * paint every field the same colour as the first. */
                 uint32_t chain = ND(L, member)->next_sibling;
                 while (chain && ND(L, chain)->type == AST_VAR_DECL) {
-                    uint32_t cft = ND(L, chain)->first_child;
-                    uint32_t cfn = cft ? ND(L, cft)->next_sibling : 0;
+                    uint32_t cft_n = ND(L, chain)->first_child;
+                    uint32_t cfn = cft_n ? ND(L, cft_n)->next_sibling : 0;
                     if (cfn && ND(L, cfn)->type == AST_IDENT
                         && sd->num_fields < 16) {
-                        sd->field_types[sd->num_fields] = ft;
+                        uint32_t cft = resolve_type(L, cft_n,
+                            ND(L, chain)->d.oper.flags, 0);
+                        sd->field_types[sd->num_fields] = cft;
                         get_text(L, cfn, sd->field_names[sd->num_fields],
                                  sizeof(sd->field_names[0]));
                         sd->num_fields++;
diff --git a/src/ir/bir_mem2reg.c b/src/ir/bir_mem2reg.c
index 93bbf57..2d73cd0 100644
--- a/src/ir/bir_mem2reg.c
+++ b/src/ir/bir_mem2reg.c
@@ -836,7 +836,7 @@ static void step7_compact(m2r_t *S)
                 case BIR_BR:
                     is_block_ref = (j == 0); break;
                 case BIR_BR_COND:
-                    is_block_ref = (j == 1 || j == 2); break;
+                    is_block_ref = (j >= 1 && j <= 3); break;
                 case BIR_SWITCH:
                     is_block_ref = (j == 1); break;
                 case BIR_PHI:
diff --git a/src/ir/bir_print.c b/src/ir/bir_print.c
index 8ca070a..c3893d4 100644
--- a/src/ir/bir_print.c
+++ b/src/ir/bir_print.c
@@ -279,6 +279,10 @@ static void print_inst(const bir_module_t *M, const bir_inst_t *I,
         print_block_label(M, I->operands[1], base_block, out);
         fprintf(out, ", ");
         print_block_label(M, I->operands[2], base_block, out);
+        if (I->num_operands > 3) {
+            fprintf(out, " merge ");
+            print_block_label(M, I->operands[3], base_block, out);
+        }
         break;
 
     /* Switch: switch %val, default BLOCK, [const: BLOCK], ... */
diff --git a/src/main.c b/src/main.c
index 1ae5ac8..a110859 100644
--- a/src/main.c
+++ b/src/main.c
@@ -106,6 +106,7 @@ int main(int argc, char *argv[])
     int no_dce = 0;
     int no_sched = 0;
     int no_pp = 0;
+    int snap_mode = 0;
     amd_target_t amd_target = AMD_TARGET_GFX1100;
     uint32_t     amd_elfm  = 0x41;       /* EF_AMDGPU_MACH for exact chip */
     const char  *amd_chip  = "gfx1100";  /* chip string for ELF metadata */
@@ -207,6 +208,8 @@ int main(int argc, char *argv[])
             amd_ra_lin = 1;
         else if (strcmp(argv[i], "--max-vgprs") == 0 && i + 1 < argc)
             amd_max_vgpr = atoi(argv[++i]);
+        else if (strcmp(argv[i], "--snap") == 0)
+            snap_mode = 1;
         else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
             usage(argv[0]);
             return 0;
@@ -396,6 +399,7 @@ int main(int argc, char *argv[])
                     }
                     amd->target = amd_target;
                     amd->elf_mach = amd_elfm;
+                    amd->snap_mode = (uint8_t)snap_mode;
                     snprintf(amd->chip_name, sizeof(amd->chip_name),
                              "%s", amd_chip);
                     int arc = amdgpu_compile(bir_module, amd);
diff --git a/src/runtime/bc_runtime.c b/src/runtime/bc_runtime.c
index e0e42f2..89b71ae 100644
--- a/src/runtime/bc_runtime.c
+++ b/src/runtime/bc_runtime.c
@@ -4,6 +4,7 @@
 #ifdef __linux__
 
 #include "bc_runtime.h"
+#include "bc_abend.h"
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
@@ -23,8 +24,16 @@ typedef struct { uint64_t handle; } hsa_executable_t;
 typedef struct { uint64_t handle; } hsa_executable_symbol_t;
 typedef struct { uint64_t handle; } hsa_code_object_reader_t;
 typedef struct { uint64_t handle; } hsa_loaded_code_object_t;
+typedef struct { uint64_t handle; } hsa_amd_memory_pool_t;
 typedef int64_t hsa_signal_value_t;
 
+/* AMD memory pool constants */
+#define HSA_AMD_SEGMENT_GLOBAL                    0
+#define HSA_AMD_MEMORY_POOL_INFO_SEGMENT          0
+#define HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS     1
+#define HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG   1
+#define HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE      2
+
 /* HSA queue — layout must match ABI exactly */
 typedef struct {
     uint32_t    type;
@@ -138,6 +147,15 @@ typedef uint64_t (*pfn_queue_load_write_idx_t)(const hsa_queue_t*);
 typedef void (*pfn_queue_store_write_idx_t)(
     const hsa_queue_t*, uint64_t);
 
+/* AMD memory pool extensions */
+typedef hsa_status_t (*pfn_amd_pool_alloc_t)(
+    hsa_amd_memory_pool_t, size_t, uint32_t, void**);
+typedef hsa_status_t (*pfn_amd_pool_free_t)(void*);
+typedef hsa_status_t (*pfn_amd_iterate_pools_t)(
+    hsa_agent_t, hsa_status_t(*)(hsa_amd_memory_pool_t, void*), void*);
+typedef hsa_status_t (*pfn_amd_pool_get_info_t)(
+    hsa_amd_memory_pool_t, uint32_t, void*);
+
 /* ---- Internal Device Structure ---- */
 
 typedef struct {
@@ -176,6 +194,9 @@ typedef struct {
     hsa_region_t kernarg_region;
     hsa_region_t device_region;
     int          initialized;
+
+    /* ABEND fault diagnostics — heap-allocated (~37KB) */
+    ab_ctx_t    *abend;
 } bc_device_impl_t;
 
 _Static_assert(sizeof(bc_device_impl_t) <= BC_DEVICE_OPAQUE_SIZE,
@@ -241,6 +262,35 @@ static hsa_status_t find_device_cb(hsa_region_t region, void *data)
     return HSA_STATUS_SUCCESS;
 }
 
+/* ---- Queue Error Callback ---- */
+
+/* Called by HSA when the queue faults — aperture violations, invalid
+ * packets, the usual GPU misdemeanours. Maps HSA status to an ABEND
+ * code and fires the dump. This is where the magic happens. */
+static void bc_qerr(hsa_status_t status, hsa_queue_t *source, void *data)
+{
+    (void)source;
+    ab_ctx_t *A = (ab_ctx_t *)data;
+    if (!A) return;
+
+    /* Map HSA queue error to ABEND code.
+     * HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION = 41 (0x29)
+     * HSA_STATUS_ERROR_MEMORY_FAULT = 43 (0x2B) */
+    uint16_t code = AB_G0FF;
+    if (status == 41)       /* MEMORY_APERTURE_VIOLATION */
+        code = AB_G0C5;
+    else if (status == 43)  /* MEMORY_FAULT */
+        code = AB_G0C4;
+    else if (status == 42)  /* ILLEGAL_INSTRUCTION */
+        code = AB_G0C1;
+
+    A->code = code;
+    A->reason = (uint32_t)status;
+    A->faulted = 1;
+
+    ab_dump(A, stderr);
+}
+
 /* ---- Init / Shutdown ---- */
 
 int bc_device_init(bc_device_t *dev)
@@ -317,13 +367,21 @@ int bc_device_init(bc_device_t *dev)
         return BC_RT_ERR_NO_MEM;
     }
 
+    /* ABEND diagnostics — init before queue so we can hook the
+     * queue error callback. Aperture violations are queue errors,
+     * not VM faults, so the system event handler alone won't catch them. */
+    D->abend = (ab_ctx_t *)calloc(1, sizeof(ab_ctx_t));
+    if (D->abend)
+        ab_init(D->abend, D->hsa_lib);
+
     uint32_t queue_size = 0;
     D->agent_get_info(D->gpu_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
                       &queue_size);
     if (queue_size == 0) queue_size = 128;
 
     st = D->queue_create(D->gpu_agent, queue_size, HSA_QUEUE_TYPE_SINGLE,
-                         NULL, NULL, UINT32_MAX, UINT32_MAX, &D->queue);
+                         bc_qerr, D->abend, UINT32_MAX, UINT32_MAX,
+                         &D->queue);
     if (st != HSA_STATUS_SUCCESS) {
         fprintf(stderr, "bc_runtime: queue_create failed (0x%x)\n", st);
         D->hsa_shut_down();
@@ -332,6 +390,7 @@ int bc_device_init(bc_device_t *dev)
     }
 
     D->initialized = 1;
+
     fprintf(stderr, "bc_runtime: GPU ready, queue size %u\n", queue_size);
     return BC_RT_OK;
 }
@@ -341,6 +400,7 @@ void bc_device_shutdown(bc_device_t *dev)
     bc_device_impl_t *D = D_of(dev);
     if (!D->initialized) return;
 
+    if (D->abend) { ab_shut(D->abend); free(D->abend); }
     if (D->queue)    D->queue_destroy(D->queue);
     D->hsa_shut_down();
     if (D->hsa_lib)  dlclose(D->hsa_lib);
@@ -434,6 +494,15 @@ int bc_load_kernel(bc_device_t *dev, const char *hsaco_path,
     out->_exec = exec.handle;
     out->_reader = reader.handle;
 
+    /* Load source map and stash kernel name for ABEND dumps */
+    if (D->abend) {
+        ab_slod(D->abend, hsaco_buf, (uint32_t)n);
+        /* Pre-fill dispatch context with kernel info so the dump
+         * has something useful even if ab_snag hasn't been called */
+        ab_snag(D->abend, out, kernel_name, "gfx942",
+                0, 0, 0, 0, 0, 0, NULL, 0);
+    }
+
     fprintf(stderr, "bc_runtime: loaded '%s' (kernarg=%u, lds=%u, "
             "scratch=%u)\n", kernel_name, out->kernarg_size,
             out->group_size, out->private_size);
@@ -476,6 +545,15 @@ void bc_free(bc_device_t *dev, void *ptr)
     D->mem_free(ptr);
 }
 
+void bc_trak(bc_device_t *dev, void *ptr, size_t size,
+             const char *label, int flags)
+{
+    bc_device_impl_t *D = D_of(dev);
+    if (D->abend)
+        ab_trak(D->abend, (uint64_t)(uintptr_t)ptr,
+                (uint64_t)size, label, (uint8_t)flags);
+}
+
 int bc_copy_h2d(bc_device_t *dev, void *dst, const void *src, size_t size)
 {
     bc_device_impl_t *D = D_of(dev);
@@ -516,16 +594,33 @@ int bc_dispatch(bc_device_t *dev, const bc_kernel_t *kern,
     if (block_z == 0) block_z = 1;
 
     void *kernarg_buf = NULL;
-    st = D->mem_alloc(D->kernarg_region, kern->kernarg_size, &kernarg_buf);
+    uint32_t alloc_sz = kern->kernarg_size;
+    if (args_size > alloc_sz) alloc_sz = args_size;
+    st = D->mem_alloc(D->kernarg_region, alloc_sz, &kernarg_buf);
     if (st != HSA_STATUS_SUCCESS || !kernarg_buf) {
         fprintf(stderr, "bc_runtime: kernarg alloc failed (0x%x)\n", st);
         return BC_RT_ERR_HSA;
     }
 
-    uint32_t copy_size = args_size;
-    if (copy_size > kern->kernarg_size) copy_size = kern->kernarg_size;
-    memset(kernarg_buf, 0, kern->kernarg_size);
-    memcpy(kernarg_buf, args, copy_size);
+    memset(kernarg_buf, 0, alloc_sz);
+    D->mem_copy(kernarg_buf, args, args_size);
+
+    /* Update dispatch dims + kernarg snapshot for ABEND dump */
+    if (D->abend) {
+        D->abend->dctx.grid[0] = grid_x;
+        D->abend->dctx.grid[1] = grid_y;
+        D->abend->dctx.grid[2] = grid_z;
+        D->abend->dctx.block[0] = block_x;
+        D->abend->dctx.block[1] = block_y;
+        D->abend->dctx.block[2] = block_z;
+        if (args && args_size > 0) {
+            uint32_t snap = args_size;
+            if (snap > sizeof(D->abend->args_snap))
+                snap = (uint32_t)sizeof(D->abend->args_snap);
+            memcpy(D->abend->args_snap, args, snap);
+            D->abend->dctx.args_sz = snap;
+        }
+    }
 
     hsa_signal_t signal;
     st = D->signal_create(1, 0, NULL, &signal);
@@ -567,6 +662,15 @@ int bc_dispatch(bc_device_t *dev, const bc_kernel_t *kern,
         (HSA_FENCE_SCOPE_SYSTEM << HSA_PKT_HEADER_SCRELEASE)
     );
 
+    /* Dump raw packet for debugging */
+    {
+        uint64_t *raw = (uint64_t *)pkt;
+        fprintf(stderr, "AQL pkt @%p (64 bytes):\n", (void*)pkt);
+        fprintf(stderr, "  [00] %016lx  [08] %016lx\n", raw[0], raw[1]);
+        fprintf(stderr, "  [16] %016lx  [24] %016lx\n", raw[2], raw[3]);
+        fprintf(stderr, "  [32] %016lx  [40] %016lx\n", raw[4], raw[5]);
+        fprintf(stderr, "  [48] %016lx  [56] %016lx\n", raw[6], raw[7]);
+    }
     __atomic_store_n(&pkt->header, header, __ATOMIC_RELEASE);
 
     D->queue_store_write_idx(D->queue, idx + 1);
@@ -621,6 +725,10 @@ void *bc_alloc(bc_device_t *dev, size_t size)
 void bc_free(bc_device_t *dev, void *ptr)
 { (void)dev; (void)ptr; }
 
+void bc_trak(bc_device_t *dev, void *ptr, size_t size,
+             const char *label, int flags)
+{ (void)dev; (void)ptr; (void)size; (void)label; (void)flags; }
+
 int bc_copy_h2d(bc_device_t *dev, void *dst, const void *src, size_t size)
 { (void)dev; (void)dst; (void)src; (void)size; return BC_RT_ERR_DLOPEN; }
 
diff --git a/src/runtime/bc_runtime.h b/src/runtime/bc_runtime.h
index cd8af28..922980e 100644
--- a/src/runtime/bc_runtime.h
+++ b/src/runtime/bc_runtime.h
@@ -51,6 +51,12 @@ void bc_unload_kernel(bc_device_t *dev, bc_kernel_t *kern);
 void *bc_alloc(bc_device_t *dev, size_t size);
 void bc_free(bc_device_t *dev, void *ptr);
 
+/* Track a GPU allocation for ABEND fault correlation.
+ * label = short name (e.g. "parts", "xs_data"), flags = AB_FL_* from bc_abend.h.
+ * Optional — but without it the dump can't tell you what you hit. */
+void bc_trak(bc_device_t *dev, void *ptr, size_t size,
+             const char *label, int flags);
+
 int bc_copy_h2d(bc_device_t *dev, void *dst, const void *src, size_t size);
 int bc_copy_d2h(bc_device_t *dev, void *dst, const void *src, size_t size);
 
diff --git a/tests/test_scopy.cu b/tests/test_scopy.cu
index 16308a2..c7c9d70 100644
--- a/tests/test_scopy.cu
+++ b/tests/test_scopy.cu
@@ -1,26 +1,16 @@
-/* Struct copy from global — the operation that stumped us.
- * Buffer starts zeroed (memset by harness).  We bitcast to
- * struct Quad*, copy struct to local, then add known offsets
- * to each field to prove the fields read correctly.
- * Expected: 10.0, 20.0, 30.0, 40.0
- * Run: test_gpu_run test_scopy.hsaco test_scopy 1 */
-struct Quad { float a; float b; float c; float d; };
+/* test_scopy — 6 pointer params + scratch, close to Moa pattern.
+ * out[tid] = a[tid] + b[tid] + c[tid] + d[tid]
+ * Expected: each array has value (tid + offset), sum = 4*tid + 10 */
+__global__ void test_scopy(float *out, float *a, float *b,
+                           float *c, float *d, int n) {
+    int tid = threadIdx.x;
+    if (tid < n) {
+    float tmp[4];
+    tmp[0] = a[tid];
+    tmp[1] = b[tid];
+    tmp[2] = c[tid];
+    tmp[3] = d[tid];
 
-__global__ void test_scopy(float *out, int n) {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
-    if (tid >= n) return;
-
-    /* Bitcast + struct copy from zeroed global memory.
-     * All fields should be 0.0, so we add known constants
-     * to prove each field is independently addressable
-     * and the struct didn't arrive as a pointer-shaped
-     * hallucination. */
-    struct Quad *arr = (struct Quad *)out;
-    struct Quad q;
-    q = arr[0];
-
-    out[0] = q.a + 10.0f;
-    out[1] = q.b + 20.0f;
-    out[2] = q.c + 30.0f;
-    out[3] = q.d + 40.0f;
+    out[tid] = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+    }
 }

From d524d88c544e4d7971a9b644bf5e06065eed0f38 Mon Sep 17 00:00:00 2001
From: ZaneHam <zanehambly@gmail.com>
Date: Wed, 11 Mar 2026 21:20:20 +1300
Subject: [PATCH 2/3] Use AMD memory pool API for kernarg allocation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Standard hsa_memory_allocate from HSA kernarg region gives memory
the scalar unit cannot read on MI300X — all params seen as zero.
HIP uses hsa_amd_memory_pool_allocate internally. Do the same.

Adds pool discovery callback, 4 AMD dlsym symbols, switches
bc_dispatch to amd_palloc/amd_pfree. Debug AQL dump removed.
---
 src/runtime/bc_runtime.c | 62 +++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 14 deletions(-)

diff --git a/src/runtime/bc_runtime.c b/src/runtime/bc_runtime.c
index 89b71ae..22cbdd0 100644
--- a/src/runtime/bc_runtime.c
+++ b/src/runtime/bc_runtime.c
@@ -188,11 +188,19 @@ typedef struct {
     pfn_queue_load_write_idx_t  queue_load_write_idx;
     pfn_queue_store_write_idx_t queue_store_write_idx;
 
+    /* AMD memory pool extensions — because hsa_memory_allocate
+     * gives memory the scalar unit can't read on MI300X. Cheers AMD. */
+    pfn_amd_pool_alloc_t    amd_palloc;
+    pfn_amd_pool_free_t     amd_pfree;
+    pfn_amd_iterate_pools_t amd_ipools;
+    pfn_amd_pool_get_info_t amd_pinfo;
+
     /* Device state */
     hsa_agent_t  gpu_agent;
     hsa_queue_t *queue;
     hsa_region_t kernarg_region;
     hsa_region_t device_region;
+    hsa_amd_memory_pool_t kernarg_pool;  /* AMD pool: the one that works */
     int          initialized;
 
     /* ABEND fault diagnostics — heap-allocated (~37KB) */
@@ -262,6 +270,25 @@ static hsa_status_t find_device_cb(hsa_region_t region, void *data)
     return HSA_STATUS_SUCCESS;
 }
 
+/* ---- AMD Pool Discovery ---- */
+
+/* Find the kernarg pool the way HIP does it, not the way the HSA
+ * spec says to. The spec is a lovely work of fiction. */
+static hsa_status_t find_kpool_cb(hsa_amd_memory_pool_t pool, void *data)
+{
+    bc_device_impl_t *D = (bc_device_impl_t *)data;
+    uint32_t segment = 0;
+    D->amd_pinfo(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
+    if (segment != HSA_AMD_SEGMENT_GLOBAL) return HSA_STATUS_SUCCESS;
+    uint32_t flags = 0;
+    D->amd_pinfo(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags);
+    if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG) {
+        D->kernarg_pool = pool;
+        return HSA_STATUS_INFO_BREAK;
+    }
+    return HSA_STATUS_SUCCESS;
+}
+
 /* ---- Queue Error Callback ---- */
 
 /* Called by HSA when the queue faults — aperture violations, invalid
@@ -332,6 +359,10 @@ int bc_device_init(bc_device_t *dev)
     LOAD(exec_destroy,       "hsa_executable_destroy");
     LOAD(queue_load_write_idx,  "hsa_queue_load_write_index_relaxed");
     LOAD(queue_store_write_idx, "hsa_queue_store_write_index_relaxed");
+    LOAD(amd_palloc,  "hsa_amd_memory_pool_allocate");
+    LOAD(amd_pfree,   "hsa_amd_memory_pool_free");
+    LOAD(amd_ipools,  "hsa_amd_agent_iterate_memory_pools");
+    LOAD(amd_pinfo,   "hsa_amd_memory_pool_get_info");
 
     hsa_status_t st = D->hsa_init();
     if (st != HSA_STATUS_SUCCESS) {
@@ -367,6 +398,18 @@ int bc_device_init(bc_device_t *dev)
         return BC_RT_ERR_NO_MEM;
     }
 
+    /* AMD pool discovery — find kernarg pool the way HIP does it */
+    D->kernarg_pool.handle = 0;
+    D->amd_ipools(D->gpu_agent, find_kpool_cb, D);
+    if (D->kernarg_pool.handle == 0) {
+        fprintf(stderr, "bc_runtime: AMD kernarg pool not found\n");
+        D->hsa_shut_down();
+        dlclose(D->hsa_lib);
+        return BC_RT_ERR_NO_MEM;
+    }
+    fprintf(stderr, "bc_runtime: AMD kernarg pool found (handle=0x%lx)\n",
+            (unsigned long)D->kernarg_pool.handle);
+
     /* ABEND diagnostics — init before queue so we can hook the
      * queue error callback. Aperture violations are queue errors,
      * not VM faults, so the system event handler alone won't catch them. */
@@ -596,14 +639,14 @@ int bc_dispatch(bc_device_t *dev, const bc_kernel_t *kern,
     void *kernarg_buf = NULL;
     uint32_t alloc_sz = kern->kernarg_size;
     if (args_size > alloc_sz) alloc_sz = args_size;
-    st = D->mem_alloc(D->kernarg_region, alloc_sz, &kernarg_buf);
+    st = D->amd_palloc(D->kernarg_pool, alloc_sz, 0, &kernarg_buf);
     if (st != HSA_STATUS_SUCCESS || !kernarg_buf) {
-        fprintf(stderr, "bc_runtime: kernarg alloc failed (0x%x)\n", st);
+        fprintf(stderr, "bc_runtime: kernarg pool alloc failed (0x%x)\n", st);
         return BC_RT_ERR_HSA;
     }
 
     memset(kernarg_buf, 0, alloc_sz);
-    D->mem_copy(kernarg_buf, args, args_size);
+    memcpy(kernarg_buf, args, args_size);
 
     /* Update dispatch dims + kernarg snapshot for ABEND dump */
     if (D->abend) {
@@ -626,7 +669,7 @@ int bc_dispatch(bc_device_t *dev, const bc_kernel_t *kern,
     st = D->signal_create(1, 0, NULL, &signal);
     if (st != HSA_STATUS_SUCCESS) {
         fprintf(stderr, "bc_runtime: signal_create failed (0x%x)\n", st);
-        D->mem_free(kernarg_buf);
+        D->amd_pfree(kernarg_buf);
         return BC_RT_ERR_HSA;
     }
 
@@ -662,15 +705,6 @@ int bc_dispatch(bc_device_t *dev, const bc_kernel_t *kern,
         (HSA_FENCE_SCOPE_SYSTEM << HSA_PKT_HEADER_SCRELEASE)
     );
 
-    /* Dump raw packet for debugging */
-    {
-        uint64_t *raw = (uint64_t *)pkt;
-        fprintf(stderr, "AQL pkt @%p (64 bytes):\n", (void*)pkt);
-        fprintf(stderr, "  [00] %016lx  [08] %016lx\n", raw[0], raw[1]);
-        fprintf(stderr, "  [16] %016lx  [24] %016lx\n", raw[2], raw[3]);
-        fprintf(stderr, "  [32] %016lx  [40] %016lx\n", raw[4], raw[5]);
-        fprintf(stderr, "  [48] %016lx  [56] %016lx\n", raw[6], raw[7]);
-    }
     __atomic_store_n(&pkt->header, header, __ATOMIC_RELEASE);
 
     D->queue_store_write_idx(D->queue, idx + 1);
@@ -687,7 +721,7 @@ int bc_dispatch(bc_device_t *dev, const bc_kernel_t *kern,
      * set up an ab_ctx_t via ab_init/ab_snag can check A->faulted. */
 
     D->signal_destroy(signal);
-    D->mem_free(kernarg_buf);
+    D->amd_pfree(kernarg_buf);
 
     return BC_RT_OK;
 }

From 86a92a362c653bc050f961be6fd616df13870aed Mon Sep 17 00:00:00 2001
From: ZaneHam <zanehambly@gmail.com>
Date: Sat, 14 Mar 2026 02:56:41 +1300
Subject: [PATCH 3/3] Divergence-aware SSA register allocator (Sampaio et al.
 2013)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Eliminates 100% of VGPR spills on 654-line Monte Carlo transport
kernel (186 → 0). Total scratch traffic drops 78%. Instruction
count drops 28% (9,448 → 6,761).

The key insight from Sampaio et al. (2013, §6): on Wave64, spilling
a divergent VGPR costs 64 dwords of scratch per lane. Spilling a
uniform VGPR costs 1 dword via v_readfirstlane. The old allocator
treated all spills equally — the new one exploits the 64:1 cost ratio.

Algorithm: Cooper et al. (2001) dominator tree, Braun & Hack (2009)
loop-depth weighting, SSA liveness with PHI-aware dataflow,
divergence-weighted spill cost, greedy SSA coloring with
divergence-aware victim selection, 4-path spill codegen
(remat / uniform VGPR / divergent VGPR / SGPR), post-RA
phi elimination with free coalescing.

All static memory (~30 MB), no malloc. ~1,300 lines of C99.
Operates on SSA form before phi elimination.
Enabled via: barracuda --ssa-ra

90/91 tests pass (1 skipped, unchanged from baseline).
---
 .gitignore               |   12 +
 CONTRIBUTING.md          |   20 +-
 Makefile                 |    4 +-
 spill_analysis.txt       |  103 +++
 src/amdgpu/amd_rplan.c   |  144 ++++
 src/amdgpu/amdgpu.h      |   22 +
 src/amdgpu/emit.c        |   21 +-
 src/amdgpu/isel.c        |   25 +-
 src/amdgpu/ra_ssa.c      | 1457 ++++++++++++++++++++++++++++++++++++++
 src/amdgpu/ra_ssa.h      |   20 +
 src/main.c               |    3 +
 src/runtime/bc_runtime.c |   13 +-
 12 files changed, 1828 insertions(+), 16 deletions(-)
 create mode 100644 spill_analysis.txt
 create mode 100644 src/amdgpu/amd_rplan.c
 create mode 100644 src/amdgpu/ra_ssa.c
 create mode 100644 src/amdgpu/ra_ssa.h

diff --git a/.gitignore b/.gitignore
index d71c8ae..4003a27 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,15 @@
 # OS noise
 Thumbs.db
 .DS_Store
+
+# Internal tools (not shipped with the compiler, These are surprises for later ;-) )
+/tools/
+
+# AI assistants — keep your configs to yourself
+.claude/
+.cursorrules
+.cursor/
+.gemini/
+.github/copilot/
+CLAUDE.md
+.aider*
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5e33fa9..7ae5898 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -84,8 +84,14 @@ When I was a kid learning Lua on Roblox, I would actually copy and paste scripts
 
 **What's not acceptable**
 
-- Generating code you don't understand - When I was writing Callout, my Call and Dispatch engine (it's what Emergency services use when dispatching a firetruck because you burnt toast and now the alarm is going off), I hit a wall. I know systems but had no idea on how to properly add a button or a UI element. I found myself relying on my Ollama model too much and eventually couldn't understand what I was making. BarraCUDA requires bit level precision as it emits machine code. If you want to submit a PR but don't understand a section of the codebase or don't understand everything, that is fine, that's being human. You are more than welcome to submit a PR, even an incomplete one, and we can discuss tradeoffs and implementations. We are all learning. Learning is what makes us, us.
-- Architecture - As above please don't make architectural decisions using a chatbot. Even then if you're making a big change in the code anyway feel free to contact me, I'm always happy to chat and open to new ideas.
+- **Generating code you don't understand** - When I was writing Callout, my Call and Dispatch engine (it's what Emergency services use when dispatching a firetruck because you burnt toast and now the alarm is going off), I hit a wall. I know systems but had no idea on how to properly add a button or a UI element. I found myself relying on my Ollama model too much and eventually couldn't understand what I was making. BarraCUDA requires bit level precision as it emits machine code. If you want to submit a PR but don't understand a section of the codebase or don't understand everything, that is fine, that's being human. You are more than welcome to submit a PR, even an incomplete one, and we can discuss tradeoffs and implementations. We are all learning. Learning is what makes us, us.
+- **Wholly synthetic undeclared code** is something we'll have to send back or rework together. If you've used an LLM, just say so — declared LLM-assisted code that you understand and can defend is absolutely fine. The copyright picture is genuinely unsettled though, so occasionally I might ask you to rewrite a section from scratch. Here's why there's caution:
+  - **Licence contamination** — LLM training data can include proprietary or incompatibly-licensed code. If it leaks into a PR, it poisons the Apache 2.0 licence for the whole project.
+  - **Copyright** — the wonderful folks in that small outfit known as "The United States Federal Government" have ruled that a human has to substantially author or alter code for it to be copyrightable. Unaltered LLM output may not be copyrightable at all, which means it can't actually be licensed under Apache 2.0. Now I'm not in the US, I'm in New Zealand, and our laws are actually more reasonable, but US lawyers aren't exactly well known for their geography knowledge.
+  - **Quality** — this is a compiler that emits GPU machine code. One wrong bit is silent data corruption. You need to understand what you're shipping.
+
+  The short version: declare your tools, understand your code, and if I ask you to rewrite something it's not personal — it's just the reality of shipping code in a world where the law hasn't caught up yet. Treat LLM output like a snippet from Stack Overflow — make sure you can explain it. And if you're struggling, that's okay. Mark a PR as a draft, raise a discussion, and I'll help when I can.
+- **Architecture** - As above please don't make architectural decisions using a chatbot. Even then if you're making a big change in the code anyway feel free to contact me, I'm always happy to chat and open to new ideas.
 
 ## On the Mighty Emdash
 You'll notice emdashes everywhere — in comments, in commit messages, in this document. I've been drawing hyphens a bit too long since primary school. I have my old books from when I was seven, and there they are — emdashes. Or hyphens. Or maybe I just didn't know what I was writing. 7-year-old me didn't leave a comment.
@@ -94,6 +100,14 @@ The point is: use them. They're better than parentheses for asides, better than
 
 The hate for emdashes is superficial and weird. I understand its because of LLM's. But I'm not going to let some Robot dictate how I share my own thoughts in my own language. 
 
+## Indigenous and Endangered Languages 
+
+The first non-English language in BarraCUDA's error messages was te reo Māori. Not because it was strategic, but because I live here, these are my neighbours, and this is one of the three official languages of Aotearoa New Zealand. You can run `barracuda --lang lang/mi.txt` right now and get your errors in te reo. Kia ora, GPU.
+
+There are roughly 7,000 languages spoken on Earth. About 40% of them are endangered. When those languages disappear from technology — when every error message, every man page, every compiler diagnostic is English-only — it sends a quiet message: this isn't for you. That matters more than most developers realise. If your tools don't speak your language, you're not just reading code, you're translating infrastructure. The research on cognitive load is absolutely clear: that translation costs time, costs accuracy, and costs people who would have been brilliant engineers. When you're reading one of my glorious Abend dumps because you decided it would be a great idea to chuck your MEGAGPT10 onto your RDNA 2 stick you should at least be able to read the consequences of your bad decisions in your own tongue. 
+
+Indigenous and endangered languages are especially welcome here. Te reo Māori, Welsh, Hawaiian, Navajo, Scots Gaelic, Samoan, any of the hundreds of languages that technology has quietly decided don't matter — if you want to see your language in a compiler diagnostic, this is that project. The translation format is dead simple and needs zero compiler knowledge, which also makes it a neat entry point into compiler development too! See the Where to Help section below for how.
+
 ## Where to Help
 
 Check `Issues` for current priorities. In general the most impactful areas are:
@@ -143,7 +157,7 @@ llvm-objdump -d --mcpu=gfx1100 output.hsaco
 
 ## License
 
-BarraCUDA is Apache 2.0. By submitting a PR, you agree your contribution is licensed under the same terms.
+BarraCUDA is Apache 2.0. By submitting a PR, you agree your contribution is licensed under the same terms and you represent that you have the right to do so — meaning the code is your own work, or derived from compatibly-licensed sources, and not copied from proprietary material.
 
 ---
 
diff --git a/Makefile b/Makefile
index b401deb..196f7c1 100644
--- a/Makefile
+++ b/Makefile
@@ -14,7 +14,7 @@ LIBS    = -lm
 SOURCES = src/main.c \
           src/fe/bc_err.c src/fe/preproc.c src/fe/lexer.c src/fe/parser.c src/fe/sema.c \
           src/ir/bir.c src/ir/bir_print.c src/ir/bir_lower.c src/ir/bir_mem2reg.c src/ir/bir_cfold.c src/ir/bir_dce.c \
-          src/amdgpu/amd_rplan.c src/amdgpu/isel.c src/amdgpu/emit.c src/amdgpu/encode.c src/amdgpu/enc_tab.c src/amdgpu/sched.c src/amdgpu/verify.c \
+          src/amdgpu/amd_rplan.c src/amdgpu/isel.c src/amdgpu/emit.c src/amdgpu/ra_ssa.c src/amdgpu/encode.c src/amdgpu/enc_tab.c src/amdgpu/sched.c src/amdgpu/verify.c \
           src/tensix/isel.c src/tensix/emit.c src/tensix/coarsen.c src/tensix/datamov.c
 OBJECTS = $(SOURCES:.c=.o)
 TARGET  = barracuda
@@ -39,7 +39,7 @@ TSRC    = tests/tmain.c tests/tsmoke.c tests/tcomp.c tests/tenc.c \
           tests/tregalloc.c
 TOBJS   = $(TSRC:.c=.o)
 COBJS   = src/ir/bir.o src/ir/bir_print.o src/ir/bir_lower.o src/ir/bir_mem2reg.o src/ir/bir_cfold.o src/ir/bir_dce.o \
-          src/amdgpu/amd_rplan.o src/amdgpu/encode.o src/amdgpu/enc_tab.o src/amdgpu/isel.o src/amdgpu/emit.o src/amdgpu/sched.o src/amdgpu/verify.o \
+          src/amdgpu/amd_rplan.o src/amdgpu/encode.o src/amdgpu/enc_tab.o src/amdgpu/isel.o src/amdgpu/emit.o src/amdgpu/ra_ssa.o src/amdgpu/sched.o src/amdgpu/verify.o \
           src/fe/bc_err.o src/fe/lexer.o src/fe/parser.o src/fe/preproc.o src/fe/sema.o \
           src/runtime/bc_abend.o
 
diff --git a/spill_analysis.txt b/spill_analysis.txt
new file mode 100644
index 0000000..365d031
--- /dev/null
+++ b/spill_analysis.txt
@@ -0,0 +1,103 @@
+Divergence-Aware SSA Register Allocator — Spill Analysis
+=========================================================
+
+Date:    2026-03-14
+Kernel:  Moa transport kernel (gpu/tp_kern.cu)
+         Monte Carlo neutron transport, 654 source lines
+         253 MIR blocks, 4035 virtual registers (3933V 102S)
+         3854 divergent VGPRs, 79 uniform VGPRs
+Target:  GFX942 (CDNA3, MI300X), Wave64
+
+The new SSA register allocator (ra_ssa) eliminates ALL 186 VGPR spills
+on the Moa transport kernel.  Total scratch traffic drops by 78%.
+Total emitted instruction count drops from 9,448 to 6,761 (28.4%).
+
+on Wave64 hardware, spilling a divergent VGPR costs 64 dwords of scratch per lane.  Spilling a
+uniform VGPR costs 1 dword via v_readfirstlane.  The old allocator
+treated all spills equally, the new one exploits the 64:1 cost ratio.
+
+                        Old (ra_gc)     SSA (ra_ssa)     Change
+                        -----------     -----------     -------
+VGPR spills                     186               0      -100% (HOLY!) 
+SGPR spills                      21              29       +38%
+Total spills                    207              29       -86%
+Scratch ops (store+load)      1,754             392       -78%
+Scratch bytes                 1,396             272       -81%
+VGPRs used                      250             237        -5%
+SGPRs used                      102             102         0%
+Total emitted instructions    9,448           6,761       -28%
+v_readfirstlane (SGPR path)       0              39        new
+
+The 29 SGPR spills are cheap (4 bytes each via VGPR relay to scratch).
+The 39 v_readfirstlane instructions are the scalar extraction path
+for SGPR spill/reload, each replaces what would have been a 256-byte
+per-lane scratch store on the old allocator.
+
+Algorithm
+---------
+1. CFG + Cooper et al. (2001) iterative dominator tree
+2. Loop nesting depth (exponential weighting, Braun & Hack 2009)
+3. SSA liveness with PHI-aware dataflow + exec-mask region extension
+4. Divergence-aware spill cost: cost(v) = Σ depth_weight × div_weight
+     - div_weight = 64 for divergent VGPRs (Wave64 scratch cost)
+     - div_weight = 1  for uniform VGPRs (readfirstlane to scalar)
+     - div_weight = 1  for SGPRs (already scalar)
+5. Rematerialisation detection (immediate loads → cost 0)
+6. SSA coloring: domtree preorder, backward scan, greedy lowest-color
+     - Precoloring for intra-block interference resolution
+     - Divergence-weighted spill victim selection on pressure overflow
+7. Spill codegen with 4 paths:
+     A. Remat (0 bytes scratch, 1 instruction)
+     B. Uniform VGPR: v_readfirstlane → scratch (4 bytes)
+     C. Divergent VGPR: full per-lane scratch (wave_width × 4 bytes)
+     D. SGPR: v_mov to relay → scratch (4 bytes)
+8. Post-RA phi elimination with free coalescing (same color = no copy)
+
+All static memory (~30 MB), no malloc.  ~1,300 lines of C99.
+Operates on SSA form before phi elimination — free PHI coalescing.
+Fallback to ra_gc/ra_lin for functions exceeding pool limits.
+
+Enabled via: barracuda --ssa-ra
+
+Files
+-----
+src/amdgpu/ra_ssa.c   — allocator implementation ( approx. 1,300 lines)
+src/amdgpu/ra_ssa.h   — public interface
+src/amdgpu/amdgpu.h   — vr_divg[] bitvector, shared helpers
+src/amdgpu/isel.c     — divergence propagation to per-vreg bitvector
+src/amdgpu/emit.c     — SSA dispatch, un-static shared helpers
+src/main.c            — --ssa-ra flag
+
+References
+----------
+Sampaio, D., Souza, R. M. de, Collange, S., & Pereira, F. M. Q. (2013).
+  Divergence analysis. ACM TOPLAS 35(4), Article 13, 1-36.
+  https://doi.org/10.1145/2523815
+
+Cooper, K. D., Harvey, T. J., & Kennedy, K. (2001).
+  A simple, fast dominance algorithm.
+  Software Practice and Experience, 4, 1-10.
+
+Braun, M., & Hack, S. (2009).
+  Register spilling and live-range splitting for SSA-form programs.
+  CC 2009, LNCS 5501, pp. 174-189.
+  https://doi.org/10.1007/978-3-642-00722-4_13
+
+Yes I used Zotero because I always seem to miss something in apa7th lol.
+
+Next Steps
+----------
+- Run on MI300X hardware 
+  barracuda --ssa-ra --amdgpu-bin --gfx942 gpu/tp_kern.cu
+  then: kahu a.hsaco --all
+  then: run Godiva benchmark, verify k_eff matches CPU (0.995 ± 0.001)
+- If kernel runs correctly, benchmark against ra_gc binary
+- Expected: significant speedup from 78% scratch reduction
+  (Sampaio et al. report 26.21% speedup on 395 CUDA kernels)
+
+Test Status
+-----------
+- 90/91 tests pass (1 skipped, same as before so no change there)
+- vector_add: 6 VGPRs, 0 spills, 0 scratch (both RDNA3 and CDNA3)
+- Moa kernel: binary generation succeeds (a.hsaco, 56 KB)
+- No verifier errors
diff --git a/src/amdgpu/amd_rplan.c b/src/amdgpu/amd_rplan.c
new file mode 100644
index 0000000..51db9b4
--- /dev/null
+++ b/src/amdgpu/amd_rplan.c
@@ -0,0 +1,144 @@
+#include "amdgpu.h"
+#include <string.h>
+#include <stdio.h>
+
+/*
+ * Resource planning pass for AMDGPU kernels.
+ *
+ * Scans BIR, stamps target-specific constants onto mfunc_t.
+ * Downstream passes read decisions, never ask questions.
+ * Modelled on tensix/coarsen.c — same philosophy, different planet.
+ *
+ * The GPU hardware has strong opinions and no sense of humour.
+ * Our comments compensate.
+ */
+
+/* ---- Scan Statistics ---- */
+
+typedef struct {
+    uint32_t tid[3];       /* thread ID uses per dim */
+    uint32_t bid[3];       /* block ID uses per dim */
+    uint32_t n_alloca;     /* scratch allocations */
+    uint32_t n_loads;
+    uint32_t n_stores;
+    uint32_t n_barr;       /* barriers */
+    uint32_t n_atom;       /* atomics */
+    uint32_t n_shfl;       /* warp shuffles */
+    uint8_t  max_dim;      /* highest dim used */
+    uint8_t  has_disp;     /* uses blockDim/gridDim */
+} rp_stat_t;
+
+/* ---- BIR Scan ---- */
+
+/* Walk the BIR for one kernel, count what it uses.
+ * Like a building inspector, except the building is made of
+ * register pressure and false confidence. */
+static void rp_scan(const bir_module_t *bir, const bir_func_t *F,
+                     rp_stat_t *st)
+{
+    memset(st, 0, sizeof(*st));
+    int guard = 262144;
+
+    for (uint32_t bi = 0; bi < F->num_blocks && guard > 0; bi++, guard--) {
+        const bir_block_t *B = &bir->blocks[F->first_block + bi];
+
+        for (uint32_t ii = 0; ii < B->num_insts && guard > 0; ii++, guard--) {
+            const bir_inst_t *I = &bir->insts[B->first_inst + ii];
+            uint32_t dim = I->subop < 3 ? I->subop : 0;
+
+            switch (I->op) {
+            case BIR_THREAD_ID:
+                st->tid[dim]++;
+                if (dim > st->max_dim) st->max_dim = (uint8_t)dim;
+                break;
+            case BIR_BLOCK_ID:
+                st->bid[dim]++;
+                if (dim > st->max_dim) st->max_dim = (uint8_t)dim;
+                break;
+            case BIR_BLOCK_DIM:
+            case BIR_GRID_DIM:
+                st->has_disp = 1;
+                if (dim > st->max_dim) st->max_dim = (uint8_t)dim;
+                break;
+            case BIR_ALLOCA:        st->n_alloca++; break;
+            case BIR_LOAD:          st->n_loads++;  break;
+            case BIR_STORE:         st->n_stores++; break;
+            case BIR_BARRIER:
+            case BIR_BARRIER_GROUP: st->n_barr++;   break;
+
+            case BIR_ATOMIC_ADD: case BIR_ATOMIC_SUB:
+            case BIR_ATOMIC_AND: case BIR_ATOMIC_OR: case BIR_ATOMIC_XOR:
+            case BIR_ATOMIC_MIN: case BIR_ATOMIC_MAX:
+            case BIR_ATOMIC_XCHG: case BIR_ATOMIC_CAS:
+            case BIR_ATOMIC_LOAD: case BIR_ATOMIC_STORE:
+                st->n_atom++; break;
+
+            case BIR_SHFL: case BIR_SHFL_UP:
+            case BIR_SHFL_DOWN: case BIR_SHFL_XOR:
+            case BIR_BALLOT: case BIR_VOTE_ANY: case BIR_VOTE_ALL:
+                st->n_shfl++; break;
+
+            default: break;
+            }
+        }
+    }
+    if (st->max_dim > 2) st->max_dim = 2;
+}
+
+/* ---- Target Allocation ---- */
+
+/* Stamp target-specific constants.  Every field here is a decision
+ * that used to be an is_cdna() call scattered across isel and emit.
+ * Now the argument happens once, up front, and everyone else just
+ * reads the memo. */
+static void rp_alloc(mfunc_t *MF, amd_target_t tgt, const rp_stat_t *st)
+{
+    int cdna = (tgt <= AMD_TARGET_GFX942);
+
+    MF->exec_w   = cdna ? 1 : 0;
+    MF->smem_hz  = cdna ? 1 : 0;
+    MF->scr_afs  = cdna ? 1 : 0;
+    MF->rp_pad   = 0;
+    MF->imp_sgp  = cdna ? 6 : 0;
+    MF->sgp_min  = cdna ? 2 : 0;
+    MF->wavefront_size = cdna ? AMD_WAVE64 : AMD_WAVE_SIZE;
+
+    /* RSRC1 static mode bits — computed once, OR'd into rsrc1 by emit.
+     * The per-kernel VGPR/SGPR block fields come from regalloc and
+     * get combined separately.  Target constants and per-kernel
+     * arithmetic stay in their proper lanes, like civilised traffic. */
+    MF->r1_mode = (3u << 16) |    /* FLOAT_DENORM_32 = preserve */
+                  (3u << 18) |    /* FLOAT_DENORM_16_64 = preserve */
+                  (1u << 21) |    /* DX10_CLAMP */
+                  (1u << 23);     /* IEEE_MODE */
+    if (!cdna) {
+        MF->r1_mode |= (1u << 26) |  /* WGP_MODE */
+                        (1u << 27);   /* MEM_ORDERED */
+    }
+
+    /* Stamp dispatch/dim info from scan */
+    MF->needs_dispatch = st->has_disp;
+    MF->max_dim = st->max_dim;
+}
+
+/* ---- Public Entry ---- */
+
+void amd_rplan(amd_module_t *A)
+{
+    int guard = 8192;
+    for (uint32_t fi = 0; fi < A->num_mfuncs && guard > 0; fi++, guard--) {
+        mfunc_t *MF = &A->mfuncs[fi];
+        if (!MF->is_kernel) continue;
+
+        rp_stat_t st;
+        rp_scan(A->bir, &A->bir->funcs[MF->bir_func], &st);
+        rp_alloc(MF, A->target, &st);
+
+        const char *name = A->bir->strings + MF->name;
+        printf("  rplan %s: wave%u, sgp_imp=%u, scratch=%s, "
+               "%u loads, %u stores\n",
+               name, MF->wavefront_size, MF->imp_sgp,
+               st.n_alloca ? "yes" : "no",
+               st.n_loads, st.n_stores);
+    }
+}
diff --git a/src/amdgpu/amdgpu.h b/src/amdgpu/amdgpu.h
index b4bfeee..da60cf1 100644
--- a/src/amdgpu/amdgpu.h
+++ b/src/amdgpu/amdgpu.h
@@ -432,6 +432,7 @@ typedef struct {
     uint32_t    vreg_count;
     uint16_t    reg_map[AMD_MAX_VREGS];    /* vreg -> phys reg */
     uint8_t     reg_file[AMD_MAX_VREGS];   /* 0=SGPR, 1=VGPR */
+    uint32_t    vr_divg[AMD_MAX_VREGS / 32]; /* 1=divergent per vreg */
 
     /* BIR value -> machine vreg mapping */
     uint32_t    val_vreg[BIR_MAX_INSTS];   /* BIR inst index -> vreg */
@@ -479,9 +480,30 @@ int  amdgpu_emit_elf(amd_module_t *amd, const char *path);
 /* Set to 1 to force linear scan instead of graph coloring */
 extern int amd_ra_lin;
 
+/* Set to 1 for divergence-aware SSA register allocation */
+extern int amd_ra_ssa;
+
 /* If non-zero, cap available VGPRs for regalloc (forces spills for testing) */
 extern int amd_max_vgpr;
 
+/* ---- Divergence Helpers ---- */
+
+static inline int vr_div(const amd_module_t *A, uint16_t vr)
+{ return (int)((A->vr_divg[vr / 32] >> (vr % 32)) & 1u); }
+
+static inline void vr_sdiv(amd_module_t *A, uint16_t vr)
+{ A->vr_divg[vr / 32] |= 1u << (vr % 32); }
+
+/* ---- Shared Helpers (emit.c, un-static for ra_ssa.c) ---- */
+
+uint16_t op_vreg(const moperand_t *op);
+void     rw_ops(amd_module_t *A, const mfunc_t *F);
+void     dce_copy(amd_module_t *A, const mfunc_t *F);
+void     fin_regs(const amd_module_t *A, mfunc_t *F);
+
+/* SSA register allocator (ra_ssa.c) */
+void ra_ssa(amd_module_t *A, uint32_t mf_idx);
+
 /* Encoding table (defined in amdgpu_emit.c) */
 extern const amd_enc_entry_t amd_enc_table[AMD_OP_COUNT];
 extern const amd_enc_entry_t amd_enc_table_gfx10[AMD_OP_COUNT];
diff --git a/src/amdgpu/emit.c b/src/amdgpu/emit.c
index bb761b0..8a5d887 100644
--- a/src/amdgpu/emit.c
+++ b/src/amdgpu/emit.c
@@ -201,12 +201,14 @@ static int interval_cmp_start(const void *a, const void *b)
 }
 
 /* Get the vreg referenced by an operand, or 0xFFFF if not a vreg */
-static uint16_t operand_vreg(const moperand_t *op)
+uint16_t op_vreg(const moperand_t *op)
 {
     if (op->kind == MOP_VREG_S || op->kind == MOP_VREG_V)
         return op->reg_num;
     return 0xFFFF;
 }
+/* Legacy name used throughout this file */
+static uint16_t operand_vreg(const moperand_t *op) { return op_vreg(op); }
 
 static uint32_t coalesce(amd_module_t *A, const mfunc_t *F);
 
@@ -556,7 +558,7 @@ static void expire_old(uint32_t point)
 }
 
 /* Rewrite virtual reg operands to physical */
-static void rw_ops(amd_module_t *A, const mfunc_t *F)
+void rw_ops(amd_module_t *A, const mfunc_t *F)
 {
     for (uint32_t bi = 0; bi < F->num_blocks; bi++) {
         const mblock_t *MB = &A->mblocks[F->first_block + bi];
@@ -615,7 +617,7 @@ static void rw_ops(amd_module_t *A, const mfunc_t *F)
    These appear when regalloc assigns the same phys reg to both sides
    of a copy. Harmless but noisy — like a postman delivering a letter
    back to the sender. */
-static void dce_copy(amd_module_t *A, const mfunc_t *F)
+void dce_copy(amd_module_t *A, const mfunc_t *F)
 {
     for (uint32_t bi = 0; bi < F->num_blocks; bi++) {
         const mblock_t *MB = &A->mblocks[F->first_block + bi];
@@ -637,7 +639,7 @@ static void dce_copy(amd_module_t *A, const mfunc_t *F)
     }
 }
 
-static void fin_regs(const amd_module_t *A, mfunc_t *F)
+void fin_regs(const amd_module_t *A, mfunc_t *F)
 {
     /* Minimum 1 SGPR/VGPR for the descriptor */
     if (F->num_sgprs == 0) F->num_sgprs = 1;
@@ -1967,17 +1969,20 @@ static void ra_gc(amd_module_t *A, uint32_t mf_idx)
 
 /* Global flag: set by --no-graphcolor to force linear scan */
 int amd_ra_lin = 0;
+/* SSA-based divergence-aware allocator */
+int amd_ra_ssa = 0;
 /* If non-zero, cap available VGPRs for regalloc (forces spills for testing) */
 int amd_max_vgpr = 0;
 
 static void ra_func(amd_module_t *A, uint32_t mf_idx)
 {
-    if (amd_ra_lin || A->vreg_count > RA_MAX_NODES) {
+    if (amd_ra_ssa) {
+        ra_ssa(A, mf_idx);
+    } else if (amd_ra_lin || A->vreg_count > RA_MAX_NODES) {
         ra_lin(A, mf_idx);
     } else {
         ra_gc(A, mf_idx);
     }
-
 }
 
 /* ---- Assembly Text Printer ---- */
@@ -2230,7 +2235,9 @@ static void emit_asm_function(amd_module_t *A, uint32_t mf_idx)
 
 void amdgpu_regalloc(amd_module_t *A)
 {
-    amdgpu_phi_elim(A);
+    /* SSA path does its own phi elimination post-allocation */
+    if (!amd_ra_ssa)
+        amdgpu_phi_elim(A);
     for (uint32_t fi = 0; fi < A->num_mfuncs; fi++)
         ra_func(A, fi);
 }
diff --git a/src/amdgpu/isel.c b/src/amdgpu/isel.c
index 6375ca0..e061edb 100644
--- a/src/amdgpu/isel.c
+++ b/src/amdgpu/isel.c
@@ -350,15 +350,35 @@ static uint32_t new_vreg(int is_vector)
         return AMD_MAX_VREGS - 1; /* saturate — better than wandering into the void */
     S.amd->vreg_count = v + 1;
     S.amd->reg_file[v] = (uint8_t)is_vector;
+    /* Propagate divergence to per-vreg bitvector.
+     * Most paths: is_vector correlates with divergence.
+     * FP ops force VGPR even when uniform — caller uses new_vrd. */
+    if (is_vector) vr_sdiv(S.amd, (uint16_t)v);
     return v;
 }
 
-/* Map a BIR instruction result to a virtual register */
+/* Create vreg with explicit divergence (for FP ops on uniform data) */
+static uint32_t new_vrd(int is_vec, int is_div)
+{
+    uint32_t v = S.amd->vreg_count;
+    if (v >= AMD_MAX_VREGS - 1)
+        return AMD_MAX_VREGS - 1;
+    S.amd->vreg_count = v + 1;
+    S.amd->reg_file[v] = (uint8_t)is_vec;
+    if (is_div) vr_sdiv(S.amd, (uint16_t)v);
+    return v;
+}
+
+/* Map a BIR instruction result to a virtual register.
+ * is_vector picks the register file (SGPR vs VGPR).
+ * Divergence comes from BIR-level analysis — FP ops may be
+ * VGPR (is_vector=1) but uniform (!divergent). */
 static uint32_t map_bir_val(uint32_t bir_inst, int is_vector)
 {
     if (bir_inst < BIR_MAX_INSTS && S.amd->val_vreg[bir_inst] != 0xFFFFFFFF)
         return S.amd->val_vreg[bir_inst];
-    uint32_t v = new_vreg(is_vector);
+    int div = (bir_inst < BIR_MAX_INSTS) ? is_divergent(bir_inst) : is_vector;
+    uint32_t v = new_vrd(is_vector, div);
     if (bir_inst < BIR_MAX_INSTS) {
         S.amd->val_vreg[bir_inst] = v;
         S.amd->val_file[bir_inst] = (uint8_t)is_vector;
@@ -2605,6 +2625,7 @@ int amdgpu_compile(const bir_module_t *bir, amd_module_t *amd)
     memset(amd->val_file, 0, sizeof(amd->val_file));
     memset(amd->reg_map, 0, sizeof(amd->reg_map));
     memset(amd->reg_file, 0, sizeof(amd->reg_file));
+    memset(amd->vr_divg, 0, sizeof(amd->vr_divg));
     memset(amd->val_sbase, 0xFF, sizeof(amd->val_sbase));
     memset(amd->val_scroff, 0xFF, sizeof(amd->val_scroff)); /* -1 = dynamic */
 
diff --git a/src/amdgpu/ra_ssa.c b/src/amdgpu/ra_ssa.c
new file mode 100644
index 0000000..2ec8fa4
--- /dev/null
+++ b/src/amdgpu/ra_ssa.c
@@ -0,0 +1,1457 @@
+#include "amdgpu.h"
+#include <string.h>
+#include <stdio.h>
+
+/*
+ * Divergence-aware SSA register allocator for AMDGCN.
+ *
+ * On Wave64 (CDNA3/MI300X), spilling a divergent VGPR costs 64 dwords
+ * of scratch per lane.  Spilling a uniform VGPR costs 1 dword — extract
+ * via v_readfirstlane, store as scalar, broadcast back on reload.
+ * The previous allocator had no knowledge of this 64:1 cost asymmetry,
+ * producing 1,754 spills on the Moa transport kernel and treating every
+ * eviction like a democracy where all values are created equal.
+ * They are not.  Some values are 64× more equal than others.
+ *
+ * References:
+ *   Sampaio et al. (2013) "Divergence Analysis", ACM TOPLAS 35(4) §6
+ *   Cooper et al. (2001) "A Simple, Fast Dominance Algorithm"
+ *   Braun & Hack (2009) "Register Spilling for SSA-Form Programs"
+ *
+ * Dependencies: libc, faith in fixed-point algorithms, strong tea.
+ */
+
+/* ---- Pool Limits ---- */
+
+#define RS_MAX_BLK  4096
+#define RS_MAX_VR   8192
+#define RS_BV_WDS   ((RS_MAX_VR + 31) / 32)
+
+/* Spill relay registers — shared with emit.c */
+#define RS_RELAY_V0   250
+#define RS_VGPR_CEIL  250
+#define RS_RELAY_S    99
+#define RS_RELAY_S2   98
+#define RS_MAX_SPILL  512
+
+/* ---- Static Pools ---- */
+
+/* CFG */
+static uint16_t rs_succ[RS_MAX_BLK * 2];   /* successors (max 2 per block) */
+static uint8_t  rs_nsuc[RS_MAX_BLK];
+static uint16_t rs_pred[RS_MAX_BLK * 4];   /* predecessors */
+static uint8_t  rs_nprd[RS_MAX_BLK];
+static uint16_t rs_poff[RS_MAX_BLK];       /* pred list offset */
+
+/* Dominator tree */
+static uint16_t rs_idom[RS_MAX_BLK];
+static uint16_t rs_rpo[RS_MAX_BLK];        /* RPO number per block */
+static uint16_t rs_rord[RS_MAX_BLK];       /* block at RPO position i */
+
+/* Loop depth */
+static uint16_t rs_ldep[RS_MAX_BLK];
+
+/* Liveness bitvectors */
+static uint32_t rs_lin[RS_MAX_BLK * RS_BV_WDS];
+static uint32_t rs_lout[RS_MAX_BLK * RS_BV_WDS];
+static uint32_t rs_bdef[RS_MAX_BLK * RS_BV_WDS];
+static uint32_t rs_buse[RS_MAX_BLK * RS_BV_WDS];
+
+/* Coloring */
+static uint16_t rs_col[RS_MAX_VR];     /* physical reg, 0xFFFF = uncolored */
+static uint8_t  rs_spd[RS_MAX_VR];     /* 1 = spilled */
+static uint32_t rs_cost[RS_MAX_VR];    /* weighted spill cost */
+
+/* Remat info */
+typedef struct { uint16_t op; int32_t imm; } rs_rmat_t;
+static rs_rmat_t rs_rmat[RS_MAX_VR];
+
+/* Dominator postorder */
+static uint16_t rs_dpord[RS_MAX_BLK];
+static uint16_t rs_dchld[RS_MAX_BLK * 8]; /* domtree children, packed */
+static uint16_t rs_dcoff[RS_MAX_BLK];     /* child list offset */
+static uint8_t  rs_dcnt[RS_MAX_BLK];      /* child count */
+
+/* Spill slot tracking */
+static struct {
+    uint16_t vreg;
+    uint16_t off;   /* byte offset in scratch */
+} rs_spill[RS_MAX_SPILL];
+static uint32_t rs_nspill;
+
+/* Spill slot offset lookup */
+static uint16_t rs_soff_tbl[RS_MAX_VR];
+
+/* Expansion buffer for spill codegen */
+#define RS_EXPBUF  32768
+static minst_t rs_ebuf[RS_EXPBUF];
+
+/* ---- Bitvector Helpers ---- */
+
+static inline void bv_set(uint32_t *bv, uint16_t bit)
+{ bv[bit / 32] |= 1u << (bit % 32); }
+
+static inline void bv_clr(uint32_t *bv, uint16_t bit)
+{ bv[bit / 32] &= ~(1u << (bit % 32)); }
+
+static inline int bv_tst(const uint32_t *bv, uint16_t bit)
+{ return (int)((bv[bit / 32] >> (bit % 32)) & 1u); }
+
+/* ---- Phase 2: CFG + Dominator Tree ---- */
+
+/* Is this instruction a block terminator? */
+static int rs_term(uint16_t op)
+{
+    return op == AMD_S_BRANCH || op == AMD_S_CBRANCH_SCC0 ||
+           op == AMD_S_CBRANCH_SCC1 || op == AMD_S_CBRANCH_EXECZ ||
+           op == AMD_S_CBRANCH_EXECNZ || op == AMD_S_ENDPGM ||
+           op == AMD_S_SETPC_B64;
+}
+
+/* Build CFG successors + predecessors.  Same edge detection as
+ * ra_build_cfg but without the dramatics. */
+static void rs_cfg(const amd_module_t *A, const mfunc_t *F)
+{
+    uint16_t nb = F->num_blocks;
+    if (nb > RS_MAX_BLK) nb = RS_MAX_BLK;
+
+    memset(rs_nsuc, 0, nb);
+    memset(rs_nprd, 0, nb);
+
+    /* Pass 1: build successor lists by scanning terminators.
+     * Must scan ALL terminators backward from end of block, not
+     * just the last instruction — a block can have a conditional
+     * branch followed by an unconditional branch. Copying the
+     * ra_build_cfg pattern from emit.c because ignoring it the
+     * first time orphaned 174 blocks in the domtree.  Lesson:
+     * "same edge detection" means SAME edge detection. */
+    for (uint16_t bi = 0; bi < nb; bi++) {
+        const mblock_t *MB = &A->mblocks[F->first_block + bi];
+        int has_uncond = 0;
+
+        /* Empty blocks fall through — don't skip them! */
+        if (MB->num_insts > 0) {
+            for (uint32_t ii = MB->num_insts; ii > 0; ii--) {
+                const minst_t *mi = &A->minsts[MB->first_inst + ii - 1];
+                if (!rs_term(mi->op)) break;
+
+                if (mi->op == AMD_S_ENDPGM || mi->op == AMD_S_SETPC_B64) {
+                    has_uncond = 1;
+                } else if (mi->op == AMD_S_BRANCH) {
+                    has_uncond = 1;
+                    if (mi->num_uses > 0 &&
+                        mi->operands[mi->num_defs].kind == MOP_LABEL) {
+                        uint32_t tgt = (uint32_t)mi->operands[mi->num_defs].imm;
+                        if (tgt >= F->first_block &&
+                            tgt < F->first_block + nb &&
+                            rs_nsuc[bi] < 2) {
+                            rs_succ[bi * 2 + rs_nsuc[bi]++] =
+                                (uint16_t)(tgt - F->first_block);
+                        }
+                    }
+                } else {
+                    /* Conditional branch */
+                    if (mi->num_uses > 0 &&
+                        mi->operands[mi->num_defs].kind == MOP_LABEL) {
+                        uint32_t tgt = (uint32_t)mi->operands[mi->num_defs].imm;
+                        if (tgt >= F->first_block &&
+                            tgt < F->first_block + nb &&
+                            rs_nsuc[bi] < 2) {
+                            rs_succ[bi * 2 + rs_nsuc[bi]++] =
+                                (uint16_t)(tgt - F->first_block);
+                        }
+                    }
+                }
+            }
+        }
+
+        /* Fallthrough: if no unconditional branch/endpgm, next block
+         * is a successor.  Empty blocks always fall through.
+         * Getting this wrong orphans hundreds of blocks because empty
+         * separator blocks break the CFG chain. */
+        if (!has_uncond && bi + 1 < nb && rs_nsuc[bi] < 2)
+            rs_succ[bi * 2 + rs_nsuc[bi]++] = (uint16_t)(bi + 1);
+    }
+
+    /* Pass 2: build predecessor lists from successors */
+    /* First count preds per block */
+    for (uint16_t bi = 0; bi < nb; bi++) {
+        for (uint8_t s = 0; s < rs_nsuc[bi]; s++) {
+            uint16_t tgt = rs_succ[bi * 2 + s];
+            if (tgt < nb && rs_nprd[tgt] < 255)
+                rs_nprd[tgt]++;
+        }
+    }
+
+    /* Compute offsets */
+    uint16_t poff = 0;
+    for (uint16_t bi = 0; bi < nb; bi++) {
+        rs_poff[bi] = poff;
+        poff += rs_nprd[bi];
+        if (poff > RS_MAX_BLK * 4) poff = RS_MAX_BLK * 4;
+    }
+
+    /* Fill predecessor lists */
+    memset(rs_nprd, 0, nb);
+    for (uint16_t bi = 0; bi < nb; bi++) {
+        for (uint8_t s = 0; s < rs_nsuc[bi]; s++) {
+            uint16_t tgt = rs_succ[bi * 2 + s];
+            if (tgt >= nb) continue;
+            uint16_t off = (uint16_t)(rs_poff[tgt] + rs_nprd[tgt]);
+            if (off < RS_MAX_BLK * 4) {
+                rs_pred[off] = bi;
+                rs_nprd[tgt]++;
+            }
+        }
+    }
+}
+
+/* Iterative stack-based RPO (no recursion, as the gods of avionics demand) */
+static uint16_t rs_rpo_nb;
+
+static void rs_bld_rpo(uint16_t nb)
+{
+    static uint8_t vis[RS_MAX_BLK];
+    memset(vis, 0, nb);
+
+    /* DFS stack: (block, next_succ_to_visit) */
+    static struct { uint16_t blk; uint8_t si; } stk[RS_MAX_BLK];
+    uint16_t top = 0;
+    uint16_t rpo_pos = nb;
+
+    stk[top].blk = 0;
+    stk[top].si = 0;
+    top++;
+    vis[0] = 1;
+
+    uint32_t guard = 0;
+    while (top > 0 && guard < RS_MAX_BLK * 4) {
+        guard++;
+        uint16_t b = stk[top - 1].blk;
+        uint8_t  si = stk[top - 1].si;
+
+        if (si < rs_nsuc[b]) {
+            stk[top - 1].si = si + 1;
+            uint16_t s = rs_succ[b * 2 + si];
+            if (s < nb && !vis[s]) {
+                vis[s] = 1;
+                if (top < RS_MAX_BLK) {
+                    stk[top].blk = s;
+                    stk[top].si = 0;
+                    top++;
+                }
+            }
+        } else {
+            top--;
+            if (rpo_pos > 0) {
+                rpo_pos--;
+                rs_rord[rpo_pos] = b;
+                rs_rpo[b] = rpo_pos;
+            }
+        }
+    }
+
+    /* Unreachable blocks get appended at the end with high RPO numbers */
+    for (uint16_t bi = 0; bi < nb; bi++) {
+        if (!vis[bi] && rpo_pos > 0) {
+            rpo_pos--;
+            rs_rord[rpo_pos] = bi;
+            rs_rpo[bi] = rpo_pos;
+        }
+    }
+
+    rs_rpo_nb = nb;
+}
+
+/* Cooper et al. (2001) iterative dominator algorithm.
+ * Converges in 2-3 passes for reducible CFGs. Bounded at 4*nb
+ * iterations because we trust the algorithm but verify anyway. */
+static void rs_dom(uint16_t nb)
+{
+    /* Sentinel: 0xFFFF = undefined */
+    for (uint16_t i = 0; i < nb; i++)
+        rs_idom[i] = 0xFFFF;
+    rs_idom[0] = 0; /* entry dominates itself */
+
+    int changed = 1;
+    uint32_t guard = (uint32_t)nb * 4;
+
+    while (changed && guard-- > 0) {
+        changed = 0;
+        /* Traverse in RPO (skip entry) */
+        for (uint16_t ri = 0; ri < nb; ri++) {
+            uint16_t b = rs_rord[ri];
+            if (b == 0) continue; /* entry */
+
+            /* Find first processed predecessor */
+            uint16_t new_idom = 0xFFFF;
+            for (uint8_t pi = 0; pi < rs_nprd[b]; pi++) {
+                uint16_t p = rs_pred[rs_poff[b] + pi];
+                if (rs_idom[p] != 0xFFFF) {
+                    new_idom = p;
+                    break;
+                }
+            }
+            if (new_idom == 0xFFFF) continue;
+
+            /* Intersect with other processed predecessors */
+            for (uint8_t pi = 0; pi < rs_nprd[b]; pi++) {
+                uint16_t p = rs_pred[rs_poff[b] + pi];
+                if (p == new_idom || rs_idom[p] == 0xFFFF) continue;
+
+                /* Walk up the domtree from both fingers */
+                uint16_t f1 = p, f2 = new_idom;
+                uint32_t ig = nb * 2;
+                while (f1 != f2 && ig-- > 0) {
+                    while (rs_rpo[f1] > rs_rpo[f2] && ig-- > 0)
+                        f1 = rs_idom[f1];
+                    while (rs_rpo[f2] > rs_rpo[f1] && ig-- > 0)
+                        f2 = rs_idom[f2];
+                }
+                new_idom = f1;
+            }
+
+            if (rs_idom[b] != new_idom) {
+                rs_idom[b] = new_idom;
+                changed = 1;
+            }
+        }
+    }
+}
+
+/* ---- Phase 3: Loop Depth ---- */
+
+/* Detect back edges (B→H where H dominates B) and compute
+ * natural loop bodies.  Each loop increments nesting depth.
+ * Like counting how many Russian dolls you're inside, except
+ * each doll has its own register pressure crisis. */
+static void rs_loop(uint16_t nb)
+{
+    memset(rs_ldep, 0, nb * sizeof(uint16_t));
+
+    static uint8_t in_loop[RS_MAX_BLK];
+    static uint16_t wstk[RS_MAX_BLK]; /* worklist for body collection */
+
+    for (uint16_t bi = 0; bi < nb; bi++) {
+        for (uint8_t si = 0; si < rs_nsuc[bi]; si++) {
+            uint16_t hdr = rs_succ[bi * 2 + si];
+            if (hdr >= nb) continue;
+
+            /* Back edge: target dominates source */
+            uint16_t d = bi;
+            int is_back = 0;
+            uint32_t ig = nb;
+            while (d != 0xFFFF && ig-- > 0) {
+                if (d == hdr) { is_back = 1; break; }
+                if (d == rs_idom[d]) break;
+                d = rs_idom[d];
+            }
+            if (!is_back) continue;
+
+            /* Collect natural loop body: reverse BFS from bi to hdr */
+            memset(in_loop, 0, nb);
+            in_loop[hdr] = 1;
+            if (bi != hdr) {
+                in_loop[bi] = 1;
+                uint16_t wtop = 0;
+                wstk[wtop++] = bi;
+
+                uint32_t wg = nb * 2;
+                while (wtop > 0 && wg-- > 0) {
+                    uint16_t n = wstk[--wtop];
+                    for (uint8_t pi = 0; pi < rs_nprd[n] && pi < 4; pi++) {
+                        uint16_t p = rs_pred[rs_poff[n] + pi];
+                        if (p < nb && !in_loop[p]) {
+                            in_loop[p] = 1;
+                            if (wtop < RS_MAX_BLK)
+                                wstk[wtop++] = p;
+                        }
+                    }
+                }
+            }
+
+            /* Increment depth for all loop body blocks */
+            for (uint16_t j = 0; j < nb; j++) {
+                if (in_loop[j] && rs_ldep[j] < 16)
+                    rs_ldep[j]++;
+            }
+        }
+    }
+}
+
+/* ---- Phase 4: SSA Liveness ---- */
+
+/* PHI uses belong to predecessor blocks, not the PHI's block.
+ * Getting this wrong means PHI sources appear live-in at the
+ * def block instead of live-out at the pred block.  The register
+ * allocator then thinks the value is needed earlier than it is,
+ * and 200 perfectly good registers go to waste.  Ask LLVM — they
+ * got this wrong twice in their SSA liveness (PR19462, PR33947). */
+
+static void rs_live(const amd_module_t *A, const mfunc_t *F,
+                    uint16_t nb, uint16_t nv)
+{
+    uint32_t bv_words = (uint32_t)((nv + 31) / 32);
+    if (bv_words > RS_BV_WDS) bv_words = RS_BV_WDS;
+
+    uint32_t bv_bytes = bv_words * 4;
+    memset(rs_bdef, 0, (size_t)nb * bv_bytes);
+    memset(rs_buse, 0, (size_t)nb * bv_bytes);
+    memset(rs_lin,  0, (size_t)nb * bv_bytes);
+    memset(rs_lout, 0, (size_t)nb * bv_bytes);
+
+    /* Pass 1: compute per-block defs and uses */
+    for (uint16_t bi = 0; bi < nb; bi++) {
+        const mblock_t *MB = &A->mblocks[F->first_block + bi];
+        uint32_t *def = &rs_bdef[(uint32_t)bi * bv_words];
+        uint32_t *use = &rs_buse[(uint32_t)bi * bv_words];
+
+        for (uint32_t ii = 0; ii < MB->num_insts; ii++) {
+            const minst_t *mi = &A->minsts[MB->first_inst + ii];
+
+            if (mi->op == AMD_PSEUDO_PHI) {
+                /* PHI defs are in this block */
+                for (uint8_t d = 0; d < mi->num_defs; d++) {
+                    uint16_t vr = op_vreg(&mi->operands[d]);
+                    if (vr < nv) bv_set(def, vr);
+                }
+                /* PHI uses go to predecessor blocks — handled below */
+                continue;
+            }
+
+            /* Normal instruction: uses before defs (upward exposed) */
+            uint8_t total = mi->num_defs + mi->num_uses;
+            if (total > MINST_MAX_OPS) total = MINST_MAX_OPS;
+
+            for (uint8_t k = mi->num_defs; k < total; k++) {
+                uint16_t vr = op_vreg(&mi->operands[k]);
+                if (vr < nv && !bv_tst(def, vr))
+                    bv_set(use, vr);
+            }
+            for (uint8_t d = 0; d < mi->num_defs; d++) {
+                uint16_t vr = op_vreg(&mi->operands[d]);
+                if (vr < nv) bv_set(def, vr);
+            }
+        }
+    }
+
+    /* Pass 1b: PHI uses belong to predecessor gen sets.
+     * For each PHI operand pair (pred_block, value), if the value
+     * is a vreg not defined in pred_block, it's upward-exposed. */
+    for (uint16_t bi = 0; bi < nb; bi++) {
+        const mblock_t *MB = &A->mblocks[F->first_block + bi];
+        for (uint32_t ii = 0; ii < MB->num_insts; ii++) {
+            const minst_t *mi = &A->minsts[MB->first_inst + ii];
+            if (mi->op != AMD_PSEUDO_PHI) continue;
+
+            for (uint8_t p = 0; p + 1 < mi->num_uses; p += 2) {
+                uint8_t off = mi->num_defs + p;
+                if (off + 1 >= MINST_MAX_OPS) break;
+                if (mi->operands[off].kind != MOP_LABEL) continue;
+
+                uint32_t pred_mb = (uint32_t)mi->operands[off].imm;
+                if (pred_mb < F->first_block ||
+                    pred_mb >= F->first_block + nb) continue;
+                uint16_t pred_rel = (uint16_t)(pred_mb - F->first_block);
+
+                uint16_t vr = op_vreg(&mi->operands[off + 1]);
+                if (vr < nv) {
+                    uint32_t *puse = &rs_buse[(uint32_t)pred_rel * bv_words];
+                    uint32_t *pdef = &rs_bdef[(uint32_t)pred_rel * bv_words];
+                    if (!bv_tst(pdef, vr))
+                        bv_set(puse, vr);
+                }
+            }
+        }
+    }
+
+    /* Pass 2: backward dataflow iteration to fixpoint.
+     *   live_in[b]  = use[b] | (live_out[b] - def[b])
+     *   live_out[b] = U live_in[s] for each successor s
+     * Bounded at 200 iterations — overkill for reducible CFGs,
+     * but the sort of paranoia that keeps compilers honest. */
+    int changed = 1;
+    uint32_t guard = 200;
+
+    while (changed && guard-- > 0) {
+        changed = 0;
+        /* Process in reverse RPO for faster convergence */
+        for (int ri = (int)nb - 1; ri >= 0; ri--) {
+            uint16_t b = rs_rord[ri];
+            uint32_t *lin  = &rs_lin[(uint32_t)b * bv_words];
+            uint32_t *lout = &rs_lout[(uint32_t)b * bv_words];
+            uint32_t *def  = &rs_bdef[(uint32_t)b * bv_words];
+            uint32_t *use  = &rs_buse[(uint32_t)b * bv_words];
+
+            /* live_out = union of successor live_ins */
+            for (uint8_t si = 0; si < rs_nsuc[b]; si++) {
+                uint16_t s = rs_succ[b * 2 + si];
+                if (s >= nb) continue;
+                const uint32_t *sin = &rs_lin[(uint32_t)s * bv_words];
+                for (uint32_t w = 0; w < bv_words; w++)
+                    lout[w] |= sin[w];
+            }
+
+            /* live_in = use | (live_out - def) */
+            for (uint32_t w = 0; w < bv_words; w++) {
+                uint32_t new_in = use[w] | (lout[w] & ~def[w]);
+                if (new_in != lin[w]) {
+                    lin[w] = new_in;
+                    changed = 1;
+                }
+            }
+        }
+    }
+
+    /* Pass 3: exec-mask region extension.
+     * Values alive across a saveexec→restore pair must survive
+     * the entire masked region.  Without this, the allocator
+     * sees the last use inside the mask, frees the register,
+     * and exec restores to find its value reupholstered. */
+    {
+        struct { uint32_t save; uint16_t sblk; uint32_t rest; uint16_t rblk; }
+            eregion[64];
+        uint32_t n_er = 0;
+        uint32_t estack[32];
+        uint16_t eblk[32];
+        uint32_t esp = 0;
+
+        for (uint16_t bi = 0; bi < nb && n_er < 64; bi++) {
+            const mblock_t *MB = &A->mblocks[F->first_block + bi];
+            for (uint32_t ii = 0; ii < MB->num_insts; ii++) {
+                uint32_t mi_idx = MB->first_inst + ii;
+                const minst_t *mi = &A->minsts[mi_idx];
+
+                if (mi->op == AMD_S_AND_SAVEEXEC_B64 ||
+                    mi->op == AMD_S_AND_SAVEEXEC_B32) {
+                    if (esp < 32) {
+                        estack[esp] = mi_idx;
+                        eblk[esp] = bi;
+                        esp++;
+                    }
+                    continue;
+                }
+
+                if ((mi->op == AMD_S_OR_B64  || mi->op == AMD_S_OR_B32 ||
+                     mi->op == AMD_S_XOR_B64 || mi->op == AMD_S_XOR_B32) &&
+                    mi->num_defs > 0 &&
+                    mi->operands[0].kind == MOP_SPECIAL &&
+                    mi->operands[0].imm == AMD_SPEC_EXEC) {
+                    if (esp > 0 && n_er < 64) {
+                        esp--;
+                        eregion[n_er].save = estack[esp];
+                        eregion[n_er].sblk = eblk[esp];
+                        eregion[n_er].rest = mi_idx;
+                        eregion[n_er].rblk = bi;
+                        n_er++;
+                    }
+                }
+            }
+        }
+
+        /* For each exec region, any vreg live-out at the saveexec
+         * block must also be live-in at the restore block and all
+         * blocks in between.  We approximate by extending live-out
+         * of saveexec block into live-in of all blocks from sblk+1
+         * to rblk inclusive. */
+        for (uint32_t e = 0; e < n_er; e++) {
+            uint16_t sb = eregion[e].sblk;
+            uint16_t rb = eregion[e].rblk;
+            const uint32_t *slout = &rs_lout[(uint32_t)sb * bv_words];
+
+            for (uint16_t bi = (uint16_t)(sb + 1); bi <= rb && bi < nb; bi++) {
+                uint32_t *lin  = &rs_lin[(uint32_t)bi * bv_words];
+                uint32_t *lout2 = &rs_lout[(uint32_t)bi * bv_words];
+                for (uint32_t w = 0; w < bv_words; w++) {
+                    lin[w]  |= slout[w];
+                    lout2[w] |= slout[w];
+                }
+            }
+        }
+    }
+
+    /* Pass 4: prologue SGPR pinning.
+     * System SGPRs defined in block 0 that escape must survive
+     * to the function's end.  Same as emit.c:398-433. */
+    if (nb > 1) {
+        const uint32_t *def0 = &rs_bdef[0];
+        for (uint16_t v = 0; v < nv; v++) {
+            if (A->reg_file[v] != 0) continue; /* SGPRs only */
+            if (!bv_tst(def0, v)) continue;
+            /* If live-out of block 0, pin to live everywhere */
+            if (bv_tst(&rs_lout[0], v)) {
+                for (uint16_t bi = 1; bi < nb; bi++) {
+                    bv_set(&rs_lin[(uint32_t)bi * bv_words], v);
+                    bv_set(&rs_lout[(uint32_t)bi * bv_words], v);
+                }
+            }
+        }
+    }
+}
+
+/* ---- Phase 5: Divergence-Aware Spill Cost ---- */
+
+/* Power-of-10 depth weight, clamped at depth 8.
+ * Inner loops are exponentially more expensive to spill into.
+ * Braun & Hack (2009) use similar exponential weighting. */
+static uint32_t rs_dwgt(uint16_t depth)
+{
+    static const uint32_t tbl[] = {
+        1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000
+    };
+    if (depth > 8) depth = 8;
+    return tbl[depth];
+}
+
+/* Compute divergence-aware spill cost per vreg.
+ * Sampaio et al. (2013) §6: multiply by wave_width for divergent,
+ * 1 for uniform.  The 64:1 ratio means we'll strongly prefer
+ * spilling the boring uniform pointers over the precious per-lane
+ * particle state.  Physics appreciates this. */
+static void rs_dcst(const amd_module_t *A, const mfunc_t *F,
+                    uint16_t nb, uint16_t nv)
+{
+    memset(rs_cost, 0, (size_t)nv * sizeof(uint32_t));
+
+    uint32_t wave_w = (F->exec_w) ? 64u : 32u;
+
+    for (uint16_t bi = 0; bi < nb; bi++) {
+        const mblock_t *MB = &A->mblocks[F->first_block + bi];
+        uint32_t dw = rs_dwgt(rs_ldep[bi]);
+
+        for (uint32_t ii = 0; ii < MB->num_insts; ii++) {
+            const minst_t *mi = &A->minsts[MB->first_inst + ii];
+            uint8_t total = mi->num_defs + mi->num_uses;
+            if (total > MINST_MAX_OPS) total = MINST_MAX_OPS;
+
+            for (uint8_t k = 0; k < total; k++) {
+                uint16_t vr = op_vreg(&mi->operands[k]);
+                if (vr >= nv) continue;
+
+                uint32_t div_w = 1;
+                if (A->reg_file[vr] == 1 && vr_div(A, vr))
+                    div_w = wave_w;
+
+                /* Saturating add — 32-bit overflow protection */
+                uint32_t add = dw * div_w;
+                if (rs_cost[vr] + add < rs_cost[vr])
+                    rs_cost[vr] = 0xFFFFFFFF;
+                else
+                    rs_cost[vr] += add;
+            }
+        }
+    }
+}
+
+/* Detect rematerialisable vregs — constants and simple scalar ops.
+ * Remat is free: no scratch, no memory traffic, just re-emit the
+ * instruction.  These get cost 0, making them preferred spill victims.
+ * Like asking "who wants to go to the scratch car park?" and the
+ * constants raise their hands because they know the way back. */
+static void rs_rdet(const amd_module_t *A, const mfunc_t *F,
+                    uint16_t nb, uint16_t nv)
+{
+    memset(rs_rmat, 0, (size_t)nv * sizeof(rs_rmat_t));
+
+    for (uint16_t bi = 0; bi < nb; bi++) {
+        const mblock_t *MB = &A->mblocks[F->first_block + bi];
+        for (uint32_t ii = 0; ii < MB->num_insts; ii++) {
+            const minst_t *mi = &A->minsts[MB->first_inst + ii];
+            if (mi->num_defs != 1) continue;
+
+            uint16_t vr = op_vreg(&mi->operands[0]);
+            if (vr >= nv) continue;
+
+            /* s_mov_b32 vr, <literal> */
+            if (mi->op == AMD_S_MOV_B32 && mi->num_uses == 1 &&
+                mi->operands[1].kind == MOP_IMM) {
+                rs_rmat[vr].op = AMD_S_MOV_B32;
+                rs_rmat[vr].imm = mi->operands[1].imm;
+                rs_cost[vr] = 0;
+                continue;
+            }
+
+            /* v_mov_b32 vr, <literal> */
+            if (mi->op == AMD_V_MOV_B32 && mi->num_uses == 1 &&
+                mi->operands[1].kind == MOP_IMM) {
+                rs_rmat[vr].op = AMD_V_MOV_B32;
+                rs_rmat[vr].imm = mi->operands[1].imm;
+                rs_cost[vr] = 0;
+                continue;
+            }
+
+            /* Uniform VGPR defined by v_mov_b32 vr, sN — remat from SGPR.
+             * Sampaio et al. §6.2: if the SGPR is still live at the reload
+             * point, just re-emit v_mov_b32.  We record it optimistically;
+             * spill codegen checks availability. */
+            if (mi->op == AMD_V_MOV_B32 && mi->num_uses == 1 &&
+                (mi->operands[1].kind == MOP_VREG_S ||
+                 mi->operands[1].kind == MOP_SGPR) &&
+                !vr_div(A, vr)) {
+                rs_rmat[vr].op = AMD_V_MOV_B32;
+                rs_rmat[vr].imm = (int32_t)mi->operands[1].reg_num;
+                rs_cost[vr] = 0;
+            }
+        }
+    }
+}
+
+/* ---- Phase 6: SSA Coloring with Divergence-Aware Spilling ---- */
+
+/* Build dominator tree children lists and compute preorder.
+ * Preorder = parents before children.  In SSA, defs dominate
+ * all uses, so processing a block after its dominator guarantees
+ * all live-in values (from ancestor defs) are already colored.
+ * Postorder gets this backwards and everything lands on v0
+ * like commuters on the same train seat. */
+static uint16_t rs_bdpo(uint16_t nb)
+{
+    memset(rs_dcnt, 0, nb);
+    memset(rs_dcoff, 0, nb * sizeof(uint16_t));
+
+    /* Count children per node */
+    for (uint16_t b = 1; b < nb; b++) {
+        uint16_t p = rs_idom[b];
+        if (p < nb && rs_dcnt[p] < 8)
+            rs_dcnt[p]++;
+    }
+
+    /* Compute offsets */
+    uint16_t off = 0;
+    for (uint16_t b = 0; b < nb; b++) {
+        rs_dcoff[b] = off;
+        off += rs_dcnt[b];
+        if (off > RS_MAX_BLK * 8) off = RS_MAX_BLK * 8;
+    }
+
+    /* Fill children (re-zero counts as fill index) */
+    memset(rs_dcnt, 0, nb);
+    for (uint16_t b = 1; b < nb; b++) {
+        uint16_t p = rs_idom[b];
+        if (p >= nb) continue;
+        uint16_t ci = (uint16_t)(rs_dcoff[p] + rs_dcnt[p]);
+        if (ci < RS_MAX_BLK * 8) {
+            rs_dchld[ci] = b;
+            rs_dcnt[p]++;
+        }
+    }
+
+    /* Iterative preorder of domtree (parents first, then children) */
+    static uint16_t stk[RS_MAX_BLK];
+    uint16_t top = 0, n = 0;
+
+    stk[top++] = 0;
+
+    uint32_t guard = nb * 4;
+    while (top > 0 && n < nb && guard-- > 0) {
+        uint16_t b = stk[--top];
+        rs_dpord[n++] = b;
+
+        /* Push children in reverse order so first child pops first */
+        for (int ci = (int)rs_dcnt[b] - 1; ci >= 0; ci--) {
+            uint16_t child = rs_dchld[rs_dcoff[b] + ci];
+            if (top < RS_MAX_BLK)
+                stk[top++] = child;
+        }
+    }
+    return n;
+}
+
+/* Greedy SSA coloring.  Process blocks in dominator post-order,
+ * backward-scan maintaining a live set.  When pressure exceeds
+ * the physical limit, spill the cheapest vreg per divergence cost.
+ *
+ * No IFG bitmatrix needed — interference from live set at each
+ * def point.  Saves 8 MB vs ra_gc.  The mathematics of not
+ * preallocating an O(n²) structure when n is already 8192. */
+static uint32_t rs_aloc(amd_module_t *A, mfunc_t *F,
+                        uint16_t nb, uint16_t nv)
+{
+    memset(rs_col, 0xFF, (size_t)nv * 2); /* 0xFFFF = uncolored */
+    memset(rs_spd, 0, nv);
+
+    uint32_t bv_words = (uint32_t)((nv + 31) / 32);
+    if (bv_words > RS_BV_WDS) bv_words = RS_BV_WDS;
+
+    uint16_t n_dpo = rs_bdpo(nb);
+    uint32_t n_spill = 0;
+
+    uint16_t sgpr_floor = F->is_kernel ? F->first_alloc_sgpr : 0;
+    if (sgpr_floor < AMD_KERN_MIN_RESERVED && F->is_kernel)
+        sgpr_floor = AMD_KERN_MIN_RESERVED;
+
+    uint16_t vgpr_ceil = RS_VGPR_CEIL;
+    if (amd_max_vgpr > 0 && (uint16_t)amd_max_vgpr < vgpr_ceil)
+        vgpr_ceil = (uint16_t)amd_max_vgpr;
+
+    /* Live set bitvector, reused per block */
+    static uint32_t live[RS_BV_WDS];
+    /* Color-in-use sets for finding free registers */
+    static uint8_t s_used[AMD_MAX_SGPRS];
+    static uint8_t v_used[AMD_MAX_VGPRS];
+
+    uint16_t max_sgpr = 0, max_vgpr = 0;
+
+    for (uint16_t di = 0; di < n_dpo; di++) {
+        uint16_t b = rs_dpord[di];
+        const mblock_t *MB = &A->mblocks[F->first_block + b];
+        if (MB->num_insts == 0) continue;
+
+        /* Initialize live set to live-out of this block */
+        memcpy(live, &rs_lout[(uint32_t)b * bv_words], bv_words * 4);
+
+        /* Backward scan.  At each instruction we must:
+         * 1. Add uses to live set FIRST — they're alive at this point
+         * 2. Then process defs: color against current live set, remove
+         * This ensures the def interferes with its own instruction's
+         * uses.  Getting this backwards puts everything on v0 because
+         * the live set is empty when we try to color.  SSA's gift:
+         * each def is unique, so no use == def conflicts. */
+        for (int ii = (int)MB->num_insts - 1; ii >= 0; ii--) {
+            uint32_t mi_idx = MB->first_inst + (uint32_t)ii;
+            const minst_t *mi = &A->minsts[mi_idx];
+
+            if (mi->op == AMD_S_NOP || mi->op == AMD_PSEUDO_DEF)
+                continue;
+
+            /* Step 1: add uses to live set (they're alive here) */
+            if (mi->op != AMD_PSEUDO_PHI) {
+                uint8_t total = mi->num_defs + mi->num_uses;
+                if (total > MINST_MAX_OPS) total = MINST_MAX_OPS;
+                for (uint8_t k = mi->num_defs; k < total; k++) {
+                    uint16_t vr = op_vreg(&mi->operands[k]);
+                    if (vr < nv)
+                        bv_set(live, vr);
+                }
+            }
+
+            /* Step 2: process defs — color against live set, remove */
+            for (uint8_t d = 0; d < mi->num_defs; d++) {
+                uint16_t vr = op_vreg(&mi->operands[d]);
+                if (vr >= nv || rs_spd[vr]) continue;
+
+                /* Remove def from live set (born here, doesn't exist before) */
+                bv_clr(live, vr);
+
+                /* Already colored (e.g., PHI processed from another block) */
+                if (rs_col[vr] != 0xFFFF) continue;
+
+                uint8_t file = A->reg_file[vr];
+
+                /* Collect colors of all interfering live vregs */
+                memset(s_used, 0, AMD_MAX_SGPRS);
+                memset(v_used, 0, AMD_MAX_VGPRS);
+
+                for (uint16_t w = 0; w < nv; w++) {
+                    if (!bv_tst(live, w)) continue;
+                    if (rs_spd[w]) continue;
+                    if (A->reg_file[w] != file) continue;
+                    if (rs_col[w] == 0xFFFF) {
+                        /* Uncolored live vreg from an earlier def in
+                         * this block — backward scan hasn't reached
+                         * its def yet.  Precolor it below. */
+                    }
+                    /* Mark color as used (skip uncolored — handled below
+                     * by precoloring after the main color scan) */
+                    if (rs_col[w] != 0xFFFF) {
+                        if (file == 0 && rs_col[w] < AMD_MAX_SGPRS)
+                            s_used[rs_col[w]] = 1;
+                        else if (file == 1 && rs_col[w] < AMD_MAX_VGPRS)
+                            v_used[rs_col[w]] = 1;
+                    }
+                }
+
+                /* Pre-color uncolored live vregs in same file.
+                 * Greedy: assign each the lowest free color.
+                 * Rebuild used-set incrementally after each assignment. */
+                for (uint16_t w = 0; w < nv; w++) {
+                    if (!bv_tst(live, w)) continue;
+                    if (rs_spd[w]) continue;
+                    if (A->reg_file[w] != file) continue;
+                    if (rs_col[w] != 0xFFFF) continue;
+
+                    uint16_t pc = 0xFFFF;
+                    if (file == 0) {
+                        for (uint16_t r = sgpr_floor; r < AMD_MAX_SGPRS; r++) {
+                            if (r == RS_RELAY_S || r == RS_RELAY_S2) continue;
+                            if (!s_used[r]) { pc = r; break; }
+                        }
+                        if (pc != 0xFFFF) { rs_col[w] = pc; s_used[pc] = 1; }
+                    } else {
+                        for (uint16_t r = 0; r < vgpr_ceil; r++) {
+                            if (!v_used[r]) { pc = r; break; }
+                        }
+                        if (pc != 0xFFFF) { rs_col[w] = pc; v_used[pc] = 1; }
+                    }
+                    if (pc != 0xFFFF) {
+                        if (file == 0 && pc + 1 > max_sgpr) max_sgpr = pc + 1;
+                        else if (file == 1 && pc + 1 > max_vgpr) max_vgpr = pc + 1;
+                    }
+                }
+
+                /* Find lowest free color */
+                uint16_t color = 0xFFFF;
+                if (file == 0) {
+                    for (uint16_t r = sgpr_floor; r < AMD_MAX_SGPRS; r++) {
+                        if (r == RS_RELAY_S || r == RS_RELAY_S2) continue;
+                        if (!s_used[r]) { color = r; break; }
+                    }
+                } else {
+                    for (uint16_t r = 0; r < vgpr_ceil; r++) {
+                        if (!v_used[r]) { color = r; break; }
+                    }
+                }
+
+                if (color != 0xFFFF) {
+                    rs_col[vr] = color;
+                    if (file == 0 && color + 1 > max_sgpr)
+                        max_sgpr = color + 1;
+                    else if (file == 1 && color + 1 > max_vgpr)
+                        max_vgpr = color + 1;
+                } else {
+                    /* Pressure exceeded — spill cheapest.
+                     * Among def + all interfering live vregs in same file,
+                     * find the one with lowest cost.  Remat first (cost 0),
+                     * then uniform VGPRs, then divergent VGPRs last. */
+                    uint16_t victim = vr;
+                    uint32_t vcost = rs_cost[vr];
+
+                    for (uint16_t w = 0; w < nv; w++) {
+                        if (!bv_tst(live, w)) continue;
+                        if (A->reg_file[w] != file) continue;
+                        if (rs_spd[w]) continue;
+                        if (rs_col[w] == 0xFFFF) continue;
+                        if (rs_cost[w] < vcost) {
+                            vcost = rs_cost[w];
+                            victim = w;
+                        }
+                    }
+
+                    if (victim != vr && rs_col[victim] != 0xFFFF) {
+                        /* Evict victim, steal its color */
+                        color = rs_col[victim];
+                        rs_col[victim] = 0xFFFF;
+                        rs_spd[victim] = 1;
+                        rs_col[vr] = color;
+                        n_spill++;
+                    } else {
+                        /* Spill ourselves — no victim cheaper */
+                        rs_spd[vr] = 1;
+                        n_spill++;
+                    }
+                }
+            }
+        }
+    }
+
+    F->num_sgprs = max_sgpr;
+    if (F->is_kernel && F->num_sgprs < F->first_alloc_sgpr)
+        F->num_sgprs = F->first_alloc_sgpr;
+    F->num_vgprs = max_vgpr;
+
+    return n_spill;
+}
+
+/* ---- Phase 7: Divergence-Aware Spill Codegen ---- */
+
+/* Insert spill/reload code for each spilled vreg.
+ * Three codegen paths depending on divergence:
+ *
+ * Path A (remat): re-emit the defining instruction before each use.
+ *   Cost: 0 bytes scratch, 1 instruction.
+ *
+ * Path B (uniform VGPR): v_readfirstlane to scalar, store 4 bytes.
+ *   Cost: 4 bytes scratch.  64× cheaper than divergent.
+ *
+ * Path C (divergent VGPR): full scratch store/load per lane.
+ *   Cost: wave_width × 4 bytes scratch.  The expensive case.
+ *
+ * Path D (SGPR): v_mov to VGPR relay, store 4 bytes.
+ *   Cost: 4 bytes scratch. */
+static void rs_spin(amd_module_t *A, mfunc_t *F, uint16_t nb, uint16_t nv)
+{
+    rs_nspill = 0;
+
+    /* Assign scratch offsets to spilled vregs */
+    memset(rs_soff_tbl, 0, (size_t)nv * sizeof(uint16_t));
+    uint32_t scr_off = F->scratch_bytes;
+    for (uint16_t v = 0; v < nv; v++) {
+        if (!rs_spd[v]) continue;
+        if (rs_rmat[v].op != 0) continue; /* remat, no scratch needed */
+        if (rs_nspill >= RS_MAX_SPILL) break;
+        rs_spill[rs_nspill].vreg = v;
+        rs_spill[rs_nspill].off = (uint16_t)scr_off;
+        rs_soff_tbl[v] = (uint16_t)scr_off;
+        scr_off += 4;
+        rs_nspill++;
+    }
+    F->scratch_bytes = scr_off;
+
+    /* Find scratch FP SGPR — scan for existing scratch op */
+    uint16_t scr_sgpr = 0;
+    for (uint16_t bi = 0; bi < nb && scr_sgpr == 0; bi++) {
+        const mblock_t *MB = &A->mblocks[F->first_block + bi];
+        for (uint32_t ii = 0; ii < MB->num_insts; ii++) {
+            const minst_t *mi = &A->minsts[MB->first_inst + ii];
+            if (mi->op == AMD_SCRATCH_LOAD_DWORD ||
+                mi->op == AMD_SCRATCH_STORE_DWORD) {
+                /* SADDR operand */
+                for (uint8_t k = 0; k < mi->num_defs + mi->num_uses && k < MINST_MAX_OPS; k++) {
+                    if (mi->operands[k].kind == MOP_VREG_S ||
+                        mi->operands[k].kind == MOP_SGPR) {
+                        scr_sgpr = mi->operands[k].reg_num;
+                        break;
+                    }
+                }
+                if (scr_sgpr) break;
+            }
+        }
+    }
+
+    /* Process each block: expand into rs_ebuf, copy back */
+    for (uint16_t bi = 0; bi < nb; bi++) {
+        mblock_t *MB = &A->mblocks[F->first_block + bi];
+        uint32_t en = 0; /* expansion count */
+        int any_spill = 0;
+
+        /* Check if this block has any spilled vreg references */
+        for (uint32_t ii = 0; ii < MB->num_insts && !any_spill; ii++) {
+            const minst_t *mi = &A->minsts[MB->first_inst + ii];
+            uint8_t total = mi->num_defs + mi->num_uses;
+            if (total > MINST_MAX_OPS) total = MINST_MAX_OPS;
+            for (uint8_t k = 0; k < total; k++) {
+                uint16_t vr = op_vreg(&mi->operands[k]);
+                if (vr < nv && rs_spd[vr]) { any_spill = 1; break; }
+            }
+        }
+        if (!any_spill) continue;
+
+        /* Expand block instructions into buffer */
+        for (uint32_t ii = 0; ii < MB->num_insts; ii++) {
+            minst_t *mi = &A->minsts[MB->first_inst + ii];
+
+            if (mi->op == AMD_S_NOP || mi->op == AMD_PSEUDO_DEF) {
+                if (en < RS_EXPBUF) rs_ebuf[en++] = *mi;
+                continue;
+            }
+
+            uint8_t total = mi->num_defs + mi->num_uses;
+            if (total > MINST_MAX_OPS) total = MINST_MAX_OPS;
+
+            /* Pre-instruction: reload spilled uses */
+            for (uint8_t k = mi->num_defs; k < total; k++) {
+                uint16_t vr = op_vreg(&mi->operands[k]);
+                if (vr >= nv || !rs_spd[vr]) continue;
+
+                if (rs_rmat[vr].op != 0) {
+                    /* Path A: rematerialise */
+                    uint16_t nv2 = (uint16_t)A->vreg_count;
+                    if (nv2 < AMD_MAX_VREGS - 1) {
+                        A->vreg_count = nv2 + 1;
+                        A->reg_file[nv2] = A->reg_file[vr];
+                    }
+                    if (en < RS_EXPBUF) {
+                        minst_t *rm = &rs_ebuf[en++];
+                        memset(rm, 0, sizeof(minst_t));
+                        rm->op = rs_rmat[vr].op;
+                        rm->num_defs = 1;
+                        rm->num_uses = 1;
+                        rm->operands[0].kind = A->reg_file[vr] ?
+                            MOP_VREG_V : MOP_VREG_S;
+                        rm->operands[0].reg_num = nv2;
+                        rm->operands[1].kind = MOP_IMM;
+                        rm->operands[1].imm = rs_rmat[vr].imm;
+                    }
+                    mi->operands[k].reg_num = nv2;
+                } else {
+                    /* Path B/C/D: scratch reload */
+                    uint16_t relay = (uint16_t)(RS_RELAY_V0 + (k % 3));
+                    uint16_t soff = rs_soff_tbl[vr];
+
+                    /* scratch_load_dword vRelay, sScrFP, offset */
+                    if (en < RS_EXPBUF) {
+                        minst_t *ld = &rs_ebuf[en++];
+                        memset(ld, 0, sizeof(minst_t));
+                        ld->op = AMD_SCRATCH_LOAD_DWORD;
+                        ld->num_defs = 1;
+                        ld->num_uses = 2;
+                        ld->operands[0].kind = MOP_VGPR;
+                        ld->operands[0].reg_num = relay;
+                        ld->operands[1].kind = MOP_SGPR;
+                        ld->operands[1].reg_num = scr_sgpr;
+                        ld->operands[2].kind = MOP_IMM;
+                        ld->operands[2].imm = (int32_t)soff;
+                    }
+                    /* s_waitcnt vmcnt(0) */
+                    if (en < RS_EXPBUF) {
+                        minst_t *wt = &rs_ebuf[en++];
+                        memset(wt, 0, sizeof(minst_t));
+                        wt->op = AMD_S_WAITCNT;
+                        wt->flags = AMD_WAIT_VMCNT0;
+                    }
+
+                    if (A->reg_file[vr] == 0) {
+                        /* Path D: SGPR — readfirstlane from relay */
+                        uint16_t sr = (k % 2) ? RS_RELAY_S2 : RS_RELAY_S;
+                        if (en < RS_EXPBUF) {
+                            minst_t *rf = &rs_ebuf[en++];
+                            memset(rf, 0, sizeof(minst_t));
+                            rf->op = AMD_V_READFIRSTLANE_B32;
+                            rf->num_defs = 1;
+                            rf->num_uses = 1;
+                            rf->operands[0].kind = MOP_SGPR;
+                            rf->operands[0].reg_num = sr;
+                            rf->operands[1].kind = MOP_VGPR;
+                            rf->operands[1].reg_num = relay;
+                        }
+                        mi->operands[k].kind = MOP_VREG_S;
+                        mi->operands[k].reg_num = vr; /* will be rewritten */
+                        /* Patch to use SGPR relay directly */
+                        mi->operands[k].kind = MOP_SGPR;
+                        mi->operands[k].reg_num = sr;
+                    } else {
+                        /* Path B/C: VGPR — use relay directly */
+                        mi->operands[k].kind = MOP_VGPR;
+                        mi->operands[k].reg_num = relay;
+                    }
+                }
+            }
+
+            /* Emit the instruction itself */
+            if (en < RS_EXPBUF) rs_ebuf[en++] = *mi;
+
+            /* Post-instruction: store spilled defs */
+            for (uint8_t d = 0; d < mi->num_defs; d++) {
+                uint16_t vr = op_vreg(&mi->operands[d]);
+                if (vr >= nv || !rs_spd[vr]) continue;
+                if (rs_rmat[vr].op != 0) continue; /* remat, no store */
+
+                uint16_t relay = (uint16_t)(RS_RELAY_V0 + (d % 3));
+                uint16_t soff = rs_soff_tbl[vr];
+
+                if (A->reg_file[vr] == 0) {
+                    /* Path D: SGPR def — move to VGPR relay first */
+                    uint16_t sr = (d % 2) ? RS_RELAY_S2 : RS_RELAY_S;
+                    mi->operands[d].kind = MOP_SGPR;
+                    mi->operands[d].reg_num = sr;
+                    if (en < RS_EXPBUF) {
+                        minst_t *mv = &rs_ebuf[en - 1]; /* patch last */
+                        (void)mv;
+                        minst_t *vm = &rs_ebuf[en++];
+                        memset(vm, 0, sizeof(minst_t));
+                        vm->op = AMD_V_MOV_B32;
+                        vm->num_defs = 1;
+                        vm->num_uses = 1;
+                        vm->operands[0].kind = MOP_VGPR;
+                        vm->operands[0].reg_num = relay;
+                        vm->operands[1].kind = MOP_SGPR;
+                        vm->operands[1].reg_num = sr;
+                    }
+                } else {
+                    /* Path B/C: VGPR def — redirect to relay */
+                    /* Patch the instruction's def to write relay */
+                    rs_ebuf[en - 1].operands[d].kind = MOP_VGPR;
+                    rs_ebuf[en - 1].operands[d].reg_num = relay;
+                }
+
+                /* scratch_store_dword vRelay, sScrFP, offset */
+                if (en < RS_EXPBUF) {
+                    minst_t *st = &rs_ebuf[en++];
+                    memset(st, 0, sizeof(minst_t));
+                    st->op = AMD_SCRATCH_STORE_DWORD;
+                    st->num_defs = 0;
+                    st->num_uses = 3;
+                    st->operands[0].kind = MOP_VGPR;
+                    st->operands[0].reg_num = relay;
+                    st->operands[1].kind = MOP_SGPR;
+                    st->operands[1].reg_num = scr_sgpr;
+                    st->operands[2].kind = MOP_IMM;
+                    st->operands[2].imm = (int32_t)soff;
+                }
+
+                /* Fence */
+                if (en < RS_EXPBUF) {
+                    minst_t *wt = &rs_ebuf[en++];
+                    memset(wt, 0, sizeof(minst_t));
+                    wt->op = AMD_S_WAITCNT;
+                    wt->flags = AMD_WAIT_VMCNT0;
+                }
+            }
+        }
+
+        if (en == 0) continue;
+        if (en > RS_EXPBUF) en = RS_EXPBUF;
+
+        /* Copy expanded block back.  If it grew, shift subsequent insts. */
+        uint32_t old_ninst = MB->num_insts;
+        int32_t delta = (int32_t)en - (int32_t)old_ninst;
+
+        if (delta > 0) {
+            /* Grow: shift tail right */
+            uint32_t tail_start = MB->first_inst + old_ninst;
+            uint32_t tail_len = A->num_minsts - tail_start;
+            if (A->num_minsts + (uint32_t)delta > AMD_MAX_MINSTS) continue;
+            memmove(&A->minsts[tail_start + (uint32_t)delta],
+                    &A->minsts[tail_start],
+                    tail_len * sizeof(minst_t));
+            A->num_minsts += (uint32_t)delta;
+            for (uint16_t lb = (uint16_t)(bi + 1); lb < nb; lb++)
+                A->mblocks[F->first_block + lb].first_inst += (uint32_t)delta;
+        } else if (delta < 0) {
+            /* Shrink: shift tail left */
+            uint32_t shrink = (uint32_t)(-delta);
+            uint32_t tail_start = MB->first_inst + old_ninst;
+            uint32_t tail_len = A->num_minsts - tail_start;
+            memmove(&A->minsts[tail_start - shrink],
+                    &A->minsts[tail_start],
+                    tail_len * sizeof(minst_t));
+            A->num_minsts -= shrink;
+            for (uint16_t lb = (uint16_t)(bi + 1); lb < nb; lb++)
+                A->mblocks[F->first_block + lb].first_inst -= shrink;
+        }
+
+        /* Copy expanded instructions into place */
+        memcpy(&A->minsts[MB->first_inst], rs_ebuf, en * sizeof(minst_t));
+        MB->num_insts = en;
+    }
+}
+
+/* ---- Phase 8: Post-RA Phi Elimination ---- */
+
+/* Eliminate PHIs after register allocation.  The key advantage:
+ * PHI sources and dests with the same color need no copy at all.
+ * Free coalescing — the SSA allocator's party trick.
+ *
+ * Cycle detection: if PHI copies on one edge form a permutation
+ * cycle (A→B, B→A), use relay register as temporary.  Detected
+ * by bounded scan of pending copies per edge. */
+static void rs_phie(amd_module_t *A, mfunc_t *F, uint16_t nb)
+{
+    /* Collect copies from PHIs */
+    #define RS_PHI_MAX 4096
+    typedef struct {
+        uint32_t pred_mb;
+        uint16_t dst_col;  /* physical reg of dst */
+        uint16_t src_vr;   /* source vreg (to look up color) */
+        uint8_t  file;     /* 0=SGPR, 1=VGPR */
+        moperand_t src_op; /* original source operand */
+    } rs_phi_t;
+
+    static rs_phi_t rs_phis[RS_PHI_MAX];
+    uint32_t np = 0;
+
+    for (uint16_t bi = 0; bi < nb; bi++) {
+        mblock_t *MB = &A->mblocks[F->first_block + bi];
+        for (uint32_t ii = 0; ii < MB->num_insts; ii++) {
+            minst_t *mi = &A->minsts[MB->first_inst + ii];
+            if (mi->op != AMD_PSEUDO_PHI) continue;
+
+            uint16_t dst_vr = op_vreg(&mi->operands[0]);
+            if (dst_vr == 0xFFFF) goto nop_phi;
+
+            uint16_t dst_col = rs_col[dst_vr];
+            uint8_t file = A->reg_file[dst_vr];
+
+            for (uint8_t p = 0; p + 1 < mi->num_uses && np < RS_PHI_MAX; p += 2) {
+                uint8_t off = mi->num_defs + p;
+                if (off + 1 >= MINST_MAX_OPS) break;
+                if (mi->operands[off].kind != MOP_LABEL) continue;
+
+                uint32_t pred_mb = (uint32_t)mi->operands[off].imm;
+                uint16_t src_vr = op_vreg(&mi->operands[off + 1]);
+
+                /* If same color → free coalesce, no copy needed */
+                if (src_vr != 0xFFFF && rs_col[src_vr] == dst_col &&
+                    !rs_spd[src_vr])
+                    continue;
+
+                rs_phis[np].pred_mb = pred_mb;
+                rs_phis[np].dst_col = dst_col;
+                rs_phis[np].src_vr = src_vr;
+                rs_phis[np].file = file;
+                rs_phis[np].src_op = mi->operands[off + 1];
+                np++;
+            }
+
+nop_phi:
+            mi->op = AMD_S_NOP;
+            mi->num_defs = 0;
+            mi->num_uses = 0;
+        }
+    }
+
+    if (np == 0) return;
+
+    /* Count copies per predecessor block */
+    static uint32_t cpb[RS_MAX_BLK];
+    memset(cpb, 0, nb * sizeof(uint32_t));
+    for (uint32_t i = 0; i < np; i++) {
+        uint32_t pred = rs_phis[i].pred_mb;
+        if (pred >= F->first_block && pred < F->first_block + nb) {
+            uint32_t rel = pred - F->first_block;
+            cpb[rel]++;
+        }
+    }
+
+    /* Insert copies before terminators, process blocks in reverse
+     * so shifts don't affect already-processed blocks.
+     * Same memmove ballet as amdgpu_phi_elim(). */
+    for (int mb = (int)nb - 1; mb >= 0; mb--) {
+        uint16_t b = (uint16_t)mb;
+        uint32_t copies = cpb[b];
+        if (copies == 0) continue;
+        if (A->num_minsts + copies > AMD_MAX_MINSTS) continue;
+
+        mblock_t *MB = &A->mblocks[F->first_block + b];
+
+        /* Find insertion point: before trailing terminators */
+        uint32_t insert_rel = MB->num_insts;
+        for (uint32_t ti = MB->num_insts; ti > 0; ti--) {
+            if (rs_term(A->minsts[MB->first_inst + ti - 1].op))
+                insert_rel = ti - 1;
+            else
+                break;
+        }
+        uint32_t insert_abs = MB->first_inst + insert_rel;
+
+        /* Shift tail */
+        uint32_t tail_len = A->num_minsts - insert_abs;
+        memmove(&A->minsts[insert_abs + copies],
+                &A->minsts[insert_abs],
+                tail_len * sizeof(minst_t));
+
+        /* Insert copies */
+        uint32_t ci = 0;
+        uint32_t pred_abs = F->first_block + b;
+        for (uint32_t i = 0; i < np && ci < copies; i++) {
+            if (rs_phis[i].pred_mb != pred_abs) continue;
+
+            minst_t *copy = &A->minsts[insert_abs + ci];
+            memset(copy, 0, sizeof(minst_t));
+
+            uint16_t dst_col = rs_phis[i].dst_col;
+            uint16_t src_col = 0xFFFF;
+            uint16_t src_vr = rs_phis[i].src_vr;
+
+            if (src_vr != 0xFFFF && !rs_spd[src_vr])
+                src_col = rs_col[src_vr];
+
+            /* Build copy instruction */
+            if (rs_phis[i].file == 1) {
+                copy->op = AMD_V_MOV_B32;
+                copy->num_defs = 1;
+                copy->num_uses = 1;
+                copy->operands[0].kind = MOP_VGPR;
+                copy->operands[0].reg_num = dst_col;
+                if (src_col != 0xFFFF) {
+                    copy->operands[1].kind = MOP_VGPR;
+                    copy->operands[1].reg_num = src_col;
+                } else {
+                    /* Source is an immediate or spilled — use original */
+                    copy->operands[1] = rs_phis[i].src_op;
+                }
+            } else {
+                copy->op = AMD_S_MOV_B32;
+                copy->num_defs = 1;
+                copy->num_uses = 1;
+                copy->operands[0].kind = MOP_SGPR;
+                copy->operands[0].reg_num = dst_col;
+                if (src_col != 0xFFFF) {
+                    copy->operands[1].kind = MOP_SGPR;
+                    copy->operands[1].reg_num = src_col;
+                } else {
+                    copy->operands[1] = rs_phis[i].src_op;
+                }
+            }
+            ci++;
+        }
+
+        A->num_minsts += copies;
+        MB->num_insts += copies;
+        for (uint16_t later = (uint16_t)(b + 1); later < nb; later++)
+            A->mblocks[F->first_block + later].first_inst += copies;
+    }
+
+    #undef RS_PHI_MAX
+}
+
+/* ---- Phase 9: Integration ---- */
+
+/* Write allocated colors to reg_map for rw_ops() consumption.
+ * Also rewrite operands of non-spilled instructions from virtual
+ * to physical, convert PSEUDO_COPY to actual MOVs. */
+static void rs_wmap(amd_module_t *A, uint16_t nv)
+{
+    for (uint16_t v = 0; v < nv; v++) {
+        if (rs_spd[v]) {
+            A->reg_map[v] = 0xFFFF;
+        } else if (rs_col[v] == 0xFFFF) {
+            /* Uncolored, unspilled vreg — dead code in orphan blocks.
+             * Map to a safe default so rw_ops doesn't choke on 0xFFFF. */
+            A->reg_map[v] = (A->reg_file[v] == 1) ? 0 : 0;
+        } else {
+            A->reg_map[v] = rs_col[v];
+        }
+    }
+}
+
+void ra_ssa(amd_module_t *A, uint32_t mf_idx)
+{
+    mfunc_t *F = &A->mfuncs[mf_idx];
+    uint16_t nv = (uint16_t)(A->vreg_count < RS_MAX_VR ?
+                              A->vreg_count : RS_MAX_VR);
+    uint16_t nb = F->num_blocks;
+
+    if (nb > RS_MAX_BLK || nv > RS_MAX_VR) {
+        /* Fallback: too large for static pools */
+        fprintf(stderr, "  ra_ssa: %u blocks/%u vregs exceeds limits, fallback\n",
+                nb, nv);
+        /* Need phi elimination before linear scan */
+        amdgpu_phi_elim(A);
+        /* ra_lin is static in emit.c — call through regalloc with flag */
+        amd_ra_ssa = 0;
+        amd_ra_lin = 1;
+        amdgpu_regalloc(A);
+        amd_ra_ssa = 1;
+        amd_ra_lin = 0;
+        return;
+    }
+
+    /* Phase 2: CFG + dominator tree */
+    rs_cfg(A, F);
+    rs_bld_rpo(nb);
+    rs_dom(nb);
+
+    /* Phase 3: loop nesting depth */
+    rs_loop(nb);
+
+    /* Phase 4: SSA liveness */
+    rs_live(A, F, nb, nv);
+
+    /* Phase 5: divergence-aware spill cost + remat */
+    rs_dcst(A, F, nb, nv);
+    rs_rdet(A, F, nb, nv);
+
+    /* Phase 6: SSA coloring */
+    uint32_t ns = rs_aloc(A, F, nb, nv);
+
+    if (ns > 0) {
+        /* Phase 7: divergence-aware spill codegen */
+        rs_spin(A, F, nb, nv);
+    }
+
+    /* Phase 8: post-RA phi elimination */
+    rs_phie(A, F, nb);
+
+    /* Write reg_map for rw_ops */
+    rs_wmap(A, nv);
+
+    /* Finalize: launch_bounds caps, minimum registers */
+    fin_regs(A, F);
+
+    /* Rewrite virtual→physical operands */
+    rw_ops(A, F);
+
+    /* Kill self-copies (same phys reg both sides) */
+    dce_copy(A, F);
+}
diff --git a/src/amdgpu/ra_ssa.h b/src/amdgpu/ra_ssa.h
new file mode 100644
index 0000000..51a02e1
--- /dev/null
+++ b/src/amdgpu/ra_ssa.h
@@ -0,0 +1,20 @@
+#ifndef BARRACUDA_RA_SSA_H
+#define BARRACUDA_RA_SSA_H
+
+#include "amdgpu.h"
+
+/*
+ * Divergence-aware SSA register allocator.
+ * Spills uniform VGPRs cheaply (readfirstlane, 4 bytes scratch)
+ * and preserves divergent VGPRs in registers (256 bytes scratch each).
+ * Operates on SSA form before phi elimination.
+ *
+ * References:
+ *   Sampaio et al. (2013) "Divergence Analysis", ACM TOPLAS 35(4)
+ *   Cooper et al. (2001) "A Simple, Fast Dominance Algorithm"
+ *   Braun & Hack (2009) "Register Spilling for SSA-Form Programs"
+ */
+
+void ra_ssa(amd_module_t *A, uint32_t mf_idx);
+
+#endif /* BARRACUDA_RA_SSA_H */
diff --git a/src/main.c b/src/main.c
index a110859..b4439a5 100644
--- a/src/main.c
+++ b/src/main.c
@@ -80,6 +80,7 @@ static void usage(const char *prog)
         "  --gfx1030     Target RDNA 2 (gfx1030)\n"
         "  --gfx1200     Target RDNA 4 (gfx1200)\n"
         "  --no-graphcolor  Force linear scan register allocation\n"
+        "  --ssa-ra         Divergence-aware SSA register allocation\n"
         "  --max-vgprs N    Cap VGPR count for regalloc (forces spills)\n"
         "  --tensix      Compile to TT-Metalium C++ (Tensix SFPU)\n"
         "  -o <file>     Output file (for --amdgpu-bin, --tensix)\n"
@@ -206,6 +207,8 @@ int main(int argc, char *argv[])
             no_sched = 1;
         else if (strcmp(argv[i], "--no-graphcolor") == 0)
             amd_ra_lin = 1;
+        else if (strcmp(argv[i], "--ssa-ra") == 0)
+            amd_ra_ssa = 1;
         else if (strcmp(argv[i], "--max-vgprs") == 0 && i + 1 < argc)
             amd_max_vgpr = atoi(argv[++i]);
         else if (strcmp(argv[i], "--snap") == 0)
diff --git a/src/runtime/bc_runtime.c b/src/runtime/bc_runtime.c
index 22cbdd0..a540af8 100644
--- a/src/runtime/bc_runtime.c
+++ b/src/runtime/bc_runtime.c
@@ -31,7 +31,7 @@ typedef int64_t hsa_signal_value_t;
 #define HSA_AMD_SEGMENT_GLOBAL                    0
 #define HSA_AMD_MEMORY_POOL_INFO_SEGMENT          0
 #define HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS     1
-#define HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG   1
+#define HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG   8
 #define HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE      2
 
 /* HSA queue — layout must match ABI exactly */
@@ -155,6 +155,8 @@ typedef hsa_status_t (*pfn_amd_iterate_pools_t)(
     hsa_agent_t, hsa_status_t(*)(hsa_amd_memory_pool_t, void*), void*);
 typedef hsa_status_t (*pfn_amd_pool_get_info_t)(
     hsa_amd_memory_pool_t, uint32_t, void*);
+typedef hsa_status_t (*pfn_amd_allow_access_t)(
+    uint32_t, const hsa_agent_t*, const uint32_t*, const void*);
 
 /* ---- Internal Device Structure ---- */
 
@@ -194,6 +196,7 @@ typedef struct {
     pfn_amd_pool_free_t     amd_pfree;
     pfn_amd_iterate_pools_t amd_ipools;
     pfn_amd_pool_get_info_t amd_pinfo;
+    pfn_amd_allow_access_t  amd_allow;
 
     /* Device state */
     hsa_agent_t  gpu_agent;
@@ -282,7 +285,7 @@ static hsa_status_t find_kpool_cb(hsa_amd_memory_pool_t pool, void *data)
     if (segment != HSA_AMD_SEGMENT_GLOBAL) return HSA_STATUS_SUCCESS;
     uint32_t flags = 0;
     D->amd_pinfo(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags);
-    if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG) {
+    if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE) {
         D->kernarg_pool = pool;
         return HSA_STATUS_INFO_BREAK;
     }
@@ -363,6 +366,7 @@ int bc_device_init(bc_device_t *dev)
     LOAD(amd_pfree,   "hsa_amd_memory_pool_free");
     LOAD(amd_ipools,  "hsa_amd_agent_iterate_memory_pools");
     LOAD(amd_pinfo,   "hsa_amd_memory_pool_get_info");
+    LOAD(amd_allow,   "hsa_amd_agents_allow_access");
 
     hsa_status_t st = D->hsa_init();
     if (st != HSA_STATUS_SUCCESS) {
@@ -645,6 +649,11 @@ int bc_dispatch(bc_device_t *dev, const bc_kernel_t *kern,
         return BC_RT_ERR_HSA;
     }
 
+    /* Grant GPU access to the kernarg buffer. Without this the IOMMU
+     * doesn't map the pages and s_load_dword returns zeros silently.
+     * Undocumented, of course. Thanks AMD. */
+    D->amd_allow(1, &D->gpu_agent, NULL, kernarg_buf);
+
     memset(kernarg_buf, 0, alloc_sz);
     memcpy(kernarg_buf, args, args_size);