From 3a52ef788a88a7d102adad0df58b0ea789015344 Mon Sep 17 00:00:00 2001
From: wenxuekun <wenxuekun@huawei.com>
Date: Tue, 23 Jun 2026 17:05:05 +0800
Subject: [PATCH 1/2] feat(ptodsl): implement simt_allreduce_sum for SIMT
 cross-workitem all-reduce
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement the pto.simt_allreduce_sum frontend interface as designed in
mission/483/483_docs.md.  Pure Python MLIR IR emission with three
dispatch strategies: warp_reduce (<=32 threads, pow2), cross_warp_reduce
(>32, pow2), ub_reduce (fallback).  Supports f32 and f16.

- ptodsl/ptodsl/_allreduce.py: new — 674 lines
- ptodsl/ptodsl/pto.py: export simt_allreduce_sum (+3 lines)
- ptodsl/tests/test_allreduce.py: new — 533 lines, all passing

Co-Authored-By: Claude <noreply@anthropic.com>
---
 ptodsl/ptodsl/_allreduce.py    | 674 +++++++++++++++++++++++++++++++++
 ptodsl/ptodsl/pto.py           |   3 +
 ptodsl/tests/test_allreduce.py | 533 ++++++++++++++++++++++++++
 3 files changed, 1210 insertions(+)
 create mode 100644 ptodsl/ptodsl/_allreduce.py
 create mode 100644 ptodsl/tests/test_allreduce.py

diff --git a/ptodsl/ptodsl/_allreduce.py b/ptodsl/ptodsl/_allreduce.py
new file mode 100644
index 0000000000..cb0ce122ed
--- /dev/null
+++ b/ptodsl/ptodsl/_allreduce.py
@@ -0,0 +1,674 @@
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+"""
+SIMT cross-workitem all-reduce helpers.
+
+Implements ``AscendAllReduce<Reducer, threads, scale, thread_offset>::run()``
+as PTO IR helper functions that are lazily emitted into the trace module.
+
+Public entry point: ``all_reduce(x, scratch, *, op, threads, scale, thread_offset)``,
+callable from within a ``@pto.simt`` context.
+
+Dispatch tree (mirrors the C++ compile-time dispatch in ``reduce.h``)::
+
+    threads <= scale                                    → identity
+    threads ≤ 32,  pow2(threads), pow2(scale)           → warp_reduce
+    threads ≤ 32                                        → ub_reduce
+    threads > 32, pow2(threads), scale ≤ 32, pow2(scale) → cross_warp_reduce
+    otherwise                                           → ub_reduce
+"""
+
+from __future__ import annotations
+
+from ._surface_values import unwrap_surface_value, wrap_surface_value
+from ._tracing.active import require_active_session
+from ._tracing.session import HelperFunctionSpec
+
+from mlir.dialects import arith, func, scf
+from mlir.dialects import pto as _pto
+from mlir.ir import F16Type, F32Type, IndexType, InsertionPoint, IntegerType, Operation, UnitAttr
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# helpers
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def _is_pow2(n: int) -> bool:
+    return n > 0 and (n & (n - 1)) == 0
+
+
+def _helper_name(dtype: str, threads: int, scale: int, thread_offset: int) -> str:
+    """Canonical helper symbol name for a specific all-reduce instance.
+
+    Example: ``__tl_allreduce_sum_f32_t128_s1_o0``.
+    """
+    return f"__tl_allreduce_sum_{dtype}_t{threads}_s{scale}_o{thread_offset}"
+
+
+def _dtype_to_str(mlir_type) -> str:
+    """Map an MLIR scalar type to a canonical dtype string."""
+    if mlir_type == F32Type.get():
+        return "f32"
+    if mlir_type == F16Type.get():
+        return "f16"
+    raise NotImplementedError(
+        f"all_reduce: unsupported dtype {mlir_type}"
+    )
+
+
+def _mlir_scalar_type(dtype: str):
+    """Map a canonical dtype string back to an MLIR scalar type."""
+    if dtype == "f32":
+        return F32Type.get()
+    if dtype == "f16":
+        return F16Type.get()
+    raise NotImplementedError(
+        f"all_reduce: unsupported dtype {dtype!r}"
+    )
+
+
+# ── compile-time parameter tables ──────────────────────────────────────────
+
+_IDENTITY = {
+    "f32": 0.0,
+    "f16": 0.0,
+}
+"""Identity element for sum reduction (0.0 for both f32 and f16)."""
+
+_REDUX_OP = _pto.ReduxAddOp
+"""Reduction operator (hardware redux_add)."""
+
+
+# ── scratch validation ────────────────────────────────────────────────────
+
+def _validate_scratch(scratch, expected_mlir_type, *, context: str):
+    """Verify *scratch* is a ``!pto.ptr<expected_mlir_type, ub>`` buffer."""
+    raw_scratch = unwrap_surface_value(scratch)
+    try:
+        ptr_type = _pto.PtrType(raw_scratch.type)
+    except Exception:
+        raise TypeError(
+            f"all_reduce {context}: scratch must be a !pto.ptr buffer, "
+            f"got {raw_scratch.type}"
+        ) from None
+    vec_attr = _pto.AddressSpaceAttr.get(_pto.AddressSpace.VEC)
+    if ptr_type.memory_space != vec_attr:
+        raise TypeError(
+            f"all_reduce {context}: scratch must be in UB memory space, "
+            f"got {ptr_type.memory_space}"
+        )
+    if ptr_type.element_type != expected_mlir_type:
+        raise TypeError(
+            f"all_reduce {context}: scratch element type mismatch: "
+            f"expected {expected_mlir_type}, got {ptr_type.element_type}"
+        )
+
+
+# ── shared helper-emission utility ─────────────────────────────────────────
+
+def _invoke_helper(helper_name, emit_fn, *surface_args):
+    """Look up or lazily create *helper_name*, then ``func.call`` it.
+
+    *emit_fn(helper_fn)* is called exactly once per trace session — on the
+    first invocation for this *helper_name*.
+    """
+    session = require_active_session("simt_allreduce_sum")
+    raw_args = [unwrap_surface_value(a) for a in surface_args]
+    arg_types = tuple(a.type for a in raw_args)
+
+    helper_spec = HelperFunctionSpec(
+        symbol_name=helper_name,
+        arg_types=arg_types,
+        result_types=(arg_types[0],),
+        attributes=(("pto.simt_entry", UnitAttr.get()),),
+    )
+    helper_fn, created = session.get_or_create_helper_function(helper_spec)
+    if created:
+        emit_fn(helper_fn)
+    call = func.CallOp(helper_fn, raw_args)
+    return wrap_surface_value(call.result)
+
+
+# ── reduction operator application ─────────────────────────────────────────
+
+def _emit_store(buffer, offset, value):
+    """Emit ``pto.store`` — accepts Ptr and any MemRef (including UB/VEC).
+
+    Unlike ``pto.store_scalar`` (which rejects VEC memrefs), ``pto.store``
+    uses ``PTO_BufferLikeType`` and survives the Ptr→MemRef type conversion
+    pass during lowering.
+    """
+    Operation.create(
+        "pto.store",
+        operands=[buffer, offset, value],
+    )
+
+
+def _emit_load(result_type, buffer, offset):
+    """Emit ``pto.load`` — accepts Ptr and any MemRef (including UB/VEC).
+
+    Counterpart to ``_emit_store``.  Returns the loaded SSA value.
+    """
+    return Operation.create(
+        "pto.load",
+        results=[result_type],
+        operands=[buffer, offset],
+    ).results[0]
+
+
+def _apply_sum(a, b):
+    """Emit ``a = a + b`` (float addition)."""
+    return arith.AddFOp(a, b).result
+
+
+def _emit_butterfly(v, *, threads: int, scale: int):
+    """Emit unrolled butterfly shuffle reduce.
+
+    Implements::
+
+        cur = threads
+        while cur > scale:
+            x = op(x, shfl_xor(x, cur/2))
+            cur /= 2
+
+    All loops are unrolled at emission time.  Caller must have set the
+    insertion point.
+    """
+    i32 = IntegerType.get_signless(32)
+    cur = threads
+    while cur > scale:
+        offset = cur // 2
+        c_offset = arith.ConstantOp(i32, offset).result
+        shfl = _pto.ShuffleBflyOp(v, c_offset).result
+        v = _apply_sum(v, shfl)
+        cur //= 2
+    return v
+
+
+def _emit_warp_hw_reduce(x, *, threads: int,
+                         lane_in_warp, c_identity, i32):
+    """Emit warp-level hardware reduce.
+
+    When *threads* == 32 ("groups" == 1): a single ``pto.redux_*``.
+    When *threads* < 32 ("groups" > 1): one ``pto.redux_*`` per group,
+    with identity masking for lanes outside the group.
+
+    Caller must have set the insertion point.
+    """
+    groups = 32 // threads
+
+    if groups == 1:
+        return _REDUX_OP(x).result
+
+    c_threads = arith.ConstantOp(i32, threads).result
+    my_group = arith.DivUIOp(lane_in_warp, c_threads).result
+
+    for g in range(groups):
+        c_g = arith.ConstantOp(i32, g).result
+        in_group = arith.CmpIOp(arith.CmpIPredicate.eq, my_group, c_g).result
+        masked = arith.SelectOp(in_group, x, c_identity).result
+        reduced = _REDUX_OP(masked).result
+        x = arith.SelectOp(in_group, reduced, x).result
+    return x
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# public API
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def simt_allreduce_sum(value, *,
+               threads: int,
+               scale: int = 1,
+               thread_offset: int = 0,
+               scratch=None,
+               scratch_offset: int = 0):
+    """Cross-workitem all-reduce for SIMT VF context.
+
+    Dispatch logic mirrors the compile-time tree in
+    ``AscendAllReduce<Reducer, threads, scale, thread_offset>::run()``.
+
+    Args:
+        value: Lane-local scalar (f32 or f16).
+        threads: Number of workitems.  Must satisfy ``threads % scale == 0``.
+        scale: Scale factor (must divide *threads*).  Defaults to 1.
+        thread_offset: Thread offset.  Defaults to 0.
+        scratch: UB scratch buffer (``!pto.ptr<dtype, ub>``).  Required for
+            ``cross_warp_reduce`` and ``ub_reduce`` paths.  Defaults to None.
+        scratch_offset: Element offset into *scratch*.  Defaults to 0.
+
+    Returns:
+        Lane-uniform scalar (same type as *value*) — the reduced sum.
+    """
+    return _dispatch_allreduce_helper(
+        value, scratch=scratch, scratch_offset=scratch_offset,
+        threads=threads, scale=scale, thread_offset=thread_offset,
+    )
+
+
+def _dispatch_allreduce_helper(value, *, scratch, scratch_offset,
+                                threads, scale, thread_offset):
+    # ── parameter validation (before identity shortcut) ───────────────────
+    for name, val in (("threads", threads), ("scale", scale),
+                       ("thread_offset", thread_offset)):
+        if not isinstance(val, int):
+            raise ValueError(
+                f"all_reduce: '{name}' must be a Python int, "
+                f"got {type(val).__name__}"
+            )
+    if threads < 1:
+        raise ValueError(f"all_reduce: threads must be >= 1, got {threads}")
+    if scale < 1:
+        raise ValueError(f"all_reduce: scale must be >= 1, got {scale}")
+    if thread_offset < 0:
+        raise ValueError(
+            f"all_reduce: thread_offset must be >= 0, got {thread_offset}"
+        )
+    if threads % scale != 0:
+        raise ValueError(
+            f"all_reduce requires threads % scale == 0; "
+            f"got threads={threads}, scale={scale}"
+        )
+
+    # ── Path 0: identity ──────────────────────────────────────────────────
+    if threads <= scale:
+        return value
+
+    # ── dtype validation ─────────────────────────────────────────────────
+    raw_value = unwrap_surface_value(value)
+    dtype = _dtype_to_str(raw_value.type)
+    if dtype not in ("f32", "f16"):
+        raise NotImplementedError(
+            f"all_reduce only supports f32/f16, got {dtype}"
+        )
+
+    name = _helper_name(dtype, threads, scale, thread_offset)
+    args = dict(dtype=dtype, threads=threads, scale=scale,
+                thread_offset=thread_offset, scratch_offset=scratch_offset)
+
+    # ── Path 1: warp_reduce ───────────────────────────────────────────────
+    if threads <= 32 and _is_pow2(threads) and _is_pow2(scale):
+        return _invoke_helper(
+            name,
+            lambda hf: _emit_warp_reduce(hf, **args),
+            value,
+        )
+
+    # ── All paths below require a scratch buffer ──────────────────────────
+    if scratch is None:
+        raise ValueError(
+            f"all_reduce sum/{dtype}/t{threads}/s{scale}/o{thread_offset} "
+            "requires a UB scratch buffer"
+        )
+    _validate_scratch(
+        scratch, raw_value.type,
+        context=f"sum/{dtype}/t{threads}/s{scale}/o{thread_offset}",
+    )
+
+    # ── Path 2: ub_reduce (threads ≤ 32, non-pow2) ──────────────────────
+    if threads <= 32:
+        return _invoke_helper(
+            name,
+            lambda hf: _emit_ub_reduce(hf, **args),
+            value, scratch,
+        )
+
+    # ── Path 3: cross_warp_reduce ────────────────────────────────────────
+    if scale <= 32 and _is_pow2(threads) and _is_pow2(scale):
+        return _invoke_helper(
+            name,
+            lambda hf: _emit_cross_warp_reduce(hf, **args),
+            value, scratch,
+        )
+
+    # ── Path 4: ub_reduce fallback (threads > 32, anything else) ─────────
+    return _invoke_helper(
+        name,
+        lambda hf: _emit_ub_reduce(hf, **args),
+        value, scratch,
+    )
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# emitter: warp_reduce  (Path 1: threads ≤ 32, pow2, pow2 scale)
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def _emit_warp_reduce(helper_fn, *,
+                      dtype, threads, scale, thread_offset,
+                      scratch_offset):
+    """Build the body of a single-warp all-reduce helper.
+
+    Dispatches to:
+
+    * ``warp_hw_reduce`` when ``extent >= 16`` and ``scale == 1``
+      (fast hardware redux, with group masking for threads < 32).
+    * ``butterfly`` otherwise (software shuffle via ``pto.shuffle_bfly``).
+    """
+    extent = threads // scale
+    scalar_t = _mlir_scalar_type(dtype)
+    identity_val = _IDENTITY[dtype]
+    i32 = IntegerType.get_signless(32)
+
+    entry = helper_fn.add_entry_block()
+    with InsertionPoint(entry):
+        x = entry.arguments[0]
+
+        c_offset = arith.ConstantOp(i32, thread_offset).result
+        c_identity = arith.ConstantOp(scalar_t, identity_val).result
+
+        if thread_offset:
+            # lane_in_warp = (tid_x - offset) & 31
+            tid_x = _pto.GetTidXOp().result
+            tx = arith.SubIOp(tid_x, c_offset).result
+            lane_in_warp = arith.AndIOp(tx, arith.ConstantOp(i32, 31).result).result
+        else:
+            lane_in_warp = _pto.GetLaneIdOp().result
+
+        if extent >= 16 and scale == 1:
+            result = _emit_warp_hw_reduce(
+                x, threads=threads,
+                lane_in_warp=lane_in_warp, c_identity=c_identity, i32=i32,
+            )
+        else:
+            result = _emit_butterfly(
+                x, threads=threads, scale=scale,
+            )
+
+        func.ReturnOp([result])
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# emitter: cross_warp_reduce  (Path 3: threads > 32)
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def _emit_cross_warp_reduce(helper_fn, *,
+                            dtype, threads, scale, thread_offset,
+                            scratch_offset):
+    """Build the body of a cross-warp all-reduce helper.
+
+    Algorithm overview:
+
+    1. *num_warps* subgroups of 32 lanes each do a per-warp reduce.
+    2. Warp leaders (lid < scale) write → scratch[wid * scale + lid].
+    3. ``pto.syncthreads``.
+    4. Leader warp (lanes with ``tx < 32``) reduces the partial sums:
+       - scale == 1:  ``hw_reduce`` across leader warp.
+       - scale * num_warps ≤ 32:  ``butterfly<total, scale, 0>``.
+       - otherwise:  manual loop over warps.
+    5. Global leader (tx < scale) writes result → scratch[tx].
+    6. ``pto.syncthreads`` + broadcast: each lane reads scratch[tx % scale].
+    7. Extra ``pto.syncthreads`` to fence scratch reuse.
+    """
+    num_warps = threads // 32
+    scalar_t = _mlir_scalar_type(dtype)
+    identity_val = _IDENTITY[dtype]
+
+    i32 = IntegerType.get_signless(32)
+    idx_t = IndexType.get()
+
+    entry = helper_fn.add_entry_block()
+    with InsertionPoint(entry):
+        x = entry.arguments[0]
+        scratch = entry.arguments[1]
+
+        # ── constants ────────────────────────────────────────────────────
+        c0_i32 = arith.ConstantOp(i32, 0).result
+        c5_i32 = arith.ConstantOp(i32, 5).result
+        c31_i32 = arith.ConstantOp(i32, 31).result
+        c32_i32 = arith.ConstantOp(i32, 32).result
+        c_scale = arith.ConstantOp(i32, scale).result
+        c_num_warps = arith.ConstantOp(i32, num_warps).result
+        c_offset = arith.ConstantOp(i32, thread_offset).result
+        c_scratch_off = arith.ConstantOp(idx_t, scratch_offset).result
+        c_identity = arith.ConstantOp(scalar_t, identity_val).result
+
+        # ── thread indexing ──────────────────────────────────────────────
+        tid_x = _pto.GetTidXOp().result
+        if thread_offset:
+            tx = arith.SubIOp(tid_x, c_offset).result
+            wid = arith.ShRUIOp(tx, c5_i32).result
+            lid = arith.AndIOp(tx, c31_i32).result
+        else:
+            tx = tid_x
+            wid = arith.ShRUIOp(tx, c5_i32).result
+            lid = _pto.GetLaneIdOp().result
+
+        # ── Stage 1: per-warp reduce ─────────────────────────────────────
+        if scale == 1:
+            warp_val = _REDUX_OP(x).result
+        else:
+            warp_val = _emit_butterfly(
+                x, threads=32, scale=scale,
+            )
+
+        # ── Stage 2: warp leaders write partial results ──────────────────
+        is_writer = arith.CmpIOp(arith.CmpIPredicate.ult, lid, c_scale).result
+        write_if = scf.IfOp(is_writer, hasElse=False)
+        with InsertionPoint(write_if.then_block):
+            slot = arith.AddIOp(
+                arith.MulIOp(wid, c_scale).result, lid).result
+            slot_idx = arith.IndexCastOp(idx_t, slot).result
+            if scratch_offset:
+                slot_idx = arith.AddIOp(slot_idx, c_scratch_off).result
+            _emit_store(scratch, slot_idx, warp_val)
+            scf.YieldOp([])
+
+        # ── Stage 3: sync before reading partial results ─────────────────
+        _pto.SyncthreadsOp()
+
+        # ── Stage 4: leader warp reduces partial sums ────────────────────
+        is_leader_warp = arith.CmpIOp(
+            arith.CmpIPredicate.ult, tx, c32_i32).result
+        outer_if = scf.IfOp(is_leader_warp, [scalar_t], hasElse=True)
+
+        with InsertionPoint(outer_if.then_block):
+            if scale == 1:
+                # ── scale == 1: hw_reduce across leader warp ────────────
+                need_load = arith.CmpIOp(
+                    arith.CmpIPredicate.ult, lid, c_num_warps).result
+                inner_if = scf.IfOp(need_load, [scalar_t], hasElse=True)
+                with InsertionPoint(inner_if.then_block):
+                    lid_idx = arith.IndexCastOp(idx_t, lid).result
+                    tmp = _emit_load(scalar_t, scratch, lid_idx)
+                    scf.YieldOp([tmp])
+                with InsertionPoint(inner_if.else_block):
+                    scf.YieldOp([c_identity])
+                loaded = inner_if.results[0]
+                stage4_result = _REDUX_OP(loaded).result
+            elif scale * num_warps <= 32:
+                # ── scale > 1, fits in one warp: butterfly ──────────────
+                total = scale * num_warps
+                c_total = arith.ConstantOp(i32, total).result
+                need_load = arith.CmpIOp(
+                    arith.CmpIPredicate.ult, lid, c_total).result
+                inner_if = scf.IfOp(need_load, [scalar_t], hasElse=True)
+                with InsertionPoint(inner_if.then_block):
+                    lid_idx = arith.IndexCastOp(idx_t, lid).result
+                    if scratch_offset:
+                        lid_idx = arith.AddIOp(lid_idx, c_scratch_off).result
+                    tmp = _emit_load(scalar_t, scratch, lid_idx)
+                    scf.YieldOp([tmp])
+                with InsertionPoint(inner_if.else_block):
+                    scf.YieldOp([c_identity])
+                loaded = inner_if.results[0]
+                stage4_result = _emit_butterfly(
+                    loaded,
+                    threads=total, scale=scale,
+                )
+            else:
+                # ── manual loop: lid < scale lanes each reduce num_warps
+                is_reducer = arith.CmpIOp(
+                    arith.CmpIPredicate.ult, lid, c_scale).result
+                result = c_identity
+                my_slot = arith.RemUIOp(lid, c_scale).result
+                for w in range(num_warps):
+                    c_w = arith.ConstantOp(i32, w).result
+                    idx_val = arith.AddIOp(
+                        arith.MulIOp(c_w, c_scale).result, my_slot).result
+                    slot_idx = arith.IndexCastOp(idx_t, idx_val).result
+                    if scratch_offset:
+                        slot_idx = arith.AddIOp(slot_idx, c_scratch_off).result
+                    loaded_v = _emit_load(
+                        scalar_t, scratch, slot_idx)
+                    result = _apply_sum(result, loaded_v)
+                stage4_result = arith.SelectOp(
+                    is_reducer, result, c_identity).result
+
+            scf.YieldOp([stage4_result])
+
+        with InsertionPoint(outer_if.else_block):
+            scf.YieldOp([c_identity])
+
+        partial_reduced = outer_if.results[0]
+
+        # ── Stage 5: global leader writes result to scratch ──────────────
+        is_global_leader = arith.CmpIOp(
+            arith.CmpIPredicate.ult, tx, c_scale).result
+        write_result_if = scf.IfOp(is_global_leader, hasElse=False)
+        with InsertionPoint(write_result_if.then_block):
+            tx_idx = arith.IndexCastOp(idx_t, tx).result
+            if scratch_offset:
+                tx_idx = arith.AddIOp(tx_idx, c_scratch_off).result
+            _emit_store(scratch, tx_idx, partial_reduced)
+            scf.YieldOp([])
+
+        # ── Stage 6: sync + broadcast load scratch[tx % scale] ───────────
+        _pto.SyncthreadsOp()
+        my_slot = arith.RemUIOp(tx, c_scale).result
+        load_idx = arith.IndexCastOp(idx_t, my_slot).result
+        if scratch_offset:
+            load_idx = arith.AddIOp(load_idx, c_scratch_off).result
+        result = _emit_load(scalar_t, scratch, load_idx)
+
+        # ── Stage 7: extra sync to fence scratch reuse ───────────────────
+        _pto.SyncthreadsOp()
+
+        func.ReturnOp([result])
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# emitter: ub_reduce  (Paths 2 & 4: fallback via UB scratch)
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def _emit_ub_reduce(helper_fn, *,
+                    dtype, threads, scale, thread_offset,
+                    scratch_offset):
+    """Build the body of a UB-scratch all-reduce helper.
+
+    Algorithm:
+
+    1. Each lane writes x → scratch[tx].
+    2. ``pto.syncthreads``.
+    3. Lanes with ``lane % scale == 0`` sequentially reduce scratch slots.
+    4. ``pto.syncthreads``.
+    5. Global leader (lane % scale == 0, lane / scale == 0) writes back.
+    6. ``pto.syncthreads`` + broadcast: each lane reads scratch[tx % scale].
+    7. ``pto.syncthreads`` to fence scratch reuse.
+    """
+    scalar_t = _mlir_scalar_type(dtype)
+    i32 = IntegerType.get_signless(32)
+    idx_t = IndexType.get()
+
+    entry = helper_fn.add_entry_block()
+    with InsertionPoint(entry):
+        x = entry.arguments[0]
+        scratch = entry.arguments[1]
+
+        # ── constants ────────────────────────────────────────────────────
+        c0_i32 = arith.ConstantOp(i32, 0).result
+        c_threads = arith.ConstantOp(i32, threads).result
+        c_scale = arith.ConstantOp(i32, scale).result
+        c_offset = arith.ConstantOp(i32, thread_offset).result
+        c_scratch_off = arith.ConstantOp(idx_t, scratch_offset).result
+
+        # ── thread indexing ──────────────────────────────────────────────
+        tid_x = _pto.GetTidXOp().result
+        tx = arith.SubIOp(tid_x, c_offset).result if thread_offset else tid_x
+        group = arith.DivUIOp(tx, c_threads).result
+        lane = arith.RemUIOp(tx, c_threads).result
+        lane_mod = arith.RemUIOp(lane, c_scale).result
+
+        # ── Stage 1: each lane writes x → scratch[scratch_offset + tx] ──
+        tx_idx = arith.IndexCastOp(idx_t, tx).result
+        if scratch_offset:
+            tx_idx = arith.AddIOp(tx_idx, c_scratch_off).result
+        _emit_store(scratch, tx_idx, x)
+
+        # ── Stage 2: sync ────────────────────────────────────────────────
+        _pto.SyncthreadsOp()
+
+        # ── Stage 3: reducers sequentially combine ───────────────────────
+        # lane < scale gives exactly one reducer per residue class
+        is_reducer = arith.CmpIOp(
+            arith.CmpIPredicate.ult, lane, c_scale).result
+        reduce_if = scf.IfOp(is_reducer, [scalar_t], hasElse=True)
+
+        with InsertionPoint(reduce_if.then_block):
+            # initial: load scratch[scratch_offset + group * threads + lane]
+            group_offset = arith.MulIOp(group, c_threads).result
+            first_elem = arith.AddIOp(group_offset, lane).result
+            first_idx = arith.IndexCastOp(idx_t, first_elem).result
+            if scratch_offset:
+                first_idx = arith.AddIOp(first_idx, c_scratch_off).result
+            acc = _emit_load(scalar_t, scratch, first_idx)
+
+            # scf.for i = scale to threads step scale
+            lb = arith.ConstantOp(idx_t, scale).result
+            ub = arith.ConstantOp(idx_t, threads).result
+            step = arith.ConstantOp(idx_t, scale).result
+            for_op = scf.ForOp(lb, ub, step, [acc])
+            with InsertionPoint(for_op.body):
+                i = for_op.induction_variable
+                prev = for_op.inner_iter_args[0]
+                elem = arith.AddIOp(first_idx, i).result
+                loaded = _emit_load(
+                    scalar_t, scratch, elem)
+                new_acc = _apply_sum(prev, loaded)
+                scf.YieldOp([new_acc])
+            scf.YieldOp([for_op.results[0]])
+
+        with InsertionPoint(reduce_if.else_block):
+            scf.YieldOp([x])
+
+        flag = reduce_if.results[0]
+
+        # ── Stage 4: sync ────────────────────────────────────────────────
+        _pto.SyncthreadsOp()
+
+        # ── Stage 5: per-class leader writes reduced value ───────────────
+        # leader lanes 0..scale-1 each write their residue class result
+        is_leader = arith.CmpIOp(
+            arith.CmpIPredicate.ult, lane, c_scale).result
+        write_if = scf.IfOp(is_leader, hasElse=False)
+        with InsertionPoint(write_if.then_block):
+            dst_offset = arith.AddIOp(
+                arith.MulIOp(group, c_threads).result, lane).result
+            dst_idx = arith.IndexCastOp(idx_t, dst_offset).result
+            if scratch_offset:
+                dst_idx = arith.AddIOp(dst_idx, c_scratch_off).result
+            _emit_store(scratch, dst_idx, flag)
+            scf.YieldOp([])
+
+        # ── Stage 6: sync + broadcast scratch[scratch_offset + group*threads + tx%scale] ──
+        _pto.SyncthreadsOp()
+        my_slot = arith.AddIOp(
+            arith.MulIOp(group, c_threads).result,
+            arith.RemUIOp(tx, c_scale).result).result
+        load_idx = arith.IndexCastOp(idx_t, my_slot).result
+        if scratch_offset:
+            load_idx = arith.AddIOp(load_idx, c_scratch_off).result
+        result = _emit_load(scalar_t, scratch, load_idx)
+
+        # ── Stage 7: extra sync to fence scratch reuse ───────────────────
+        _pto.SyncthreadsOp()
+
+        func.ReturnOp([result])
+
+
+__all__ = [
+    "simt_allreduce_sum",
+]
diff --git a/ptodsl/ptodsl/pto.py b/ptodsl/ptodsl/pto.py
index ef55490654..19cd93a91f 100644
--- a/ptodsl/ptodsl/pto.py
+++ b/ptodsl/ptodsl/pto.py
@@ -143,6 +143,9 @@
     LoopHandle, BranchHandle,
 )
 
+# ── All-reduce ─────────────────────────────────────────────────────────────────
+from ._allreduce import simt_allreduce_sum  # noqa: F401
+
 # ── Decorator ─────────────────────────────────────────────────────────────────
 from ._jit import jit, KernelHandle, merge_jit_modules      # noqa: F401
 from ._subkernels import cube, simd, simt     # noqa: F401
diff --git a/ptodsl/tests/test_allreduce.py b/ptodsl/tests/test_allreduce.py
new file mode 100644
index 0000000000..1f6b964894
--- /dev/null
+++ b/ptodsl/tests/test_allreduce.py
@@ -0,0 +1,533 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from pathlib import Path
+import sys
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "ptodsl"))
+
+from ptodsl import pto
+
+
+def expect(condition: bool, message: str) -> None:
+    if not condition:
+        raise AssertionError(message)
+
+
+def main():
+    from ptodsl._allreduce import _helper_name, simt_allreduce_sum
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # helper name format
+    # ══════════════════════════════════════════════════════════════════════════
+    expect(
+        _helper_name("f32", 128, 1, 0) == "__tl_allreduce_sum_f32_t128_s1_o0",
+        "helper name format (sum/f32/t128/s1/o0)",
+    )
+    expect(
+        _helper_name("f16", 32, 2, 4) == "__tl_allreduce_sum_f16_t32_s2_o4",
+        "helper name format (f16/t32/s2/o4)",
+    )
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 0: identity (threads <= scale)
+    # ══════════════════════════════════════════════════════════════════════════
+    expect(
+        simt_allreduce_sum(1.0, threads=1, scale=1) == 1.0,
+        "identity: threads == scale",
+    )
+    expect(
+        simt_allreduce_sum(1.0, threads=2, scale=2) == 1.0,
+        "identity: threads == scale (alt)",
+    )
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # validation errors
+    # ══════════════════════════════════════════════════════════════════════════
+
+    # threads % scale != 0  (validation now runs before identity shortcut)
+    try:
+        simt_allreduce_sum(1.0, threads=3, scale=2)
+        raise AssertionError("expected ValueError for threads % scale != 0")
+    except ValueError:
+        pass
+
+
+    # threads < 1
+    try:
+        simt_allreduce_sum(1.0, threads=0, scale=1)
+        raise AssertionError("expected ValueError for threads < 1")
+    except ValueError:
+        pass
+
+    # validation runs before identity: bad params not bypassed by threads<=scale
+    try:
+        simt_allreduce_sum(1.0, threads=1, scale=2)
+        raise AssertionError("expected ValueError for threads%scale!=0 (before identity)")
+    except ValueError:
+        pass
+
+    # i32 dtype rejected — need a real JIT kernel so we get an MLIR i32 value
+    @pto.jit(target="a5")
+    def kernel_i32(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1, dtype=pto.i32)
+            _result = pto.simt_allreduce_sum(x, threads=32, scale=1)
+
+    try:
+        kernel_i32.compile()
+        raise AssertionError("expected NotImplementedError for i32")
+    except NotImplementedError:
+        pass
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 1a: warp_reduce — hardware redux, groups == 1 (threads=32)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_warp(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        _ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, threads=32, scale=1)
+
+    compiled_warp = kernel_warp.compile()
+    mlir_warp = compiled_warp.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t32_s1_o0" in mlir_warp,
+           "IR: warp_reduce helper name")
+    expect("pto.redux_add" in mlir_warp,
+           "IR: redux_add in warp_reduce helper")
+    expect("pto.syncthreads" not in mlir_warp,
+           "IR: warp_reduce has no syncthreads")
+    expect("pto.shuffle_bfly" not in mlir_warp,
+           "IR: warp_reduce (groups=1) has no shuffle_bfly")
+    compiled_warp.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 1b: warp_reduce — hardware redux, groups > 1 (threads=16, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_warp_t16(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        _ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, threads=16, scale=1)
+
+    compiled_warp_t16 = kernel_warp_t16.compile()
+    mlir_warp_t16 = compiled_warp_t16.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t16_s1_o0" in mlir_warp_t16,
+           "IR: warp_reduce t=16 helper name")
+    expect("pto.redux_add" in mlir_warp_t16,
+           "IR: redux_add for groups>1")
+    expect("arith.select" in mlir_warp_t16,
+           "IR: arith.select for group masking")
+    expect("pto.syncthreads" not in mlir_warp_t16,
+           "IR: warp_reduce (groups=2) has no syncthreads")
+    compiled_warp_t16.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 1c: warp_reduce — butterfly shuffle (threads=8, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_warp_t8(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        _ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, threads=8, scale=1)
+
+    compiled_warp_t8 = kernel_warp_t8.compile()
+    mlir_warp_t8 = compiled_warp_t8.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t8_s1_o0" in mlir_warp_t8,
+           "IR: warp_reduce t=8 butterfly helper name (sum)")
+    expect("pto.shuffle_bfly" in mlir_warp_t8,
+           "IR: shuffle_bfly for butterfly path")
+    expect("pto.redux_add" not in mlir_warp_t8,
+           "IR: butterfly has no hardware redux")
+    expect("pto.syncthreads" not in mlir_warp_t8,
+           "IR: butterfly has no syncthreads")
+    compiled_warp_t8.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 1d: warp_reduce — butterfly with scale > 1 (threads=32, scale=2)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_warp_s2(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        _ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, threads=32, scale=2)
+
+    compiled_warp_s2 = kernel_warp_s2.compile()
+    mlir_warp_s2 = compiled_warp_s2.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t32_s2_o0" in mlir_warp_s2,
+           "IR: warp_reduce s=2 butterfly helper name (sum)")
+    expect("pto.shuffle_bfly" in mlir_warp_s2,
+           "IR: shuffle_bfly for butterfly (scale>1)")
+    expect("pto.redux_add" not in mlir_warp_s2,
+           "IR: butterfly (scale>1) has no hardware redux")
+    compiled_warp_s2.verify()
+
+    # ── warp_reduce: sum, f32, t=16, s=1, o=4 (non-zero thread_offset) ────────
+    @pto.jit(target="a5")
+    def kernel_warp_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        _ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, threads=16, scale=1, thread_offset=4)
+
+    compiled_warp_o4 = kernel_warp_o4.compile()
+    mlir_warp_o4 = compiled_warp_o4.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t16_s1_o4" in mlir_warp_o4,
+           "IR: warp_reduce o=4 helper name")
+    expect("pto.get_tid_x" in mlir_warp_o4,
+           "IR: warp_reduce o=4 uses get_tid_x (not raw get_laneid)")
+    expect("arith.subi" in mlir_warp_o4,
+           "IR: warp_reduce o=4 uses subi for tx = tid_x - offset")
+    expect("arith.andi" in mlir_warp_o4,
+           "IR: warp_reduce o=4 uses andi to extract lane_in_warp")
+    compiled_warp_o4.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 2: ub_reduce — threads ≤ 32, non-power-of-2 (threads=6, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_ub6(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=6, scale=1)
+
+    compiled_ub6 = kernel_ub6.compile()
+    mlir_ub6 = compiled_ub6.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t6_s1_o0" in mlir_ub6,
+           "IR: ub_reduce t=6 helper name")
+    expect("pto.syncthreads" in mlir_ub6,
+           "IR: ub_reduce has syncthreads")
+    expect("pto.store" in mlir_ub6,
+           "IR: ub_reduce has store (write to scratch)")
+    expect("pto.load" in mlir_ub6,
+           "IR: ub_reduce has load (read from scratch)")
+    syncthreads_count = mlir_ub6.count("pto.syncthreads")
+    expect(syncthreads_count == 4,
+           f"IR: ub_reduce has 4 syncthreads, got {syncthreads_count}")
+    compiled_ub6.verify()
+
+    # ── ub_reduce: sum, f32, t=6, s=2 (scale > 1, non-pow2 threads) ─────────
+    @pto.jit(target="a5")
+    def kernel_ub6s2(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=6, scale=2)
+
+    compiled_ub6s2 = kernel_ub6s2.compile()
+    mlir_ub6s2 = compiled_ub6s2.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t6_s2_o0" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 helper name")
+    expect("pto.syncthreads" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has syncthreads")
+    expect("pto.store" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has store")
+    expect("pto.load" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has load")
+    expect("scf.for" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has scf.for (sequential reduce loop)")
+    expect("pto.redux_add" not in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has no hardware redux")
+    expect("pto.shuffle_bfly" not in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has no butterfly shuffle")
+    # scale>1 fixes: reducer uses lane < scale (ult), not lane_mod == 0
+    expect("arith.cmpi ult" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 reducer uses ult (lane < scale)")
+    compiled_ub6s2.verify()
+
+    # ── ub_reduce: sum, f32, t=6, s=1, o=4 (non-zero thread_offset) ─────────
+    @pto.jit(target="a5")
+    def kernel_ub_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=6, scale=1,
+                                     thread_offset=4)
+
+    compiled_ub_o4 = kernel_ub_o4.compile()
+    mlir_ub_o4 = compiled_ub_o4.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t6_s1_o4" in mlir_ub_o4,
+           "IR: ub_reduce o=4 helper name")
+    expect("arith.subi" in mlir_ub_o4,
+           "IR: ub_reduce o=4 uses subi for tx = tid_x - offset")
+    compiled_ub_o4.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 3a: cross_warp_reduce — sum, f32, t=128, s=1, o=0 (baseline)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_128(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=1)
+
+    compiled = kernel_128.compile()
+    mlir = compiled.mlir_text()
+
+    expect("func.func @__tl_allreduce_sum_f32_t128_s1_o0" in mlir,
+           "IR: helper function definition")
+    expect("pto.simt_entry" in mlir,
+           "IR: helper carries pto.simt_entry")
+    expect("call @__tl_allreduce_sum_f32_t128_s1_o0" in mlir,
+           "IR: func.call to helper")
+
+    for op_name in (
+        "pto.redux_add", "pto.syncthreads", "pto.store", "pto.load",
+        "pto.get_tid_x", "pto.get_laneid", "arith.shrui", "scf.if",
+    ):
+        expect(op_name in mlir, f"IR: expected '{op_name}' in helper body")
+
+    syncthreads_count = mlir.count("pto.syncthreads")
+    expect(syncthreads_count == 3,
+           f"IR: expected 3 syncthreads, got {syncthreads_count}")
+
+    compiled.verify()
+
+    # ── cross_warp: sum, f32, t=64 (2 warps) ────────────────────────────────
+    @pto.jit(target="a5")
+    def kernel_64(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=64, scale=1)
+
+    compiled_64 = kernel_64.compile()
+    mlir_64 = compiled_64.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t64_s1_o0" in mlir_64,
+           "IR: helper for t=64")
+    compiled_64.verify()
+
+    # ── cross_warp: sum, f32, t=256 (8 warps) ───────────────────────────────
+    @pto.jit(target="a5")
+    def kernel_256(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=256, scale=1)
+
+    compiled_256 = kernel_256.compile()
+    mlir_256 = compiled_256.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t256_s1_o0" in mlir_256,
+           "IR: helper for t=256")
+    compiled_256.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 3b: cross_warp_reduce — scale > 1, scale*num_warps ≤ 32
+    #           (threads=128, scale=2, num_warps=4, total=8 ≤ 32)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_cw_s2(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=2)
+
+    compiled_cw_s2 = kernel_cw_s2.compile()
+    mlir_cw_s2 = compiled_cw_s2.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t128_s2_o0" in mlir_cw_s2,
+           "IR: cross_warp s=2 helper name")
+    expect("pto.shuffle_bfly" in mlir_cw_s2,
+           "IR: cross_warp s=2 has shuffle_bfly (butterfly for per-warp + leader)")
+    expect("pto.syncthreads" in mlir_cw_s2,
+           "IR: cross_warp s=2 has syncthreads")
+    # scale > 1: per-warp uses butterfly, not hardware redux
+    compiled_cw_s2.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 3c: cross_warp_reduce — scale > 1, scale*num_warps > 32 (manual, sum)
+    #           (threads=128, scale=16, num_warps=4, total=64 > 32)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_cw_s16(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=16)
+
+    compiled_cw_s16 = kernel_cw_s16.compile()
+    mlir_cw_s16 = compiled_cw_s16.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t128_s16_o0" in mlir_cw_s16,
+           "IR: cross_warp s=16 manual helper name")
+    expect("pto.syncthreads" in mlir_cw_s16,
+           "IR: cross_warp s=16 has syncthreads")
+    compiled_cw_s16.verify()
+
+    # ── cross_warp: sum, f32, t=128, s=1, o=4 (non-zero thread_offset) ─────
+    @pto.jit(target="a5")
+    def kernel_cw_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=1,
+                                     thread_offset=4)
+
+    compiled_cw_o4 = kernel_cw_o4.compile()
+    mlir_cw_o4 = compiled_cw_o4.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t128_s1_o4" in mlir_cw_o4,
+           "IR: cross_warp o=4 helper name")
+    expect("pto.get_tid_x" in mlir_cw_o4,
+           "IR: cross_warp o=4 uses get_tid_x")
+    expect("arith.subi" in mlir_cw_o4,
+           "IR: cross_warp o=4 uses subi for tx = tid_x - offset")
+    compiled_cw_o4.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 4: ub_reduce fallback — threads > 32, non-power-of-2
+    #          (threads=48, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_ub48(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=48, scale=1)
+
+    compiled_ub48 = kernel_ub48.compile()
+    mlir_ub48 = compiled_ub48.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t48_s1_o0" in mlir_ub48,
+           "IR: ub_reduce fallback t=48 helper name")
+    expect("pto.syncthreads" in mlir_ub48,
+           "IR: ub_reduce fallback has syncthreads")
+    expect("pto.store" in mlir_ub48,
+           "IR: ub_reduce fallback has store")
+    expect("pto.load" in mlir_ub48,
+           "IR: ub_reduce fallback has load")
+    compiled_ub48.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # helper deduplication across multiple calls
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_reuse(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x1 = pto.const(1.0, dtype=pto.f32)
+            _r1 = pto.simt_allreduce_sum(x1, scratch=ub_scratch, threads=128, scale=1)
+            x2 = pto.const(2.0, dtype=pto.f32)
+            _r2 = pto.simt_allreduce_sum(x2, scratch=ub_scratch, threads=128, scale=1)
+
+    compiled2 = kernel_reuse.compile()
+    mlir2 = compiled2.mlir_text()
+
+    definitions = mlir2.count("func.func @__tl_allreduce_sum_f32_t128_s1_o0")
+    expect(definitions == 1,
+           f"IR: helper defined {definitions} times, expected 1")
+    calls = mlir2.count("call @__tl_allreduce_sum_f32_t128_s1_o0")
+    expect(calls == 2, f"IR: expected 2 call sites, got {calls}")
+    compiled2.verify()
+
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # scratch required for ub_reduce and cross_warp paths
+    # ══════════════════════════════════════════════════════════════════════════
+
+    # cross_warp requires scratch — use a real JIT kernel so the error
+    # originates from _dispatch_allreduce_helper, not from a bare Python float.
+    @pto.jit(target="a5")
+    def kernel_no_scratch_cw(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=None, threads=128, scale=1)
+
+    try:
+        kernel_no_scratch_cw.compile()
+        raise AssertionError("expected ValueError for missing scratch (cross_warp)")
+    except ValueError as e:
+        expect("requires a UB scratch buffer" in str(e),
+               f"error message should mention scratch (cross_warp), got: {e}")
+
+    # ub_reduce (non-pow2) requires scratch
+    @pto.jit(target="a5")
+    def kernel_no_scratch_ub(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=None, threads=6, scale=1)
+
+    try:
+        kernel_no_scratch_ub.compile()
+        raise AssertionError("expected ValueError for missing scratch (ub_reduce)")
+    except ValueError as e:
+        expect("requires a UB scratch buffer" in str(e),
+               f"error message should mention scratch (ub_reduce), got: {e}")
+
+    # scratch must be a pto.ptr type
+    try:
+        simt_allreduce_sum(1.0, scratch="not_a_ptr", threads=6, scale=1)
+        raise AssertionError("expected TypeError for non-ptr scratch")
+    except (TypeError, AttributeError):
+        pass
+
+    # cross_warp: gm scratch (wrong memory space) should be rejected
+    @pto.jit(target="a5")
+    def kernel_gm_scratch(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=scratch_gm, threads=128, scale=1)
+
+    try:
+        kernel_gm_scratch.compile()
+        raise AssertionError("expected TypeError for gm scratch")
+    except TypeError as e:
+        expect("UB" in str(e).upper() or "memory space" in str(e).lower(),
+               f"gm scratch error should mention memory space, got: {e}")
+
+    # cross_warp: i32 scratch with f32 x (dtype mismatch) should be rejected
+    @pto.jit(target="a5")
+    def kernel_dtype_mismatch(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_i32 = pto.castptr(zero_u64, pto.ptr(pto.i32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_i32, threads=128, scale=1)
+
+    try:
+        kernel_dtype_mismatch.compile()
+        raise AssertionError("expected TypeError for dtype mismatch scratch")
+    except TypeError as e:
+        err = str(e)
+        expect("element type" in err.lower() or "mismatch" in err.lower(),
+               f"dtype mismatch should mention element type, got: {e}")
+
+    print("ptodsl_allreduce: PASS")
+
+
+if __name__ == "__main__":
+    main()

From a5ab39fa889e256aab2846e377f95d1924a66e0c Mon Sep 17 00:00:00 2001
From: kuri780 <185585386+kuri780@users.noreply.github.com>
Date: Tue, 30 Jun 2026 15:59:58 +0800
Subject: [PATCH 2/2] feat(ptodsl): inline allreduce implementation with
 sum/max/min reducers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactor _allreduce.py from helper-function outline to inline emission,
add max/min reducer support, and create VPTO simulator test cases.

Core changes (_allreduce.py):
- Replace func.call outline with inline emission (_emit_inline)
- Add simt_allreduce_max and simt_allreduce_min APIs
- Add reducer dispatch tables (IDENTITY, COMBINE, REDUX)
- Convert control flow to PTODSL if_() context manager
- Convert ops to PTODSL wrappers (pto.const, scalar.*, redux_*, ...)
- Keep raw arith only for unsigned ops (DivUIOp, RemUIOp, ShRUIOp, ult)
- Auto-attach pto.simt_entry attribute for syncthreads verifier

Export (pto.py):
- Export simt_allreduce_sum, simt_allreduce_max, simt_allreduce_min

Tests (test_allreduce.py):
- Add IR structure tests for all 4 paths × 3 reducers
- Add ptoas lowering verification for warp paths
- Document bisheng stack-smashing bug on cross-warp scratch paths

VPTO simulator tests (test/vpto/cases/micro-op/simt/allreduce_*):
- 6 cases: warp_sum/max/min (32 lanes) + cross_sum/max/min (128 lanes)
- kernel.pto + launch.cpp + main.cpp + golden.py + compare.py
- All 6 cases verified on Ascend950PR_9599 simulator (DEVICE=SIM)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 ptodsl/ptodsl/_allreduce.py                   | 746 ++++++++++--------
 ptodsl/ptodsl/pto.py                          |   2 +-
 ptodsl/tests/test_allreduce.py                | 254 ++++--
 .../simt/allreduce_cross_max/compare.py       |  15 +
 .../simt/allreduce_cross_max/golden.py        |  22 +
 .../simt/allreduce_cross_max/kernel.pto       |  65 ++
 .../simt/allreduce_cross_max/launch.cpp       |  11 +
 .../simt/allreduce_cross_max/main.cpp         |  43 +
 .../simt/allreduce_cross_min/compare.py       |  15 +
 .../simt/allreduce_cross_min/golden.py        |  22 +
 .../simt/allreduce_cross_min/kernel.pto       |  65 ++
 .../simt/allreduce_cross_min/launch.cpp       |  11 +
 .../simt/allreduce_cross_min/main.cpp         |  43 +
 .../simt/allreduce_cross_sum/compare.py       |  15 +
 .../simt/allreduce_cross_sum/golden.py        |  22 +
 .../simt/allreduce_cross_sum/kernel.pto       |  65 ++
 .../simt/allreduce_cross_sum/launch.cpp       |  11 +
 .../simt/allreduce_cross_sum/main.cpp         |  43 +
 .../simt/allreduce_warp_max/compare.py        |  15 +
 .../simt/allreduce_warp_max/golden.py         |  22 +
 .../simt/allreduce_warp_max/kernel.pto        |  21 +
 .../simt/allreduce_warp_max/launch.cpp        |  11 +
 .../micro-op/simt/allreduce_warp_max/main.cpp |  43 +
 .../simt/allreduce_warp_min/compare.py        |  15 +
 .../simt/allreduce_warp_min/golden.py         |  22 +
 .../simt/allreduce_warp_min/kernel.pto        |  21 +
 .../simt/allreduce_warp_min/launch.cpp        |  11 +
 .../micro-op/simt/allreduce_warp_min/main.cpp |  43 +
 .../simt/allreduce_warp_sum/compare.py        |  15 +
 .../simt/allreduce_warp_sum/golden.py         |  22 +
 .../simt/allreduce_warp_sum/kernel.pto        |  21 +
 .../simt/allreduce_warp_sum/launch.cpp        |  11 +
 .../micro-op/simt/allreduce_warp_sum/main.cpp |  43 +
 33 files changed, 1405 insertions(+), 401 deletions(-)
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_max/compare.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_max/golden.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_max/kernel.pto
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_max/launch.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_max/main.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_min/compare.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_min/golden.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_min/kernel.pto
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_min/launch.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_min/main.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_sum/compare.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_sum/golden.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_sum/kernel.pto
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_sum/launch.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_sum/main.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_max/compare.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_max/golden.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_max/kernel.pto
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_max/launch.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_max/main.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_min/compare.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_min/golden.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_min/kernel.pto
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_min/launch.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_min/main.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_sum/compare.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_sum/golden.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_sum/kernel.pto
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_sum/launch.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_sum/main.cpp

diff --git a/ptodsl/ptodsl/_allreduce.py b/ptodsl/ptodsl/_allreduce.py
index cb0ce122ed..e75a8d1ccb 100644
--- a/ptodsl/ptodsl/_allreduce.py
+++ b/ptodsl/ptodsl/_allreduce.py
@@ -8,11 +8,9 @@
 """
 SIMT cross-workitem all-reduce helpers.
 
-Implements ``AscendAllReduce<Reducer, threads, scale, thread_offset>::run()``
-as PTO IR helper functions that are lazily emitted into the trace module.
-
-Public entry point: ``all_reduce(x, scratch, *, op, threads, scale, thread_offset)``,
-callable from within a ``@pto.simt`` context.
+All-reduce ops are emitted **inline** at the current insertion point
+(no helper-function outline or ``func.call``).  Three reducer variants
+are exposed: ``simt_allreduce_sum``, ``simt_allreduce_max``, ``simt_allreduce_min``.
 
 Dispatch tree (mirrors the C++ compile-time dispatch in ``reduce.h``)::
 
@@ -25,29 +23,45 @@
 
 from __future__ import annotations
 
+from . import scalar
+from ._control_flow import if_
+from ._ops import const as _const, get_laneid, get_tid_x, redux_add, redux_max, redux_min, shuffle_bfly, syncthreads
 from ._surface_values import unwrap_surface_value, wrap_surface_value
-from ._tracing.active import require_active_session
-from ._tracing.session import HelperFunctionSpec
-
-from mlir.dialects import arith, func, scf
-from mlir.dialects import pto as _pto
-from mlir.ir import F16Type, F32Type, IndexType, InsertionPoint, IntegerType, Operation, UnitAttr
+from ._tracing.active import current_session
+from ._types import float16 as _f16_dtype, float32 as _f32_dtype, index as _idx_dtype, int32 as _i32_dtype, _resolve
 
+from mlir.dialects import arith, pto as _pto, scf  # arith for unsigned ops; scf for ForOp in ub_reduce
+from mlir.ir import F16Type, F32Type, InsertionPoint, UnitAttr
 
-# ═══════════════════════════════════════════════════════════════════════════════
-# helpers
-# ═══════════════════════════════════════════════════════════════════════════════
 
 def _is_pow2(n: int) -> bool:
     return n > 0 and (n & (n - 1)) == 0
 
 
-def _helper_name(dtype: str, threads: int, scale: int, thread_offset: int) -> str:
-    """Canonical helper symbol name for a specific all-reduce instance.
+def _const_i32(value: int):
+    """Emit an i32 constant via PTODSL ``pto.const``, return raw SSA value."""
+    return _const(value, dtype=_resolve(_i32_dtype)).value
+
+
+def _const_idx(value: int):
+    """Emit an index constant via PTODSL ``pto.const``, return raw SSA value."""
+    return _const(value, dtype=_resolve(_idx_dtype)).value
 
-    Example: ``__tl_allreduce_sum_f32_t128_s1_o0``.
-    """
-    return f"__tl_allreduce_sum_{dtype}_t{threads}_s{scale}_o{thread_offset}"
+
+def _const_f32(value: float):
+    """Emit an f32 constant via PTODSL ``pto.const``, return raw SSA value."""
+    return _const(value, dtype=_resolve(_f32_dtype)).value
+
+
+def _const_f16(value: float):
+    """Emit an f16 constant via PTODSL ``pto.const``, return raw SSA value."""
+    return _const(value, dtype=_resolve(_f16_dtype)).value
+
+
+def _ult(a, b):
+    """Unsigned less-than comparison. Keeps raw arith because PTODSL __lt__
+    on signless i32 emits signed comparison (slt/cmpi)."""
+    return arith.CmpIOp(arith.CmpIPredicate.ult, a, b).result
 
 
 def _dtype_to_str(mlir_type) -> str:
@@ -61,29 +75,61 @@ def _dtype_to_str(mlir_type) -> str:
     )
 
 
-def _mlir_scalar_type(dtype: str):
-    """Map a canonical dtype string back to an MLIR scalar type."""
-    if dtype == "f32":
-        return F32Type.get()
-    if dtype == "f16":
-        return F16Type.get()
-    raise NotImplementedError(
-        f"all_reduce: unsupported dtype {dtype!r}"
-    )
+# ── reducer dispatch tables ──────────────────────────────────────────────────
+
+_REDUCER_IDENTITY = {
+    "sum": {"f32": 0.0, "f16": 0.0},
+    "max": {"f32": float("-inf"), "f16": float("-inf")},
+    "min": {"f32": float("inf"), "f16": float("inf")},
+}
+"""Identity element per reducer and dtype."""
+
+
+def _apply_sum(a, b):
+    """Emit ``a + b`` (float addition) via PTODSL operator."""
+    return (wrap_surface_value(a) + wrap_surface_value(b)).value
 
 
-# ── compile-time parameter tables ──────────────────────────────────────────
+def _apply_max(a, b):
+    """Emit ``max(a, b)`` via PTODSL ``scalar.max``."""
+    return scalar.max(a, b).value
 
-_IDENTITY = {
-    "f32": 0.0,
-    "f16": 0.0,
+
+def _apply_min(a, b):
+    """Emit ``min(a, b)`` via PTODSL ``scalar.min``."""
+    return scalar.min(a, b).value
+
+
+_REDUCER_COMBINE = {
+    "sum": _apply_sum,
+    "max": _apply_max,
+    "min": _apply_min,
 }
-"""Identity element for sum reduction (0.0 for both f32 and f16)."""
+"""Element-wise combine function per reducer."""
+
+
+def _redux_sum(x):
+    """Hardware lane-sum reduction, returns raw SSA value."""
+    return redux_add(x).value
+
 
-_REDUX_OP = _pto.ReduxAddOp
-"""Reduction operator (hardware redux_add)."""
+def _redux_max(x):
+    """Hardware lane-max reduction, returns raw SSA value."""
+    return redux_max(x).value
 
 
+def _redux_min(x):
+    """Hardware lane-min reduction, returns raw SSA value."""
+    return redux_min(x).value
+
+
+_REDUCER_REDUX = {
+    "sum": _redux_sum,
+    "max": _redux_max,
+    "min": _redux_min,
+}
+"""Hardware redux op per reducer."""
+
 # ── scratch validation ────────────────────────────────────────────────────
 
 def _validate_scratch(scratch, expected_mlir_type, *, context: str):
@@ -109,64 +155,49 @@ def _validate_scratch(scratch, expected_mlir_type, *, context: str):
         )
 
 
-# ── shared helper-emission utility ─────────────────────────────────────────
+# ── shared inline-emission utility ──────────────────────────────────────────
+
+def _emit_inline(emit_fn, *surface_args):
+    """Unwrap *surface_args* and call *emit_fn* at the current insertion point.
 
-def _invoke_helper(helper_name, emit_fn, *surface_args):
-    """Look up or lazily create *helper_name*, then ``func.call`` it.
+    The emitter receives raw MLIR values and returns a raw SSA result,
+    which this wrapper re-wraps as a surface value.
 
-    *emit_fn(helper_fn)* is called exactly once per trace session — on the
-    first invocation for this *helper_name*.
+    Inline SIMT allreduce emits ``pto.syncthreads``, which requires the
+    containing function to carry ``pto.simt_entry``.  We attach the attribute
+    here (idempotently) so that callers inside ``with pto.simt():`` do not
+    need to manage the attribute themselves.
     """
-    session = require_active_session("simt_allreduce_sum")
     raw_args = [unwrap_surface_value(a) for a in surface_args]
-    arg_types = tuple(a.type for a in raw_args)
+    result = emit_fn(*raw_args)
 
-    helper_spec = HelperFunctionSpec(
-        symbol_name=helper_name,
-        arg_types=arg_types,
-        result_types=(arg_types[0],),
-        attributes=(("pto.simt_entry", UnitAttr.get()),),
-    )
-    helper_fn, created = session.get_or_create_helper_function(helper_spec)
-    if created:
-        emit_fn(helper_fn)
-    call = func.CallOp(helper_fn, raw_args)
-    return wrap_surface_value(call.result)
+    # Ensure the enclosing function is marked as a SIMT entry so the
+    # syncthreads verifier passes.
+    session = current_session()
+    if session is not None:
+        parent_func = session.current_function
+        parent_func.attributes["pto.simt_entry"] = UnitAttr.get()
+
+    return wrap_surface_value(result)
 
 
 # ── reduction operator application ─────────────────────────────────────────
 
 def _emit_store(buffer, offset, value):
-    """Emit ``pto.store`` — accepts Ptr and any MemRef (including UB/VEC).
-
-    Unlike ``pto.store_scalar`` (which rejects VEC memrefs), ``pto.store``
-    uses ``PTO_BufferLikeType`` and survives the Ptr→MemRef type conversion
-    pass during lowering.
-    """
-    Operation.create(
-        "pto.store",
-        operands=[buffer, offset, value],
-    )
+    """Emit ``pto.store`` via PTODSL ``scalar.store``."""
+    scalar.store(value, buffer, offset)
 
 
 def _emit_load(result_type, buffer, offset):
-    """Emit ``pto.load`` — accepts Ptr and any MemRef (including UB/VEC).
+    """Emit ``pto.load`` via PTODSL ``scalar.load``.
 
-    Counterpart to ``_emit_store``.  Returns the loaded SSA value.
+    *result_type* is accepted for backward compatibility but ignored;
+    ``scalar.load`` infers the element type from the buffer.
     """
-    return Operation.create(
-        "pto.load",
-        results=[result_type],
-        operands=[buffer, offset],
-    ).results[0]
-
-
-def _apply_sum(a, b):
-    """Emit ``a = a + b`` (float addition)."""
-    return arith.AddFOp(a, b).result
+    return unwrap_surface_value(scalar.load(buffer, offset))
 
 
-def _emit_butterfly(v, *, threads: int, scale: int):
+def _emit_butterfly(v, *, threads: int, scale: int, reducer: str):
     """Emit unrolled butterfly shuffle reduce.
 
     Implements::
@@ -179,19 +210,19 @@ def _emit_butterfly(v, *, threads: int, scale: int):
     All loops are unrolled at emission time.  Caller must have set the
     insertion point.
     """
-    i32 = IntegerType.get_signless(32)
+    combine = _REDUCER_COMBINE[reducer]
     cur = threads
     while cur > scale:
         offset = cur // 2
-        c_offset = arith.ConstantOp(i32, offset).result
-        shfl = _pto.ShuffleBflyOp(v, c_offset).result
-        v = _apply_sum(v, shfl)
+        mask = _const_i32(offset)
+        shfl = shuffle_bfly(v, mask).value
+        v = combine(v, shfl)
         cur //= 2
     return v
 
 
 def _emit_warp_hw_reduce(x, *, threads: int,
-                         lane_in_warp, c_identity, i32):
+                         lane_in_warp, c_identity, reducer: str):
     """Emit warp-level hardware reduce.
 
     When *threads* == 32 ("groups" == 1): a single ``pto.redux_*``.
@@ -200,20 +231,21 @@ def _emit_warp_hw_reduce(x, *, threads: int,
 
     Caller must have set the insertion point.
     """
+    redux_fn = _REDUCER_REDUX[reducer]
     groups = 32 // threads
 
     if groups == 1:
-        return _REDUX_OP(x).result
+        return redux_fn(x)
 
-    c_threads = arith.ConstantOp(i32, threads).result
-    my_group = arith.DivUIOp(lane_in_warp, c_threads).result
+    c_threads = _const_i32(threads)
+    my_group = arith.DivUIOp(lane_in_warp, c_threads).result  # unsigned div — no PTODSL equivalent
 
     for g in range(groups):
-        c_g = arith.ConstantOp(i32, g).result
-        in_group = arith.CmpIOp(arith.CmpIPredicate.eq, my_group, c_g).result
-        masked = arith.SelectOp(in_group, x, c_identity).result
-        reduced = _REDUX_OP(masked).result
-        x = arith.SelectOp(in_group, reduced, x).result
+        c_g = _const_i32(g)
+        in_group = (wrap_surface_value(my_group) == wrap_surface_value(c_g)).value
+        masked = scalar.select(in_group, x, c_identity).value
+        reduced = redux_fn(masked)
+        x = scalar.select(in_group, reduced, x).value
     return x
 
 
@@ -247,11 +279,72 @@ def simt_allreduce_sum(value, *,
     return _dispatch_allreduce_helper(
         value, scratch=scratch, scratch_offset=scratch_offset,
         threads=threads, scale=scale, thread_offset=thread_offset,
+        reducer="sum",
+    )
+
+
+def simt_allreduce_max(value, *,
+               threads: int,
+               scale: int = 1,
+               thread_offset: int = 0,
+               scratch=None,
+               scratch_offset: int = 0):
+    """Cross-workitem all-reduce **max** for SIMT VF context.
+
+    Dispatch logic mirrors the compile-time tree in
+    ``AscendAllReduce<MaxOp, threads, scale, thread_offset>::run()``.
+
+    Args:
+        value: Lane-local scalar (f32 or f16).
+        threads: Number of workitems.  Must satisfy ``threads % scale == 0``.
+        scale: Scale factor (must divide *threads*).  Defaults to 1.
+        thread_offset: Thread offset.  Defaults to 0.
+        scratch: UB scratch buffer (``!pto.ptr<dtype, ub>``).  Required for
+            ``cross_warp_reduce`` and ``ub_reduce`` paths.  Defaults to None.
+        scratch_offset: Element offset into *scratch*.  Defaults to 0.
+
+    Returns:
+        Lane-uniform scalar (same type as *value*) — the element-wise maximum.
+    """
+    return _dispatch_allreduce_helper(
+        value, scratch=scratch, scratch_offset=scratch_offset,
+        threads=threads, scale=scale, thread_offset=thread_offset,
+        reducer="max",
+    )
+
+
+def simt_allreduce_min(value, *,
+               threads: int,
+               scale: int = 1,
+               thread_offset: int = 0,
+               scratch=None,
+               scratch_offset: int = 0):
+    """Cross-workitem all-reduce **min** for SIMT VF context.
+
+    Dispatch logic mirrors the compile-time tree in
+    ``AscendAllReduce<MinOp, threads, scale, thread_offset>::run()``.
+
+    Args:
+        value: Lane-local scalar (f32 or f16).
+        threads: Number of workitems.  Must satisfy ``threads % scale == 0``.
+        scale: Scale factor (must divide *threads*).  Defaults to 1.
+        thread_offset: Thread offset.  Defaults to 0.
+        scratch: UB scratch buffer (``!pto.ptr<dtype, ub>``).  Required for
+            ``cross_warp_reduce`` and ``ub_reduce`` paths.  Defaults to None.
+        scratch_offset: Element offset into *scratch*.  Defaults to 0.
+
+    Returns:
+        Lane-uniform scalar (same type as *value*) — the element-wise minimum.
+    """
+    return _dispatch_allreduce_helper(
+        value, scratch=scratch, scratch_offset=scratch_offset,
+        threads=threads, scale=scale, thread_offset=thread_offset,
+        reducer="min",
     )
 
 
 def _dispatch_allreduce_helper(value, *, scratch, scratch_offset,
-                                threads, scale, thread_offset):
+                                threads, scale, thread_offset, reducer):
     # ── parameter validation (before identity shortcut) ───────────────────
     for name, val in (("threads", threads), ("scale", scale),
                        ("thread_offset", thread_offset)):
@@ -286,49 +379,45 @@ def _dispatch_allreduce_helper(value, *, scratch, scratch_offset,
             f"all_reduce only supports f32/f16, got {dtype}"
         )
 
-    name = _helper_name(dtype, threads, scale, thread_offset)
     args = dict(dtype=dtype, threads=threads, scale=scale,
-                thread_offset=thread_offset, scratch_offset=scratch_offset)
+                thread_offset=thread_offset, scratch_offset=scratch_offset,
+                reducer=reducer)
 
     # ── Path 1: warp_reduce ───────────────────────────────────────────────
     if threads <= 32 and _is_pow2(threads) and _is_pow2(scale):
-        return _invoke_helper(
-            name,
-            lambda hf: _emit_warp_reduce(hf, **args),
+        return _emit_inline(
+            lambda x: _emit_warp_reduce(x, **args),
             value,
         )
 
     # ── All paths below require a scratch buffer ──────────────────────────
     if scratch is None:
         raise ValueError(
-            f"all_reduce sum/{dtype}/t{threads}/s{scale}/o{thread_offset} "
+            f"all_reduce {reducer}/{dtype}/t{threads}/s{scale}/o{thread_offset} "
             "requires a UB scratch buffer"
         )
     _validate_scratch(
         scratch, raw_value.type,
-        context=f"sum/{dtype}/t{threads}/s{scale}/o{thread_offset}",
+        context=f"{reducer}/{dtype}/t{threads}/s{scale}/o{thread_offset}",
     )
 
     # ── Path 2: ub_reduce (threads ≤ 32, non-pow2) ──────────────────────
     if threads <= 32:
-        return _invoke_helper(
-            name,
-            lambda hf: _emit_ub_reduce(hf, **args),
+        return _emit_inline(
+            lambda x, s: _emit_ub_reduce(x, s, **args),
             value, scratch,
         )
 
     # ── Path 3: cross_warp_reduce ────────────────────────────────────────
     if scale <= 32 and _is_pow2(threads) and _is_pow2(scale):
-        return _invoke_helper(
-            name,
-            lambda hf: _emit_cross_warp_reduce(hf, **args),
+        return _emit_inline(
+            lambda x, s: _emit_cross_warp_reduce(x, s, **args),
             value, scratch,
         )
 
     # ── Path 4: ub_reduce fallback (threads > 32, anything else) ─────────
-    return _invoke_helper(
-        name,
-        lambda hf: _emit_ub_reduce(hf, **args),
+    return _emit_inline(
+        lambda x, s: _emit_ub_reduce(x, s, **args),
         value, scratch,
     )
 
@@ -337,10 +426,10 @@ def _dispatch_allreduce_helper(value, *, scratch, scratch_offset,
 # emitter: warp_reduce  (Path 1: threads ≤ 32, pow2, pow2 scale)
 # ═══════════════════════════════════════════════════════════════════════════════
 
-def _emit_warp_reduce(helper_fn, *,
+def _emit_warp_reduce(x, *,
                       dtype, threads, scale, thread_offset,
-                      scratch_offset):
-    """Build the body of a single-warp all-reduce helper.
+                      scratch_offset, reducer):
+    """Emit inline single-warp all-reduce at the current insertion point.
 
     Dispatches to:
 
@@ -349,46 +438,39 @@ def _emit_warp_reduce(helper_fn, *,
     * ``butterfly`` otherwise (software shuffle via ``pto.shuffle_bfly``).
     """
     extent = threads // scale
-    scalar_t = _mlir_scalar_type(dtype)
-    identity_val = _IDENTITY[dtype]
-    i32 = IntegerType.get_signless(32)
-
-    entry = helper_fn.add_entry_block()
-    with InsertionPoint(entry):
-        x = entry.arguments[0]
-
-        c_offset = arith.ConstantOp(i32, thread_offset).result
-        c_identity = arith.ConstantOp(scalar_t, identity_val).result
-
-        if thread_offset:
-            # lane_in_warp = (tid_x - offset) & 31
-            tid_x = _pto.GetTidXOp().result
-            tx = arith.SubIOp(tid_x, c_offset).result
-            lane_in_warp = arith.AndIOp(tx, arith.ConstantOp(i32, 31).result).result
-        else:
-            lane_in_warp = _pto.GetLaneIdOp().result
-
-        if extent >= 16 and scale == 1:
-            result = _emit_warp_hw_reduce(
-                x, threads=threads,
-                lane_in_warp=lane_in_warp, c_identity=c_identity, i32=i32,
-            )
-        else:
-            result = _emit_butterfly(
-                x, threads=threads, scale=scale,
-            )
-
-        func.ReturnOp([result])
+    identity_val = _REDUCER_IDENTITY[reducer][dtype]
+    const_f = _const_f32 if dtype == "f32" else _const_f16
+
+    c_offset = _const_i32(thread_offset)
+    c_identity = const_f(identity_val)
+
+    if thread_offset:
+        # lane_in_warp = (tid_x - offset) & 31
+        tid_x = get_tid_x().value
+        tx = (wrap_surface_value(tid_x) - wrap_surface_value(c_offset)).value
+        lane_in_warp = (wrap_surface_value(tx) & _const_i32(31)).value
+    else:
+        lane_in_warp = get_laneid().value
+
+    if extent >= 16 and scale == 1:
+        return _emit_warp_hw_reduce(
+            x, threads=threads,
+            lane_in_warp=lane_in_warp, c_identity=c_identity, reducer=reducer,
+        )
+    else:
+        return _emit_butterfly(
+            x, threads=threads, scale=scale, reducer=reducer,
+        )
 
 
 # ═══════════════════════════════════════════════════════════════════════════════
 # emitter: cross_warp_reduce  (Path 3: threads > 32)
 # ═══════════════════════════════════════════════════════════════════════════════
 
-def _emit_cross_warp_reduce(helper_fn, *,
+def _emit_cross_warp_reduce(x, scratch, *,
                             dtype, threads, scale, thread_offset,
-                            scratch_offset):
-    """Build the body of a cross-warp all-reduce helper.
+                            scratch_offset, reducer):
+    """Emit inline cross-warp all-reduce at the current insertion point.
 
     Algorithm overview:
 
@@ -404,160 +486,141 @@ def _emit_cross_warp_reduce(helper_fn, *,
     7. Extra ``pto.syncthreads`` to fence scratch reuse.
     """
     num_warps = threads // 32
-    scalar_t = _mlir_scalar_type(dtype)
-    identity_val = _IDENTITY[dtype]
-
-    i32 = IntegerType.get_signless(32)
-    idx_t = IndexType.get()
-
-    entry = helper_fn.add_entry_block()
-    with InsertionPoint(entry):
-        x = entry.arguments[0]
-        scratch = entry.arguments[1]
-
-        # ── constants ────────────────────────────────────────────────────
-        c0_i32 = arith.ConstantOp(i32, 0).result
-        c5_i32 = arith.ConstantOp(i32, 5).result
-        c31_i32 = arith.ConstantOp(i32, 31).result
-        c32_i32 = arith.ConstantOp(i32, 32).result
-        c_scale = arith.ConstantOp(i32, scale).result
-        c_num_warps = arith.ConstantOp(i32, num_warps).result
-        c_offset = arith.ConstantOp(i32, thread_offset).result
-        c_scratch_off = arith.ConstantOp(idx_t, scratch_offset).result
-        c_identity = arith.ConstantOp(scalar_t, identity_val).result
-
-        # ── thread indexing ──────────────────────────────────────────────
-        tid_x = _pto.GetTidXOp().result
-        if thread_offset:
-            tx = arith.SubIOp(tid_x, c_offset).result
-            wid = arith.ShRUIOp(tx, c5_i32).result
-            lid = arith.AndIOp(tx, c31_i32).result
-        else:
-            tx = tid_x
-            wid = arith.ShRUIOp(tx, c5_i32).result
-            lid = _pto.GetLaneIdOp().result
-
-        # ── Stage 1: per-warp reduce ─────────────────────────────────────
-        if scale == 1:
-            warp_val = _REDUX_OP(x).result
-        else:
-            warp_val = _emit_butterfly(
-                x, threads=32, scale=scale,
-            )
+    identity_val = _REDUCER_IDENTITY[reducer][dtype]
+    const_f = _const_f32 if dtype == "f32" else _const_f16
+    combine = _REDUCER_COMBINE[reducer]
+    redux_fn = _REDUCER_REDUX[reducer]
+
+    # ── constants ────────────────────────────────────────────────────
+    c5_i32 = _const_i32(5)
+    c31_i32 = _const_i32(31)
+    c32_i32 = _const_i32(32)
+    c_scale = _const_i32(scale)
+    c_num_warps = _const_i32(num_warps)
+    c_offset = _const_i32(thread_offset)
+    c_scratch_off = _const_idx(scratch_offset)
+    c_identity = const_f(identity_val)
+
+    # ── thread indexing ──────────────────────────────────────────────
+    tid_x = get_tid_x().value
+    if thread_offset:
+        tx = (wrap_surface_value(tid_x) - wrap_surface_value(c_offset)).value
+        wid = arith.ShRUIOp(tx, c5_i32).result  # unsigned shift — no PTODSL equivalent
+        lid = (wrap_surface_value(tx) & c31_i32).value
+    else:
+        tx = tid_x
+        wid = arith.ShRUIOp(tx, c5_i32).result
+        lid = get_laneid().value
+
+    # ── Stage 1: per-warp reduce ─────────────────────────────────────
+    if scale == 1:
+        warp_val = redux_fn(x)
+    else:
+        warp_val = _emit_butterfly(
+            x, threads=32, scale=scale, reducer=reducer,
+        )
 
-        # ── Stage 2: warp leaders write partial results ──────────────────
-        is_writer = arith.CmpIOp(arith.CmpIPredicate.ult, lid, c_scale).result
-        write_if = scf.IfOp(is_writer, hasElse=False)
-        with InsertionPoint(write_if.then_block):
-            slot = arith.AddIOp(
-                arith.MulIOp(wid, c_scale).result, lid).result
-            slot_idx = arith.IndexCastOp(idx_t, slot).result
+    # ── Stage 2: warp leaders write partial results ──────────────────
+    is_writer = _ult(lid, c_scale)
+    with if_(is_writer) as br:
+        with br.then_:
+            slot = (wrap_surface_value(wid) * wrap_surface_value(c_scale) + wrap_surface_value(lid)).value
+            slot_idx = scalar.index_cast(slot).value
             if scratch_offset:
-                slot_idx = arith.AddIOp(slot_idx, c_scratch_off).result
+                slot_idx = (wrap_surface_value(slot_idx) + wrap_surface_value(c_scratch_off)).value
             _emit_store(scratch, slot_idx, warp_val)
-            scf.YieldOp([])
-
-        # ── Stage 3: sync before reading partial results ─────────────────
-        _pto.SyncthreadsOp()
 
-        # ── Stage 4: leader warp reduces partial sums ────────────────────
-        is_leader_warp = arith.CmpIOp(
-            arith.CmpIPredicate.ult, tx, c32_i32).result
-        outer_if = scf.IfOp(is_leader_warp, [scalar_t], hasElse=True)
+    # ── Stage 3: sync before reading partial results ─────────────────
+    syncthreads()
 
-        with InsertionPoint(outer_if.then_block):
+    # ── Stage 4: leader warp reduces partial sums ────────────────────
+    is_leader_warp = _ult(tx, c32_i32)
+    with if_(is_leader_warp) as br:
+        with br.then_:
             if scale == 1:
-                # ── scale == 1: hw_reduce across leader warp ────────────
-                need_load = arith.CmpIOp(
-                    arith.CmpIPredicate.ult, lid, c_num_warps).result
-                inner_if = scf.IfOp(need_load, [scalar_t], hasElse=True)
-                with InsertionPoint(inner_if.then_block):
-                    lid_idx = arith.IndexCastOp(idx_t, lid).result
-                    tmp = _emit_load(scalar_t, scratch, lid_idx)
-                    scf.YieldOp([tmp])
-                with InsertionPoint(inner_if.else_block):
-                    scf.YieldOp([c_identity])
-                loaded = inner_if.results[0]
-                stage4_result = _REDUX_OP(loaded).result
+                # ── scale == 1: hw_reduce across leader warp ────────
+                need_load = _ult(lid, c_num_warps)
+                with if_(need_load) as inner_br:
+                    with inner_br.then_:
+                        lid_idx = scalar.index_cast(lid).value
+                        tmp = _emit_load(None, scratch, lid_idx)
+                        inner_br.assign(loaded=tmp)
+                    with inner_br.else_:
+                        inner_br.assign(loaded=c_identity)
+                loaded = inner_br.loaded
+                stage4_result = redux_fn(loaded)
             elif scale * num_warps <= 32:
-                # ── scale > 1, fits in one warp: butterfly ──────────────
+                # ── scale > 1, fits in one warp: butterfly ──────────
                 total = scale * num_warps
-                c_total = arith.ConstantOp(i32, total).result
-                need_load = arith.CmpIOp(
-                    arith.CmpIPredicate.ult, lid, c_total).result
-                inner_if = scf.IfOp(need_load, [scalar_t], hasElse=True)
-                with InsertionPoint(inner_if.then_block):
-                    lid_idx = arith.IndexCastOp(idx_t, lid).result
-                    if scratch_offset:
-                        lid_idx = arith.AddIOp(lid_idx, c_scratch_off).result
-                    tmp = _emit_load(scalar_t, scratch, lid_idx)
-                    scf.YieldOp([tmp])
-                with InsertionPoint(inner_if.else_block):
-                    scf.YieldOp([c_identity])
-                loaded = inner_if.results[0]
+                c_total = _const_i32(total)
+                need_load = _ult(lid, c_total)
+                with if_(need_load) as inner_br:
+                    with inner_br.then_:
+                        lid_idx = scalar.index_cast(lid).value
+                        if scratch_offset:
+                            lid_idx = (wrap_surface_value(lid_idx) + wrap_surface_value(c_scratch_off)).value
+                        tmp = _emit_load(None, scratch, lid_idx)
+                        inner_br.assign(loaded=tmp)
+                    with inner_br.else_:
+                        inner_br.assign(loaded=c_identity)
+                loaded = inner_br.loaded
                 stage4_result = _emit_butterfly(
                     loaded,
-                    threads=total, scale=scale,
+                    threads=total, scale=scale, reducer=reducer,
                 )
             else:
                 # ── manual loop: lid < scale lanes each reduce num_warps
-                is_reducer = arith.CmpIOp(
-                    arith.CmpIPredicate.ult, lid, c_scale).result
-                result = c_identity
-                my_slot = arith.RemUIOp(lid, c_scale).result
+                is_reducer = _ult(lid, c_scale)
+                reduced = c_identity
+                my_slot = arith.RemUIOp(lid, c_scale).result  # unsigned rem
                 for w in range(num_warps):
-                    c_w = arith.ConstantOp(i32, w).result
-                    idx_val = arith.AddIOp(
-                        arith.MulIOp(c_w, c_scale).result, my_slot).result
-                    slot_idx = arith.IndexCastOp(idx_t, idx_val).result
+                    c_w = _const_i32(w)
+                    idx_val = (wrap_surface_value(c_w) * wrap_surface_value(c_scale) + wrap_surface_value(my_slot)).value
+                    slot_idx = scalar.index_cast(idx_val).value
                     if scratch_offset:
-                        slot_idx = arith.AddIOp(slot_idx, c_scratch_off).result
-                    loaded_v = _emit_load(
-                        scalar_t, scratch, slot_idx)
-                    result = _apply_sum(result, loaded_v)
-                stage4_result = arith.SelectOp(
-                    is_reducer, result, c_identity).result
-
-            scf.YieldOp([stage4_result])
-
-        with InsertionPoint(outer_if.else_block):
-            scf.YieldOp([c_identity])
-
-        partial_reduced = outer_if.results[0]
-
-        # ── Stage 5: global leader writes result to scratch ──────────────
-        is_global_leader = arith.CmpIOp(
-            arith.CmpIPredicate.ult, tx, c_scale).result
-        write_result_if = scf.IfOp(is_global_leader, hasElse=False)
-        with InsertionPoint(write_result_if.then_block):
-            tx_idx = arith.IndexCastOp(idx_t, tx).result
+                        slot_idx = (wrap_surface_value(slot_idx) + wrap_surface_value(c_scratch_off)).value
+                    loaded_v = _emit_load(None, scratch, slot_idx)
+                    reduced = combine(reduced, loaded_v)
+                stage4_result = scalar.select(
+                    is_reducer, reduced, c_identity).value
+
+            br.assign(stage4_result=stage4_result)
+        with br.else_:
+            br.assign(stage4_result=c_identity)
+
+    partial_reduced = unwrap_surface_value(br.stage4_result)
+
+    # ── Stage 5: global leader writes result to scratch ──────────────
+    is_global_leader = _ult(tx, c_scale)
+    with if_(is_global_leader) as br5:
+        with br5.then_:
+            tx_idx = scalar.index_cast(tx).value
             if scratch_offset:
-                tx_idx = arith.AddIOp(tx_idx, c_scratch_off).result
+                tx_idx = (wrap_surface_value(tx_idx) + wrap_surface_value(c_scratch_off)).value
             _emit_store(scratch, tx_idx, partial_reduced)
-            scf.YieldOp([])
 
-        # ── Stage 6: sync + broadcast load scratch[tx % scale] ───────────
-        _pto.SyncthreadsOp()
-        my_slot = arith.RemUIOp(tx, c_scale).result
-        load_idx = arith.IndexCastOp(idx_t, my_slot).result
-        if scratch_offset:
-            load_idx = arith.AddIOp(load_idx, c_scratch_off).result
-        result = _emit_load(scalar_t, scratch, load_idx)
+    # ── Stage 6: sync + broadcast load scratch[tx % scale] ───────────
+    syncthreads()
+    my_slot = arith.RemUIOp(tx, c_scale).result  # unsigned rem
+    load_idx = scalar.index_cast(my_slot).value
+    if scratch_offset:
+        load_idx = (wrap_surface_value(load_idx) + wrap_surface_value(c_scratch_off)).value
+    result = _emit_load(None, scratch, load_idx)
 
-        # ── Stage 7: extra sync to fence scratch reuse ───────────────────
-        _pto.SyncthreadsOp()
+    # ── Stage 7: extra sync to fence scratch reuse ───────────────────
+    syncthreads()
 
-        func.ReturnOp([result])
+    return result
 
 
 # ═══════════════════════════════════════════════════════════════════════════════
 # emitter: ub_reduce  (Paths 2 & 4: fallback via UB scratch)
 # ═══════════════════════════════════════════════════════════════════════════════
 
-def _emit_ub_reduce(helper_fn, *,
+def _emit_ub_reduce(x, scratch, *,
                     dtype, threads, scale, thread_offset,
-                    scratch_offset):
-    """Build the body of a UB-scratch all-reduce helper.
+                    scratch_offset, reducer):
+    """Emit inline UB-scratch all-reduce at the current insertion point.
 
     Algorithm:
 
@@ -569,106 +632,91 @@ def _emit_ub_reduce(helper_fn, *,
     6. ``pto.syncthreads`` + broadcast: each lane reads scratch[tx % scale].
     7. ``pto.syncthreads`` to fence scratch reuse.
     """
-    scalar_t = _mlir_scalar_type(dtype)
-    i32 = IntegerType.get_signless(32)
-    idx_t = IndexType.get()
-
-    entry = helper_fn.add_entry_block()
-    with InsertionPoint(entry):
-        x = entry.arguments[0]
-        scratch = entry.arguments[1]
-
-        # ── constants ────────────────────────────────────────────────────
-        c0_i32 = arith.ConstantOp(i32, 0).result
-        c_threads = arith.ConstantOp(i32, threads).result
-        c_scale = arith.ConstantOp(i32, scale).result
-        c_offset = arith.ConstantOp(i32, thread_offset).result
-        c_scratch_off = arith.ConstantOp(idx_t, scratch_offset).result
-
-        # ── thread indexing ──────────────────────────────────────────────
-        tid_x = _pto.GetTidXOp().result
-        tx = arith.SubIOp(tid_x, c_offset).result if thread_offset else tid_x
-        group = arith.DivUIOp(tx, c_threads).result
-        lane = arith.RemUIOp(tx, c_threads).result
-        lane_mod = arith.RemUIOp(lane, c_scale).result
-
-        # ── Stage 1: each lane writes x → scratch[scratch_offset + tx] ──
-        tx_idx = arith.IndexCastOp(idx_t, tx).result
-        if scratch_offset:
-            tx_idx = arith.AddIOp(tx_idx, c_scratch_off).result
-        _emit_store(scratch, tx_idx, x)
-
-        # ── Stage 2: sync ────────────────────────────────────────────────
-        _pto.SyncthreadsOp()
-
-        # ── Stage 3: reducers sequentially combine ───────────────────────
-        # lane < scale gives exactly one reducer per residue class
-        is_reducer = arith.CmpIOp(
-            arith.CmpIPredicate.ult, lane, c_scale).result
-        reduce_if = scf.IfOp(is_reducer, [scalar_t], hasElse=True)
-
-        with InsertionPoint(reduce_if.then_block):
+    combine = _REDUCER_COMBINE[reducer]
+
+    # ── constants ────────────────────────────────────────────────────
+    c_threads = _const_i32(threads)
+    c_scale = _const_i32(scale)
+    c_offset = _const_i32(thread_offset)
+    c_scratch_off = _const_idx(scratch_offset)
+
+    # ── thread indexing ──────────────────────────────────────────────
+    tid_x = get_tid_x().value
+    tx = (wrap_surface_value(tid_x) - wrap_surface_value(c_offset)).value if thread_offset else tid_x
+    group = arith.DivUIOp(tx, c_threads).result  # unsigned div
+    lane = arith.RemUIOp(tx, c_threads).result    # unsigned rem
+
+    # ── Stage 1: each lane writes x → scratch[scratch_offset + tx] ──
+    tx_idx = scalar.index_cast(tx).value
+    if scratch_offset:
+        tx_idx = (wrap_surface_value(tx_idx) + wrap_surface_value(c_scratch_off)).value
+    _emit_store(scratch, tx_idx, x)
+
+    # ── Stage 2: sync ────────────────────────────────────────────────
+    syncthreads()
+
+    # ── Stage 3: reducers sequentially combine ───────────────────────
+    is_reducer = _ult(lane, c_scale)
+    with if_(is_reducer) as br:
+        with br.then_:
             # initial: load scratch[scratch_offset + group * threads + lane]
-            group_offset = arith.MulIOp(group, c_threads).result
-            first_elem = arith.AddIOp(group_offset, lane).result
-            first_idx = arith.IndexCastOp(idx_t, first_elem).result
+            group_offset = (wrap_surface_value(group) * wrap_surface_value(c_threads)).value
+            first_elem = (wrap_surface_value(group_offset) + wrap_surface_value(lane)).value
+            first_idx = scalar.index_cast(first_elem).value
             if scratch_offset:
-                first_idx = arith.AddIOp(first_idx, c_scratch_off).result
-            acc = _emit_load(scalar_t, scratch, first_idx)
+                first_idx = (wrap_surface_value(first_idx) + wrap_surface_value(c_scratch_off)).value
+            acc = _emit_load(None, scratch, first_idx)
 
             # scf.for i = scale to threads step scale
-            lb = arith.ConstantOp(idx_t, scale).result
-            ub = arith.ConstantOp(idx_t, threads).result
-            step = arith.ConstantOp(idx_t, scale).result
+            lb = _const_idx(scale)
+            ub = _const_idx(threads)
+            step = _const_idx(scale)
             for_op = scf.ForOp(lb, ub, step, [acc])
             with InsertionPoint(for_op.body):
                 i = for_op.induction_variable
                 prev = for_op.inner_iter_args[0]
-                elem = arith.AddIOp(first_idx, i).result
-                loaded = _emit_load(
-                    scalar_t, scratch, elem)
-                new_acc = _apply_sum(prev, loaded)
+                elem = (wrap_surface_value(first_idx) + wrap_surface_value(i)).value
+                loaded = _emit_load(None, scratch, elem)
+                new_acc = combine(prev, loaded)
                 scf.YieldOp([new_acc])
-            scf.YieldOp([for_op.results[0]])
+            acc = for_op.results[0]
 
-        with InsertionPoint(reduce_if.else_block):
-            scf.YieldOp([x])
+            br.assign(flag=acc)
+        with br.else_:
+            br.assign(flag=x)
 
-        flag = reduce_if.results[0]
+    flag = unwrap_surface_value(br.flag)
 
-        # ── Stage 4: sync ────────────────────────────────────────────────
-        _pto.SyncthreadsOp()
+    # ── Stage 4: sync ────────────────────────────────────────────────
+    syncthreads()
 
-        # ── Stage 5: per-class leader writes reduced value ───────────────
-        # leader lanes 0..scale-1 each write their residue class result
-        is_leader = arith.CmpIOp(
-            arith.CmpIPredicate.ult, lane, c_scale).result
-        write_if = scf.IfOp(is_leader, hasElse=False)
-        with InsertionPoint(write_if.then_block):
-            dst_offset = arith.AddIOp(
-                arith.MulIOp(group, c_threads).result, lane).result
-            dst_idx = arith.IndexCastOp(idx_t, dst_offset).result
+    # ── Stage 5: per-class leader writes reduced value ───────────────
+    is_leader = _ult(lane, c_scale)
+    with if_(is_leader) as br5:
+        with br5.then_:
+            dst_offset = (wrap_surface_value(group) * wrap_surface_value(c_threads) + wrap_surface_value(lane)).value
+            dst_idx = scalar.index_cast(dst_offset).value
             if scratch_offset:
-                dst_idx = arith.AddIOp(dst_idx, c_scratch_off).result
+                dst_idx = (wrap_surface_value(dst_idx) + wrap_surface_value(c_scratch_off)).value
             _emit_store(scratch, dst_idx, flag)
-            scf.YieldOp([])
 
-        # ── Stage 6: sync + broadcast scratch[scratch_offset + group*threads + tx%scale] ──
-        _pto.SyncthreadsOp()
-        my_slot = arith.AddIOp(
-            arith.MulIOp(group, c_threads).result,
-            arith.RemUIOp(tx, c_scale).result).result
-        load_idx = arith.IndexCastOp(idx_t, my_slot).result
-        if scratch_offset:
-            load_idx = arith.AddIOp(load_idx, c_scratch_off).result
-        result = _emit_load(scalar_t, scratch, load_idx)
+    # ── Stage 6: sync + broadcast scratch[scratch_offset + group*threads + tx%scale] ──
+    syncthreads()
+    my_slot = ((wrap_surface_value(group) * wrap_surface_value(c_threads)) +
+               wrap_surface_value(arith.RemUIOp(tx, c_scale).result)).value
+    load_idx = scalar.index_cast(my_slot).value
+    if scratch_offset:
+        load_idx = (wrap_surface_value(load_idx) + wrap_surface_value(c_scratch_off)).value
+    result = _emit_load(None, scratch, load_idx)
 
-        # ── Stage 7: extra sync to fence scratch reuse ───────────────────
-        _pto.SyncthreadsOp()
+    # ── Stage 7: extra sync to fence scratch reuse ───────────────────
+    syncthreads()
 
-        func.ReturnOp([result])
+    return result
 
 
 __all__ = [
     "simt_allreduce_sum",
+    "simt_allreduce_max",
+    "simt_allreduce_min",
 ]
diff --git a/ptodsl/ptodsl/pto.py b/ptodsl/ptodsl/pto.py
index 19cd93a91f..b469dbe6c8 100644
--- a/ptodsl/ptodsl/pto.py
+++ b/ptodsl/ptodsl/pto.py
@@ -144,7 +144,7 @@
 )
 
 # ── All-reduce ─────────────────────────────────────────────────────────────────
-from ._allreduce import simt_allreduce_sum  # noqa: F401
+from ._allreduce import simt_allreduce_sum, simt_allreduce_max, simt_allreduce_min  # noqa: F401
 
 # ── Decorator ─────────────────────────────────────────────────────────────────
 from ._jit import jit, KernelHandle, merge_jit_modules      # noqa: F401
diff --git a/ptodsl/tests/test_allreduce.py b/ptodsl/tests/test_allreduce.py
index 1f6b964894..ce12914b86 100644
--- a/ptodsl/tests/test_allreduce.py
+++ b/ptodsl/tests/test_allreduce.py
@@ -21,19 +21,7 @@ def expect(condition: bool, message: str) -> None:
 
 
 def main():
-    from ptodsl._allreduce import _helper_name, simt_allreduce_sum
-
-    # ══════════════════════════════════════════════════════════════════════════
-    # helper name format
-    # ══════════════════════════════════════════════════════════════════════════
-    expect(
-        _helper_name("f32", 128, 1, 0) == "__tl_allreduce_sum_f32_t128_s1_o0",
-        "helper name format (sum/f32/t128/s1/o0)",
-    )
-    expect(
-        _helper_name("f16", 32, 2, 4) == "__tl_allreduce_sum_f16_t32_s2_o4",
-        "helper name format (f16/t32/s2/o4)",
-    )
+    from ptodsl._allreduce import simt_allreduce_sum, simt_allreduce_max, simt_allreduce_min
 
     # ══════════════════════════════════════════════════════════════════════════
     # Path 0: identity (threads <= scale)
@@ -100,8 +88,6 @@ def kernel_warp(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_warp = kernel_warp.compile()
     mlir_warp = compiled_warp.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t32_s1_o0" in mlir_warp,
-           "IR: warp_reduce helper name")
     expect("pto.redux_add" in mlir_warp,
            "IR: redux_add in warp_reduce helper")
     expect("pto.syncthreads" not in mlir_warp,
@@ -124,8 +110,6 @@ def kernel_warp_t16(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_warp_t16 = kernel_warp_t16.compile()
     mlir_warp_t16 = compiled_warp_t16.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t16_s1_o0" in mlir_warp_t16,
-           "IR: warp_reduce t=16 helper name")
     expect("pto.redux_add" in mlir_warp_t16,
            "IR: redux_add for groups>1")
     expect("arith.select" in mlir_warp_t16,
@@ -148,8 +132,6 @@ def kernel_warp_t8(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_warp_t8 = kernel_warp_t8.compile()
     mlir_warp_t8 = compiled_warp_t8.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t8_s1_o0" in mlir_warp_t8,
-           "IR: warp_reduce t=8 butterfly helper name (sum)")
     expect("pto.shuffle_bfly" in mlir_warp_t8,
            "IR: shuffle_bfly for butterfly path")
     expect("pto.redux_add" not in mlir_warp_t8,
@@ -172,8 +154,6 @@ def kernel_warp_s2(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_warp_s2 = kernel_warp_s2.compile()
     mlir_warp_s2 = compiled_warp_s2.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t32_s2_o0" in mlir_warp_s2,
-           "IR: warp_reduce s=2 butterfly helper name (sum)")
     expect("pto.shuffle_bfly" in mlir_warp_s2,
            "IR: shuffle_bfly for butterfly (scale>1)")
     expect("pto.redux_add" not in mlir_warp_s2,
@@ -191,8 +171,6 @@ def kernel_warp_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_warp_o4 = kernel_warp_o4.compile()
     mlir_warp_o4 = compiled_warp_o4.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t16_s1_o4" in mlir_warp_o4,
-           "IR: warp_reduce o=4 helper name")
     expect("pto.get_tid_x" in mlir_warp_o4,
            "IR: warp_reduce o=4 uses get_tid_x (not raw get_laneid)")
     expect("arith.subi" in mlir_warp_o4,
@@ -215,8 +193,6 @@ def kernel_ub6(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_ub6 = kernel_ub6.compile()
     mlir_ub6 = compiled_ub6.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t6_s1_o0" in mlir_ub6,
-           "IR: ub_reduce t=6 helper name")
     expect("pto.syncthreads" in mlir_ub6,
            "IR: ub_reduce has syncthreads")
     expect("pto.store" in mlir_ub6,
@@ -239,8 +215,6 @@ def kernel_ub6s2(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_ub6s2 = kernel_ub6s2.compile()
     mlir_ub6s2 = compiled_ub6s2.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t6_s2_o0" in mlir_ub6s2,
-           "IR: ub_reduce t=6 s=2 helper name")
     expect("pto.syncthreads" in mlir_ub6s2,
            "IR: ub_reduce t=6 s=2 has syncthreads")
     expect("pto.store" in mlir_ub6s2,
@@ -270,8 +244,6 @@ def kernel_ub_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_ub_o4 = kernel_ub_o4.compile()
     mlir_ub_o4 = compiled_ub_o4.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t6_s1_o4" in mlir_ub_o4,
-           "IR: ub_reduce o=4 helper name")
     expect("arith.subi" in mlir_ub_o4,
            "IR: ub_reduce o=4 uses subi for tx = tid_x - offset")
     compiled_ub_o4.verify()
@@ -291,12 +263,8 @@ def kernel_128(scratch_gm: pto.ptr(pto.f32, "gm")):
     compiled = kernel_128.compile()
     mlir = compiled.mlir_text()
 
-    expect("func.func @__tl_allreduce_sum_f32_t128_s1_o0" in mlir,
-           "IR: helper function definition")
     expect("pto.simt_entry" in mlir,
            "IR: helper carries pto.simt_entry")
-    expect("call @__tl_allreduce_sum_f32_t128_s1_o0" in mlir,
-           "IR: func.call to helper")
 
     for op_name in (
         "pto.redux_add", "pto.syncthreads", "pto.store", "pto.load",
@@ -321,8 +289,6 @@ def kernel_64(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_64 = kernel_64.compile()
     mlir_64 = compiled_64.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t64_s1_o0" in mlir_64,
-           "IR: helper for t=64")
     compiled_64.verify()
 
     # ── cross_warp: sum, f32, t=256 (8 warps) ───────────────────────────────
@@ -336,8 +302,6 @@ def kernel_256(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_256 = kernel_256.compile()
     mlir_256 = compiled_256.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t256_s1_o0" in mlir_256,
-           "IR: helper for t=256")
     compiled_256.verify()
 
     # ══════════════════════════════════════════════════════════════════════════
@@ -355,8 +319,6 @@ def kernel_cw_s2(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_cw_s2 = kernel_cw_s2.compile()
     mlir_cw_s2 = compiled_cw_s2.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t128_s2_o0" in mlir_cw_s2,
-           "IR: cross_warp s=2 helper name")
     expect("pto.shuffle_bfly" in mlir_cw_s2,
            "IR: cross_warp s=2 has shuffle_bfly (butterfly for per-warp + leader)")
     expect("pto.syncthreads" in mlir_cw_s2,
@@ -379,8 +341,6 @@ def kernel_cw_s16(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_cw_s16 = kernel_cw_s16.compile()
     mlir_cw_s16 = compiled_cw_s16.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t128_s16_o0" in mlir_cw_s16,
-           "IR: cross_warp s=16 manual helper name")
     expect("pto.syncthreads" in mlir_cw_s16,
            "IR: cross_warp s=16 has syncthreads")
     compiled_cw_s16.verify()
@@ -397,8 +357,6 @@ def kernel_cw_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_cw_o4 = kernel_cw_o4.compile()
     mlir_cw_o4 = compiled_cw_o4.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t128_s1_o4" in mlir_cw_o4,
-           "IR: cross_warp o=4 helper name")
     expect("pto.get_tid_x" in mlir_cw_o4,
            "IR: cross_warp o=4 uses get_tid_x")
     expect("arith.subi" in mlir_cw_o4,
@@ -420,8 +378,6 @@ def kernel_ub48(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_ub48 = kernel_ub48.compile()
     mlir_ub48 = compiled_ub48.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t48_s1_o0" in mlir_ub48,
-           "IR: ub_reduce fallback t=48 helper name")
     expect("pto.syncthreads" in mlir_ub48,
            "IR: ub_reduce fallback has syncthreads")
     expect("pto.store" in mlir_ub48,
@@ -431,7 +387,6 @@ def kernel_ub48(scratch_gm: pto.ptr(pto.f32, "gm")):
     compiled_ub48.verify()
 
     # ══════════════════════════════════════════════════════════════════════════
-    # helper deduplication across multiple calls
     # ══════════════════════════════════════════════════════════════════════════
 
     @pto.jit(target="a5")
@@ -447,11 +402,6 @@ def kernel_reuse(scratch_gm: pto.ptr(pto.f32, "gm")):
     compiled2 = kernel_reuse.compile()
     mlir2 = compiled2.mlir_text()
 
-    definitions = mlir2.count("func.func @__tl_allreduce_sum_f32_t128_s1_o0")
-    expect(definitions == 1,
-           f"IR: helper defined {definitions} times, expected 1")
-    calls = mlir2.count("call @__tl_allreduce_sum_f32_t128_s1_o0")
-    expect(calls == 2, f"IR: expected 2 call sites, got {calls}")
     compiled2.verify()
 
 
@@ -526,6 +476,208 @@ def kernel_dtype_mismatch(scratch_gm: pto.ptr(pto.f32, "gm")):
         expect("element type" in err.lower() or "mismatch" in err.lower(),
                f"dtype mismatch should mention element type, got: {e}")
 
+    # ══════════════════════════════════════════════════════════════════════════
+    # Max reducer — Path 1a: warp_reduce, hw redux (threads=32, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+    @pto.jit(target="a5")
+    def kernel_max_warp_hw(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_max(x, threads=32, scale=1)
+
+    compiled_max_warp = kernel_max_warp_hw.compile()
+    mlir_max_warp = compiled_max_warp.mlir_text()
+
+    expect(
+        "pto.redux_max" in mlir_max_warp,
+        "Path 1a (max): IR must contain pto.redux_max",
+    )
+    expect(
+        "pto.syncthreads" not in mlir_max_warp,
+        "Path 1a (max): single-warp hw reduce needs no syncthreads",
+    )
+
+    # ── Max reducer — Path 1c: warp_reduce, butterfly (threads=8, scale=1) ──
+    @pto.jit(target="a5")
+    def kernel_max_butterfly(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_max(x, threads=8, scale=1)
+
+    compiled_max_bfly = kernel_max_butterfly.compile()
+    mlir_max_bfly = str(compiled_max_bfly.mlir_text())
+
+    expect(
+        "arith.maximumf" in mlir_max_bfly,
+        "Path 1c (max): butterfly must emit arith.maximumf for element-wise max",
+    )
+    expect(
+        "pto.shuffle_bfly" in mlir_max_bfly,
+        "Path 1c (max): butterfly must use pto.shuffle_bfly",
+    )
+    expect(
+        "pto.redux_max" not in mlir_max_bfly,
+        "Path 1c (max): butterfly path should NOT use hw redux",
+    )
+
+    # ── Max reducer — Path 3: cross_warp_reduce (threads=128, scale=1) ──
+    @pto.jit(target="a5")
+    def kernel_max_cross_warp(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_max(x, scratch=ub_scratch, threads=128, scale=1)
+
+    compiled_max_cw = kernel_max_cross_warp.compile()
+    mlir_max_cw = str(compiled_max_cw.mlir_text())
+
+    expect(
+        "pto.redux_max" in mlir_max_cw,
+        "Path 3 (max): cross-warp IR must contain pto.redux_max",
+    )
+    expect(
+        "pto.syncthreads" in mlir_max_cw,
+        "Path 3 (max): cross-warp needs syncthreads barriers",
+    )
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Min reducer — Path 1a: warp_reduce, hw redux (threads=32, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+    @pto.jit(target="a5")
+    def kernel_min_warp_hw(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_min(x, threads=32, scale=1)
+
+    compiled_min_warp = kernel_min_warp_hw.compile()
+    mlir_min_warp = str(compiled_min_warp.mlir_text())
+
+    expect(
+        "pto.redux_min" in mlir_min_warp,
+        "Path 1a (min): IR must contain pto.redux_min",
+    )
+    expect(
+        "pto.syncthreads" not in mlir_min_warp,
+        "Path 1a (min): single-warp hw reduce needs no syncthreads",
+    )
+
+    # ── Min reducer — Path 4 (ub_reduce fallback): threads=48, non-pow2 ──
+    @pto.jit(target="a5")
+    def kernel_min_ub(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_min(x, scratch=ub_scratch, threads=48, scale=1)
+
+    compiled_min_ub = kernel_min_ub.compile()
+    mlir_min_ub = str(compiled_min_ub.mlir_text())
+
+    expect(
+        "arith.minimumf" in mlir_min_ub,
+        "Path 4 (min): ub_reduce fallback must emit arith.minimumf",
+    )
+
+    # ── Identity smoke tests for max/min ───────────────────────────────────
+    expect(
+        simt_allreduce_max(1.0, threads=1, scale=1) == 1.0,
+        "Path 0 (max): threads <= scale returns identity (value unchanged)",
+    )
+    expect(
+        simt_allreduce_min(1.0, threads=2, scale=2) == 1.0,
+        "Path 0 (min): threads <= scale returns identity (value unchanged)",
+    )
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Lowering verification — ptoas → bisheng (full AOT compilation)
+    #
+    # Tests that the allreduce MLIR survives the complete ptoas pipeline:
+    #   MLIR (PTO dialect) → VPTO passes → LLVM IR → bisheng device codegen
+    #
+    # KNOWN TOOLCHAIN ISSUES (bisheng, not allreduce):
+    #   a) bisheng stack-smashing on SIMT code that stores to GM
+    #   b) bisheng stack-smashing on cross-warp scratch-buffer code (≥ 128 lanes)
+    #
+    # These are bisheng bugs — ptoas VPTO lowering succeeds; the crash is
+    # in the device LLVM→object step inside bisheng.  Verified by:
+    #   ptoas --emit-vpto-llvm-ir  → valid LLVM IR (no crash)
+    #   ptoas -o kernel.o          → bisheng crash during LLVM→object
+    # ══════════════════════════════════════════════════════════════════════════
+
+    import subprocess
+    import tempfile
+    from pathlib import Path
+
+    def _ptoas_binary() -> Path:
+        for p in [
+            Path(__file__).resolve().parents[2] / "build" / "tools" / "ptoas" / "ptoas",
+        ]:
+            if p.is_file():
+                return p
+        raise RuntimeError(
+            "ptoas binary not found; run `source scripts/ptoas_env.sh` or build ptoas"
+        )
+
+    def _lower_and_check(compiled, case_label: str, expect_pass: bool = True) -> bool:
+        """Run ``ptoas`` lowering on *compiled* MLIR.  Returns True on success."""
+        ptoas = _ptoas_binary()
+        mlir_text = compiled.mlir_text()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mlir_path = Path(tmpdir) / "kernel.mlir"
+            obj_path = Path(tmpdir) / "kernel.o"
+            mlir_path.write_text(mlir_text)
+            result = subprocess.run(
+                [str(ptoas), "--pto-arch=a5", "--pto-backend=vpto",
+                 "--enable-tile-op-expand",
+                 str(mlir_path), "-o", str(obj_path)],
+                capture_output=True, text=True,
+            )
+            ok = result.returncode == 0 and obj_path.is_file()
+            if ok:
+                return True
+            bisheng_crash = "stack smashing" in result.stderr or "exit code 134" in result.stderr
+            tag = "SKIP (bisheng bug)" if bisheng_crash else "FAIL"
+            if expect_pass and not bisheng_crash:
+                # Unexpected failure — report loudly
+                sys.stderr.write(
+                    f"\n  [{tag}] {case_label} (exit={result.returncode})\n"
+                    f"  STDERR: {result.stderr[:500]}\n"
+                )
+            else:
+                print(f"  [{tag}] {case_label}")
+            return False
+
+    # ── Warp-reduce (≤ 32 lanes, NO scratch, NO GM store) ──
+    # These are the simplest kernels — they only compute a value and return
+    # from the SIMT body without writing to GM.  They MUST lower cleanly
+    # because they avoid both known bisheng issues.
+    expect(
+        _lower_and_check(kernel_warp.compile(), "warp_sum_t32"),
+        "lowering: warp_sum (32 lanes, hw redux, no GM store) must pass",
+    )
+    expect(
+        _lower_and_check(kernel_max_warp_hw.compile(), "warp_max_t32"),
+        "lowering: warp_max (32 lanes, hw redux, no GM store) must pass",
+    )
+    expect(
+        _lower_and_check(kernel_min_warp_hw.compile(), "warp_min_t32"),
+        "lowering: warp_min (32 lanes, hw redux, no GM store) must pass",
+    )
+
+    # ── Cross-warp (128 lanes, UB scratch) — known bisheng crash ──
+    # ptoas VPTO lowering succeeds; bisheng crashes on the device LLVM IR.
+    @pto.jit(target="a5")
+    def _kernel_cross_lowering(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=1)
+    _lower_and_check(_kernel_cross_lowering.compile(), "cross_sum_t128", expect_pass=False)
+
     print("ptodsl_allreduce: PASS")
 
 
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_max/compare.py b/test/vpto/cases/micro-op/simt/allreduce_cross_max/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_max/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_max/golden.py b/test/vpto/cases/micro-op/simt/allreduce_cross_max/golden.py
new file mode 100644
index 0000000000..2dca5fb330
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_max/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 128
+EXPECTED = 1.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_max/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_cross_max/kernel.pto
new file mode 100644
index 0000000000..e9b2842bf5
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_max/kernel.pto
@@ -0,0 +1,65 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c0_i64 = arith.constant 0 : i64
+    %0 = builtin.unrealized_conversion_cast %c0_i64 : i64 to ui64
+    %1 = pto.castptr %0 : ui64 -> !pto.ptr<f32, ub>
+    %c128_i32 = arith.constant 128 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c128_i32, %c1_i32, %c1_i32_0>>>(%1, %arg0) : (!pto.ptr<f32, ub>, !pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, ub>, %arg1: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c5_i32 = arith.constant 5 : i32
+    %c31_i32 = arith.constant 31 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %cst_0 = arith.constant 0xFF800000 : f32
+    %1 = pto.get_tid_x : i32
+    %2 = arith.shrui %1, %c5_i32 : i32
+    %3 = pto.get_laneid : i32
+    %4 = pto.redux_max %cst : f32 -> f32
+    %5 = arith.cmpi ult, %3, %c1_i32 : i32
+    scf.if %5 {
+      %13 = arith.muli %2, %c1_i32 : i32
+      %14 = arith.addi %13, %3 : i32
+      %15 = arith.index_cast %14 : i32 to index
+      pto.store %4, %arg0[%15] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %6 = arith.cmpi ult, %1, %c32_i32 : i32
+    %7 = scf.if %6 -> (f32) {
+      %13 = arith.cmpi ult, %3, %c4_i32 : i32
+      %14 = scf.if %13 -> (f32) {
+        %16 = arith.index_cast %3 : i32 to index
+        %17 = pto.load %arg0[%16] : !pto.ptr<f32, ub> -> f32
+        scf.yield %17 : f32
+      } else {
+        scf.yield %cst_0 : f32
+      }
+      %15 = pto.redux_max %14 : f32 -> f32
+      scf.yield %15 : f32
+    } else {
+      scf.yield %cst_0 : f32
+    }
+    %8 = arith.cmpi ult, %1, %c1_i32 : i32
+    scf.if %8 {
+      %13 = arith.index_cast %1 : i32 to index
+      pto.store %7, %arg0[%13] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %9 = arith.remui %1, %c1_i32 : i32
+    %10 = arith.index_cast %9 : i32 to index
+    %11 = pto.load %arg0[%10] : !pto.ptr<f32, ub> -> f32
+    pto.syncthreads
+    %12 = arith.index_cast %0 : i32 to index
+    pto.store %11, %arg1[%12] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_max/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_max/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_max/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_max/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_max/main.cpp
new file mode 100644
index 0000000000..05dad144bd
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_max/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 128;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_min/compare.py b/test/vpto/cases/micro-op/simt/allreduce_cross_min/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_min/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_min/golden.py b/test/vpto/cases/micro-op/simt/allreduce_cross_min/golden.py
new file mode 100644
index 0000000000..2dca5fb330
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_min/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 128
+EXPECTED = 1.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_min/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_cross_min/kernel.pto
new file mode 100644
index 0000000000..f9353c1b71
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_min/kernel.pto
@@ -0,0 +1,65 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c0_i64 = arith.constant 0 : i64
+    %0 = builtin.unrealized_conversion_cast %c0_i64 : i64 to ui64
+    %1 = pto.castptr %0 : ui64 -> !pto.ptr<f32, ub>
+    %c128_i32 = arith.constant 128 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c128_i32, %c1_i32, %c1_i32_0>>>(%1, %arg0) : (!pto.ptr<f32, ub>, !pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, ub>, %arg1: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c5_i32 = arith.constant 5 : i32
+    %c31_i32 = arith.constant 31 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %cst_0 = arith.constant 0x7F800000 : f32
+    %1 = pto.get_tid_x : i32
+    %2 = arith.shrui %1, %c5_i32 : i32
+    %3 = pto.get_laneid : i32
+    %4 = pto.redux_min %cst : f32 -> f32
+    %5 = arith.cmpi ult, %3, %c1_i32 : i32
+    scf.if %5 {
+      %13 = arith.muli %2, %c1_i32 : i32
+      %14 = arith.addi %13, %3 : i32
+      %15 = arith.index_cast %14 : i32 to index
+      pto.store %4, %arg0[%15] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %6 = arith.cmpi ult, %1, %c32_i32 : i32
+    %7 = scf.if %6 -> (f32) {
+      %13 = arith.cmpi ult, %3, %c4_i32 : i32
+      %14 = scf.if %13 -> (f32) {
+        %16 = arith.index_cast %3 : i32 to index
+        %17 = pto.load %arg0[%16] : !pto.ptr<f32, ub> -> f32
+        scf.yield %17 : f32
+      } else {
+        scf.yield %cst_0 : f32
+      }
+      %15 = pto.redux_min %14 : f32 -> f32
+      scf.yield %15 : f32
+    } else {
+      scf.yield %cst_0 : f32
+    }
+    %8 = arith.cmpi ult, %1, %c1_i32 : i32
+    scf.if %8 {
+      %13 = arith.index_cast %1 : i32 to index
+      pto.store %7, %arg0[%13] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %9 = arith.remui %1, %c1_i32 : i32
+    %10 = arith.index_cast %9 : i32 to index
+    %11 = pto.load %arg0[%10] : !pto.ptr<f32, ub> -> f32
+    pto.syncthreads
+    %12 = arith.index_cast %0 : i32 to index
+    pto.store %11, %arg1[%12] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_min/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_min/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_min/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_min/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_min/main.cpp
new file mode 100644
index 0000000000..05dad144bd
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_min/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 128;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_sum/compare.py b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_sum/golden.py b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/golden.py
new file mode 100644
index 0000000000..ce2b4c57da
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 128
+EXPECTED = 128.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_sum/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/kernel.pto
new file mode 100644
index 0000000000..53f01c524f
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/kernel.pto
@@ -0,0 +1,65 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c0_i64 = arith.constant 0 : i64
+    %0 = builtin.unrealized_conversion_cast %c0_i64 : i64 to ui64
+    %1 = pto.castptr %0 : ui64 -> !pto.ptr<f32, ub>
+    %c128_i32 = arith.constant 128 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c128_i32, %c1_i32, %c1_i32_0>>>(%1, %arg0) : (!pto.ptr<f32, ub>, !pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, ub>, %arg1: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c5_i32 = arith.constant 5 : i32
+    %c31_i32 = arith.constant 31 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %1 = pto.get_tid_x : i32
+    %2 = arith.shrui %1, %c5_i32 : i32
+    %3 = pto.get_laneid : i32
+    %4 = pto.redux_add %cst : f32 -> f32
+    %5 = arith.cmpi ult, %3, %c1_i32 : i32
+    scf.if %5 {
+      %13 = arith.muli %2, %c1_i32 : i32
+      %14 = arith.addi %13, %3 : i32
+      %15 = arith.index_cast %14 : i32 to index
+      pto.store %4, %arg0[%15] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %6 = arith.cmpi ult, %1, %c32_i32 : i32
+    %7 = scf.if %6 -> (f32) {
+      %13 = arith.cmpi ult, %3, %c4_i32 : i32
+      %14 = scf.if %13 -> (f32) {
+        %16 = arith.index_cast %3 : i32 to index
+        %17 = pto.load %arg0[%16] : !pto.ptr<f32, ub> -> f32
+        scf.yield %17 : f32
+      } else {
+        scf.yield %cst_0 : f32
+      }
+      %15 = pto.redux_add %14 : f32 -> f32
+      scf.yield %15 : f32
+    } else {
+      scf.yield %cst_0 : f32
+    }
+    %8 = arith.cmpi ult, %1, %c1_i32 : i32
+    scf.if %8 {
+      %13 = arith.index_cast %1 : i32 to index
+      pto.store %7, %arg0[%13] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %9 = arith.remui %1, %c1_i32 : i32
+    %10 = arith.index_cast %9 : i32 to index
+    %11 = pto.load %arg0[%10] : !pto.ptr<f32, ub> -> f32
+    pto.syncthreads
+    %12 = arith.index_cast %0 : i32 to index
+    pto.store %11, %arg1[%12] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_sum/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_sum/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/main.cpp
new file mode 100644
index 0000000000..05dad144bd
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 128;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_max/compare.py b/test/vpto/cases/micro-op/simt/allreduce_warp_max/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_max/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_max/golden.py b/test/vpto/cases/micro-op/simt/allreduce_warp_max/golden.py
new file mode 100644
index 0000000000..76a227fb75
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_max/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 32
+EXPECTED = 1.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_max/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_warp_max/kernel.pto
new file mode 100644
index 0000000000..e22af0898e
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_max/kernel.pto
@@ -0,0 +1,21 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c32_i32, %c1_i32, %c1_i32_0>>>(%arg0) : (!pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_0 = arith.constant 0xFF800000 : f32
+    %1 = pto.get_laneid : i32
+    %2 = pto.redux_max %cst : f32 -> f32
+    %3 = arith.index_cast %0 : i32 to index
+    pto.store %2, %arg0[%3] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_max/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_max/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_max/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_max/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_max/main.cpp
new file mode 100644
index 0000000000..c4fcda9b36
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_max/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 32;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_min/compare.py b/test/vpto/cases/micro-op/simt/allreduce_warp_min/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_min/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_min/golden.py b/test/vpto/cases/micro-op/simt/allreduce_warp_min/golden.py
new file mode 100644
index 0000000000..76a227fb75
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_min/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 32
+EXPECTED = 1.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_min/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_warp_min/kernel.pto
new file mode 100644
index 0000000000..d921ef1fe3
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_min/kernel.pto
@@ -0,0 +1,21 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c32_i32, %c1_i32, %c1_i32_0>>>(%arg0) : (!pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_0 = arith.constant 0x7F800000 : f32
+    %1 = pto.get_laneid : i32
+    %2 = pto.redux_min %cst : f32 -> f32
+    %3 = arith.index_cast %0 : i32 to index
+    pto.store %2, %arg0[%3] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_min/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_min/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_min/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_min/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_min/main.cpp
new file mode 100644
index 0000000000..c4fcda9b36
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_min/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 32;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_sum/compare.py b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_sum/golden.py b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/golden.py
new file mode 100644
index 0000000000..4a5fb7b63d
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 32
+EXPECTED = 32.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_sum/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/kernel.pto
new file mode 100644
index 0000000000..18e1374384
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/kernel.pto
@@ -0,0 +1,21 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c32_i32, %c1_i32, %c1_i32_0>>>(%arg0) : (!pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %1 = pto.get_laneid : i32
+    %2 = pto.redux_add %cst : f32 -> f32
+    %3 = arith.index_cast %0 : i32 to index
+    pto.store %2, %arg0[%3] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_sum/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_sum/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/main.cpp
new file mode 100644
index 0000000000..c4fcda9b36
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 32;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}