diff --git a/ptodsl/ptodsl/_allreduce.py b/ptodsl/ptodsl/_allreduce.py
new file mode 100644
index 0000000000..e75a8d1ccb
--- /dev/null
+++ b/ptodsl/ptodsl/_allreduce.py
@@ -0,0 +1,722 @@
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+"""
+SIMT cross-workitem all-reduce helpers.
+
+All-reduce ops are emitted **inline** at the current insertion point
+(no helper-function outline or ``func.call``).  Three reducer variants
+are exposed: ``simt_allreduce_sum``, ``simt_allreduce_max``, ``simt_allreduce_min``.
+
+Dispatch tree (mirrors the C++ compile-time dispatch in ``reduce.h``)::
+
+    threads <= scale                                    → identity
+    threads ≤ 32,  pow2(threads), pow2(scale)           → warp_reduce
+    threads ≤ 32                                        → ub_reduce
+    threads > 32, pow2(threads), scale ≤ 32, pow2(scale) → cross_warp_reduce
+    otherwise                                           → ub_reduce
+"""
+
+from __future__ import annotations
+
+from . import scalar
+from ._control_flow import if_
+from ._ops import const as _const, get_laneid, get_tid_x, redux_add, redux_max, redux_min, shuffle_bfly, syncthreads
+from ._surface_values import unwrap_surface_value, wrap_surface_value
+from ._tracing.active import current_session
+from ._types import float16 as _f16_dtype, float32 as _f32_dtype, index as _idx_dtype, int32 as _i32_dtype, _resolve
+
+from mlir.dialects import arith, pto as _pto, scf  # arith for unsigned ops; scf for ForOp in ub_reduce
+from mlir.ir import F16Type, F32Type, InsertionPoint, UnitAttr
+
+
+def _is_pow2(n: int) -> bool:
+    return n > 0 and (n & (n - 1)) == 0
+
+
+def _const_i32(value: int):
+    """Emit an i32 constant via PTODSL ``pto.const``, return raw SSA value."""
+    return _const(value, dtype=_resolve(_i32_dtype)).value
+
+
+def _const_idx(value: int):
+    """Emit an index constant via PTODSL ``pto.const``, return raw SSA value."""
+    return _const(value, dtype=_resolve(_idx_dtype)).value
+
+
+def _const_f32(value: float):
+    """Emit an f32 constant via PTODSL ``pto.const``, return raw SSA value."""
+    return _const(value, dtype=_resolve(_f32_dtype)).value
+
+
+def _const_f16(value: float):
+    """Emit an f16 constant via PTODSL ``pto.const``, return raw SSA value."""
+    return _const(value, dtype=_resolve(_f16_dtype)).value
+
+
+def _ult(a, b):
+    """Unsigned less-than comparison. Keeps raw arith because PTODSL __lt__
+    on signless i32 emits signed comparison (slt/cmpi)."""
+    return arith.CmpIOp(arith.CmpIPredicate.ult, a, b).result
+
+
+def _dtype_to_str(mlir_type) -> str:
+    """Map an MLIR scalar type to a canonical dtype string."""
+    if mlir_type == F32Type.get():
+        return "f32"
+    if mlir_type == F16Type.get():
+        return "f16"
+    raise NotImplementedError(
+        f"all_reduce: unsupported dtype {mlir_type}"
+    )
+
+
+# ── reducer dispatch tables ──────────────────────────────────────────────────
+
+_REDUCER_IDENTITY = {
+    "sum": {"f32": 0.0, "f16": 0.0},
+    "max": {"f32": float("-inf"), "f16": float("-inf")},
+    "min": {"f32": float("inf"), "f16": float("inf")},
+}
+"""Identity element per reducer and dtype."""
+
+
+def _apply_sum(a, b):
+    """Emit ``a + b`` (float addition) via PTODSL operator."""
+    return (wrap_surface_value(a) + wrap_surface_value(b)).value
+
+
+def _apply_max(a, b):
+    """Emit ``max(a, b)`` via PTODSL ``scalar.max``."""
+    return scalar.max(a, b).value
+
+
+def _apply_min(a, b):
+    """Emit ``min(a, b)`` via PTODSL ``scalar.min``."""
+    return scalar.min(a, b).value
+
+
+_REDUCER_COMBINE = {
+    "sum": _apply_sum,
+    "max": _apply_max,
+    "min": _apply_min,
+}
+"""Element-wise combine function per reducer."""
+
+
+def _redux_sum(x):
+    """Hardware lane-sum reduction, returns raw SSA value."""
+    return redux_add(x).value
+
+
+def _redux_max(x):
+    """Hardware lane-max reduction, returns raw SSA value."""
+    return redux_max(x).value
+
+
+def _redux_min(x):
+    """Hardware lane-min reduction, returns raw SSA value."""
+    return redux_min(x).value
+
+
+_REDUCER_REDUX = {
+    "sum": _redux_sum,
+    "max": _redux_max,
+    "min": _redux_min,
+}
+"""Hardware redux op per reducer."""
+
+# ── scratch validation ────────────────────────────────────────────────────
+
+def _validate_scratch(scratch, expected_mlir_type, *, context: str):
+    """Verify *scratch* is a ``!pto.ptr<expected_mlir_type, ub>`` buffer."""
+    raw_scratch = unwrap_surface_value(scratch)
+    try:
+        ptr_type = _pto.PtrType(raw_scratch.type)
+    except Exception:
+        raise TypeError(
+            f"all_reduce {context}: scratch must be a !pto.ptr buffer, "
+            f"got {raw_scratch.type}"
+        ) from None
+    vec_attr = _pto.AddressSpaceAttr.get(_pto.AddressSpace.VEC)
+    if ptr_type.memory_space != vec_attr:
+        raise TypeError(
+            f"all_reduce {context}: scratch must be in UB memory space, "
+            f"got {ptr_type.memory_space}"
+        )
+    if ptr_type.element_type != expected_mlir_type:
+        raise TypeError(
+            f"all_reduce {context}: scratch element type mismatch: "
+            f"expected {expected_mlir_type}, got {ptr_type.element_type}"
+        )
+
+
+# ── shared inline-emission utility ──────────────────────────────────────────
+
+def _emit_inline(emit_fn, *surface_args):
+    """Unwrap *surface_args* and call *emit_fn* at the current insertion point.
+
+    The emitter receives raw MLIR values and returns a raw SSA result,
+    which this wrapper re-wraps as a surface value.
+
+    Inline SIMT allreduce emits ``pto.syncthreads``, which requires the
+    containing function to carry ``pto.simt_entry``.  We attach the attribute
+    here (idempotently) so that callers inside ``with pto.simt():`` do not
+    need to manage the attribute themselves.
+    """
+    raw_args = [unwrap_surface_value(a) for a in surface_args]
+    result = emit_fn(*raw_args)
+
+    # Ensure the enclosing function is marked as a SIMT entry so the
+    # syncthreads verifier passes.
+    session = current_session()
+    if session is not None:
+        parent_func = session.current_function
+        parent_func.attributes["pto.simt_entry"] = UnitAttr.get()
+
+    return wrap_surface_value(result)
+
+
+# ── reduction operator application ─────────────────────────────────────────
+
+def _emit_store(buffer, offset, value):
+    """Emit ``pto.store`` via PTODSL ``scalar.store``."""
+    scalar.store(value, buffer, offset)
+
+
+def _emit_load(result_type, buffer, offset):
+    """Emit ``pto.load`` via PTODSL ``scalar.load``.
+
+    *result_type* is accepted for backward compatibility but ignored;
+    ``scalar.load`` infers the element type from the buffer.
+    """
+    return unwrap_surface_value(scalar.load(buffer, offset))
+
+
+def _emit_butterfly(v, *, threads: int, scale: int, reducer: str):
+    """Emit unrolled butterfly shuffle reduce.
+
+    Implements::
+
+        cur = threads
+        while cur > scale:
+            x = op(x, shfl_xor(x, cur/2))
+            cur /= 2
+
+    All loops are unrolled at emission time.  Caller must have set the
+    insertion point.
+    """
+    combine = _REDUCER_COMBINE[reducer]
+    cur = threads
+    while cur > scale:
+        offset = cur // 2
+        mask = _const_i32(offset)
+        shfl = shuffle_bfly(v, mask).value
+        v = combine(v, shfl)
+        cur //= 2
+    return v
+
+
+def _emit_warp_hw_reduce(x, *, threads: int,
+                         lane_in_warp, c_identity, reducer: str):
+    """Emit warp-level hardware reduce.
+
+    When *threads* == 32 ("groups" == 1): a single ``pto.redux_*``.
+    When *threads* < 32 ("groups" > 1): one ``pto.redux_*`` per group,
+    with identity masking for lanes outside the group.
+
+    Caller must have set the insertion point.
+    """
+    redux_fn = _REDUCER_REDUX[reducer]
+    groups = 32 // threads
+
+    if groups == 1:
+        return redux_fn(x)
+
+    c_threads = _const_i32(threads)
+    my_group = arith.DivUIOp(lane_in_warp, c_threads).result  # unsigned div — no PTODSL equivalent
+
+    for g in range(groups):
+        c_g = _const_i32(g)
+        in_group = (wrap_surface_value(my_group) == wrap_surface_value(c_g)).value
+        masked = scalar.select(in_group, x, c_identity).value
+        reduced = redux_fn(masked)
+        x = scalar.select(in_group, reduced, x).value
+    return x
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# public API
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def simt_allreduce_sum(value, *,
+               threads: int,
+               scale: int = 1,
+               thread_offset: int = 0,
+               scratch=None,
+               scratch_offset: int = 0):
+    """Cross-workitem all-reduce for SIMT VF context.
+
+    Dispatch logic mirrors the compile-time tree in
+    ``AscendAllReduce<Reducer, threads, scale, thread_offset>::run()``.
+
+    Args:
+        value: Lane-local scalar (f32 or f16).
+        threads: Number of workitems.  Must satisfy ``threads % scale == 0``.
+        scale: Scale factor (must divide *threads*).  Defaults to 1.
+        thread_offset: Thread offset.  Defaults to 0.
+        scratch: UB scratch buffer (``!pto.ptr<dtype, ub>``).  Required for
+            ``cross_warp_reduce`` and ``ub_reduce`` paths.  Defaults to None.
+        scratch_offset: Element offset into *scratch*.  Defaults to 0.
+
+    Returns:
+        Lane-uniform scalar (same type as *value*) — the reduced sum.
+    """
+    return _dispatch_allreduce_helper(
+        value, scratch=scratch, scratch_offset=scratch_offset,
+        threads=threads, scale=scale, thread_offset=thread_offset,
+        reducer="sum",
+    )
+
+
+def simt_allreduce_max(value, *,
+               threads: int,
+               scale: int = 1,
+               thread_offset: int = 0,
+               scratch=None,
+               scratch_offset: int = 0):
+    """Cross-workitem all-reduce **max** for SIMT VF context.
+
+    Dispatch logic mirrors the compile-time tree in
+    ``AscendAllReduce<MaxOp, threads, scale, thread_offset>::run()``.
+
+    Args:
+        value: Lane-local scalar (f32 or f16).
+        threads: Number of workitems.  Must satisfy ``threads % scale == 0``.
+        scale: Scale factor (must divide *threads*).  Defaults to 1.
+        thread_offset: Thread offset.  Defaults to 0.
+        scratch: UB scratch buffer (``!pto.ptr<dtype, ub>``).  Required for
+            ``cross_warp_reduce`` and ``ub_reduce`` paths.  Defaults to None.
+        scratch_offset: Element offset into *scratch*.  Defaults to 0.
+
+    Returns:
+        Lane-uniform scalar (same type as *value*) — the element-wise maximum.
+    """
+    return _dispatch_allreduce_helper(
+        value, scratch=scratch, scratch_offset=scratch_offset,
+        threads=threads, scale=scale, thread_offset=thread_offset,
+        reducer="max",
+    )
+
+
+def simt_allreduce_min(value, *,
+               threads: int,
+               scale: int = 1,
+               thread_offset: int = 0,
+               scratch=None,
+               scratch_offset: int = 0):
+    """Cross-workitem all-reduce **min** for SIMT VF context.
+
+    Dispatch logic mirrors the compile-time tree in
+    ``AscendAllReduce<MinOp, threads, scale, thread_offset>::run()``.
+
+    Args:
+        value: Lane-local scalar (f32 or f16).
+        threads: Number of workitems.  Must satisfy ``threads % scale == 0``.
+        scale: Scale factor (must divide *threads*).  Defaults to 1.
+        thread_offset: Thread offset.  Defaults to 0.
+        scratch: UB scratch buffer (``!pto.ptr<dtype, ub>``).  Required for
+            ``cross_warp_reduce`` and ``ub_reduce`` paths.  Defaults to None.
+        scratch_offset: Element offset into *scratch*.  Defaults to 0.
+
+    Returns:
+        Lane-uniform scalar (same type as *value*) — the element-wise minimum.
+    """
+    return _dispatch_allreduce_helper(
+        value, scratch=scratch, scratch_offset=scratch_offset,
+        threads=threads, scale=scale, thread_offset=thread_offset,
+        reducer="min",
+    )
+
+
+def _dispatch_allreduce_helper(value, *, scratch, scratch_offset,
+                                threads, scale, thread_offset, reducer):
+    # ── parameter validation (before identity shortcut) ───────────────────
+    for name, val in (("threads", threads), ("scale", scale),
+                       ("thread_offset", thread_offset)):
+        if not isinstance(val, int):
+            raise ValueError(
+                f"all_reduce: '{name}' must be a Python int, "
+                f"got {type(val).__name__}"
+            )
+    if threads < 1:
+        raise ValueError(f"all_reduce: threads must be >= 1, got {threads}")
+    if scale < 1:
+        raise ValueError(f"all_reduce: scale must be >= 1, got {scale}")
+    if thread_offset < 0:
+        raise ValueError(
+            f"all_reduce: thread_offset must be >= 0, got {thread_offset}"
+        )
+    if threads % scale != 0:
+        raise ValueError(
+            f"all_reduce requires threads % scale == 0; "
+            f"got threads={threads}, scale={scale}"
+        )
+
+    # ── Path 0: identity ──────────────────────────────────────────────────
+    if threads <= scale:
+        return value
+
+    # ── dtype validation ─────────────────────────────────────────────────
+    raw_value = unwrap_surface_value(value)
+    dtype = _dtype_to_str(raw_value.type)
+    if dtype not in ("f32", "f16"):
+        raise NotImplementedError(
+            f"all_reduce only supports f32/f16, got {dtype}"
+        )
+
+    args = dict(dtype=dtype, threads=threads, scale=scale,
+                thread_offset=thread_offset, scratch_offset=scratch_offset,
+                reducer=reducer)
+
+    # ── Path 1: warp_reduce ───────────────────────────────────────────────
+    if threads <= 32 and _is_pow2(threads) and _is_pow2(scale):
+        return _emit_inline(
+            lambda x: _emit_warp_reduce(x, **args),
+            value,
+        )
+
+    # ── All paths below require a scratch buffer ──────────────────────────
+    if scratch is None:
+        raise ValueError(
+            f"all_reduce {reducer}/{dtype}/t{threads}/s{scale}/o{thread_offset} "
+            "requires a UB scratch buffer"
+        )
+    _validate_scratch(
+        scratch, raw_value.type,
+        context=f"{reducer}/{dtype}/t{threads}/s{scale}/o{thread_offset}",
+    )
+
+    # ── Path 2: ub_reduce (threads ≤ 32, non-pow2) ──────────────────────
+    if threads <= 32:
+        return _emit_inline(
+            lambda x, s: _emit_ub_reduce(x, s, **args),
+            value, scratch,
+        )
+
+    # ── Path 3: cross_warp_reduce ────────────────────────────────────────
+    if scale <= 32 and _is_pow2(threads) and _is_pow2(scale):
+        return _emit_inline(
+            lambda x, s: _emit_cross_warp_reduce(x, s, **args),
+            value, scratch,
+        )
+
+    # ── Path 4: ub_reduce fallback (threads > 32, anything else) ─────────
+    return _emit_inline(
+        lambda x, s: _emit_ub_reduce(x, s, **args),
+        value, scratch,
+    )
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# emitter: warp_reduce  (Path 1: threads ≤ 32, pow2, pow2 scale)
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def _emit_warp_reduce(x, *,
+                      dtype, threads, scale, thread_offset,
+                      scratch_offset, reducer):
+    """Emit inline single-warp all-reduce at the current insertion point.
+
+    Dispatches to:
+
+    * ``warp_hw_reduce`` when ``extent >= 16`` and ``scale == 1``
+      (fast hardware redux, with group masking for threads < 32).
+    * ``butterfly`` otherwise (software shuffle via ``pto.shuffle_bfly``).
+    """
+    extent = threads // scale
+    identity_val = _REDUCER_IDENTITY[reducer][dtype]
+    const_f = _const_f32 if dtype == "f32" else _const_f16
+
+    c_offset = _const_i32(thread_offset)
+    c_identity = const_f(identity_val)
+
+    if thread_offset:
+        # lane_in_warp = (tid_x - offset) & 31
+        tid_x = get_tid_x().value
+        tx = (wrap_surface_value(tid_x) - wrap_surface_value(c_offset)).value
+        lane_in_warp = (wrap_surface_value(tx) & _const_i32(31)).value
+    else:
+        lane_in_warp = get_laneid().value
+
+    if extent >= 16 and scale == 1:
+        return _emit_warp_hw_reduce(
+            x, threads=threads,
+            lane_in_warp=lane_in_warp, c_identity=c_identity, reducer=reducer,
+        )
+    else:
+        return _emit_butterfly(
+            x, threads=threads, scale=scale, reducer=reducer,
+        )
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# emitter: cross_warp_reduce  (Path 3: threads > 32)
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def _emit_cross_warp_reduce(x, scratch, *,
+                            dtype, threads, scale, thread_offset,
+                            scratch_offset, reducer):
+    """Emit inline cross-warp all-reduce at the current insertion point.
+
+    Algorithm overview:
+
+    1. *num_warps* subgroups of 32 lanes each do a per-warp reduce.
+    2. Warp leaders (lid < scale) write → scratch[wid * scale + lid].
+    3. ``pto.syncthreads``.
+    4. Leader warp (lanes with ``tx < 32``) reduces the partial sums:
+       - scale == 1:  ``hw_reduce`` across leader warp.
+       - scale * num_warps ≤ 32:  ``butterfly<total, scale, 0>``.
+       - otherwise:  manual loop over warps.
+    5. Global leader (tx < scale) writes result → scratch[tx].
+    6. ``pto.syncthreads`` + broadcast: each lane reads scratch[tx % scale].
+    7. Extra ``pto.syncthreads`` to fence scratch reuse.
+    """
+    num_warps = threads // 32
+    identity_val = _REDUCER_IDENTITY[reducer][dtype]
+    const_f = _const_f32 if dtype == "f32" else _const_f16
+    combine = _REDUCER_COMBINE[reducer]
+    redux_fn = _REDUCER_REDUX[reducer]
+
+    # ── constants ────────────────────────────────────────────────────
+    c5_i32 = _const_i32(5)
+    c31_i32 = _const_i32(31)
+    c32_i32 = _const_i32(32)
+    c_scale = _const_i32(scale)
+    c_num_warps = _const_i32(num_warps)
+    c_offset = _const_i32(thread_offset)
+    c_scratch_off = _const_idx(scratch_offset)
+    c_identity = const_f(identity_val)
+
+    # ── thread indexing ──────────────────────────────────────────────
+    tid_x = get_tid_x().value
+    if thread_offset:
+        tx = (wrap_surface_value(tid_x) - wrap_surface_value(c_offset)).value
+        wid = arith.ShRUIOp(tx, c5_i32).result  # unsigned shift — no PTODSL equivalent
+        lid = (wrap_surface_value(tx) & c31_i32).value
+    else:
+        tx = tid_x
+        wid = arith.ShRUIOp(tx, c5_i32).result
+        lid = get_laneid().value
+
+    # ── Stage 1: per-warp reduce ─────────────────────────────────────
+    if scale == 1:
+        warp_val = redux_fn(x)
+    else:
+        warp_val = _emit_butterfly(
+            x, threads=32, scale=scale, reducer=reducer,
+        )
+
+    # ── Stage 2: warp leaders write partial results ──────────────────
+    is_writer = _ult(lid, c_scale)
+    with if_(is_writer) as br:
+        with br.then_:
+            slot = (wrap_surface_value(wid) * wrap_surface_value(c_scale) + wrap_surface_value(lid)).value
+            slot_idx = scalar.index_cast(slot).value
+            if scratch_offset:
+                slot_idx = (wrap_surface_value(slot_idx) + wrap_surface_value(c_scratch_off)).value
+            _emit_store(scratch, slot_idx, warp_val)
+
+    # ── Stage 3: sync before reading partial results ─────────────────
+    syncthreads()
+
+    # ── Stage 4: leader warp reduces partial sums ────────────────────
+    is_leader_warp = _ult(tx, c32_i32)
+    with if_(is_leader_warp) as br:
+        with br.then_:
+            if scale == 1:
+                # ── scale == 1: hw_reduce across leader warp ────────
+                need_load = _ult(lid, c_num_warps)
+                with if_(need_load) as inner_br:
+                    with inner_br.then_:
+                        lid_idx = scalar.index_cast(lid).value
+                        tmp = _emit_load(None, scratch, lid_idx)
+                        inner_br.assign(loaded=tmp)
+                    with inner_br.else_:
+                        inner_br.assign(loaded=c_identity)
+                loaded = inner_br.loaded
+                stage4_result = redux_fn(loaded)
+            elif scale * num_warps <= 32:
+                # ── scale > 1, fits in one warp: butterfly ──────────
+                total = scale * num_warps
+                c_total = _const_i32(total)
+                need_load = _ult(lid, c_total)
+                with if_(need_load) as inner_br:
+                    with inner_br.then_:
+                        lid_idx = scalar.index_cast(lid).value
+                        if scratch_offset:
+                            lid_idx = (wrap_surface_value(lid_idx) + wrap_surface_value(c_scratch_off)).value
+                        tmp = _emit_load(None, scratch, lid_idx)
+                        inner_br.assign(loaded=tmp)
+                    with inner_br.else_:
+                        inner_br.assign(loaded=c_identity)
+                loaded = inner_br.loaded
+                stage4_result = _emit_butterfly(
+                    loaded,
+                    threads=total, scale=scale, reducer=reducer,
+                )
+            else:
+                # ── manual loop: lid < scale lanes each reduce num_warps
+                is_reducer = _ult(lid, c_scale)
+                reduced = c_identity
+                my_slot = arith.RemUIOp(lid, c_scale).result  # unsigned rem
+                for w in range(num_warps):
+                    c_w = _const_i32(w)
+                    idx_val = (wrap_surface_value(c_w) * wrap_surface_value(c_scale) + wrap_surface_value(my_slot)).value
+                    slot_idx = scalar.index_cast(idx_val).value
+                    if scratch_offset:
+                        slot_idx = (wrap_surface_value(slot_idx) + wrap_surface_value(c_scratch_off)).value
+                    loaded_v = _emit_load(None, scratch, slot_idx)
+                    reduced = combine(reduced, loaded_v)
+                stage4_result = scalar.select(
+                    is_reducer, reduced, c_identity).value
+
+            br.assign(stage4_result=stage4_result)
+        with br.else_:
+            br.assign(stage4_result=c_identity)
+
+    partial_reduced = unwrap_surface_value(br.stage4_result)
+
+    # ── Stage 5: global leader writes result to scratch ──────────────
+    is_global_leader = _ult(tx, c_scale)
+    with if_(is_global_leader) as br5:
+        with br5.then_:
+            tx_idx = scalar.index_cast(tx).value
+            if scratch_offset:
+                tx_idx = (wrap_surface_value(tx_idx) + wrap_surface_value(c_scratch_off)).value
+            _emit_store(scratch, tx_idx, partial_reduced)
+
+    # ── Stage 6: sync + broadcast load scratch[tx % scale] ───────────
+    syncthreads()
+    my_slot = arith.RemUIOp(tx, c_scale).result  # unsigned rem
+    load_idx = scalar.index_cast(my_slot).value
+    if scratch_offset:
+        load_idx = (wrap_surface_value(load_idx) + wrap_surface_value(c_scratch_off)).value
+    result = _emit_load(None, scratch, load_idx)
+
+    # ── Stage 7: extra sync to fence scratch reuse ───────────────────
+    syncthreads()
+
+    return result
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# emitter: ub_reduce  (Paths 2 & 4: fallback via UB scratch)
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def _emit_ub_reduce(x, scratch, *,
+                    dtype, threads, scale, thread_offset,
+                    scratch_offset, reducer):
+    """Emit inline UB-scratch all-reduce at the current insertion point.
+
+    Algorithm:
+
+    1. Each lane writes x → scratch[tx].
+    2. ``pto.syncthreads``.
+    3. Lanes with ``lane % scale == 0`` sequentially reduce scratch slots.
+    4. ``pto.syncthreads``.
+    5. Global leader (lane % scale == 0, lane / scale == 0) writes back.
+    6. ``pto.syncthreads`` + broadcast: each lane reads scratch[tx % scale].
+    7. ``pto.syncthreads`` to fence scratch reuse.
+    """
+    combine = _REDUCER_COMBINE[reducer]
+
+    # ── constants ────────────────────────────────────────────────────
+    c_threads = _const_i32(threads)
+    c_scale = _const_i32(scale)
+    c_offset = _const_i32(thread_offset)
+    c_scratch_off = _const_idx(scratch_offset)
+
+    # ── thread indexing ──────────────────────────────────────────────
+    tid_x = get_tid_x().value
+    tx = (wrap_surface_value(tid_x) - wrap_surface_value(c_offset)).value if thread_offset else tid_x
+    group = arith.DivUIOp(tx, c_threads).result  # unsigned div
+    lane = arith.RemUIOp(tx, c_threads).result    # unsigned rem
+
+    # ── Stage 1: each lane writes x → scratch[scratch_offset + tx] ──
+    tx_idx = scalar.index_cast(tx).value
+    if scratch_offset:
+        tx_idx = (wrap_surface_value(tx_idx) + wrap_surface_value(c_scratch_off)).value
+    _emit_store(scratch, tx_idx, x)
+
+    # ── Stage 2: sync ────────────────────────────────────────────────
+    syncthreads()
+
+    # ── Stage 3: reducers sequentially combine ───────────────────────
+    is_reducer = _ult(lane, c_scale)
+    with if_(is_reducer) as br:
+        with br.then_:
+            # initial: load scratch[scratch_offset + group * threads + lane]
+            group_offset = (wrap_surface_value(group) * wrap_surface_value(c_threads)).value
+            first_elem = (wrap_surface_value(group_offset) + wrap_surface_value(lane)).value
+            first_idx = scalar.index_cast(first_elem).value
+            if scratch_offset:
+                first_idx = (wrap_surface_value(first_idx) + wrap_surface_value(c_scratch_off)).value
+            acc = _emit_load(None, scratch, first_idx)
+
+            # scf.for i = scale to threads step scale
+            lb = _const_idx(scale)
+            ub = _const_idx(threads)
+            step = _const_idx(scale)
+            for_op = scf.ForOp(lb, ub, step, [acc])
+            with InsertionPoint(for_op.body):
+                i = for_op.induction_variable
+                prev = for_op.inner_iter_args[0]
+                elem = (wrap_surface_value(first_idx) + wrap_surface_value(i)).value
+                loaded = _emit_load(None, scratch, elem)
+                new_acc = combine(prev, loaded)
+                scf.YieldOp([new_acc])
+            acc = for_op.results[0]
+
+            br.assign(flag=acc)
+        with br.else_:
+            br.assign(flag=x)
+
+    flag = unwrap_surface_value(br.flag)
+
+    # ── Stage 4: sync ────────────────────────────────────────────────
+    syncthreads()
+
+    # ── Stage 5: per-class leader writes reduced value ───────────────
+    is_leader = _ult(lane, c_scale)
+    with if_(is_leader) as br5:
+        with br5.then_:
+            dst_offset = (wrap_surface_value(group) * wrap_surface_value(c_threads) + wrap_surface_value(lane)).value
+            dst_idx = scalar.index_cast(dst_offset).value
+            if scratch_offset:
+                dst_idx = (wrap_surface_value(dst_idx) + wrap_surface_value(c_scratch_off)).value
+            _emit_store(scratch, dst_idx, flag)
+
+    # ── Stage 6: sync + broadcast scratch[scratch_offset + group*threads + tx%scale] ──
+    syncthreads()
+    my_slot = ((wrap_surface_value(group) * wrap_surface_value(c_threads)) +
+               wrap_surface_value(arith.RemUIOp(tx, c_scale).result)).value
+    load_idx = scalar.index_cast(my_slot).value
+    if scratch_offset:
+        load_idx = (wrap_surface_value(load_idx) + wrap_surface_value(c_scratch_off)).value
+    result = _emit_load(None, scratch, load_idx)
+
+    # ── Stage 7: extra sync to fence scratch reuse ───────────────────
+    syncthreads()
+
+    return result
+
+
+__all__ = [
+    "simt_allreduce_sum",
+    "simt_allreduce_max",
+    "simt_allreduce_min",
+]
diff --git a/ptodsl/ptodsl/pto.py b/ptodsl/ptodsl/pto.py
index ef55490654..b469dbe6c8 100644
--- a/ptodsl/ptodsl/pto.py
+++ b/ptodsl/ptodsl/pto.py
@@ -143,6 +143,9 @@
     LoopHandle, BranchHandle,
 )
 
+# ── All-reduce ─────────────────────────────────────────────────────────────────
+from ._allreduce import simt_allreduce_sum, simt_allreduce_max, simt_allreduce_min  # noqa: F401
+
 # ── Decorator ─────────────────────────────────────────────────────────────────
 from ._jit import jit, KernelHandle, merge_jit_modules      # noqa: F401
 from ._subkernels import cube, simd, simt     # noqa: F401
diff --git a/ptodsl/tests/test_allreduce.py b/ptodsl/tests/test_allreduce.py
new file mode 100644
index 0000000000..ce12914b86
--- /dev/null
+++ b/ptodsl/tests/test_allreduce.py
@@ -0,0 +1,685 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from pathlib import Path
+import sys
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "ptodsl"))
+
+from ptodsl import pto
+
+
+def expect(condition: bool, message: str) -> None:
+    if not condition:
+        raise AssertionError(message)
+
+
+def main():
+    from ptodsl._allreduce import simt_allreduce_sum, simt_allreduce_max, simt_allreduce_min
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 0: identity (threads <= scale)
+    # ══════════════════════════════════════════════════════════════════════════
+    expect(
+        simt_allreduce_sum(1.0, threads=1, scale=1) == 1.0,
+        "identity: threads == scale",
+    )
+    expect(
+        simt_allreduce_sum(1.0, threads=2, scale=2) == 1.0,
+        "identity: threads == scale (alt)",
+    )
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # validation errors
+    # ══════════════════════════════════════════════════════════════════════════
+
+    # threads % scale != 0  (validation now runs before identity shortcut)
+    try:
+        simt_allreduce_sum(1.0, threads=3, scale=2)
+        raise AssertionError("expected ValueError for threads % scale != 0")
+    except ValueError:
+        pass
+
+
+    # threads < 1
+    try:
+        simt_allreduce_sum(1.0, threads=0, scale=1)
+        raise AssertionError("expected ValueError for threads < 1")
+    except ValueError:
+        pass
+
+    # validation runs before identity: bad params not bypassed by threads<=scale
+    try:
+        simt_allreduce_sum(1.0, threads=1, scale=2)
+        raise AssertionError("expected ValueError for threads%scale!=0 (before identity)")
+    except ValueError:
+        pass
+
+    # i32 dtype rejected — need a real JIT kernel so we get an MLIR i32 value
+    @pto.jit(target="a5")
+    def kernel_i32(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1, dtype=pto.i32)
+            _result = pto.simt_allreduce_sum(x, threads=32, scale=1)
+
+    try:
+        kernel_i32.compile()
+        raise AssertionError("expected NotImplementedError for i32")
+    except NotImplementedError:
+        pass
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 1a: warp_reduce — hardware redux, groups == 1 (threads=32)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_warp(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        _ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, threads=32, scale=1)
+
+    compiled_warp = kernel_warp.compile()
+    mlir_warp = compiled_warp.mlir_text()
+    expect("pto.redux_add" in mlir_warp,
+           "IR: redux_add in warp_reduce helper")
+    expect("pto.syncthreads" not in mlir_warp,
+           "IR: warp_reduce has no syncthreads")
+    expect("pto.shuffle_bfly" not in mlir_warp,
+           "IR: warp_reduce (groups=1) has no shuffle_bfly")
+    compiled_warp.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 1b: warp_reduce — hardware redux, groups > 1 (threads=16, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_warp_t16(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        _ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, threads=16, scale=1)
+
+    compiled_warp_t16 = kernel_warp_t16.compile()
+    mlir_warp_t16 = compiled_warp_t16.mlir_text()
+    expect("pto.redux_add" in mlir_warp_t16,
+           "IR: redux_add for groups>1")
+    expect("arith.select" in mlir_warp_t16,
+           "IR: arith.select for group masking")
+    expect("pto.syncthreads" not in mlir_warp_t16,
+           "IR: warp_reduce (groups=2) has no syncthreads")
+    compiled_warp_t16.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 1c: warp_reduce — butterfly shuffle (threads=8, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_warp_t8(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        _ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, threads=8, scale=1)
+
+    compiled_warp_t8 = kernel_warp_t8.compile()
+    mlir_warp_t8 = compiled_warp_t8.mlir_text()
+    expect("pto.shuffle_bfly" in mlir_warp_t8,
+           "IR: shuffle_bfly for butterfly path")
+    expect("pto.redux_add" not in mlir_warp_t8,
+           "IR: butterfly has no hardware redux")
+    expect("pto.syncthreads" not in mlir_warp_t8,
+           "IR: butterfly has no syncthreads")
+    compiled_warp_t8.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 1d: warp_reduce — butterfly with scale > 1 (threads=32, scale=2)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_warp_s2(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        _ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, threads=32, scale=2)
+
+    compiled_warp_s2 = kernel_warp_s2.compile()
+    mlir_warp_s2 = compiled_warp_s2.mlir_text()
+    expect("pto.shuffle_bfly" in mlir_warp_s2,
+           "IR: shuffle_bfly for butterfly (scale>1)")
+    expect("pto.redux_add" not in mlir_warp_s2,
+           "IR: butterfly (scale>1) has no hardware redux")
+    compiled_warp_s2.verify()
+
+    # ── warp_reduce: sum, f32, t=16, s=1, o=4 (non-zero thread_offset) ────────
+    @pto.jit(target="a5")
+    def kernel_warp_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        _ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, threads=16, scale=1, thread_offset=4)
+
+    compiled_warp_o4 = kernel_warp_o4.compile()
+    mlir_warp_o4 = compiled_warp_o4.mlir_text()
+    expect("pto.get_tid_x" in mlir_warp_o4,
+           "IR: warp_reduce o=4 uses get_tid_x (not raw get_laneid)")
+    expect("arith.subi" in mlir_warp_o4,
+           "IR: warp_reduce o=4 uses subi for tx = tid_x - offset")
+    expect("arith.andi" in mlir_warp_o4,
+           "IR: warp_reduce o=4 uses andi to extract lane_in_warp")
+    compiled_warp_o4.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 2: ub_reduce — threads ≤ 32, non-power-of-2 (threads=6, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_ub6(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=6, scale=1)
+
+    compiled_ub6 = kernel_ub6.compile()
+    mlir_ub6 = compiled_ub6.mlir_text()
+    expect("pto.syncthreads" in mlir_ub6,
+           "IR: ub_reduce has syncthreads")
+    expect("pto.store" in mlir_ub6,
+           "IR: ub_reduce has store (write to scratch)")
+    expect("pto.load" in mlir_ub6,
+           "IR: ub_reduce has load (read from scratch)")
+    syncthreads_count = mlir_ub6.count("pto.syncthreads")
+    expect(syncthreads_count == 4,
+           f"IR: ub_reduce has 4 syncthreads, got {syncthreads_count}")
+    compiled_ub6.verify()
+
+    # ── ub_reduce: sum, f32, t=6, s=2 (scale > 1, non-pow2 threads) ─────────
+    @pto.jit(target="a5")
+    def kernel_ub6s2(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=6, scale=2)
+
+    compiled_ub6s2 = kernel_ub6s2.compile()
+    mlir_ub6s2 = compiled_ub6s2.mlir_text()
+    expect("pto.syncthreads" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has syncthreads")
+    expect("pto.store" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has store")
+    expect("pto.load" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has load")
+    expect("scf.for" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has scf.for (sequential reduce loop)")
+    expect("pto.redux_add" not in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has no hardware redux")
+    expect("pto.shuffle_bfly" not in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has no butterfly shuffle")
+    # scale>1 fixes: reducer uses lane < scale (ult), not lane_mod == 0
+    expect("arith.cmpi ult" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 reducer uses ult (lane < scale)")
+    compiled_ub6s2.verify()
+
+    # ── ub_reduce: sum, f32, t=6, s=1, o=4 (non-zero thread_offset) ─────────
+    @pto.jit(target="a5")
+    def kernel_ub_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=6, scale=1,
+                                     thread_offset=4)
+
+    compiled_ub_o4 = kernel_ub_o4.compile()
+    mlir_ub_o4 = compiled_ub_o4.mlir_text()
+    expect("arith.subi" in mlir_ub_o4,
+           "IR: ub_reduce o=4 uses subi for tx = tid_x - offset")
+    compiled_ub_o4.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 3a: cross_warp_reduce — sum, f32, t=128, s=1, o=0 (baseline)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_128(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=1)
+
+    compiled = kernel_128.compile()
+    mlir = compiled.mlir_text()
+
+    expect("pto.simt_entry" in mlir,
+           "IR: helper carries pto.simt_entry")
+
+    for op_name in (
+        "pto.redux_add", "pto.syncthreads", "pto.store", "pto.load",
+        "pto.get_tid_x", "pto.get_laneid", "arith.shrui", "scf.if",
+    ):
+        expect(op_name in mlir, f"IR: expected '{op_name}' in helper body")
+
+    syncthreads_count = mlir.count("pto.syncthreads")
+    expect(syncthreads_count == 3,
+           f"IR: expected 3 syncthreads, got {syncthreads_count}")
+
+    compiled.verify()
+
+    # ── cross_warp: sum, f32, t=64 (2 warps) ────────────────────────────────
+    @pto.jit(target="a5")
+    def kernel_64(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=64, scale=1)
+
+    compiled_64 = kernel_64.compile()
+    mlir_64 = compiled_64.mlir_text()
+    compiled_64.verify()
+
+    # ── cross_warp: sum, f32, t=256 (8 warps) ───────────────────────────────
+    @pto.jit(target="a5")
+    def kernel_256(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=256, scale=1)
+
+    compiled_256 = kernel_256.compile()
+    mlir_256 = compiled_256.mlir_text()
+    compiled_256.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 3b: cross_warp_reduce — scale > 1, scale*num_warps ≤ 32
+    #           (threads=128, scale=2, num_warps=4, total=8 ≤ 32)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_cw_s2(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=2)
+
+    compiled_cw_s2 = kernel_cw_s2.compile()
+    mlir_cw_s2 = compiled_cw_s2.mlir_text()
+    expect("pto.shuffle_bfly" in mlir_cw_s2,
+           "IR: cross_warp s=2 has shuffle_bfly (butterfly for per-warp + leader)")
+    expect("pto.syncthreads" in mlir_cw_s2,
+           "IR: cross_warp s=2 has syncthreads")
+    # scale > 1: per-warp uses butterfly, not hardware redux
+    compiled_cw_s2.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 3c: cross_warp_reduce — scale > 1, scale*num_warps > 32 (manual, sum)
+    #           (threads=128, scale=16, num_warps=4, total=64 > 32)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_cw_s16(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=16)
+
+    compiled_cw_s16 = kernel_cw_s16.compile()
+    mlir_cw_s16 = compiled_cw_s16.mlir_text()
+    expect("pto.syncthreads" in mlir_cw_s16,
+           "IR: cross_warp s=16 has syncthreads")
+    compiled_cw_s16.verify()
+
+    # ── cross_warp: sum, f32, t=128, s=1, o=4 (non-zero thread_offset) ─────
+    @pto.jit(target="a5")
+    def kernel_cw_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=1,
+                                     thread_offset=4)
+
+    compiled_cw_o4 = kernel_cw_o4.compile()
+    mlir_cw_o4 = compiled_cw_o4.mlir_text()
+    expect("pto.get_tid_x" in mlir_cw_o4,
+           "IR: cross_warp o=4 uses get_tid_x")
+    expect("arith.subi" in mlir_cw_o4,
+           "IR: cross_warp o=4 uses subi for tx = tid_x - offset")
+    compiled_cw_o4.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 4: ub_reduce fallback — threads > 32, non-power-of-2
+    #          (threads=48, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_ub48(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=48, scale=1)
+
+    compiled_ub48 = kernel_ub48.compile()
+    mlir_ub48 = compiled_ub48.mlir_text()
+    expect("pto.syncthreads" in mlir_ub48,
+           "IR: ub_reduce fallback has syncthreads")
+    expect("pto.store" in mlir_ub48,
+           "IR: ub_reduce fallback has store")
+    expect("pto.load" in mlir_ub48,
+           "IR: ub_reduce fallback has load")
+    compiled_ub48.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_reuse(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x1 = pto.const(1.0, dtype=pto.f32)
+            _r1 = pto.simt_allreduce_sum(x1, scratch=ub_scratch, threads=128, scale=1)
+            x2 = pto.const(2.0, dtype=pto.f32)
+            _r2 = pto.simt_allreduce_sum(x2, scratch=ub_scratch, threads=128, scale=1)
+
+    compiled2 = kernel_reuse.compile()
+    mlir2 = compiled2.mlir_text()
+
+    compiled2.verify()
+
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # scratch required for ub_reduce and cross_warp paths
+    # ══════════════════════════════════════════════════════════════════════════
+
+    # cross_warp requires scratch — use a real JIT kernel so the error
+    # originates from _dispatch_allreduce_helper, not from a bare Python float.
+    @pto.jit(target="a5")
+    def kernel_no_scratch_cw(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=None, threads=128, scale=1)
+
+    try:
+        kernel_no_scratch_cw.compile()
+        raise AssertionError("expected ValueError for missing scratch (cross_warp)")
+    except ValueError as e:
+        expect("requires a UB scratch buffer" in str(e),
+               f"error message should mention scratch (cross_warp), got: {e}")
+
+    # ub_reduce (non-pow2) requires scratch
+    @pto.jit(target="a5")
+    def kernel_no_scratch_ub(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=None, threads=6, scale=1)
+
+    try:
+        kernel_no_scratch_ub.compile()
+        raise AssertionError("expected ValueError for missing scratch (ub_reduce)")
+    except ValueError as e:
+        expect("requires a UB scratch buffer" in str(e),
+               f"error message should mention scratch (ub_reduce), got: {e}")
+
+    # scratch must be a pto.ptr type
+    try:
+        simt_allreduce_sum(1.0, scratch="not_a_ptr", threads=6, scale=1)
+        raise AssertionError("expected TypeError for non-ptr scratch")
+    except (TypeError, AttributeError):
+        pass
+
+    # cross_warp: gm scratch (wrong memory space) should be rejected
+    @pto.jit(target="a5")
+    def kernel_gm_scratch(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=scratch_gm, threads=128, scale=1)
+
+    try:
+        kernel_gm_scratch.compile()
+        raise AssertionError("expected TypeError for gm scratch")
+    except TypeError as e:
+        expect("UB" in str(e).upper() or "memory space" in str(e).lower(),
+               f"gm scratch error should mention memory space, got: {e}")
+
+    # cross_warp: i32 scratch with f32 x (dtype mismatch) should be rejected
+    @pto.jit(target="a5")
+    def kernel_dtype_mismatch(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_i32 = pto.castptr(zero_u64, pto.ptr(pto.i32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_i32, threads=128, scale=1)
+
+    try:
+        kernel_dtype_mismatch.compile()
+        raise AssertionError("expected TypeError for dtype mismatch scratch")
+    except TypeError as e:
+        err = str(e)
+        expect("element type" in err.lower() or "mismatch" in err.lower(),
+               f"dtype mismatch should mention element type, got: {e}")
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Max reducer — Path 1a: warp_reduce, hw redux (threads=32, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+    @pto.jit(target="a5")
+    def kernel_max_warp_hw(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_max(x, threads=32, scale=1)
+
+    compiled_max_warp = kernel_max_warp_hw.compile()
+    mlir_max_warp = compiled_max_warp.mlir_text()
+
+    expect(
+        "pto.redux_max" in mlir_max_warp,
+        "Path 1a (max): IR must contain pto.redux_max",
+    )
+    expect(
+        "pto.syncthreads" not in mlir_max_warp,
+        "Path 1a (max): single-warp hw reduce needs no syncthreads",
+    )
+
+    # ── Max reducer — Path 1c: warp_reduce, butterfly (threads=8, scale=1) ──
+    @pto.jit(target="a5")
+    def kernel_max_butterfly(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_max(x, threads=8, scale=1)
+
+    compiled_max_bfly = kernel_max_butterfly.compile()
+    mlir_max_bfly = str(compiled_max_bfly.mlir_text())
+
+    expect(
+        "arith.maximumf" in mlir_max_bfly,
+        "Path 1c (max): butterfly must emit arith.maximumf for element-wise max",
+    )
+    expect(
+        "pto.shuffle_bfly" in mlir_max_bfly,
+        "Path 1c (max): butterfly must use pto.shuffle_bfly",
+    )
+    expect(
+        "pto.redux_max" not in mlir_max_bfly,
+        "Path 1c (max): butterfly path should NOT use hw redux",
+    )
+
+    # ── Max reducer — Path 3: cross_warp_reduce (threads=128, scale=1) ──
+    @pto.jit(target="a5")
+    def kernel_max_cross_warp(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_max(x, scratch=ub_scratch, threads=128, scale=1)
+
+    compiled_max_cw = kernel_max_cross_warp.compile()
+    mlir_max_cw = str(compiled_max_cw.mlir_text())
+
+    expect(
+        "pto.redux_max" in mlir_max_cw,
+        "Path 3 (max): cross-warp IR must contain pto.redux_max",
+    )
+    expect(
+        "pto.syncthreads" in mlir_max_cw,
+        "Path 3 (max): cross-warp needs syncthreads barriers",
+    )
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Min reducer — Path 1a: warp_reduce, hw redux (threads=32, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+    @pto.jit(target="a5")
+    def kernel_min_warp_hw(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_min(x, threads=32, scale=1)
+
+    compiled_min_warp = kernel_min_warp_hw.compile()
+    mlir_min_warp = str(compiled_min_warp.mlir_text())
+
+    expect(
+        "pto.redux_min" in mlir_min_warp,
+        "Path 1a (min): IR must contain pto.redux_min",
+    )
+    expect(
+        "pto.syncthreads" not in mlir_min_warp,
+        "Path 1a (min): single-warp hw reduce needs no syncthreads",
+    )
+
+    # ── Min reducer — Path 4 (ub_reduce fallback): threads=48, non-pow2 ──
+    @pto.jit(target="a5")
+    def kernel_min_ub(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_min(x, scratch=ub_scratch, threads=48, scale=1)
+
+    compiled_min_ub = kernel_min_ub.compile()
+    mlir_min_ub = str(compiled_min_ub.mlir_text())
+
+    expect(
+        "arith.minimumf" in mlir_min_ub,
+        "Path 4 (min): ub_reduce fallback must emit arith.minimumf",
+    )
+
+    # ── Identity smoke tests for max/min ───────────────────────────────────
+    expect(
+        simt_allreduce_max(1.0, threads=1, scale=1) == 1.0,
+        "Path 0 (max): threads <= scale returns identity (value unchanged)",
+    )
+    expect(
+        simt_allreduce_min(1.0, threads=2, scale=2) == 1.0,
+        "Path 0 (min): threads <= scale returns identity (value unchanged)",
+    )
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Lowering verification — ptoas → bisheng (full AOT compilation)
+    #
+    # Tests that the allreduce MLIR survives the complete ptoas pipeline:
+    #   MLIR (PTO dialect) → VPTO passes → LLVM IR → bisheng device codegen
+    #
+    # KNOWN TOOLCHAIN ISSUES (bisheng, not allreduce):
+    #   a) bisheng stack-smashing on SIMT code that stores to GM
+    #   b) bisheng stack-smashing on cross-warp scratch-buffer code (≥ 128 lanes)
+    #
+    # These are bisheng bugs — ptoas VPTO lowering succeeds; the crash is
+    # in the device LLVM→object step inside bisheng.  Verified by:
+    #   ptoas --emit-vpto-llvm-ir  → valid LLVM IR (no crash)
+    #   ptoas -o kernel.o          → bisheng crash during LLVM→object
+    # ══════════════════════════════════════════════════════════════════════════
+
+    import subprocess
+    import tempfile
+    from pathlib import Path
+
+    def _ptoas_binary() -> Path:
+        for p in [
+            Path(__file__).resolve().parents[2] / "build" / "tools" / "ptoas" / "ptoas",
+        ]:
+            if p.is_file():
+                return p
+        raise RuntimeError(
+            "ptoas binary not found; run `source scripts/ptoas_env.sh` or build ptoas"
+        )
+
+    def _lower_and_check(compiled, case_label: str, expect_pass: bool = True) -> bool:
+        """Run ``ptoas`` lowering on *compiled* MLIR.  Returns True on success."""
+        ptoas = _ptoas_binary()
+        mlir_text = compiled.mlir_text()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mlir_path = Path(tmpdir) / "kernel.mlir"
+            obj_path = Path(tmpdir) / "kernel.o"
+            mlir_path.write_text(mlir_text)
+            result = subprocess.run(
+                [str(ptoas), "--pto-arch=a5", "--pto-backend=vpto",
+                 "--enable-tile-op-expand",
+                 str(mlir_path), "-o", str(obj_path)],
+                capture_output=True, text=True,
+            )
+            ok = result.returncode == 0 and obj_path.is_file()
+            if ok:
+                return True
+            bisheng_crash = "stack smashing" in result.stderr or "exit code 134" in result.stderr
+            tag = "SKIP (bisheng bug)" if bisheng_crash else "FAIL"
+            if expect_pass and not bisheng_crash:
+                # Unexpected failure — report loudly
+                sys.stderr.write(
+                    f"\n  [{tag}] {case_label} (exit={result.returncode})\n"
+                    f"  STDERR: {result.stderr[:500]}\n"
+                )
+            else:
+                print(f"  [{tag}] {case_label}")
+            return False
+
+    # ── Warp-reduce (≤ 32 lanes, NO scratch, NO GM store) ──
+    # These are the simplest kernels — they only compute a value and return
+    # from the SIMT body without writing to GM.  They MUST lower cleanly
+    # because they avoid both known bisheng issues.
+    expect(
+        _lower_and_check(kernel_warp.compile(), "warp_sum_t32"),
+        "lowering: warp_sum (32 lanes, hw redux, no GM store) must pass",
+    )
+    expect(
+        _lower_and_check(kernel_max_warp_hw.compile(), "warp_max_t32"),
+        "lowering: warp_max (32 lanes, hw redux, no GM store) must pass",
+    )
+    expect(
+        _lower_and_check(kernel_min_warp_hw.compile(), "warp_min_t32"),
+        "lowering: warp_min (32 lanes, hw redux, no GM store) must pass",
+    )
+
+    # ── Cross-warp (128 lanes, UB scratch) — known bisheng crash ──
+    # ptoas VPTO lowering succeeds; bisheng crashes on the device LLVM IR.
+    @pto.jit(target="a5")
+    def _kernel_cross_lowering(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=1)
+    _lower_and_check(_kernel_cross_lowering.compile(), "cross_sum_t128", expect_pass=False)
+
+    print("ptodsl_allreduce: PASS")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_max/compare.py b/test/vpto/cases/micro-op/simt/allreduce_cross_max/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_max/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_max/golden.py b/test/vpto/cases/micro-op/simt/allreduce_cross_max/golden.py
new file mode 100644
index 0000000000..2dca5fb330
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_max/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 128
+EXPECTED = 1.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_max/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_cross_max/kernel.pto
new file mode 100644
index 0000000000..e9b2842bf5
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_max/kernel.pto
@@ -0,0 +1,65 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c0_i64 = arith.constant 0 : i64
+    %0 = builtin.unrealized_conversion_cast %c0_i64 : i64 to ui64
+    %1 = pto.castptr %0 : ui64 -> !pto.ptr<f32, ub>
+    %c128_i32 = arith.constant 128 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c128_i32, %c1_i32, %c1_i32_0>>>(%1, %arg0) : (!pto.ptr<f32, ub>, !pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, ub>, %arg1: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c5_i32 = arith.constant 5 : i32
+    %c31_i32 = arith.constant 31 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %cst_0 = arith.constant 0xFF800000 : f32
+    %1 = pto.get_tid_x : i32
+    %2 = arith.shrui %1, %c5_i32 : i32
+    %3 = pto.get_laneid : i32
+    %4 = pto.redux_max %cst : f32 -> f32
+    %5 = arith.cmpi ult, %3, %c1_i32 : i32
+    scf.if %5 {
+      %13 = arith.muli %2, %c1_i32 : i32
+      %14 = arith.addi %13, %3 : i32
+      %15 = arith.index_cast %14 : i32 to index
+      pto.store %4, %arg0[%15] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %6 = arith.cmpi ult, %1, %c32_i32 : i32
+    %7 = scf.if %6 -> (f32) {
+      %13 = arith.cmpi ult, %3, %c4_i32 : i32
+      %14 = scf.if %13 -> (f32) {
+        %16 = arith.index_cast %3 : i32 to index
+        %17 = pto.load %arg0[%16] : !pto.ptr<f32, ub> -> f32
+        scf.yield %17 : f32
+      } else {
+        scf.yield %cst_0 : f32
+      }
+      %15 = pto.redux_max %14 : f32 -> f32
+      scf.yield %15 : f32
+    } else {
+      scf.yield %cst_0 : f32
+    }
+    %8 = arith.cmpi ult, %1, %c1_i32 : i32
+    scf.if %8 {
+      %13 = arith.index_cast %1 : i32 to index
+      pto.store %7, %arg0[%13] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %9 = arith.remui %1, %c1_i32 : i32
+    %10 = arith.index_cast %9 : i32 to index
+    %11 = pto.load %arg0[%10] : !pto.ptr<f32, ub> -> f32
+    pto.syncthreads
+    %12 = arith.index_cast %0 : i32 to index
+    pto.store %11, %arg1[%12] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_max/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_max/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_max/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_max/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_max/main.cpp
new file mode 100644
index 0000000000..05dad144bd
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_max/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 128;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_min/compare.py b/test/vpto/cases/micro-op/simt/allreduce_cross_min/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_min/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_min/golden.py b/test/vpto/cases/micro-op/simt/allreduce_cross_min/golden.py
new file mode 100644
index 0000000000..2dca5fb330
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_min/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 128
+EXPECTED = 1.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_min/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_cross_min/kernel.pto
new file mode 100644
index 0000000000..f9353c1b71
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_min/kernel.pto
@@ -0,0 +1,65 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c0_i64 = arith.constant 0 : i64
+    %0 = builtin.unrealized_conversion_cast %c0_i64 : i64 to ui64
+    %1 = pto.castptr %0 : ui64 -> !pto.ptr<f32, ub>
+    %c128_i32 = arith.constant 128 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c128_i32, %c1_i32, %c1_i32_0>>>(%1, %arg0) : (!pto.ptr<f32, ub>, !pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, ub>, %arg1: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c5_i32 = arith.constant 5 : i32
+    %c31_i32 = arith.constant 31 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %cst_0 = arith.constant 0x7F800000 : f32
+    %1 = pto.get_tid_x : i32
+    %2 = arith.shrui %1, %c5_i32 : i32
+    %3 = pto.get_laneid : i32
+    %4 = pto.redux_min %cst : f32 -> f32
+    %5 = arith.cmpi ult, %3, %c1_i32 : i32
+    scf.if %5 {
+      %13 = arith.muli %2, %c1_i32 : i32
+      %14 = arith.addi %13, %3 : i32
+      %15 = arith.index_cast %14 : i32 to index
+      pto.store %4, %arg0[%15] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %6 = arith.cmpi ult, %1, %c32_i32 : i32
+    %7 = scf.if %6 -> (f32) {
+      %13 = arith.cmpi ult, %3, %c4_i32 : i32
+      %14 = scf.if %13 -> (f32) {
+        %16 = arith.index_cast %3 : i32 to index
+        %17 = pto.load %arg0[%16] : !pto.ptr<f32, ub> -> f32
+        scf.yield %17 : f32
+      } else {
+        scf.yield %cst_0 : f32
+      }
+      %15 = pto.redux_min %14 : f32 -> f32
+      scf.yield %15 : f32
+    } else {
+      scf.yield %cst_0 : f32
+    }
+    %8 = arith.cmpi ult, %1, %c1_i32 : i32
+    scf.if %8 {
+      %13 = arith.index_cast %1 : i32 to index
+      pto.store %7, %arg0[%13] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %9 = arith.remui %1, %c1_i32 : i32
+    %10 = arith.index_cast %9 : i32 to index
+    %11 = pto.load %arg0[%10] : !pto.ptr<f32, ub> -> f32
+    pto.syncthreads
+    %12 = arith.index_cast %0 : i32 to index
+    pto.store %11, %arg1[%12] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_min/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_min/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_min/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_min/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_min/main.cpp
new file mode 100644
index 0000000000..05dad144bd
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_min/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 128;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_sum/compare.py b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_sum/golden.py b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/golden.py
new file mode 100644
index 0000000000..ce2b4c57da
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 128
+EXPECTED = 128.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_sum/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/kernel.pto
new file mode 100644
index 0000000000..53f01c524f
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/kernel.pto
@@ -0,0 +1,65 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c0_i64 = arith.constant 0 : i64
+    %0 = builtin.unrealized_conversion_cast %c0_i64 : i64 to ui64
+    %1 = pto.castptr %0 : ui64 -> !pto.ptr<f32, ub>
+    %c128_i32 = arith.constant 128 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c128_i32, %c1_i32, %c1_i32_0>>>(%1, %arg0) : (!pto.ptr<f32, ub>, !pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, ub>, %arg1: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c5_i32 = arith.constant 5 : i32
+    %c31_i32 = arith.constant 31 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %1 = pto.get_tid_x : i32
+    %2 = arith.shrui %1, %c5_i32 : i32
+    %3 = pto.get_laneid : i32
+    %4 = pto.redux_add %cst : f32 -> f32
+    %5 = arith.cmpi ult, %3, %c1_i32 : i32
+    scf.if %5 {
+      %13 = arith.muli %2, %c1_i32 : i32
+      %14 = arith.addi %13, %3 : i32
+      %15 = arith.index_cast %14 : i32 to index
+      pto.store %4, %arg0[%15] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %6 = arith.cmpi ult, %1, %c32_i32 : i32
+    %7 = scf.if %6 -> (f32) {
+      %13 = arith.cmpi ult, %3, %c4_i32 : i32
+      %14 = scf.if %13 -> (f32) {
+        %16 = arith.index_cast %3 : i32 to index
+        %17 = pto.load %arg0[%16] : !pto.ptr<f32, ub> -> f32
+        scf.yield %17 : f32
+      } else {
+        scf.yield %cst_0 : f32
+      }
+      %15 = pto.redux_add %14 : f32 -> f32
+      scf.yield %15 : f32
+    } else {
+      scf.yield %cst_0 : f32
+    }
+    %8 = arith.cmpi ult, %1, %c1_i32 : i32
+    scf.if %8 {
+      %13 = arith.index_cast %1 : i32 to index
+      pto.store %7, %arg0[%13] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %9 = arith.remui %1, %c1_i32 : i32
+    %10 = arith.index_cast %9 : i32 to index
+    %11 = pto.load %arg0[%10] : !pto.ptr<f32, ub> -> f32
+    pto.syncthreads
+    %12 = arith.index_cast %0 : i32 to index
+    pto.store %11, %arg1[%12] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_sum/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_sum/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/main.cpp
new file mode 100644
index 0000000000..05dad144bd
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 128;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_max/compare.py b/test/vpto/cases/micro-op/simt/allreduce_warp_max/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_max/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_max/golden.py b/test/vpto/cases/micro-op/simt/allreduce_warp_max/golden.py
new file mode 100644
index 0000000000..76a227fb75
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_max/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 32
+EXPECTED = 1.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_max/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_warp_max/kernel.pto
new file mode 100644
index 0000000000..e22af0898e
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_max/kernel.pto
@@ -0,0 +1,21 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c32_i32, %c1_i32, %c1_i32_0>>>(%arg0) : (!pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_0 = arith.constant 0xFF800000 : f32
+    %1 = pto.get_laneid : i32
+    %2 = pto.redux_max %cst : f32 -> f32
+    %3 = arith.index_cast %0 : i32 to index
+    pto.store %2, %arg0[%3] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_max/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_max/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_max/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_max/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_max/main.cpp
new file mode 100644
index 0000000000..c4fcda9b36
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_max/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 32;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_min/compare.py b/test/vpto/cases/micro-op/simt/allreduce_warp_min/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_min/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_min/golden.py b/test/vpto/cases/micro-op/simt/allreduce_warp_min/golden.py
new file mode 100644
index 0000000000..76a227fb75
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_min/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 32
+EXPECTED = 1.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_min/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_warp_min/kernel.pto
new file mode 100644
index 0000000000..d921ef1fe3
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_min/kernel.pto
@@ -0,0 +1,21 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c32_i32, %c1_i32, %c1_i32_0>>>(%arg0) : (!pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_0 = arith.constant 0x7F800000 : f32
+    %1 = pto.get_laneid : i32
+    %2 = pto.redux_min %cst : f32 -> f32
+    %3 = arith.index_cast %0 : i32 to index
+    pto.store %2, %arg0[%3] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_min/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_min/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_min/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_min/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_min/main.cpp
new file mode 100644
index 0000000000..c4fcda9b36
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_min/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 32;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_sum/compare.py b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_sum/golden.py b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/golden.py
new file mode 100644
index 0000000000..4a5fb7b63d
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 32
+EXPECTED = 32.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_sum/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/kernel.pto
new file mode 100644
index 0000000000..18e1374384
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/kernel.pto
@@ -0,0 +1,21 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c32_i32, %c1_i32, %c1_i32_0>>>(%arg0) : (!pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %1 = pto.get_laneid : i32
+    %2 = pto.redux_add %cst : f32 -> f32
+    %3 = arith.index_cast %0 : i32 to index
+    pto.store %2, %arg0[%3] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_sum/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_sum/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/main.cpp
new file mode 100644
index 0000000000..c4fcda9b36
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 32;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}