From 9fd60765436d1d8ee441f9d831be260f46913577 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Thu, 25 Jun 2026 14:42:11 +0800
Subject: [PATCH 01/37] Add PTODSL alloc_buffer surface

---
 .../user_guide/04-type-system-and-buffer.md   |  21 ++-
 ptodsl/ptodsl/_bootstrap.py                   |   6 +
 ptodsl/ptodsl/_ops.py                         | 142 +++++++++++++++++-
 ptodsl/ptodsl/_surface_values.py              |  41 +++++
 ptodsl/ptodsl/_tracing/session.py             |  40 ++++-
 ptodsl/ptodsl/pto.py                          |   2 +-
 ptodsl/tests/test_jit_compile.py              |  99 ++++++++++++
 7 files changed, 345 insertions(+), 6 deletions(-)

diff --git a/ptodsl/docs/user_guide/04-type-system-and-buffer.md b/ptodsl/docs/user_guide/04-type-system-and-buffer.md
index 14261c2ab5..95842eeed7 100644
--- a/ptodsl/docs/user_guide/04-type-system-and-buffer.md
+++ b/ptodsl/docs/user_guide/04-type-system-and-buffer.md
@@ -171,7 +171,22 @@ ptr_ub  = pto.ptr(pto.f16, pto.MemorySpace.UB)
 | `MemorySpace.ACC` | Cube L0C accumulator buffer |
 | `MemorySpace.BIAS` | Cube bias table buffer |
 
-## 4.5 TensorView
+## 4.5 Explicit scratch buffers
+
+Use `pto.alloc_buffer(...)` in explicit-mode kernels to allocate scratch storage that is addressed through pointer-style operations:
+
+```python
+ub_scratch = pto.alloc_buffer((4096,), pto.f32, scope="ub")
+fragment = pto.alloc_buffer((32,), pto.f32, scope="local", persistent=True)
+```
+
+`scope="ub"` reserves space in the function-level Unified Buffer scratch area and returns a typed UB pointer. The allocation contributes to the kernel's dynamic shared-memory size and can be passed to explicit data-movement helpers such as `pto.mte_gm_ub(...)` and `pto.mte_ub_gm(...)`.
+
+`scope="local"` creates SIMT-local fragment storage for use by lower-level load/store surfaces. It is intended for per-workitem arrays such as `x_frag[]` and `w_frag[]`. The `persistent` flag is accepted as lifetime metadata for callers that need to distinguish reusable fragment storage from ordinary temporary scratch.
+
+Shapes must be static positive integers so the frontend can compute storage size and layout while tracing.
+
+## 4.6 TensorView
 
 `TensorView` is a descriptor for a tensor in Global Memory. Create one inside a `@pto.jit` body with `make_tensor_view`:
 
@@ -201,7 +216,7 @@ def kernel(
 
 Strides support non-contiguous tensors. Pass `strides=A.strides` from the source tensor for the default row-major layout, or supply explicit strides for sub-views. Use `tv.as_ptr()` to obtain a typed GM pointer for use with MTE Ops in explicit-mode orchestration.
 
-## 4.6 PartitionTensorView
+## 4.7 PartitionTensorView
 
 `partition_view` creates a sub-view of a TensorView at a given offset and size. It describes *which part* of the GM tensor a `tile.load` or `tile.store` should operate on:
 
@@ -212,7 +227,7 @@ part = pto.partition_view(tv, offsets=[row_offset, 0], sizes=[BLOCK, dim])
 
 The result is a `PartitionTensorView` — a lightweight descriptor, not a data buffer. It carries the partition's shape, strides, and element type (inherited from the source TensorView). Use `part.as_ptr()` to obtain a typed GM pointer for MTE Ops in explicit-mode orchestration.
 
-## 4.7 Tile
+## 4.8 Tile
 
 A `Tile` is an on-chip buffer allocated in UB or cube-local memory. Allocate tiles with `alloc_tile`:
 
diff --git a/ptodsl/ptodsl/_bootstrap.py b/ptodsl/ptodsl/_bootstrap.py
index 958494fd04..639f78326b 100644
--- a/ptodsl/ptodsl/_bootstrap.py
+++ b/ptodsl/ptodsl/_bootstrap.py
@@ -61,6 +61,10 @@ def _bootstrap_python_paths() -> None:
 _bootstrap_python_paths()
 
 from mlir.dialects import pto as _pto_dialect  # noqa: E402
+try:
+    from mlir.dialects import llvm as _llvm_dialect  # noqa: E402
+except Exception:  # pragma: no cover - depends on the installed MLIR package.
+    _llvm_dialect = None
 from mlir.ir import Context, Location           # noqa: E402
 
 
@@ -68,6 +72,8 @@ def make_context() -> Context:
     """Create a fresh MLIR Context with the PTO dialect loaded."""
     ctx = Context()
     _pto_dialect.register_dialect(ctx, load=True)
+    if _llvm_dialect is not None and hasattr(_llvm_dialect, "register_dialect"):
+        _llvm_dialect.register_dialect(ctx, load=True)
     return ctx
 
 
diff --git a/ptodsl/ptodsl/_ops.py b/ptodsl/ptodsl/_ops.py
index 0cb8dca177..b85ef414c5 100644
--- a/ptodsl/ptodsl/_ops.py
+++ b/ptodsl/ptodsl/_ops.py
@@ -34,6 +34,7 @@
 from ._scalar_coercion import coerce_scalar_to_type, materialize_scalar_literal
 from ._runtime_scalar_ops import classify_runtime_scalar_type, emit_runtime_binary_op
 from ._surface_values import (
+    AllocatedBufferValue,
     MaskResultValue,
     PartitionTensorViewValue,
     TensorViewValue,
@@ -59,6 +60,7 @@
     mask_type,
     part_tensor_view_type,
     part_tensor_view_type_from_dims,
+    ptr,
     tensor_view_type,
     tensor_view_type_from_dims,
     vreg_type,
@@ -76,7 +78,9 @@
     IndexType,
     IntegerType,
     MemRefType,
+    Operation,
     Type,
+    TypeAttr,
 )
 
 # Pipe name shorthands → canonical PIPE_* names
@@ -1729,6 +1733,142 @@ def _tile_transfer_partition(tv, tile, *, offsets=None, sizes=None, context: str
     return partition_view(tv, offsets=normalized_offsets, sizes=normalized_sizes)
 
 
+def alloc_buffer(shape, dtype, *, scope="ub", persistent=False):
+    """
+    Allocate explicit scratch storage and return an address-like surface value.
+
+    ``scope="ub"`` reserves a byte range in the function-level UB scratch area
+    and returns a typed PTO pointer. ``scope="local"`` emits an LLVM stack
+    allocation for SIMT lane-local fragment storage. Access lowering for local
+    buffers is intentionally left to the scalar/vector load-store surfaces.
+    """
+    _require_explicit_mode("pto.alloc_buffer(...)")
+    normalized_scope = _normalize_alloc_buffer_scope(scope)
+    element_type = _resolve(dtype)
+    element_count = _static_alloc_buffer_element_count(shape)
+    elem_bytes = _element_bytewidth(element_type)
+    byte_size = element_count * elem_bytes
+
+    if normalized_scope == "ub":
+        return _alloc_ub_buffer(
+            shape,
+            dtype,
+            element_type,
+            element_count,
+            byte_size,
+            persistent=persistent,
+        )
+    if normalized_scope == "local":
+        return _alloc_local_buffer(
+            shape,
+            dtype,
+            element_type,
+            element_count,
+            byte_size,
+            persistent=persistent,
+        )
+    raise AssertionError(f"unhandled alloc_buffer scope {normalized_scope!r}")
+
+
+def _normalize_alloc_buffer_scope(scope):
+    if not isinstance(scope, str):
+        try:
+            space = _normalize_address_space(scope)
+        except Exception:
+            space = None
+        if space == _pto.AddressSpace.VEC:
+            return "ub"
+        raise TypeError("pto.alloc_buffer(..., scope=...) expects 'ub' or 'local'")
+    normalized = scope.strip().lower()
+    if normalized in {"ub", "vec"}:
+        return "ub"
+    if normalized in {"local", "private"}:
+        return "local"
+    raise ValueError("pto.alloc_buffer(..., scope=...) expects one of 'ub' or 'local'")
+
+
+def _static_alloc_buffer_element_count(shape):
+    if isinstance(shape, int):
+        dims = (shape,)
+    elif isinstance(shape, (list, tuple)):
+        dims = tuple(shape)
+    else:
+        raise TypeError("pto.alloc_buffer(shape, ...) expects an int or a tuple/list of static dimensions")
+    if not dims:
+        raise ValueError("pto.alloc_buffer(shape, ...) expects at least one dimension")
+    count = 1
+    for dim in dims:
+        raw_dim = unwrap_surface_value(dim)
+        if isinstance(raw_dim, bool):
+            raise TypeError("pto.alloc_buffer(shape, ...) does not accept bool dimensions")
+        if not isinstance(raw_dim, int):
+            raise TypeError(
+                "pto.alloc_buffer(shape, ...) requires static integer dimensions; "
+                f"got {getattr(raw_dim, 'type', type(raw_dim).__name__)}"
+            )
+        if raw_dim <= 0:
+            raise ValueError(f"pto.alloc_buffer(shape, ...) dimensions must be positive, got {raw_dim}")
+        count *= raw_dim
+    return count
+
+
+def _alloc_ub_buffer(shape, dtype, element_type, element_count, byte_size, *, persistent):
+    from ._tracing.active import current_session
+
+    session = current_session()
+    if session is None:
+        raise RuntimeError("pto.alloc_buffer(scope='ub') may only be used while tracing a PTODSL kernel")
+
+    byte_offset = session.allocate_ub_scratch(byte_size, alignment=32)
+    ub_base_i8 = wrap_surface_value(session.get_or_create_ub_base_i8_ptr())
+    if byte_offset:
+        ptr_i8_value = addptr(ub_base_i8, arith.ConstantOp(IndexType.get(), byte_offset).result)
+    else:
+        ptr_i8_value = ub_base_i8
+    ptr_value = castptr(ptr_i8_value, ptr(element_type, "ub"))
+    return AllocatedBufferValue(
+        unwrap_surface_value(ptr_value),
+        scope="ub",
+        shape=_normalize_alloc_buffer_shape_metadata(shape),
+        dtype=dtype,
+        element_type=element_type,
+        element_count=element_count,
+        byte_size=byte_size,
+        byte_offset=byte_offset,
+        persistent=persistent,
+    )
+
+
+def _alloc_local_buffer(shape, dtype, element_type, element_count, byte_size, *, persistent):
+    i32 = IntegerType.get_signless(32)
+    count = _materialize_integer_literal(i32, element_count)
+    llvm_ptr_type = Type.parse("!llvm.ptr")
+    alloca = Operation.create(
+        "llvm.alloca",
+        results=[llvm_ptr_type],
+        operands=[count],
+        attributes={
+            "elem_type": TypeAttr.get(element_type),
+        },
+    ).results[0]
+    return AllocatedBufferValue(
+        alloca,
+        scope="local",
+        shape=_normalize_alloc_buffer_shape_metadata(shape),
+        dtype=dtype,
+        element_type=element_type,
+        element_count=element_count,
+        byte_size=byte_size,
+        persistent=persistent,
+    )
+
+
+def _normalize_alloc_buffer_shape_metadata(shape):
+    if isinstance(shape, int):
+        return (shape,)
+    return tuple(unwrap_surface_value(dim) for dim in shape)
+
+
 def alloc_tile(
     tile_type=None,
     *,
@@ -4063,7 +4203,7 @@ def import_reserved_buffer(name, *, peer_func):
     "vaxpy", "vaddrelu", "vsubrelu",
     "vsel",
     "make_tensor_view", "partition_view",
-    "alloc_tile",
+    "alloc_buffer", "alloc_tile",
     "tload", "tstore", "tmov",
     "tadd", "tsub", "tmul", "tdiv", "tmax", "tmin",
     "tadds", "tsubs", "tmuls", "tdivs", "tmaxs", "tmins",
diff --git a/ptodsl/ptodsl/_surface_values.py b/ptodsl/ptodsl/_surface_values.py
index e8a7541b9f..d511c2c8a4 100644
--- a/ptodsl/ptodsl/_surface_values.py
+++ b/ptodsl/ptodsl/_surface_values.py
@@ -280,6 +280,46 @@ def __radd__(self, offset):
         return AddressOffsetValue(self, offset)
 
 
+class AllocatedBufferValue(AddressValue):
+    """Address returned by ``pto.alloc_buffer`` with allocation metadata."""
+
+    def __init__(
+        self,
+        value,
+        *,
+        scope,
+        shape,
+        dtype,
+        element_type,
+        element_count,
+        byte_size,
+        byte_offset=None,
+        persistent=False,
+    ):
+        super().__init__(value)
+        self.scope = scope
+        self.shape = tuple(shape)
+        self.dtype = dtype
+        self.element_type = element_type
+        self.element_count = element_count
+        self.byte_size = byte_size
+        self.byte_offset = byte_offset
+        self.persistent = bool(persistent)
+
+    @property
+    def surface_metadata(self):
+        return {
+            "scope": self.scope,
+            "shape": self.shape,
+            "dtype": self.dtype,
+            "element_type": self.element_type,
+            "element_count": self.element_count,
+            "byte_size": self.byte_size,
+            "byte_offset": self.byte_offset,
+            "persistent": self.persistent,
+        }
+
+
 @dataclass(frozen=True)
 class AddressOffsetValue:
     """Address view plus an element offset, used by scalar.load/store sugar."""
@@ -961,6 +1001,7 @@ def _coerce_index_value(value):
 
 
 __all__ = [
+    "AllocatedBufferValue",
     "AddressOffsetValue",
     "AddressValue",
     "MaskResultValue",
diff --git a/ptodsl/ptodsl/_tracing/session.py b/ptodsl/ptodsl/_tracing/session.py
index 272dd10038..705d9eccf8 100644
--- a/ptodsl/ptodsl/_tracing/session.py
+++ b/ptodsl/ptodsl/_tracing/session.py
@@ -21,7 +21,7 @@
 
 from mlir.dialects import arith, func
 from mlir.dialects import pto as _pto
-from mlir.ir import InsertionPoint, IntegerType, UnitAttr
+from mlir.ir import InsertionPoint, IntegerAttr, IntegerType, UnitAttr
 
 
 @dataclass(frozen=True)
@@ -56,6 +56,8 @@ def __init__(self, module_spec, module, entry_function):
         self._helpers: dict[str, object] = {}
         self._subkernel_stack: list[SubkernelTraceFrame] = []
         self._carry_loop_stack = []
+        self._ub_base_i8_ptr = None
+        self._ub_scratch_next_byte = 0
 
     @property
     def current_function(self):
@@ -81,6 +83,32 @@ def bind_entry_block(self, entry_block) -> None:
         """Record the root entry block for the active trace."""
         self.entry_block = entry_block
 
+    @property
+    def ub_scratch_size(self) -> int:
+        return self._ub_scratch_next_byte
+
+    def get_or_create_ub_base_i8_ptr(self):
+        """Return the shared UB byte-base pointer for explicit scratch buffers."""
+        if self._ub_base_i8_ptr is not None:
+            return self._ub_base_i8_ptr
+        from .._ops import castptr
+        from .._types import int8, ptr
+
+        i64 = IntegerType.get_signless(64)
+        zero = arith.ConstantOp(i64, 0).result
+        self._ub_base_i8_ptr = castptr(zero, ptr(int8, "ub")).value
+        return self._ub_base_i8_ptr
+
+    def allocate_ub_scratch(self, byte_size: int, *, alignment: int = 32) -> int:
+        """Reserve one aligned byte range in the function-level UB scratch area."""
+        if not isinstance(byte_size, int) or byte_size <= 0:
+            raise ValueError(f"UB scratch allocation expects a positive byte size, got {byte_size!r}")
+        if not isinstance(alignment, int) or alignment <= 0:
+            raise ValueError(f"UB scratch allocation expects a positive alignment, got {alignment!r}")
+        offset = _align_up(self._ub_scratch_next_byte, alignment)
+        self._ub_scratch_next_byte = offset + byte_size
+        return offset
+
     @contextmanager
     def enter_function(self, ir_fn):
         """Push *ir_fn* as the current active function in this session."""
@@ -216,6 +244,16 @@ def validate_final_state(self) -> None:
             raise RuntimeError("PTODSL trace-session exited with an open subkernel lowering frame")
         if self._carry_loop_stack:
             raise RuntimeError("PTODSL trace-session exited with an open loop-carry lowering frame")
+        if self._ub_scratch_next_byte:
+            i64 = IntegerType.get_signless(64)
+            self.entry_function.attributes["dyn_shared_memory_buf"] = IntegerAttr.get(
+                i64,
+                _align_up(self._ub_scratch_next_byte, 32),
+            )
+
+
+def _align_up(value: int, alignment: int) -> int:
+    return ((value + alignment - 1) // alignment) * alignment
 
 
 __all__ = [
diff --git a/ptodsl/ptodsl/pto.py b/ptodsl/ptodsl/pto.py
index aba8383180..e32636f911 100644
--- a/ptodsl/ptodsl/pto.py
+++ b/ptodsl/ptodsl/pto.py
@@ -97,7 +97,7 @@
     vaxpy, vaddrelu, vsubrelu,
     vsel,
     make_tensor_view, partition_view,
-    alloc_tile,
+    alloc_buffer, alloc_tile,
     mte_load, mte_store, mte_gm_ub, mte_ub_gm, mte_ub_ub, mte_ub_l1,
     mte_gm_l1, mte_l1_ub, mte_gm_l1_frac, mte_l1_bt, mte_l1_fb, mem_bar,
     mte_l1_l0a, mte_l1_l0b, mte_l1_l0a_mx, mte_l1_l0b_mx,
diff --git a/ptodsl/tests/test_jit_compile.py b/ptodsl/tests/test_jit_compile.py
index abec06a1dc..3258d3f8a0 100644
--- a/ptodsl/tests/test_jit_compile.py
+++ b/ptodsl/tests/test_jit_compile.py
@@ -328,6 +328,59 @@ def simt_helper_lowering_probe(*, TRACE_TOKEN: pto.constexpr = 0):
     simt_tid_probe()
 
 
+@pto.jit(target="a5", mode="explicit")
+def alloc_buffer_ub_probe(
+    A_ptr: pto.ptr(pto.f32, "gm"),
+    O_ptr: pto.ptr(pto.f32, "gm"),
+):
+    scratch = pto.alloc_buffer((64,), pto.f32, scope="ub")
+    pto.mte_gm_ub(A_ptr, scratch, 0, 256, nburst=(1, 0, 0))
+    pto.mte_ub_gm(scratch, O_ptr, 256, nburst=(1, 0, 0))
+
+
+@pto.simt
+def alloc_buffer_local_helper():
+    _ = pto.alloc_buffer((32,), pto.f32, scope="local", persistent=True)
+
+
+@pto.jit(target="a5", mode="explicit")
+def alloc_buffer_local_probe():
+    alloc_buffer_local_helper()
+
+
+@pto.simt
+def rmsnorm_alloc_buffer_frag_helper(
+    w_ub: pto.ptr(pto.f32, pto.MemorySpace.UB),
+    x_ub: pto.ptr(pto.f32, pto.MemorySpace.UB),
+):
+    _ = pto.get_tid_x()
+    _ = w_ub
+    _ = x_ub
+    _ = pto.alloc_buffer((32,), pto.f32, scope="local")
+    _ = pto.alloc_buffer((32,), pto.f32, scope="local", persistent=True)
+
+
+@pto.jit(target="a5", mode="explicit")
+def rmsnorm_alloc_buffer_layout_probe(
+    X: pto.ptr(pto.f32, "gm"),
+    W: pto.ptr(pto.f32, "gm"),
+    Y: pto.ptr(pto.f32, "gm"),
+    RSTD: pto.ptr(pto.f32, "gm"),
+):
+    w_ub = pto.alloc_buffer((4096,), pto.f32, scope="ub")
+    x_ub = pto.alloc_buffer((2, 4096), pto.f32, scope="ub")
+    rstd_ub = pto.alloc_buffer((16,), pto.f32, scope="ub")
+    y_ub = pto.alloc_buffer((2, 4096), pto.f32, scope="ub")
+    reduce_scratch = pto.alloc_buffer((128,), pto.f32, scope="ub")
+
+    pto.mte_gm_ub(W, w_ub, 0, 4096 * 4, nburst=(1, 0, 0))
+    pto.mte_gm_ub(X, x_ub, 0, 4096 * 4, nburst=(1, 0, 0))
+    rmsnorm_alloc_buffer_frag_helper(w_ub, x_ub)
+    pto.mte_ub_gm(y_ub, Y, 4096 * 4, nburst=(1, 0, 0))
+    pto.mte_ub_gm(rstd_ub, RSTD, 4, nburst=(1, 0, 0))
+    _ = reduce_scratch
+
+
 @pto.jit(target="a5")
 def ast_subkernel_runtime_for_probe(rows: pto.i32):
     ast_subkernel_runtime_for_helper(rows)
@@ -2173,6 +2226,52 @@ def main() -> None:
     expect("pto.get_tid_y" in simt_text, "SIMT helper body should contain pto.get_tid_y")
     expect("pto.get_tid_z" in simt_text, "SIMT helper body should contain pto.get_tid_z")
 
+    alloc_buffer_ub_text = alloc_buffer_ub_probe.compile().mlir_text()
+    expect_parse_roundtrip_and_verify(alloc_buffer_ub_text, "alloc_buffer UB specialization")
+    expect(
+        "dyn_shared_memory_buf = 256 : i64" in alloc_buffer_ub_text,
+        "alloc_buffer(scope='ub') should size the function-level UB scratch area",
+    )
+    expect(
+        "pto.castptr %c0_i64" in alloc_buffer_ub_text and "!pto.ptr<i8, ub>" in alloc_buffer_ub_text,
+        "alloc_buffer(scope='ub') should materialize a shared UB byte-base pointer",
+    )
+    expect(
+        "pto.mte_gm_ub" in alloc_buffer_ub_text and "pto.mte_ub_gm" in alloc_buffer_ub_text,
+        "alloc_buffer(scope='ub') result should be accepted by explicit MTE helpers",
+    )
+
+    alloc_buffer_local_text = alloc_buffer_local_probe.compile().mlir_text()
+    expect_parse_roundtrip_and_verify(alloc_buffer_local_text, "alloc_buffer local specialization")
+    expect(
+        "llvm.alloca" in alloc_buffer_local_text and "x f32" in alloc_buffer_local_text,
+        "alloc_buffer(scope='local') should lower to an LLVM stack allocation in the SIMT helper",
+    )
+    expect(
+        "func.func @alloc_buffer_local_helper() attributes {pto.simt_entry}" in alloc_buffer_local_text,
+        "alloc_buffer(scope='local') probe should keep allocation inside the SIMT helper body",
+    )
+
+    rmsnorm_alloc_buffer_text = rmsnorm_alloc_buffer_layout_probe.compile().mlir_text()
+    expect_parse_roundtrip_and_verify(rmsnorm_alloc_buffer_text, "RMSNorm alloc_buffer layout specialization")
+    expect(
+        "dyn_shared_memory_buf = 82496 : i64" in rmsnorm_alloc_buffer_text,
+        "RMSNorm alloc_buffer layout should reserve the same UB scratch size as the expanded RMSNorm kernel",
+    )
+    for expected_offset in (16384, 49152, 49216, 81984):
+        expect(
+            f"arith.constant {expected_offset} : index" in rmsnorm_alloc_buffer_text,
+            f"RMSNorm alloc_buffer layout should materialize UB byte offset {expected_offset}",
+        )
+    expect(
+        rmsnorm_alloc_buffer_text.count("llvm.alloca") == 2,
+        "RMSNorm alloc_buffer fragment helper should allocate x_frag and persistent w_frag locally",
+    )
+    expect(
+        "call @rmsnorm_alloc_buffer_frag_helper" in rmsnorm_alloc_buffer_text,
+        "RMSNorm alloc_buffer layout should pass UB scratch pointers through the existing SIMT helper call path",
+    )
+
     ast_subkernel_runtime_for_text = ast_subkernel_runtime_for_probe.compile().mlir_text()
     expect_parse_roundtrip_and_verify(
         ast_subkernel_runtime_for_text,

From b8456d67a7e7b0a0d6124882165e9e56246036a7 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Thu, 25 Jun 2026 15:02:41 +0800
Subject: [PATCH 02/37] Add PTODSL contiguous scalar vector access

---
 .../user_guide/06-scalar-and-pointer-ops.md   |  43 ++++++-
 ptodsl/ptodsl/_builtin_vector.py              |  49 ++++++++
 ptodsl/ptodsl/_runtime_scalar_ops.py          |   6 +-
 ptodsl/ptodsl/_surface_values.py              |  43 ++++++-
 ptodsl/ptodsl/_types.py                       |  37 +++++-
 ptodsl/ptodsl/pto.py                          |   3 +-
 ptodsl/ptodsl/scalar.py                       | 117 +++++++++++++++++-
 ptodsl/tests/test_jit_compile.py              |  48 +++++++
 8 files changed, 329 insertions(+), 17 deletions(-)
 create mode 100644 ptodsl/ptodsl/_builtin_vector.py

diff --git a/ptodsl/docs/user_guide/06-scalar-and-pointer-ops.md b/ptodsl/docs/user_guide/06-scalar-and-pointer-ops.md
index 53b9da34b0..5a055bcd5f 100644
--- a/ptodsl/docs/user_guide/06-scalar-and-pointer-ops.md
+++ b/ptodsl/docs/user_guide/06-scalar-and-pointer-ops.md
@@ -35,11 +35,11 @@ When in doubt, ask: *can this value change between launches of the same compiled
 
 ## 6.2 Scalar access: load and store
 
-`scalar.load` reads a single scalar element from a typed pointer or tile location. `scalar.store` writes a scalar back. These are the canonical scalar memory ops for SIMT authoring. The offset is counted in elements, not bytes.
+`scalar.load` reads one scalar element from a typed pointer or tile location. With `contiguous=N`, it reads `N` adjacent elements as a builtin MLIR vector value. `scalar.store` writes either a scalar or one of those builtin vector values back. These are the canonical memory ops for SIMT authoring. Offsets are counted in elements, not bytes.
 
-#### `scalar.load(ptr: PtrType, offset: Index) -> ScalarType`
+#### `scalar.load(ptr: PtrType, offset: Index, *, contiguous: int | None = None) -> ScalarType | VecValue`
 
-**Description**: Loads one scalar element from a typed pointer at the given element offset.
+**Description**: Loads one scalar element from a typed pointer at the given element offset, or `contiguous` adjacent elements as `vector<NxT>`.
 
 **Parameters**:
 
@@ -47,12 +47,14 @@ When in doubt, ask: *can this value change between launches of the same compiled
 |-----------|------|-------------|
 | `ptr` | `PtrType` | Typed pointer (`pto.ptr<T, space>`) or the result of `tile.as_ptr()` |
 | `offset` | `Index` | Element displacement from `ptr` |
+| `contiguous` | `int` or `None` | `None` and `1` load one scalar; `N > 1` loads `N` adjacent elements |
 
 **Returns**:
 
 | Return Value | Type | Description |
 |--------------|------|-------------|
 | `value` | `ScalarType` | The loaded scalar, matching the pointer's element type |
+| `value` | `pto.vec(T, N)` | Returned when `contiguous=N > 1`; lowers as builtin `vector<NxT>` |
 
 **Tile-index form** — the preferred syntax when loading from a tile:
 
@@ -71,19 +73,28 @@ val = scalar.load(ptr, offset)       # explicit offset
 val = scalar.load(ptr + offset)      # pointer arithmetic shorthand
 ```
 
+**Contiguous vector form**:
+
+```python
+x4 = scalar.load(ptr, offset, contiguous=4)
+```
+
+For a `pto.ptr(pto.f32, "ub")`, this produces a value with DSL type `pto.vec(pto.f32, 4)` and MLIR type `vector<4xf32>`. The frontend lowers this directly to low-level pointer arithmetic plus an LLVM vector load; it does not introduce a new PTO semantic op.
+
 ---
 
-#### `scalar.store(value: ScalarType, ptr: PtrType, offset: Index) -> None`
+#### `scalar.store(value: ScalarType | VecValue, ptr: PtrType, offset: Index, *, contiguous: int | None = None) -> None`
 
-**Description**: Stores one scalar element to a typed pointer at the given element offset.
+**Description**: Stores one scalar element or a builtin vector value to a typed pointer at the given element offset.
 
 **Parameters**:
 
 | Parameter | Type | Description |
 |-----------|------|-------------|
-| `value` | `ScalarType` | Scalar value to write |
+| `value` | `ScalarType` or `pto.vec(T, N)` | Scalar value or contiguous vector value to write |
 | `ptr` | `PtrType` | Typed destination pointer |
 | `offset` | `Index` | Element displacement from `ptr` |
+| `contiguous` | `int` or `None` | Optional width check for vector stores; if provided, it must match the vector lane count |
 
 **Returns**: None (side-effect operation).
 
@@ -101,6 +112,26 @@ scalar.store(value, tile[row, col])
 scalar.store(value, ptr, offset)
 ```
 
+**Contiguous vector form**:
+
+```python
+scalar.store(x4, ptr, offset)
+scalar.store(x4, ptr, offset, contiguous=4)  # optional width check
+```
+
+Vector stores lower directly to an LLVM vector store. Scalar stores remain scalar stores; `scalar.store(scalar_value, ptr, offset, contiguous=N)` is rejected because scalar values are not implicitly broadcast for stores.
+
+#### `pto.vec(dtype, lanes, *, init=None)`
+
+`pto.vec(dtype, lanes)` names a builtin vector type such as `vector<4xf32>`. When `init` is provided, it constructs a vector value. A scalar initializer is broadcast to every lane:
+
+```python
+rstd4 = pto.vec(pto.f32, 4, init=rstd)
+y4 = x4 * rstd4
+```
+
+The initial vector arithmetic surface is intentionally narrow: multiplication of compatible `VecValue` operands lowers to elementwise `arith.mulf` on builtin vector types.
+
 ---
 
 ### Typical SIMT usage
diff --git a/ptodsl/ptodsl/_builtin_vector.py b/ptodsl/ptodsl/_builtin_vector.py
new file mode 100644
index 0000000000..decc8be750
--- /dev/null
+++ b/ptodsl/ptodsl/_builtin_vector.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+"""Builtin MLIR vector helpers for PTODSL scalar-contiguous access."""
+
+from ._bootstrap import make_context  # ensure MLIR is on sys.path  # noqa: F401
+from ._scalar_coercion import coerce_scalar_to_type
+from ._surface_values import VecValue, unwrap_surface_value
+from ._types import _resolve, _validate_vec_lanes, vec_type
+
+from mlir.dialects import arith
+from mlir.dialects import llvm
+from mlir.ir import IntegerType, VectorType
+
+
+def vec(dtype, lanes: int, *, init=None):
+    """Create a builtin vector type descriptor or broadcast vector value."""
+    lanes = _validate_vec_lanes(lanes, context="pto.vec(...)")
+    descriptor = vec_type(dtype, lanes)
+    if init is None:
+        return descriptor
+    return _broadcast_vec_value(descriptor, init)
+
+
+def _broadcast_vec_value(descriptor, init):
+    vector_type = _resolve(descriptor)
+    element_type = VectorType(vector_type).element_type
+    raw_init = unwrap_surface_value(init)
+
+    if hasattr(raw_init, "type") and VectorType.isinstance(raw_init.type):
+        vec_value = VecValue(raw_init)
+        if vec_value.type != vector_type:
+            raise TypeError(f"pto.vec(..., init=vector) expected {vector_type}, got {vec_value.type}")
+        return vec_value
+
+    scalar_value = coerce_scalar_to_type(init, element_type, context="pto.vec(..., init=...)")
+    current = llvm.UndefOp(vector_type).res
+    i32 = IntegerType.get_signless(32)
+    for lane in range(descriptor.lanes):
+        lane_index = arith.ConstantOp(i32, lane).result
+        current = llvm.InsertElementOp(current, scalar_value, lane_index).res
+    return VecValue(current)
+
+
+__all__ = ["vec"]
diff --git a/ptodsl/ptodsl/_runtime_scalar_ops.py b/ptodsl/ptodsl/_runtime_scalar_ops.py
index 7f2e3bbf3c..8a6b4a1ab4 100644
--- a/ptodsl/ptodsl/_runtime_scalar_ops.py
+++ b/ptodsl/ptodsl/_runtime_scalar_ops.py
@@ -18,7 +18,7 @@
 )
 
 from mlir.dialects import arith, math
-from mlir.ir import BF16Type, F16Type, F32Type, FloatAttr, IndexType, IntegerType
+from mlir.ir import BF16Type, F16Type, F32Type, FloatAttr, IndexType, IntegerType, VectorType
 
 
 _FLOAT_BINARY_OPS = {
@@ -188,6 +188,10 @@ def classify_runtime_scalar_type(type_obj):
         return "integer"
     if any(cls.isinstance(type_obj) for cls in (BF16Type, F16Type, F32Type)):
         return "float"
+    if VectorType.isinstance(type_obj):
+        elem_type = VectorType(type_obj).element_type
+        if any(cls.isinstance(elem_type) for cls in (BF16Type, F16Type, F32Type)):
+            return "float"
     raise TypeError(f"runtime scalar operators only support index/int/float values, got {type_obj}")
 
 
diff --git a/ptodsl/ptodsl/_surface_values.py b/ptodsl/ptodsl/_surface_values.py
index e8a7541b9f..21e32f7252 100644
--- a/ptodsl/ptodsl/_surface_values.py
+++ b/ptodsl/ptodsl/_surface_values.py
@@ -13,14 +13,19 @@
 from dataclasses import dataclass
 
 from ._diagnostics import native_python_control_flow_error
-from ._runtime_scalar_ops import emit_runtime_binary_op, emit_runtime_bitwise_op, emit_runtime_compare
+from ._runtime_scalar_ops import (
+    emit_runtime_binary_op,
+    emit_runtime_bitwise_op,
+    emit_runtime_compare,
+    normalize_runtime_binary_operands,
+)
 from ._surface_types import PartitionTensorView, TensorView, Tile
 from ._types import _normalize_address_space, _resolve, ptr
 
 from mlir.dialects import arith
 from mlir.dialects import memref
 from mlir.dialects import pto as _pto
-from mlir.ir import IndexType, IntegerAttr, IntegerType, MemRefType, ShapedType, StridedLayoutAttr, Type
+from mlir.ir import IndexType, IntegerAttr, IntegerType, MemRefType, ShapedType, StridedLayoutAttr, Type, VectorType
 
 
 def unwrap_surface_value(value):
@@ -145,6 +150,8 @@ def wrap_surface_value(
         return AddressValue(value)
     except Exception:
         pass
+    if VectorType.isinstance(type_obj):
+        return VecValue(value)
     return RuntimeValue(value)
 
 
@@ -258,6 +265,37 @@ def __rxor__(self, other):
         return wrap_surface_value(emit_runtime_bitwise_op("xor", unwrap_surface_value(other), self.value))
 
 
+class VecValue(_SurfaceValue):
+    """Author-facing builtin vector value backed by an MLIR vector SSA value."""
+
+    def __init__(self, value):
+        if not VectorType.isinstance(value.type):
+            raise TypeError(f"VecValue expects an MLIR vector value, got {value.type}")
+        super().__init__(value)
+        vec_type = VectorType(value.type)
+        if vec_type.rank != 1:
+            raise TypeError(f"PTODSL builtin vectors must be rank-1, got {value.type}")
+        self.lanes = int(vec_type.shape[0])
+        self.element_type = vec_type.element_type
+
+    def __mul__(self, other):
+        return _emit_vec_binary_op("mul", self, other)
+
+    def __rmul__(self, other):
+        return _emit_vec_binary_op("mul", other, self)
+
+
+def _emit_vec_binary_op(op_name: str, lhs, rhs):
+    lhs_raw = unwrap_surface_value(lhs)
+    rhs_raw = unwrap_surface_value(rhs)
+    if not (VectorType.isinstance(lhs_raw.type) and VectorType.isinstance(rhs_raw.type)):
+        raise TypeError("PTODSL VecValue arithmetic expects compatible vector operands")
+    lhs_raw, rhs_raw, kind = normalize_runtime_binary_operands(lhs_raw, rhs_raw)
+    if kind != "float":
+        raise TypeError(f"PTODSL VecValue operator '{op_name}' currently supports only floating-point vectors")
+    return VecValue(emit_runtime_binary_op(op_name, lhs_raw, rhs_raw))
+
+
 class MaskResultValue(_SurfaceValue):
     """Mask value that also supports `(mask, remained)` unpacking."""
 
@@ -967,6 +1005,7 @@ def _coerce_index_value(value):
     "PartitionSpec",
     "PartitionTensorViewValue",
     "RuntimeValue",
+    "VecValue",
     "TileElementRef",
     "TileSliceValue",
     "TensorViewValue",
diff --git a/ptodsl/ptodsl/_types.py b/ptodsl/ptodsl/_types.py
index 7423613f18..c4295bf456 100644
--- a/ptodsl/ptodsl/_types.py
+++ b/ptodsl/ptodsl/_types.py
@@ -35,6 +35,7 @@ def softmax(arg0: pto.ptr(pto.float32, "GM"), ...):
     IntegerType,
     ShapedType,
     Type,
+    VectorType,
 )
 
 # ── Address-space name → AddressSpace enum ───────────────────────────────────
@@ -152,6 +153,35 @@ def __repr__(self):
         return f"<pto.mask {self._bits}>"
 
 
+class _VecDescriptor(_DType):
+    def __init__(self, elem, lanes: int):
+        self._elem = elem
+        self._lanes = _validate_vec_lanes(lanes, context="pto.vec(...)")
+
+    def resolve(self) -> Type:
+        elem = _ensure_non_storage_only_dtype(self._elem, context="pto.vec(...)")
+        return VectorType.get([self._lanes], elem)
+
+    @property
+    def lanes(self) -> int:
+        return self._lanes
+
+    @property
+    def elem(self):
+        return self._elem
+
+    def __repr__(self):
+        return f"<pto.vec {self._lanes}x{self._elem}>"
+
+
+def _validate_vec_lanes(lanes: int, *, context: str) -> int:
+    if isinstance(lanes, bool) or not isinstance(lanes, int):
+        raise TypeError(f"{context} expects lanes to be a positive Python integer")
+    if lanes <= 0:
+        raise ValueError(f"{context} expects lanes to be positive")
+    return lanes
+
+
 def _resolve(dtype) -> Type:
     """Coerce a ``_DType`` descriptor or a concrete ``mlir.ir.Type`` to a Type."""
     if isinstance(dtype, _DType):
@@ -396,6 +426,11 @@ def vreg_type(lanes: int, elem) -> _VRegDescriptor:
     return _VRegDescriptor(lanes, elem)
 
 
+def vec_type(elem, lanes: int) -> _VecDescriptor:
+    """Return a lazy descriptor for builtin ``vector<lanes x elem>`` values."""
+    return _VecDescriptor(elem, lanes)
+
+
 def mask_type(bits: str = "b32") -> _MaskDescriptor:
     """Return a lazy descriptor for ``!pto.mask<bits>``."""
     return _MaskDescriptor(bits)
@@ -469,7 +504,7 @@ def part_tensor_view_type_from_dims(dims, elem) -> Type:
     "si8", "si16", "si32", "si64",
     "ui8", "ui16", "ui32", "ui64",
     "index",
-    "ptr", "vreg_type", "mask_type",
+    "ptr", "vreg_type", "vec_type", "mask_type",
     "tile_buf_type", "tensor_view_type", "tensor_view_type_from_dims",
     "part_tensor_view_type", "part_tensor_view_type_from_dims",
 ]
diff --git a/ptodsl/ptodsl/pto.py b/ptodsl/ptodsl/pto.py
index aba8383180..4193993148 100644
--- a/ptodsl/ptodsl/pto.py
+++ b/ptodsl/ptodsl/pto.py
@@ -30,9 +30,10 @@
     si8, si16, si32, si64,
     ui8, ui16, ui32, ui64,
     index,
-    ptr, vreg_type, mask_type,
+    ptr, vreg_type, vec_type, mask_type,
     _resolve,
 )
+from ._builtin_vector import vec  # noqa: F401
 from ._surface_types import (   # noqa: F401
     constexpr,
     tensor_spec,
diff --git a/ptodsl/ptodsl/scalar.py b/ptodsl/ptodsl/scalar.py
index fef02ff966..5497c6dcd7 100644
--- a/ptodsl/ptodsl/scalar.py
+++ b/ptodsl/ptodsl/scalar.py
@@ -24,12 +24,13 @@
     emit_runtime_max,
     emit_runtime_min,
 )
-from ._surface_values import resolve_address_access, unwrap_surface_value, wrap_surface_value
+from ._surface_values import VecValue, resolve_address_access, unwrap_surface_value, wrap_surface_value
 from ._types import _resolve
 
 from mlir.dialects import arith
+from mlir.dialects import llvm
 from mlir.dialects import math
-from mlir.ir import IndexType, MemRefType, Operation
+from mlir.ir import IndexType, IntegerType, MemRefType, Operation, VectorType
 from mlir.dialects import pto as _pto
 
 
@@ -120,10 +121,13 @@ def abs(value):
     return wrap_surface_value(emit_runtime_abs(unwrap_surface_value(value)))
 
 
-def load(ptr_or_ref, offset=None):
-    """Load one scalar element from a PTODSL address view or tile element."""
+def load(ptr_or_ref, offset=None, *, contiguous=None):
+    """Load one scalar element or a contiguous builtin vector from a PTODSL address view."""
+    width = _normalize_contiguous(contiguous, context="scalar.load(...)")
     buffer_value, index_value = resolve_address_access(ptr_or_ref, offset)
     result_type = _infer_buffer_element_type(buffer_value.type)
+    if width > 1:
+        return VecValue(_emit_contiguous_load(buffer_value, index_value, result_type, width))
     return wrap_surface_value(Operation.create(
         "pto.load",
         results=[result_type],
@@ -131,16 +135,45 @@ def load(ptr_or_ref, offset=None):
     ).results[0])
 
 
-def store(value, ptr_or_ref, offset=None):
-    """Store one scalar element to a PTODSL address view or tile element."""
+def store(value, ptr_or_ref, offset=None, *, contiguous=None):
+    """Store one scalar element or a builtin vector to a PTODSL address view."""
     buffer_value, index_value = resolve_address_access(ptr_or_ref, offset)
     elem_type = _infer_buffer_element_type(buffer_value.type)
+    raw_value = unwrap_surface_value(value)
+    if hasattr(raw_value, "type") and VectorType.isinstance(raw_value.type):
+        vec_value = value if isinstance(value, VecValue) else VecValue(raw_value)
+        width = _normalize_contiguous(contiguous, context="scalar.store(...)", default=vec_value.lanes)
+        if width != vec_value.lanes:
+            raise ValueError(
+                f"scalar.store(..., contiguous={width}) does not match vector lane count {vec_value.lanes}"
+            )
+        if vec_value.element_type != elem_type:
+            raise TypeError(
+                "scalar.store(vector, ...) element type must match the destination pointer element type: "
+                f"got {vec_value.element_type}, expected {elem_type}"
+            )
+        _emit_contiguous_store(raw_value, buffer_value, index_value)
+        return
+
+    width = _normalize_contiguous(contiguous, context="scalar.store(...)")
+    if width > 1:
+        raise TypeError("scalar.store(scalar, ..., contiguous=N) is not supported; pass a vector value")
     Operation.create(
         "pto.store",
         operands=[buffer_value, index_value, coerce_scalar_to_type(value, elem_type, context="scalar.store(...)")],
     )
 
 
+def _normalize_contiguous(contiguous, *, context: str, default: int = 1) -> int:
+    if contiguous is None:
+        return default
+    if isinstance(contiguous, bool) or not isinstance(contiguous, int):
+        raise TypeError(f"{context} expects contiguous to be a positive Python integer")
+    if contiguous <= 0:
+        raise ValueError(f"{context} expects contiguous to be positive")
+    return contiguous
+
+
 def _infer_buffer_element_type(buffer_type):
     try:
         return _pto.PtrType(buffer_type).element_type
@@ -148,6 +181,78 @@ def _infer_buffer_element_type(buffer_type):
         return MemRefType(buffer_type).element_type
 
 
+def _emit_contiguous_load(buffer_value, index_value, elem_type, width: int):
+    vector_type = VectorType.get([width], elem_type)
+    ptr_value = _emit_llvm_byte_pointer(buffer_value, index_value, elem_type)
+    return llvm.LoadOp(vector_type, ptr_value).res
+
+
+def _emit_contiguous_store(vector_value, buffer_value, index_value):
+    elem_type = VectorType(vector_value.type).element_type
+    ptr_value = _emit_llvm_byte_pointer(buffer_value, index_value, elem_type)
+    llvm.StoreOp(vector_value, ptr_value)
+
+
+def _emit_llvm_byte_pointer(buffer_value, index_value, elem_type):
+    pto_ptr_type = _as_pto_ptr_type(buffer_value.type)
+    byte_offset = _emit_byte_offset(index_value, elem_type)
+    i64 = IntegerType.get_signless(64)
+    addr_as_i64 = _pto.CastPtrOp(i64, buffer_value).result
+    llvm_ptr_type = llvm.PointerType.get(_pto_ptr_llvm_address_space(pto_ptr_type))
+    llvm_base = llvm.IntToPtrOp(llvm_ptr_type, addr_as_i64).res
+    return llvm.GEPOp(
+        llvm_ptr_type,
+        llvm_base,
+        [byte_offset],
+        [-2147483648],
+        IntegerType.get_signless(8),
+    ).res
+
+
+def _emit_byte_offset(index_value, elem_type):
+    bytewidth = _element_bytewidth(elem_type)
+    bytewidth_const = arith.ConstantOp(IndexType.get(), bytewidth).result
+    byte_index = arith.MulIOp(index_value, bytewidth_const).result
+    return arith.IndexCastOp(IntegerType.get_signless(64), byte_index).result
+
+
+def _as_pto_ptr_type(type_obj):
+    try:
+        return _pto.PtrType(type_obj)
+    except Exception as exc:
+        raise TypeError(
+            "contiguous scalar.load/store currently expects a PTO pointer-backed address"
+        ) from exc
+
+
+def _pto_ptr_llvm_address_space(ptr_type) -> int:
+    memory_space = getattr(ptr_type, "memory_space", None)
+    value = getattr(memory_space, "value", None)
+    if value is not None:
+        return int(value)
+    text = str(ptr_type)
+    if ", ub>" in text or ", vec>" in text:
+        return 6
+    if ", gm>" in text or text.endswith(">"):
+        return 1
+    raise TypeError(f"unable to infer LLVM address space for pointer type {ptr_type}")
+
+
+def _element_bytewidth(elem_type):
+    if str(elem_type) == "f32":
+        return 4
+    if str(elem_type) in {"f16", "bf16"}:
+        return 2
+    if IntegerType.isinstance(elem_type):
+        width = IntegerType(elem_type).width
+        if width % 8 != 0:
+            raise TypeError(f"unsupported sub-byte integer element type {elem_type}")
+        return width // 8
+    if str(elem_type).startswith("f8") or str(elem_type).startswith("!pto."):
+        return 1
+    raise TypeError(f"unsupported element type {elem_type}")
+
+
 __all__ = [
     "muli", "addi", "subi",
     "index_cast",
diff --git a/ptodsl/tests/test_jit_compile.py b/ptodsl/tests/test_jit_compile.py
index abec06a1dc..5792f8e37d 100644
--- a/ptodsl/tests/test_jit_compile.py
+++ b/ptodsl/tests/test_jit_compile.py
@@ -925,6 +925,31 @@ def scalar_pointer_offset_probe():
     _ = valid_cols
 
 
+@pto.jit(target="a5")
+def scalar_contiguous_vector_probe():
+    data_tile = pto.alloc_tile(shape=[1, 16], dtype=pto.f32, valid_shape=[1, 16])
+    data_ptr = data_tile.as_ptr()
+    x4 = scalar.load(data_ptr, 0, contiguous=4)
+    scale4 = pto.vec(pto.f32, 4, init=1.0)
+    y4 = x4 * scale4
+    scalar.store(y4, data_ptr, 4)
+
+
+@pto.jit(target="a5")
+def scalar_contiguous_width_mismatch_probe():
+    data_tile = pto.alloc_tile(shape=[1, 16], dtype=pto.f32, valid_shape=[1, 16])
+    data_ptr = data_tile.as_ptr()
+    x4 = scalar.load(data_ptr, 0, contiguous=4)
+    scalar.store(x4, data_ptr, 4, contiguous=2)
+
+
+@pto.jit(target="a5")
+def scalar_contiguous_scalar_store_probe():
+    data_tile = pto.alloc_tile(shape=[1, 16], dtype=pto.f32, valid_shape=[1, 16])
+    data_ptr = data_tile.as_ptr()
+    scalar.store(1.0, data_ptr, 0, contiguous=4)
+
+
 @pto.jit(target="a5")
 def addptr_surface_probe():
     meta_tile = pto.alloc_tile(shape=[1, 8], dtype=pto.i32, valid_shape=[1, 4])
@@ -1724,6 +1749,7 @@ def main() -> None:
     make_mask_index_roundtrip_probe.verify()
     integer_loop_bound_probe.verify()
     scalar_pointer_offset_probe.verify()
+    scalar_contiguous_vector_probe.verify()
     addptr_surface_probe.verify()
     simt_pointer_offset_probe.verify()
     scalar_store_element_coercion_probe.verify()
@@ -2659,6 +2685,28 @@ def main() -> None:
         "scalar.load(ptr + 2) should lower as element offset 2",
     )
 
+    scalar_contiguous_text = scalar_contiguous_vector_probe.compile().mlir_text()
+    expect_parse_roundtrip_and_verify(scalar_contiguous_text, "scalar contiguous vector specialization")
+    expect("llvm.load" in scalar_contiguous_text, "scalar.load(..., contiguous=N) should lower to llvm.load")
+    expect("llvm.store" in scalar_contiguous_text, "scalar.store(vector, ...) should lower to llvm.store")
+    expect("vector<4xf32>" in scalar_contiguous_text, "contiguous=4 over f32 should produce vector<4xf32>")
+    expect("llvm.insertelement" in scalar_contiguous_text, "pto.vec(..., init=scalar) should broadcast with insertelement")
+    expect("arith.mulf" in scalar_contiguous_text, "VecValue multiplication should lower to arith.mulf")
+    expect(
+        "pto.load" not in scalar_contiguous_text and "pto.store" not in scalar_contiguous_text,
+        "contiguous vector memory access should not lower through scalar pto.load/store",
+    )
+    expect_raises(
+        ValueError,
+        lambda: scalar_contiguous_width_mismatch_probe.compile(),
+        "does not match vector lane count",
+    )
+    expect_raises(
+        TypeError,
+        lambda: scalar_contiguous_scalar_store_probe.compile(),
+        "scalar.store(scalar, ..., contiguous=N) is not supported",
+    )
+
     addptr_surface_text = addptr_surface_probe.compile().mlir_text()
     expect_parse_roundtrip_and_verify(addptr_surface_text, "addptr surface specialization")
     expect(

From b458deb6088626302ea3cc08c78d803229c7d5ff Mon Sep 17 00:00:00 2001
From: wenxuekun <wenxuekun@huawei.com>
Date: Tue, 23 Jun 2026 17:05:05 +0800
Subject: [PATCH 03/37] feat(ptodsl): implement simt_allreduce_sum for SIMT
 cross-workitem all-reduce
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement the pto.simt_allreduce_sum frontend interface as designed in
mission/483/483_docs.md.  Pure Python MLIR IR emission with three
dispatch strategies: warp_reduce (<=32 threads, pow2), cross_warp_reduce
(>32, pow2), ub_reduce (fallback).  Supports f32 and f16.

- ptodsl/ptodsl/_allreduce.py: new — 674 lines
- ptodsl/ptodsl/pto.py: export simt_allreduce_sum (+3 lines)
- ptodsl/tests/test_allreduce.py: new — 533 lines, all passing

Co-Authored-By: Claude <noreply@anthropic.com>
---
 ptodsl/ptodsl/_allreduce.py    | 674 +++++++++++++++++++++++++++++++++
 ptodsl/ptodsl/pto.py           |   3 +
 ptodsl/tests/test_allreduce.py | 533 ++++++++++++++++++++++++++
 3 files changed, 1210 insertions(+)
 create mode 100644 ptodsl/ptodsl/_allreduce.py
 create mode 100644 ptodsl/tests/test_allreduce.py

diff --git a/ptodsl/ptodsl/_allreduce.py b/ptodsl/ptodsl/_allreduce.py
new file mode 100644
index 0000000000..cb0ce122ed
--- /dev/null
+++ b/ptodsl/ptodsl/_allreduce.py
@@ -0,0 +1,674 @@
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+"""
+SIMT cross-workitem all-reduce helpers.
+
+Implements ``AscendAllReduce<Reducer, threads, scale, thread_offset>::run()``
+as PTO IR helper functions that are lazily emitted into the trace module.
+
+Public entry point: ``all_reduce(x, scratch, *, op, threads, scale, thread_offset)``,
+callable from within a ``@pto.simt`` context.
+
+Dispatch tree (mirrors the C++ compile-time dispatch in ``reduce.h``)::
+
+    threads <= scale                                    → identity
+    threads ≤ 32,  pow2(threads), pow2(scale)           → warp_reduce
+    threads ≤ 32                                        → ub_reduce
+    threads > 32, pow2(threads), scale ≤ 32, pow2(scale) → cross_warp_reduce
+    otherwise                                           → ub_reduce
+"""
+
+from __future__ import annotations
+
+from ._surface_values import unwrap_surface_value, wrap_surface_value
+from ._tracing.active import require_active_session
+from ._tracing.session import HelperFunctionSpec
+
+from mlir.dialects import arith, func, scf
+from mlir.dialects import pto as _pto
+from mlir.ir import F16Type, F32Type, IndexType, InsertionPoint, IntegerType, Operation, UnitAttr
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# helpers
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def _is_pow2(n: int) -> bool:
+    return n > 0 and (n & (n - 1)) == 0
+
+
+def _helper_name(dtype: str, threads: int, scale: int, thread_offset: int) -> str:
+    """Canonical helper symbol name for a specific all-reduce instance.
+
+    Example: ``__tl_allreduce_sum_f32_t128_s1_o0``.
+    """
+    return f"__tl_allreduce_sum_{dtype}_t{threads}_s{scale}_o{thread_offset}"
+
+
+def _dtype_to_str(mlir_type) -> str:
+    """Map an MLIR scalar type to a canonical dtype string."""
+    if mlir_type == F32Type.get():
+        return "f32"
+    if mlir_type == F16Type.get():
+        return "f16"
+    raise NotImplementedError(
+        f"all_reduce: unsupported dtype {mlir_type}"
+    )
+
+
+def _mlir_scalar_type(dtype: str):
+    """Map a canonical dtype string back to an MLIR scalar type."""
+    if dtype == "f32":
+        return F32Type.get()
+    if dtype == "f16":
+        return F16Type.get()
+    raise NotImplementedError(
+        f"all_reduce: unsupported dtype {dtype!r}"
+    )
+
+
+# ── compile-time parameter tables ──────────────────────────────────────────
+
+_IDENTITY = {
+    "f32": 0.0,
+    "f16": 0.0,
+}
+"""Identity element for sum reduction (0.0 for both f32 and f16)."""
+
+_REDUX_OP = _pto.ReduxAddOp
+"""Reduction operator (hardware redux_add)."""
+
+
+# ── scratch validation ────────────────────────────────────────────────────
+
+def _validate_scratch(scratch, expected_mlir_type, *, context: str):
+    """Verify *scratch* is a ``!pto.ptr<expected_mlir_type, ub>`` buffer."""
+    raw_scratch = unwrap_surface_value(scratch)
+    try:
+        ptr_type = _pto.PtrType(raw_scratch.type)
+    except Exception:
+        raise TypeError(
+            f"all_reduce {context}: scratch must be a !pto.ptr buffer, "
+            f"got {raw_scratch.type}"
+        ) from None
+    vec_attr = _pto.AddressSpaceAttr.get(_pto.AddressSpace.VEC)
+    if ptr_type.memory_space != vec_attr:
+        raise TypeError(
+            f"all_reduce {context}: scratch must be in UB memory space, "
+            f"got {ptr_type.memory_space}"
+        )
+    if ptr_type.element_type != expected_mlir_type:
+        raise TypeError(
+            f"all_reduce {context}: scratch element type mismatch: "
+            f"expected {expected_mlir_type}, got {ptr_type.element_type}"
+        )
+
+
+# ── shared helper-emission utility ─────────────────────────────────────────
+
+def _invoke_helper(helper_name, emit_fn, *surface_args):
+    """Look up or lazily create *helper_name*, then ``func.call`` it.
+
+    *emit_fn(helper_fn)* is called exactly once per trace session — on the
+    first invocation for this *helper_name*.
+    """
+    session = require_active_session("simt_allreduce_sum")
+    raw_args = [unwrap_surface_value(a) for a in surface_args]
+    arg_types = tuple(a.type for a in raw_args)
+
+    helper_spec = HelperFunctionSpec(
+        symbol_name=helper_name,
+        arg_types=arg_types,
+        result_types=(arg_types[0],),
+        attributes=(("pto.simt_entry", UnitAttr.get()),),
+    )
+    helper_fn, created = session.get_or_create_helper_function(helper_spec)
+    if created:
+        emit_fn(helper_fn)
+    call = func.CallOp(helper_fn, raw_args)
+    return wrap_surface_value(call.result)
+
+
+# ── reduction operator application ─────────────────────────────────────────
+
+def _emit_store(buffer, offset, value):
+    """Emit ``pto.store`` — accepts Ptr and any MemRef (including UB/VEC).
+
+    Unlike ``pto.store_scalar`` (which rejects VEC memrefs), ``pto.store``
+    uses ``PTO_BufferLikeType`` and survives the Ptr→MemRef type conversion
+    pass during lowering.
+    """
+    Operation.create(
+        "pto.store",
+        operands=[buffer, offset, value],
+    )
+
+
+def _emit_load(result_type, buffer, offset):
+    """Emit ``pto.load`` — accepts Ptr and any MemRef (including UB/VEC).
+
+    Counterpart to ``_emit_store``.  Returns the loaded SSA value.
+    """
+    return Operation.create(
+        "pto.load",
+        results=[result_type],
+        operands=[buffer, offset],
+    ).results[0]
+
+
+def _apply_sum(a, b):
+    """Emit ``a = a + b`` (float addition)."""
+    return arith.AddFOp(a, b).result
+
+
+def _emit_butterfly(v, *, threads: int, scale: int):
+    """Emit unrolled butterfly shuffle reduce.
+
+    Implements::
+
+        cur = threads
+        while cur > scale:
+            x = op(x, shfl_xor(x, cur/2))
+            cur /= 2
+
+    All loops are unrolled at emission time.  Caller must have set the
+    insertion point.
+    """
+    i32 = IntegerType.get_signless(32)
+    cur = threads
+    while cur > scale:
+        offset = cur // 2
+        c_offset = arith.ConstantOp(i32, offset).result
+        shfl = _pto.ShuffleBflyOp(v, c_offset).result
+        v = _apply_sum(v, shfl)
+        cur //= 2
+    return v
+
+
+def _emit_warp_hw_reduce(x, *, threads: int,
+                         lane_in_warp, c_identity, i32):
+    """Emit warp-level hardware reduce.
+
+    When *threads* == 32 ("groups" == 1): a single ``pto.redux_*``.
+    When *threads* < 32 ("groups" > 1): one ``pto.redux_*`` per group,
+    with identity masking for lanes outside the group.
+
+    Caller must have set the insertion point.
+    """
+    groups = 32 // threads
+
+    if groups == 1:
+        return _REDUX_OP(x).result
+
+    c_threads = arith.ConstantOp(i32, threads).result
+    my_group = arith.DivUIOp(lane_in_warp, c_threads).result
+
+    for g in range(groups):
+        c_g = arith.ConstantOp(i32, g).result
+        in_group = arith.CmpIOp(arith.CmpIPredicate.eq, my_group, c_g).result
+        masked = arith.SelectOp(in_group, x, c_identity).result
+        reduced = _REDUX_OP(masked).result
+        x = arith.SelectOp(in_group, reduced, x).result
+    return x
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# public API
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def simt_allreduce_sum(value, *,
+               threads: int,
+               scale: int = 1,
+               thread_offset: int = 0,
+               scratch=None,
+               scratch_offset: int = 0):
+    """Cross-workitem all-reduce for SIMT VF context.
+
+    Dispatch logic mirrors the compile-time tree in
+    ``AscendAllReduce<Reducer, threads, scale, thread_offset>::run()``.
+
+    Args:
+        value: Lane-local scalar (f32 or f16).
+        threads: Number of workitems.  Must satisfy ``threads % scale == 0``.
+        scale: Scale factor (must divide *threads*).  Defaults to 1.
+        thread_offset: Thread offset.  Defaults to 0.
+        scratch: UB scratch buffer (``!pto.ptr<dtype, ub>``).  Required for
+            ``cross_warp_reduce`` and ``ub_reduce`` paths.  Defaults to None.
+        scratch_offset: Element offset into *scratch*.  Defaults to 0.
+
+    Returns:
+        Lane-uniform scalar (same type as *value*) — the reduced sum.
+    """
+    return _dispatch_allreduce_helper(
+        value, scratch=scratch, scratch_offset=scratch_offset,
+        threads=threads, scale=scale, thread_offset=thread_offset,
+    )
+
+
+def _dispatch_allreduce_helper(value, *, scratch, scratch_offset,
+                                threads, scale, thread_offset):
+    # ── parameter validation (before identity shortcut) ───────────────────
+    for name, val in (("threads", threads), ("scale", scale),
+                       ("thread_offset", thread_offset)):
+        if not isinstance(val, int):
+            raise ValueError(
+                f"all_reduce: '{name}' must be a Python int, "
+                f"got {type(val).__name__}"
+            )
+    if threads < 1:
+        raise ValueError(f"all_reduce: threads must be >= 1, got {threads}")
+    if scale < 1:
+        raise ValueError(f"all_reduce: scale must be >= 1, got {scale}")
+    if thread_offset < 0:
+        raise ValueError(
+            f"all_reduce: thread_offset must be >= 0, got {thread_offset}"
+        )
+    if threads % scale != 0:
+        raise ValueError(
+            f"all_reduce requires threads % scale == 0; "
+            f"got threads={threads}, scale={scale}"
+        )
+
+    # ── Path 0: identity ──────────────────────────────────────────────────
+    if threads <= scale:
+        return value
+
+    # ── dtype validation ─────────────────────────────────────────────────
+    raw_value = unwrap_surface_value(value)
+    dtype = _dtype_to_str(raw_value.type)
+    if dtype not in ("f32", "f16"):
+        raise NotImplementedError(
+            f"all_reduce only supports f32/f16, got {dtype}"
+        )
+
+    name = _helper_name(dtype, threads, scale, thread_offset)
+    args = dict(dtype=dtype, threads=threads, scale=scale,
+                thread_offset=thread_offset, scratch_offset=scratch_offset)
+
+    # ── Path 1: warp_reduce ───────────────────────────────────────────────
+    if threads <= 32 and _is_pow2(threads) and _is_pow2(scale):
+        return _invoke_helper(
+            name,
+            lambda hf: _emit_warp_reduce(hf, **args),
+            value,
+        )
+
+    # ── All paths below require a scratch buffer ──────────────────────────
+    if scratch is None:
+        raise ValueError(
+            f"all_reduce sum/{dtype}/t{threads}/s{scale}/o{thread_offset} "
+            "requires a UB scratch buffer"
+        )
+    _validate_scratch(
+        scratch, raw_value.type,
+        context=f"sum/{dtype}/t{threads}/s{scale}/o{thread_offset}",
+    )
+
+    # ── Path 2: ub_reduce (threads ≤ 32, non-pow2) ──────────────────────
+    if threads <= 32:
+        return _invoke_helper(
+            name,
+            lambda hf: _emit_ub_reduce(hf, **args),
+            value, scratch,
+        )
+
+    # ── Path 3: cross_warp_reduce ────────────────────────────────────────
+    if scale <= 32 and _is_pow2(threads) and _is_pow2(scale):
+        return _invoke_helper(
+            name,
+            lambda hf: _emit_cross_warp_reduce(hf, **args),
+            value, scratch,
+        )
+
+    # ── Path 4: ub_reduce fallback (threads > 32, anything else) ─────────
+    return _invoke_helper(
+        name,
+        lambda hf: _emit_ub_reduce(hf, **args),
+        value, scratch,
+    )
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# emitter: warp_reduce  (Path 1: threads ≤ 32, pow2, pow2 scale)
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def _emit_warp_reduce(helper_fn, *,
+                      dtype, threads, scale, thread_offset,
+                      scratch_offset):
+    """Build the body of a single-warp all-reduce helper.
+
+    Dispatches to:
+
+    * ``warp_hw_reduce`` when ``extent >= 16`` and ``scale == 1``
+      (fast hardware redux, with group masking for threads < 32).
+    * ``butterfly`` otherwise (software shuffle via ``pto.shuffle_bfly``).
+    """
+    extent = threads // scale
+    scalar_t = _mlir_scalar_type(dtype)
+    identity_val = _IDENTITY[dtype]
+    i32 = IntegerType.get_signless(32)
+
+    entry = helper_fn.add_entry_block()
+    with InsertionPoint(entry):
+        x = entry.arguments[0]
+
+        c_offset = arith.ConstantOp(i32, thread_offset).result
+        c_identity = arith.ConstantOp(scalar_t, identity_val).result
+
+        if thread_offset:
+            # lane_in_warp = (tid_x - offset) & 31
+            tid_x = _pto.GetTidXOp().result
+            tx = arith.SubIOp(tid_x, c_offset).result
+            lane_in_warp = arith.AndIOp(tx, arith.ConstantOp(i32, 31).result).result
+        else:
+            lane_in_warp = _pto.GetLaneIdOp().result
+
+        if extent >= 16 and scale == 1:
+            result = _emit_warp_hw_reduce(
+                x, threads=threads,
+                lane_in_warp=lane_in_warp, c_identity=c_identity, i32=i32,
+            )
+        else:
+            result = _emit_butterfly(
+                x, threads=threads, scale=scale,
+            )
+
+        func.ReturnOp([result])
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# emitter: cross_warp_reduce  (Path 3: threads > 32)
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def _emit_cross_warp_reduce(helper_fn, *,
+                            dtype, threads, scale, thread_offset,
+                            scratch_offset):
+    """Build the body of a cross-warp all-reduce helper.
+
+    Algorithm overview:
+
+    1. *num_warps* subgroups of 32 lanes each do a per-warp reduce.
+    2. Warp leaders (lid < scale) write → scratch[wid * scale + lid].
+    3. ``pto.syncthreads``.
+    4. Leader warp (lanes with ``tx < 32``) reduces the partial sums:
+       - scale == 1:  ``hw_reduce`` across leader warp.
+       - scale * num_warps ≤ 32:  ``butterfly<total, scale, 0>``.
+       - otherwise:  manual loop over warps.
+    5. Global leader (tx < scale) writes result → scratch[tx].
+    6. ``pto.syncthreads`` + broadcast: each lane reads scratch[tx % scale].
+    7. Extra ``pto.syncthreads`` to fence scratch reuse.
+    """
+    num_warps = threads // 32
+    scalar_t = _mlir_scalar_type(dtype)
+    identity_val = _IDENTITY[dtype]
+
+    i32 = IntegerType.get_signless(32)
+    idx_t = IndexType.get()
+
+    entry = helper_fn.add_entry_block()
+    with InsertionPoint(entry):
+        x = entry.arguments[0]
+        scratch = entry.arguments[1]
+
+        # ── constants ────────────────────────────────────────────────────
+        c0_i32 = arith.ConstantOp(i32, 0).result
+        c5_i32 = arith.ConstantOp(i32, 5).result
+        c31_i32 = arith.ConstantOp(i32, 31).result
+        c32_i32 = arith.ConstantOp(i32, 32).result
+        c_scale = arith.ConstantOp(i32, scale).result
+        c_num_warps = arith.ConstantOp(i32, num_warps).result
+        c_offset = arith.ConstantOp(i32, thread_offset).result
+        c_scratch_off = arith.ConstantOp(idx_t, scratch_offset).result
+        c_identity = arith.ConstantOp(scalar_t, identity_val).result
+
+        # ── thread indexing ──────────────────────────────────────────────
+        tid_x = _pto.GetTidXOp().result
+        if thread_offset:
+            tx = arith.SubIOp(tid_x, c_offset).result
+            wid = arith.ShRUIOp(tx, c5_i32).result
+            lid = arith.AndIOp(tx, c31_i32).result
+        else:
+            tx = tid_x
+            wid = arith.ShRUIOp(tx, c5_i32).result
+            lid = _pto.GetLaneIdOp().result
+
+        # ── Stage 1: per-warp reduce ─────────────────────────────────────
+        if scale == 1:
+            warp_val = _REDUX_OP(x).result
+        else:
+            warp_val = _emit_butterfly(
+                x, threads=32, scale=scale,
+            )
+
+        # ── Stage 2: warp leaders write partial results ──────────────────
+        is_writer = arith.CmpIOp(arith.CmpIPredicate.ult, lid, c_scale).result
+        write_if = scf.IfOp(is_writer, hasElse=False)
+        with InsertionPoint(write_if.then_block):
+            slot = arith.AddIOp(
+                arith.MulIOp(wid, c_scale).result, lid).result
+            slot_idx = arith.IndexCastOp(idx_t, slot).result
+            if scratch_offset:
+                slot_idx = arith.AddIOp(slot_idx, c_scratch_off).result
+            _emit_store(scratch, slot_idx, warp_val)
+            scf.YieldOp([])
+
+        # ── Stage 3: sync before reading partial results ─────────────────
+        _pto.SyncthreadsOp()
+
+        # ── Stage 4: leader warp reduces partial sums ────────────────────
+        is_leader_warp = arith.CmpIOp(
+            arith.CmpIPredicate.ult, tx, c32_i32).result
+        outer_if = scf.IfOp(is_leader_warp, [scalar_t], hasElse=True)
+
+        with InsertionPoint(outer_if.then_block):
+            if scale == 1:
+                # ── scale == 1: hw_reduce across leader warp ────────────
+                need_load = arith.CmpIOp(
+                    arith.CmpIPredicate.ult, lid, c_num_warps).result
+                inner_if = scf.IfOp(need_load, [scalar_t], hasElse=True)
+                with InsertionPoint(inner_if.then_block):
+                    lid_idx = arith.IndexCastOp(idx_t, lid).result
+                    tmp = _emit_load(scalar_t, scratch, lid_idx)
+                    scf.YieldOp([tmp])
+                with InsertionPoint(inner_if.else_block):
+                    scf.YieldOp([c_identity])
+                loaded = inner_if.results[0]
+                stage4_result = _REDUX_OP(loaded).result
+            elif scale * num_warps <= 32:
+                # ── scale > 1, fits in one warp: butterfly ──────────────
+                total = scale * num_warps
+                c_total = arith.ConstantOp(i32, total).result
+                need_load = arith.CmpIOp(
+                    arith.CmpIPredicate.ult, lid, c_total).result
+                inner_if = scf.IfOp(need_load, [scalar_t], hasElse=True)
+                with InsertionPoint(inner_if.then_block):
+                    lid_idx = arith.IndexCastOp(idx_t, lid).result
+                    if scratch_offset:
+                        lid_idx = arith.AddIOp(lid_idx, c_scratch_off).result
+                    tmp = _emit_load(scalar_t, scratch, lid_idx)
+                    scf.YieldOp([tmp])
+                with InsertionPoint(inner_if.else_block):
+                    scf.YieldOp([c_identity])
+                loaded = inner_if.results[0]
+                stage4_result = _emit_butterfly(
+                    loaded,
+                    threads=total, scale=scale,
+                )
+            else:
+                # ── manual loop: lid < scale lanes each reduce num_warps
+                is_reducer = arith.CmpIOp(
+                    arith.CmpIPredicate.ult, lid, c_scale).result
+                result = c_identity
+                my_slot = arith.RemUIOp(lid, c_scale).result
+                for w in range(num_warps):
+                    c_w = arith.ConstantOp(i32, w).result
+                    idx_val = arith.AddIOp(
+                        arith.MulIOp(c_w, c_scale).result, my_slot).result
+                    slot_idx = arith.IndexCastOp(idx_t, idx_val).result
+                    if scratch_offset:
+                        slot_idx = arith.AddIOp(slot_idx, c_scratch_off).result
+                    loaded_v = _emit_load(
+                        scalar_t, scratch, slot_idx)
+                    result = _apply_sum(result, loaded_v)
+                stage4_result = arith.SelectOp(
+                    is_reducer, result, c_identity).result
+
+            scf.YieldOp([stage4_result])
+
+        with InsertionPoint(outer_if.else_block):
+            scf.YieldOp([c_identity])
+
+        partial_reduced = outer_if.results[0]
+
+        # ── Stage 5: global leader writes result to scratch ──────────────
+        is_global_leader = arith.CmpIOp(
+            arith.CmpIPredicate.ult, tx, c_scale).result
+        write_result_if = scf.IfOp(is_global_leader, hasElse=False)
+        with InsertionPoint(write_result_if.then_block):
+            tx_idx = arith.IndexCastOp(idx_t, tx).result
+            if scratch_offset:
+                tx_idx = arith.AddIOp(tx_idx, c_scratch_off).result
+            _emit_store(scratch, tx_idx, partial_reduced)
+            scf.YieldOp([])
+
+        # ── Stage 6: sync + broadcast load scratch[tx % scale] ───────────
+        _pto.SyncthreadsOp()
+        my_slot = arith.RemUIOp(tx, c_scale).result
+        load_idx = arith.IndexCastOp(idx_t, my_slot).result
+        if scratch_offset:
+            load_idx = arith.AddIOp(load_idx, c_scratch_off).result
+        result = _emit_load(scalar_t, scratch, load_idx)
+
+        # ── Stage 7: extra sync to fence scratch reuse ───────────────────
+        _pto.SyncthreadsOp()
+
+        func.ReturnOp([result])
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# emitter: ub_reduce  (Paths 2 & 4: fallback via UB scratch)
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def _emit_ub_reduce(helper_fn, *,
+                    dtype, threads, scale, thread_offset,
+                    scratch_offset):
+    """Build the body of a UB-scratch all-reduce helper.
+
+    Algorithm:
+
+    1. Each lane writes x → scratch[tx].
+    2. ``pto.syncthreads``.
+    3. Lanes with ``lane % scale == 0`` sequentially reduce scratch slots.
+    4. ``pto.syncthreads``.
+    5. Global leader (lane % scale == 0, lane / scale == 0) writes back.
+    6. ``pto.syncthreads`` + broadcast: each lane reads scratch[tx % scale].
+    7. ``pto.syncthreads`` to fence scratch reuse.
+    """
+    scalar_t = _mlir_scalar_type(dtype)
+    i32 = IntegerType.get_signless(32)
+    idx_t = IndexType.get()
+
+    entry = helper_fn.add_entry_block()
+    with InsertionPoint(entry):
+        x = entry.arguments[0]
+        scratch = entry.arguments[1]
+
+        # ── constants ────────────────────────────────────────────────────
+        c0_i32 = arith.ConstantOp(i32, 0).result
+        c_threads = arith.ConstantOp(i32, threads).result
+        c_scale = arith.ConstantOp(i32, scale).result
+        c_offset = arith.ConstantOp(i32, thread_offset).result
+        c_scratch_off = arith.ConstantOp(idx_t, scratch_offset).result
+
+        # ── thread indexing ──────────────────────────────────────────────
+        tid_x = _pto.GetTidXOp().result
+        tx = arith.SubIOp(tid_x, c_offset).result if thread_offset else tid_x
+        group = arith.DivUIOp(tx, c_threads).result
+        lane = arith.RemUIOp(tx, c_threads).result
+        lane_mod = arith.RemUIOp(lane, c_scale).result
+
+        # ── Stage 1: each lane writes x → scratch[scratch_offset + tx] ──
+        tx_idx = arith.IndexCastOp(idx_t, tx).result
+        if scratch_offset:
+            tx_idx = arith.AddIOp(tx_idx, c_scratch_off).result
+        _emit_store(scratch, tx_idx, x)
+
+        # ── Stage 2: sync ────────────────────────────────────────────────
+        _pto.SyncthreadsOp()
+
+        # ── Stage 3: reducers sequentially combine ───────────────────────
+        # lane < scale gives exactly one reducer per residue class
+        is_reducer = arith.CmpIOp(
+            arith.CmpIPredicate.ult, lane, c_scale).result
+        reduce_if = scf.IfOp(is_reducer, [scalar_t], hasElse=True)
+
+        with InsertionPoint(reduce_if.then_block):
+            # initial: load scratch[scratch_offset + group * threads + lane]
+            group_offset = arith.MulIOp(group, c_threads).result
+            first_elem = arith.AddIOp(group_offset, lane).result
+            first_idx = arith.IndexCastOp(idx_t, first_elem).result
+            if scratch_offset:
+                first_idx = arith.AddIOp(first_idx, c_scratch_off).result
+            acc = _emit_load(scalar_t, scratch, first_idx)
+
+            # scf.for i = scale to threads step scale
+            lb = arith.ConstantOp(idx_t, scale).result
+            ub = arith.ConstantOp(idx_t, threads).result
+            step = arith.ConstantOp(idx_t, scale).result
+            for_op = scf.ForOp(lb, ub, step, [acc])
+            with InsertionPoint(for_op.body):
+                i = for_op.induction_variable
+                prev = for_op.inner_iter_args[0]
+                elem = arith.AddIOp(first_idx, i).result
+                loaded = _emit_load(
+                    scalar_t, scratch, elem)
+                new_acc = _apply_sum(prev, loaded)
+                scf.YieldOp([new_acc])
+            scf.YieldOp([for_op.results[0]])
+
+        with InsertionPoint(reduce_if.else_block):
+            scf.YieldOp([x])
+
+        flag = reduce_if.results[0]
+
+        # ── Stage 4: sync ────────────────────────────────────────────────
+        _pto.SyncthreadsOp()
+
+        # ── Stage 5: per-class leader writes reduced value ───────────────
+        # leader lanes 0..scale-1 each write their residue class result
+        is_leader = arith.CmpIOp(
+            arith.CmpIPredicate.ult, lane, c_scale).result
+        write_if = scf.IfOp(is_leader, hasElse=False)
+        with InsertionPoint(write_if.then_block):
+            dst_offset = arith.AddIOp(
+                arith.MulIOp(group, c_threads).result, lane).result
+            dst_idx = arith.IndexCastOp(idx_t, dst_offset).result
+            if scratch_offset:
+                dst_idx = arith.AddIOp(dst_idx, c_scratch_off).result
+            _emit_store(scratch, dst_idx, flag)
+            scf.YieldOp([])
+
+        # ── Stage 6: sync + broadcast scratch[scratch_offset + group*threads + tx%scale] ──
+        _pto.SyncthreadsOp()
+        my_slot = arith.AddIOp(
+            arith.MulIOp(group, c_threads).result,
+            arith.RemUIOp(tx, c_scale).result).result
+        load_idx = arith.IndexCastOp(idx_t, my_slot).result
+        if scratch_offset:
+            load_idx = arith.AddIOp(load_idx, c_scratch_off).result
+        result = _emit_load(scalar_t, scratch, load_idx)
+
+        # ── Stage 7: extra sync to fence scratch reuse ───────────────────
+        _pto.SyncthreadsOp()
+
+        func.ReturnOp([result])
+
+
+__all__ = [
+    "simt_allreduce_sum",
+]
diff --git a/ptodsl/ptodsl/pto.py b/ptodsl/ptodsl/pto.py
index 5ef25bdda8..2561b853ef 100644
--- a/ptodsl/ptodsl/pto.py
+++ b/ptodsl/ptodsl/pto.py
@@ -120,6 +120,9 @@
     LoopHandle, BranchHandle,
 )
 
+# ── All-reduce ─────────────────────────────────────────────────────────────────
+from ._allreduce import simt_allreduce_sum  # noqa: F401
+
 # ── Decorator ─────────────────────────────────────────────────────────────────
 from ._jit import jit, KernelHandle, merge_jit_modules      # noqa: F401
 from ._subkernels import cube, simd, simt     # noqa: F401
diff --git a/ptodsl/tests/test_allreduce.py b/ptodsl/tests/test_allreduce.py
new file mode 100644
index 0000000000..1f6b964894
--- /dev/null
+++ b/ptodsl/tests/test_allreduce.py
@@ -0,0 +1,533 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from pathlib import Path
+import sys
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "ptodsl"))
+
+from ptodsl import pto
+
+
+def expect(condition: bool, message: str) -> None:
+    if not condition:
+        raise AssertionError(message)
+
+
+def main():
+    from ptodsl._allreduce import _helper_name, simt_allreduce_sum
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # helper name format
+    # ══════════════════════════════════════════════════════════════════════════
+    expect(
+        _helper_name("f32", 128, 1, 0) == "__tl_allreduce_sum_f32_t128_s1_o0",
+        "helper name format (sum/f32/t128/s1/o0)",
+    )
+    expect(
+        _helper_name("f16", 32, 2, 4) == "__tl_allreduce_sum_f16_t32_s2_o4",
+        "helper name format (f16/t32/s2/o4)",
+    )
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 0: identity (threads <= scale)
+    # ══════════════════════════════════════════════════════════════════════════
+    expect(
+        simt_allreduce_sum(1.0, threads=1, scale=1) == 1.0,
+        "identity: threads == scale",
+    )
+    expect(
+        simt_allreduce_sum(1.0, threads=2, scale=2) == 1.0,
+        "identity: threads == scale (alt)",
+    )
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # validation errors
+    # ══════════════════════════════════════════════════════════════════════════
+
+    # threads % scale != 0  (validation now runs before identity shortcut)
+    try:
+        simt_allreduce_sum(1.0, threads=3, scale=2)
+        raise AssertionError("expected ValueError for threads % scale != 0")
+    except ValueError:
+        pass
+
+
+    # threads < 1
+    try:
+        simt_allreduce_sum(1.0, threads=0, scale=1)
+        raise AssertionError("expected ValueError for threads < 1")
+    except ValueError:
+        pass
+
+    # validation runs before identity: bad params not bypassed by threads<=scale
+    try:
+        simt_allreduce_sum(1.0, threads=1, scale=2)
+        raise AssertionError("expected ValueError for threads%scale!=0 (before identity)")
+    except ValueError:
+        pass
+
+    # i32 dtype rejected — need a real JIT kernel so we get an MLIR i32 value
+    @pto.jit(target="a5")
+    def kernel_i32(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1, dtype=pto.i32)
+            _result = pto.simt_allreduce_sum(x, threads=32, scale=1)
+
+    try:
+        kernel_i32.compile()
+        raise AssertionError("expected NotImplementedError for i32")
+    except NotImplementedError:
+        pass
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 1a: warp_reduce — hardware redux, groups == 1 (threads=32)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_warp(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        _ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, threads=32, scale=1)
+
+    compiled_warp = kernel_warp.compile()
+    mlir_warp = compiled_warp.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t32_s1_o0" in mlir_warp,
+           "IR: warp_reduce helper name")
+    expect("pto.redux_add" in mlir_warp,
+           "IR: redux_add in warp_reduce helper")
+    expect("pto.syncthreads" not in mlir_warp,
+           "IR: warp_reduce has no syncthreads")
+    expect("pto.shuffle_bfly" not in mlir_warp,
+           "IR: warp_reduce (groups=1) has no shuffle_bfly")
+    compiled_warp.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 1b: warp_reduce — hardware redux, groups > 1 (threads=16, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_warp_t16(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        _ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, threads=16, scale=1)
+
+    compiled_warp_t16 = kernel_warp_t16.compile()
+    mlir_warp_t16 = compiled_warp_t16.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t16_s1_o0" in mlir_warp_t16,
+           "IR: warp_reduce t=16 helper name")
+    expect("pto.redux_add" in mlir_warp_t16,
+           "IR: redux_add for groups>1")
+    expect("arith.select" in mlir_warp_t16,
+           "IR: arith.select for group masking")
+    expect("pto.syncthreads" not in mlir_warp_t16,
+           "IR: warp_reduce (groups=2) has no syncthreads")
+    compiled_warp_t16.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 1c: warp_reduce — butterfly shuffle (threads=8, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_warp_t8(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        _ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, threads=8, scale=1)
+
+    compiled_warp_t8 = kernel_warp_t8.compile()
+    mlir_warp_t8 = compiled_warp_t8.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t8_s1_o0" in mlir_warp_t8,
+           "IR: warp_reduce t=8 butterfly helper name (sum)")
+    expect("pto.shuffle_bfly" in mlir_warp_t8,
+           "IR: shuffle_bfly for butterfly path")
+    expect("pto.redux_add" not in mlir_warp_t8,
+           "IR: butterfly has no hardware redux")
+    expect("pto.syncthreads" not in mlir_warp_t8,
+           "IR: butterfly has no syncthreads")
+    compiled_warp_t8.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 1d: warp_reduce — butterfly with scale > 1 (threads=32, scale=2)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_warp_s2(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        _ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, threads=32, scale=2)
+
+    compiled_warp_s2 = kernel_warp_s2.compile()
+    mlir_warp_s2 = compiled_warp_s2.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t32_s2_o0" in mlir_warp_s2,
+           "IR: warp_reduce s=2 butterfly helper name (sum)")
+    expect("pto.shuffle_bfly" in mlir_warp_s2,
+           "IR: shuffle_bfly for butterfly (scale>1)")
+    expect("pto.redux_add" not in mlir_warp_s2,
+           "IR: butterfly (scale>1) has no hardware redux")
+    compiled_warp_s2.verify()
+
+    # ── warp_reduce: sum, f32, t=16, s=1, o=4 (non-zero thread_offset) ────────
+    @pto.jit(target="a5")
+    def kernel_warp_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        _ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, threads=16, scale=1, thread_offset=4)
+
+    compiled_warp_o4 = kernel_warp_o4.compile()
+    mlir_warp_o4 = compiled_warp_o4.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t16_s1_o4" in mlir_warp_o4,
+           "IR: warp_reduce o=4 helper name")
+    expect("pto.get_tid_x" in mlir_warp_o4,
+           "IR: warp_reduce o=4 uses get_tid_x (not raw get_laneid)")
+    expect("arith.subi" in mlir_warp_o4,
+           "IR: warp_reduce o=4 uses subi for tx = tid_x - offset")
+    expect("arith.andi" in mlir_warp_o4,
+           "IR: warp_reduce o=4 uses andi to extract lane_in_warp")
+    compiled_warp_o4.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 2: ub_reduce — threads ≤ 32, non-power-of-2 (threads=6, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_ub6(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=6, scale=1)
+
+    compiled_ub6 = kernel_ub6.compile()
+    mlir_ub6 = compiled_ub6.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t6_s1_o0" in mlir_ub6,
+           "IR: ub_reduce t=6 helper name")
+    expect("pto.syncthreads" in mlir_ub6,
+           "IR: ub_reduce has syncthreads")
+    expect("pto.store" in mlir_ub6,
+           "IR: ub_reduce has store (write to scratch)")
+    expect("pto.load" in mlir_ub6,
+           "IR: ub_reduce has load (read from scratch)")
+    syncthreads_count = mlir_ub6.count("pto.syncthreads")
+    expect(syncthreads_count == 4,
+           f"IR: ub_reduce has 4 syncthreads, got {syncthreads_count}")
+    compiled_ub6.verify()
+
+    # ── ub_reduce: sum, f32, t=6, s=2 (scale > 1, non-pow2 threads) ─────────
+    @pto.jit(target="a5")
+    def kernel_ub6s2(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=6, scale=2)
+
+    compiled_ub6s2 = kernel_ub6s2.compile()
+    mlir_ub6s2 = compiled_ub6s2.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t6_s2_o0" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 helper name")
+    expect("pto.syncthreads" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has syncthreads")
+    expect("pto.store" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has store")
+    expect("pto.load" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has load")
+    expect("scf.for" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has scf.for (sequential reduce loop)")
+    expect("pto.redux_add" not in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has no hardware redux")
+    expect("pto.shuffle_bfly" not in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 has no butterfly shuffle")
+    # scale>1 fixes: reducer uses lane < scale (ult), not lane_mod == 0
+    expect("arith.cmpi ult" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 reducer uses ult (lane < scale)")
+    compiled_ub6s2.verify()
+
+    # ── ub_reduce: sum, f32, t=6, s=1, o=4 (non-zero thread_offset) ─────────
+    @pto.jit(target="a5")
+    def kernel_ub_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=6, scale=1,
+                                     thread_offset=4)
+
+    compiled_ub_o4 = kernel_ub_o4.compile()
+    mlir_ub_o4 = compiled_ub_o4.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t6_s1_o4" in mlir_ub_o4,
+           "IR: ub_reduce o=4 helper name")
+    expect("arith.subi" in mlir_ub_o4,
+           "IR: ub_reduce o=4 uses subi for tx = tid_x - offset")
+    compiled_ub_o4.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 3a: cross_warp_reduce — sum, f32, t=128, s=1, o=0 (baseline)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_128(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=1)
+
+    compiled = kernel_128.compile()
+    mlir = compiled.mlir_text()
+
+    expect("func.func @__tl_allreduce_sum_f32_t128_s1_o0" in mlir,
+           "IR: helper function definition")
+    expect("pto.simt_entry" in mlir,
+           "IR: helper carries pto.simt_entry")
+    expect("call @__tl_allreduce_sum_f32_t128_s1_o0" in mlir,
+           "IR: func.call to helper")
+
+    for op_name in (
+        "pto.redux_add", "pto.syncthreads", "pto.store", "pto.load",
+        "pto.get_tid_x", "pto.get_laneid", "arith.shrui", "scf.if",
+    ):
+        expect(op_name in mlir, f"IR: expected '{op_name}' in helper body")
+
+    syncthreads_count = mlir.count("pto.syncthreads")
+    expect(syncthreads_count == 3,
+           f"IR: expected 3 syncthreads, got {syncthreads_count}")
+
+    compiled.verify()
+
+    # ── cross_warp: sum, f32, t=64 (2 warps) ────────────────────────────────
+    @pto.jit(target="a5")
+    def kernel_64(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=64, scale=1)
+
+    compiled_64 = kernel_64.compile()
+    mlir_64 = compiled_64.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t64_s1_o0" in mlir_64,
+           "IR: helper for t=64")
+    compiled_64.verify()
+
+    # ── cross_warp: sum, f32, t=256 (8 warps) ───────────────────────────────
+    @pto.jit(target="a5")
+    def kernel_256(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=256, scale=1)
+
+    compiled_256 = kernel_256.compile()
+    mlir_256 = compiled_256.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t256_s1_o0" in mlir_256,
+           "IR: helper for t=256")
+    compiled_256.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 3b: cross_warp_reduce — scale > 1, scale*num_warps ≤ 32
+    #           (threads=128, scale=2, num_warps=4, total=8 ≤ 32)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_cw_s2(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=2)
+
+    compiled_cw_s2 = kernel_cw_s2.compile()
+    mlir_cw_s2 = compiled_cw_s2.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t128_s2_o0" in mlir_cw_s2,
+           "IR: cross_warp s=2 helper name")
+    expect("pto.shuffle_bfly" in mlir_cw_s2,
+           "IR: cross_warp s=2 has shuffle_bfly (butterfly for per-warp + leader)")
+    expect("pto.syncthreads" in mlir_cw_s2,
+           "IR: cross_warp s=2 has syncthreads")
+    # scale > 1: per-warp uses butterfly, not hardware redux
+    compiled_cw_s2.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 3c: cross_warp_reduce — scale > 1, scale*num_warps > 32 (manual, sum)
+    #           (threads=128, scale=16, num_warps=4, total=64 > 32)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_cw_s16(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=16)
+
+    compiled_cw_s16 = kernel_cw_s16.compile()
+    mlir_cw_s16 = compiled_cw_s16.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t128_s16_o0" in mlir_cw_s16,
+           "IR: cross_warp s=16 manual helper name")
+    expect("pto.syncthreads" in mlir_cw_s16,
+           "IR: cross_warp s=16 has syncthreads")
+    compiled_cw_s16.verify()
+
+    # ── cross_warp: sum, f32, t=128, s=1, o=4 (non-zero thread_offset) ─────
+    @pto.jit(target="a5")
+    def kernel_cw_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=1,
+                                     thread_offset=4)
+
+    compiled_cw_o4 = kernel_cw_o4.compile()
+    mlir_cw_o4 = compiled_cw_o4.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t128_s1_o4" in mlir_cw_o4,
+           "IR: cross_warp o=4 helper name")
+    expect("pto.get_tid_x" in mlir_cw_o4,
+           "IR: cross_warp o=4 uses get_tid_x")
+    expect("arith.subi" in mlir_cw_o4,
+           "IR: cross_warp o=4 uses subi for tx = tid_x - offset")
+    compiled_cw_o4.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Path 4: ub_reduce fallback — threads > 32, non-power-of-2
+    #          (threads=48, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_ub48(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=48, scale=1)
+
+    compiled_ub48 = kernel_ub48.compile()
+    mlir_ub48 = compiled_ub48.mlir_text()
+    expect("func.func @__tl_allreduce_sum_f32_t48_s1_o0" in mlir_ub48,
+           "IR: ub_reduce fallback t=48 helper name")
+    expect("pto.syncthreads" in mlir_ub48,
+           "IR: ub_reduce fallback has syncthreads")
+    expect("pto.store" in mlir_ub48,
+           "IR: ub_reduce fallback has store")
+    expect("pto.load" in mlir_ub48,
+           "IR: ub_reduce fallback has load")
+    compiled_ub48.verify()
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # helper deduplication across multiple calls
+    # ══════════════════════════════════════════════════════════════════════════
+
+    @pto.jit(target="a5")
+    def kernel_reuse(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x1 = pto.const(1.0, dtype=pto.f32)
+            _r1 = pto.simt_allreduce_sum(x1, scratch=ub_scratch, threads=128, scale=1)
+            x2 = pto.const(2.0, dtype=pto.f32)
+            _r2 = pto.simt_allreduce_sum(x2, scratch=ub_scratch, threads=128, scale=1)
+
+    compiled2 = kernel_reuse.compile()
+    mlir2 = compiled2.mlir_text()
+
+    definitions = mlir2.count("func.func @__tl_allreduce_sum_f32_t128_s1_o0")
+    expect(definitions == 1,
+           f"IR: helper defined {definitions} times, expected 1")
+    calls = mlir2.count("call @__tl_allreduce_sum_f32_t128_s1_o0")
+    expect(calls == 2, f"IR: expected 2 call sites, got {calls}")
+    compiled2.verify()
+
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # scratch required for ub_reduce and cross_warp paths
+    # ══════════════════════════════════════════════════════════════════════════
+
+    # cross_warp requires scratch — use a real JIT kernel so the error
+    # originates from _dispatch_allreduce_helper, not from a bare Python float.
+    @pto.jit(target="a5")
+    def kernel_no_scratch_cw(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=None, threads=128, scale=1)
+
+    try:
+        kernel_no_scratch_cw.compile()
+        raise AssertionError("expected ValueError for missing scratch (cross_warp)")
+    except ValueError as e:
+        expect("requires a UB scratch buffer" in str(e),
+               f"error message should mention scratch (cross_warp), got: {e}")
+
+    # ub_reduce (non-pow2) requires scratch
+    @pto.jit(target="a5")
+    def kernel_no_scratch_ub(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=None, threads=6, scale=1)
+
+    try:
+        kernel_no_scratch_ub.compile()
+        raise AssertionError("expected ValueError for missing scratch (ub_reduce)")
+    except ValueError as e:
+        expect("requires a UB scratch buffer" in str(e),
+               f"error message should mention scratch (ub_reduce), got: {e}")
+
+    # scratch must be a pto.ptr type
+    try:
+        simt_allreduce_sum(1.0, scratch="not_a_ptr", threads=6, scale=1)
+        raise AssertionError("expected TypeError for non-ptr scratch")
+    except (TypeError, AttributeError):
+        pass
+
+    # cross_warp: gm scratch (wrong memory space) should be rejected
+    @pto.jit(target="a5")
+    def kernel_gm_scratch(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=scratch_gm, threads=128, scale=1)
+
+    try:
+        kernel_gm_scratch.compile()
+        raise AssertionError("expected TypeError for gm scratch")
+    except TypeError as e:
+        expect("UB" in str(e).upper() or "memory space" in str(e).lower(),
+               f"gm scratch error should mention memory space, got: {e}")
+
+    # cross_warp: i32 scratch with f32 x (dtype mismatch) should be rejected
+    @pto.jit(target="a5")
+    def kernel_dtype_mismatch(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_i32 = pto.castptr(zero_u64, pto.ptr(pto.i32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_i32, threads=128, scale=1)
+
+    try:
+        kernel_dtype_mismatch.compile()
+        raise AssertionError("expected TypeError for dtype mismatch scratch")
+    except TypeError as e:
+        err = str(e)
+        expect("element type" in err.lower() or "mismatch" in err.lower(),
+               f"dtype mismatch should mention element type, got: {e}")
+
+    print("ptodsl_allreduce: PASS")
+
+
+if __name__ == "__main__":
+    main()

From af96964cceb23932629fa1341146a48e94e38152 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Thu, 25 Jun 2026 03:03:18 +0800
Subject: [PATCH 04/37] fix(ptodsl): align allreduce scratch interface

---
 ptodsl/ptodsl/_allreduce.py    | 50 ++++++++--------------------------
 ptodsl/tests/test_allreduce.py | 34 +++++++++++------------
 2 files changed, 29 insertions(+), 55 deletions(-)

diff --git a/ptodsl/ptodsl/_allreduce.py b/ptodsl/ptodsl/_allreduce.py
index cb0ce122ed..d841374827 100644
--- a/ptodsl/ptodsl/_allreduce.py
+++ b/ptodsl/ptodsl/_allreduce.py
@@ -11,7 +11,7 @@
 Implements ``AscendAllReduce<Reducer, threads, scale, thread_offset>::run()``
 as PTO IR helper functions that are lazily emitted into the trace module.
 
-Public entry point: ``all_reduce(x, scratch, *, op, threads, scale, thread_offset)``,
+Public entry point: ``simt_allreduce_sum(value, scratch=None, *, threads, scale, thread_offset)``,
 callable from within a ``@pto.simt`` context.
 
 Dispatch tree (mirrors the C++ compile-time dispatch in ``reduce.h``)::
@@ -221,12 +221,10 @@ def _emit_warp_hw_reduce(x, *, threads: int,
 # public API
 # ═══════════════════════════════════════════════════════════════════════════════
 
-def simt_allreduce_sum(value, *,
+def simt_allreduce_sum(value, scratch=None, *,
                threads: int,
                scale: int = 1,
-               thread_offset: int = 0,
-               scratch=None,
-               scratch_offset: int = 0):
+               thread_offset: int = 0):
     """Cross-workitem all-reduce for SIMT VF context.
 
     Dispatch logic mirrors the compile-time tree in
@@ -239,18 +237,17 @@ def simt_allreduce_sum(value, *,
         thread_offset: Thread offset.  Defaults to 0.
         scratch: UB scratch buffer (``!pto.ptr<dtype, ub>``).  Required for
             ``cross_warp_reduce`` and ``ub_reduce`` paths.  Defaults to None.
-        scratch_offset: Element offset into *scratch*.  Defaults to 0.
 
     Returns:
         Lane-uniform scalar (same type as *value*) — the reduced sum.
     """
     return _dispatch_allreduce_helper(
-        value, scratch=scratch, scratch_offset=scratch_offset,
+        value, scratch=scratch,
         threads=threads, scale=scale, thread_offset=thread_offset,
     )
 
 
-def _dispatch_allreduce_helper(value, *, scratch, scratch_offset,
+def _dispatch_allreduce_helper(value, *, scratch,
                                 threads, scale, thread_offset):
     # ── parameter validation (before identity shortcut) ───────────────────
     for name, val in (("threads", threads), ("scale", scale),
@@ -288,7 +285,7 @@ def _dispatch_allreduce_helper(value, *, scratch, scratch_offset,
 
     name = _helper_name(dtype, threads, scale, thread_offset)
     args = dict(dtype=dtype, threads=threads, scale=scale,
-                thread_offset=thread_offset, scratch_offset=scratch_offset)
+                thread_offset=thread_offset)
 
     # ── Path 1: warp_reduce ───────────────────────────────────────────────
     if threads <= 32 and _is_pow2(threads) and _is_pow2(scale):
@@ -338,8 +335,7 @@ def _dispatch_allreduce_helper(value, *, scratch, scratch_offset,
 # ═══════════════════════════════════════════════════════════════════════════════
 
 def _emit_warp_reduce(helper_fn, *,
-                      dtype, threads, scale, thread_offset,
-                      scratch_offset):
+                      dtype, threads, scale, thread_offset):
     """Build the body of a single-warp all-reduce helper.
 
     Dispatches to:
@@ -386,8 +382,7 @@ def _emit_warp_reduce(helper_fn, *,
 # ═══════════════════════════════════════════════════════════════════════════════
 
 def _emit_cross_warp_reduce(helper_fn, *,
-                            dtype, threads, scale, thread_offset,
-                            scratch_offset):
+                            dtype, threads, scale, thread_offset):
     """Build the body of a cross-warp all-reduce helper.
 
     Algorithm overview:
@@ -423,7 +418,6 @@ def _emit_cross_warp_reduce(helper_fn, *,
         c_scale = arith.ConstantOp(i32, scale).result
         c_num_warps = arith.ConstantOp(i32, num_warps).result
         c_offset = arith.ConstantOp(i32, thread_offset).result
-        c_scratch_off = arith.ConstantOp(idx_t, scratch_offset).result
         c_identity = arith.ConstantOp(scalar_t, identity_val).result
 
         # ── thread indexing ──────────────────────────────────────────────
@@ -452,8 +446,6 @@ def _emit_cross_warp_reduce(helper_fn, *,
             slot = arith.AddIOp(
                 arith.MulIOp(wid, c_scale).result, lid).result
             slot_idx = arith.IndexCastOp(idx_t, slot).result
-            if scratch_offset:
-                slot_idx = arith.AddIOp(slot_idx, c_scratch_off).result
             _emit_store(scratch, slot_idx, warp_val)
             scf.YieldOp([])
 
@@ -488,8 +480,6 @@ def _emit_cross_warp_reduce(helper_fn, *,
                 inner_if = scf.IfOp(need_load, [scalar_t], hasElse=True)
                 with InsertionPoint(inner_if.then_block):
                     lid_idx = arith.IndexCastOp(idx_t, lid).result
-                    if scratch_offset:
-                        lid_idx = arith.AddIOp(lid_idx, c_scratch_off).result
                     tmp = _emit_load(scalar_t, scratch, lid_idx)
                     scf.YieldOp([tmp])
                 with InsertionPoint(inner_if.else_block):
@@ -510,8 +500,6 @@ def _emit_cross_warp_reduce(helper_fn, *,
                     idx_val = arith.AddIOp(
                         arith.MulIOp(c_w, c_scale).result, my_slot).result
                     slot_idx = arith.IndexCastOp(idx_t, idx_val).result
-                    if scratch_offset:
-                        slot_idx = arith.AddIOp(slot_idx, c_scratch_off).result
                     loaded_v = _emit_load(
                         scalar_t, scratch, slot_idx)
                     result = _apply_sum(result, loaded_v)
@@ -531,8 +519,6 @@ def _emit_cross_warp_reduce(helper_fn, *,
         write_result_if = scf.IfOp(is_global_leader, hasElse=False)
         with InsertionPoint(write_result_if.then_block):
             tx_idx = arith.IndexCastOp(idx_t, tx).result
-            if scratch_offset:
-                tx_idx = arith.AddIOp(tx_idx, c_scratch_off).result
             _emit_store(scratch, tx_idx, partial_reduced)
             scf.YieldOp([])
 
@@ -540,8 +526,6 @@ def _emit_cross_warp_reduce(helper_fn, *,
         _pto.SyncthreadsOp()
         my_slot = arith.RemUIOp(tx, c_scale).result
         load_idx = arith.IndexCastOp(idx_t, my_slot).result
-        if scratch_offset:
-            load_idx = arith.AddIOp(load_idx, c_scratch_off).result
         result = _emit_load(scalar_t, scratch, load_idx)
 
         # ── Stage 7: extra sync to fence scratch reuse ───────────────────
@@ -555,8 +539,7 @@ def _emit_cross_warp_reduce(helper_fn, *,
 # ═══════════════════════════════════════════════════════════════════════════════
 
 def _emit_ub_reduce(helper_fn, *,
-                    dtype, threads, scale, thread_offset,
-                    scratch_offset):
+                    dtype, threads, scale, thread_offset):
     """Build the body of a UB-scratch all-reduce helper.
 
     Algorithm:
@@ -583,7 +566,6 @@ def _emit_ub_reduce(helper_fn, *,
         c_threads = arith.ConstantOp(i32, threads).result
         c_scale = arith.ConstantOp(i32, scale).result
         c_offset = arith.ConstantOp(i32, thread_offset).result
-        c_scratch_off = arith.ConstantOp(idx_t, scratch_offset).result
 
         # ── thread indexing ──────────────────────────────────────────────
         tid_x = _pto.GetTidXOp().result
@@ -592,10 +574,8 @@ def _emit_ub_reduce(helper_fn, *,
         lane = arith.RemUIOp(tx, c_threads).result
         lane_mod = arith.RemUIOp(lane, c_scale).result
 
-        # ── Stage 1: each lane writes x → scratch[scratch_offset + tx] ──
+        # ── Stage 1: each lane writes x → scratch[tx] ───────────────────
         tx_idx = arith.IndexCastOp(idx_t, tx).result
-        if scratch_offset:
-            tx_idx = arith.AddIOp(tx_idx, c_scratch_off).result
         _emit_store(scratch, tx_idx, x)
 
         # ── Stage 2: sync ────────────────────────────────────────────────
@@ -608,12 +588,10 @@ def _emit_ub_reduce(helper_fn, *,
         reduce_if = scf.IfOp(is_reducer, [scalar_t], hasElse=True)
 
         with InsertionPoint(reduce_if.then_block):
-            # initial: load scratch[scratch_offset + group * threads + lane]
+            # initial: load scratch[group * threads + lane]
             group_offset = arith.MulIOp(group, c_threads).result
             first_elem = arith.AddIOp(group_offset, lane).result
             first_idx = arith.IndexCastOp(idx_t, first_elem).result
-            if scratch_offset:
-                first_idx = arith.AddIOp(first_idx, c_scratch_off).result
             acc = _emit_load(scalar_t, scratch, first_idx)
 
             # scf.for i = scale to threads step scale
@@ -648,19 +626,15 @@ def _emit_ub_reduce(helper_fn, *,
             dst_offset = arith.AddIOp(
                 arith.MulIOp(group, c_threads).result, lane).result
             dst_idx = arith.IndexCastOp(idx_t, dst_offset).result
-            if scratch_offset:
-                dst_idx = arith.AddIOp(dst_idx, c_scratch_off).result
             _emit_store(scratch, dst_idx, flag)
             scf.YieldOp([])
 
-        # ── Stage 6: sync + broadcast scratch[scratch_offset + group*threads + tx%scale] ──
+        # ── Stage 6: sync + broadcast scratch[group*threads + tx%scale] ──
         _pto.SyncthreadsOp()
         my_slot = arith.AddIOp(
             arith.MulIOp(group, c_threads).result,
             arith.RemUIOp(tx, c_scale).result).result
         load_idx = arith.IndexCastOp(idx_t, my_slot).result
-        if scratch_offset:
-            load_idx = arith.AddIOp(load_idx, c_scratch_off).result
         result = _emit_load(scalar_t, scratch, load_idx)
 
         # ── Stage 7: extra sync to fence scratch reuse ───────────────────
diff --git a/ptodsl/tests/test_allreduce.py b/ptodsl/tests/test_allreduce.py
index 1f6b964894..f9262bda0c 100644
--- a/ptodsl/tests/test_allreduce.py
+++ b/ptodsl/tests/test_allreduce.py
@@ -211,7 +211,7 @@ def kernel_ub6(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=6, scale=1)
+            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=6, scale=1)
 
     compiled_ub6 = kernel_ub6.compile()
     mlir_ub6 = compiled_ub6.mlir_text()
@@ -235,7 +235,7 @@ def kernel_ub6s2(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=6, scale=2)
+            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=6, scale=2)
 
     compiled_ub6s2 = kernel_ub6s2.compile()
     mlir_ub6s2 = compiled_ub6s2.mlir_text()
@@ -265,7 +265,7 @@ def kernel_ub_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=6, scale=1,
+            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=6, scale=1,
                                      thread_offset=4)
 
     compiled_ub_o4 = kernel_ub_o4.compile()
@@ -286,7 +286,7 @@ def kernel_128(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=1)
+            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=128, scale=1)
 
     compiled = kernel_128.compile()
     mlir = compiled.mlir_text()
@@ -317,7 +317,7 @@ def kernel_64(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=64, scale=1)
+            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=64, scale=1)
 
     compiled_64 = kernel_64.compile()
     mlir_64 = compiled_64.mlir_text()
@@ -332,7 +332,7 @@ def kernel_256(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=256, scale=1)
+            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=256, scale=1)
 
     compiled_256 = kernel_256.compile()
     mlir_256 = compiled_256.mlir_text()
@@ -351,7 +351,7 @@ def kernel_cw_s2(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=2)
+            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=128, scale=2)
 
     compiled_cw_s2 = kernel_cw_s2.compile()
     mlir_cw_s2 = compiled_cw_s2.mlir_text()
@@ -375,7 +375,7 @@ def kernel_cw_s16(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=16)
+            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=128, scale=16)
 
     compiled_cw_s16 = kernel_cw_s16.compile()
     mlir_cw_s16 = compiled_cw_s16.mlir_text()
@@ -392,7 +392,7 @@ def kernel_cw_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=1,
+            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=128, scale=1,
                                      thread_offset=4)
 
     compiled_cw_o4 = kernel_cw_o4.compile()
@@ -416,7 +416,7 @@ def kernel_ub48(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=48, scale=1)
+            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=48, scale=1)
 
     compiled_ub48 = kernel_ub48.compile()
     mlir_ub48 = compiled_ub48.mlir_text()
@@ -440,9 +440,9 @@ def kernel_reuse(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x1 = pto.const(1.0, dtype=pto.f32)
-            _r1 = pto.simt_allreduce_sum(x1, scratch=ub_scratch, threads=128, scale=1)
+            _r1 = pto.simt_allreduce_sum(x1, ub_scratch, threads=128, scale=1)
             x2 = pto.const(2.0, dtype=pto.f32)
-            _r2 = pto.simt_allreduce_sum(x2, scratch=ub_scratch, threads=128, scale=1)
+            _r2 = pto.simt_allreduce_sum(x2, ub_scratch, threads=128, scale=1)
 
     compiled2 = kernel_reuse.compile()
     mlir2 = compiled2.mlir_text()
@@ -465,7 +465,7 @@ def kernel_reuse(scratch_gm: pto.ptr(pto.f32, "gm")):
     def kernel_no_scratch_cw(scratch_gm: pto.ptr(pto.f32, "gm")):
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, scratch=None, threads=128, scale=1)
+            _result = pto.simt_allreduce_sum(x, None, threads=128, scale=1)
 
     try:
         kernel_no_scratch_cw.compile()
@@ -479,7 +479,7 @@ def kernel_no_scratch_cw(scratch_gm: pto.ptr(pto.f32, "gm")):
     def kernel_no_scratch_ub(scratch_gm: pto.ptr(pto.f32, "gm")):
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, scratch=None, threads=6, scale=1)
+            _result = pto.simt_allreduce_sum(x, None, threads=6, scale=1)
 
     try:
         kernel_no_scratch_ub.compile()
@@ -490,7 +490,7 @@ def kernel_no_scratch_ub(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     # scratch must be a pto.ptr type
     try:
-        simt_allreduce_sum(1.0, scratch="not_a_ptr", threads=6, scale=1)
+        simt_allreduce_sum(1.0, "not_a_ptr", threads=6, scale=1)
         raise AssertionError("expected TypeError for non-ptr scratch")
     except (TypeError, AttributeError):
         pass
@@ -500,7 +500,7 @@ def kernel_no_scratch_ub(scratch_gm: pto.ptr(pto.f32, "gm")):
     def kernel_gm_scratch(scratch_gm: pto.ptr(pto.f32, "gm")):
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, scratch=scratch_gm, threads=128, scale=1)
+            _result = pto.simt_allreduce_sum(x, scratch_gm, threads=128, scale=1)
 
     try:
         kernel_gm_scratch.compile()
@@ -516,7 +516,7 @@ def kernel_dtype_mismatch(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_i32 = pto.castptr(zero_u64, pto.ptr(pto.i32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, scratch=ub_i32, threads=128, scale=1)
+            _result = pto.simt_allreduce_sum(x, ub_i32, threads=128, scale=1)
 
     try:
         kernel_dtype_mismatch.compile()

From 875db5b2198f7d342a766d2a615e5cc5b89c203c Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Thu, 25 Jun 2026 15:56:17 +0800
Subject: [PATCH 05/37] test(ptodsl): cover alloc_buffer allreduce scratch

---
 ptodsl/tests/test_allreduce.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/ptodsl/tests/test_allreduce.py b/ptodsl/tests/test_allreduce.py
index f9262bda0c..c427a96314 100644
--- a/ptodsl/tests/test_allreduce.py
+++ b/ptodsl/tests/test_allreduce.py
@@ -310,6 +310,24 @@ def kernel_128(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled.verify()
 
+    # ── issue 483 integration: alloc_buffer(scope="ub") scratch ─────────────
+    @pto.jit(target="a5", mode="explicit")
+    def kernel_alloc_buffer_scratch():
+        reduce_scratch = pto.alloc_buffer((128,), pto.f32, scope="ub")
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, reduce_scratch, threads=128, scale=1)
+
+    compiled_alloc = kernel_alloc_buffer_scratch.compile()
+    mlir_alloc = compiled_alloc.mlir_text()
+    expect("dyn_shared_memory_buf = 512 : i64" in mlir_alloc,
+           "IR: alloc_buffer scratch reserves 128 f32 elements in UB")
+    expect("call @__tl_allreduce_sum_f32_t128_s1_o0" in mlir_alloc,
+           "IR: alloc_buffer scratch can be passed to simt_allreduce_sum")
+    expect("!pto.ptr<f32, ub>" in mlir_alloc,
+           "IR: allreduce scratch keeps typed UB pointer")
+    compiled_alloc.verify()
+
     # ── cross_warp: sum, f32, t=64 (2 warps) ────────────────────────────────
     @pto.jit(target="a5")
     def kernel_64(scratch_gm: pto.ptr(pto.f32, "gm")):

From 1f08361d8560fd5886f95848f96d1e5b8df54d93 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Thu, 25 Jun 2026 17:03:15 +0800
Subject: [PATCH 06/37] example(ptodsl): add RMSNorm alloc_buffer SIMT kernel

---
 ptodsl/examples/rmsnorm_alloc_buffer_simt.py | 233 +++++++++++++++++++
 1 file changed, 233 insertions(+)
 create mode 100644 ptodsl/examples/rmsnorm_alloc_buffer_simt.py

diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
new file mode 100644
index 0000000000..627fbd51f8
--- /dev/null
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+"""
+RMSNorm compile-only PTODSL example for issue 483.
+
+The example exercises the PTODSL surfaces needed by the RMSNorm SimtVF kernel:
+
+- ``pto.alloc_buffer(...)`` for UB scratch and lane-local storage
+- contiguous scalar ``load`` / ``store`` vector accesses
+- ``pto.simt_allreduce_sum(...)`` for cross-workitem sum reduction
+- runtime ``range(...)`` for the token loop so the AST rewrite emits ``scf.for``
+
+Run this file directly to print the emitted MLIR for one specialization.
+"""
+
+import argparse
+from pathlib import Path
+import sys
+
+
+if __package__ in {None, ""}:
+    here = Path(__file__).resolve()
+    for candidate in here.parents:
+        if (candidate / "ptodsl" / "__init__.py").exists():
+            sys.path.insert(0, str(candidate))
+            break
+    else:
+        raise RuntimeError(
+            "Unable to locate the PTODSL Python package root from rmsnorm_alloc_buffer_simt.py"
+        )
+
+
+from ptodsl import pto, scalar
+
+
+def init_weight_fragment_body(
+    w_ub,
+    w_frag,
+    *,
+    threads: pto.constexpr = 128,
+    rounds: pto.constexpr = 16,
+    lanes: pto.constexpr = 2,
+):
+    tx = pto.get_tid_x()
+
+    for r in pto.static_range(0, rounds):
+        ub_offset = r * threads * lanes + tx * lanes
+        frag_offset = r * lanes
+
+        w_vec = scalar.load(w_ub, ub_offset, contiguous=lanes)
+        scalar.store(w_vec, w_frag, frag_offset)
+
+
+def rmsnorm_4096_token_body(
+    x_ub,
+    y_ub,
+    rstd_ub,
+    reduce_scratch,
+    x_frag,
+    w_frag,
+    eps: pto.f32,
+    ping: pto.i32,
+    *,
+    threads: pto.constexpr = 128,
+    rounds: pto.constexpr = 16,
+    lanes: pto.constexpr = 2,
+    hidden_size: pto.constexpr = 4096,
+):
+    tx = pto.get_tid_x()
+    local_sum = 0.0
+
+    for r in pto.static_range(0, rounds):
+        lane_offset = r * threads * lanes + tx * lanes
+        x_offset = ping * hidden_size + lane_offset
+        frag_offset = r * lanes
+
+        x_vec = scalar.load(x_ub, x_offset, contiguous=lanes)
+        scalar.store(x_vec, x_frag, frag_offset)
+
+        for lane in pto.static_range(0, lanes):
+            x = scalar.load(x_frag, frag_offset + lane)
+            local_sum = local_sum + x * x
+
+    sum_sq = pto.simt_allreduce_sum(
+        local_sum,
+        reduce_scratch,
+        threads=threads,
+        scale=1,
+        thread_offset=0,
+    )
+
+    rstd = 1.0 / scalar.sqrt(sum_sq / hidden_size + eps)
+
+    with pto.if_(tx == 0) as br:
+        with br.then_:
+            scalar.store(rstd, rstd_ub, ping)
+
+    for r in pto.static_range(0, rounds):
+        lane_offset = r * threads * lanes + tx * lanes
+        y_offset = ping * hidden_size + lane_offset
+        frag_offset = r * lanes
+
+        for lane in pto.static_range(0, lanes):
+            x = scalar.load(x_frag, frag_offset + lane)
+            w = scalar.load(w_frag, frag_offset + lane)
+            y = x * rstd * w
+            scalar.store(y, y_ub, y_offset + lane)
+
+
+@pto.jit(target="a5", mode="explicit")
+def rmsnorm_4096_alloc_buffer_simt_context_kernel(
+    X: pto.ptr(pto.f32, "gm"),
+    W: pto.ptr(pto.f32, "gm"),
+    Y: pto.ptr(pto.f32, "gm"),
+    RSTD: pto.ptr(pto.f32, "gm"),
+    eps: pto.f32,
+    batch: pto.i32,
+    *,
+    threads: pto.constexpr = 128,
+    rounds: pto.constexpr = 16,
+    lanes: pto.constexpr = 2,
+    hidden_size: pto.constexpr = 4096,
+    n_cores: pto.constexpr = 64,
+    tokens_per_core: pto.constexpr = 64,
+):
+    core_id = pto.get_block_idx()
+    frag_elems: pto.constexpr = rounds * lanes
+
+    w_ub = pto.alloc_buffer((hidden_size,), pto.f32, scope="ub")
+    x_ub = pto.alloc_buffer((2, hidden_size), pto.f32, scope="ub")
+    y_ub = pto.alloc_buffer((2, hidden_size), pto.f32, scope="ub")
+    rstd_ub = pto.alloc_buffer((2,), pto.f32, scope="ub")
+    reduce_scratch = pto.alloc_buffer((threads,), pto.f32, scope="ub")
+
+    x_frag = pto.alloc_buffer((frag_elems,), pto.f32, scope="local")
+    w_frag = pto.alloc_buffer((frag_elems,), pto.f32, scope="local", persistent=True)
+
+    pto.mte_gm_ub(
+        W,
+        w_ub,
+        0,
+        hidden_size * 4,
+        nburst=(1, hidden_size * 4, hidden_size * 4),
+    )
+
+    with pto.simt():
+        init_weight_fragment_body(
+            w_ub,
+            w_frag,
+            threads=threads,
+            rounds=rounds,
+            lanes=lanes,
+        )
+
+    for local_token in range(0, tokens_per_core):
+        token_id = local_token * n_cores + core_id
+        ping = local_token % 2
+
+        pto.mte_gm_ub(
+            pto.addptr(X, token_id * hidden_size),
+            x_ub,
+            ping * hidden_size * 4,
+            hidden_size * 4,
+            nburst=(1, hidden_size * 4, hidden_size * 4),
+        )
+
+        with pto.simt():
+            rmsnorm_4096_token_body(
+                x_ub,
+                y_ub,
+                rstd_ub,
+                reduce_scratch,
+                x_frag,
+                w_frag,
+                eps,
+                ping,
+                threads=threads,
+                rounds=rounds,
+                lanes=lanes,
+                hidden_size=hidden_size,
+            )
+
+        pto.mte_ub_gm(
+            y_ub,
+            pto.addptr(Y, token_id * hidden_size),
+            hidden_size * 4,
+            nburst=(1, hidden_size * 4, hidden_size * 4),
+        )
+
+        pto.mte_ub_gm(
+            rstd_ub,
+            pto.addptr(RSTD, token_id),
+            4,
+            nburst=(1, 4, 4),
+        )
+
+
+def build_x128():
+    return rmsnorm_4096_alloc_buffer_simt_context_kernel.compile(
+        threads=128,
+        rounds=16,
+        lanes=2,
+        tokens_per_core=64,
+    )
+
+
+def build_x64():
+    return rmsnorm_4096_alloc_buffer_simt_context_kernel.compile(
+        threads=64,
+        rounds=16,
+        lanes=4,
+        tokens_per_core=64,
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Emit RMSNorm PTODSL MLIR")
+    parser.add_argument("--variant", choices=("x128", "x64"), default="x128")
+    args = parser.parse_args()
+
+    compiled = build_x128() if args.variant == "x128" else build_x64()
+    compiled.verify()
+    print(compiled.mlir_text())
+
+
+if __name__ == "__main__":
+    main()

From edae0afb9dc7e0b80e953041eeaa4a92180d0a9b Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Thu, 25 Jun 2026 18:46:37 +0800
Subject: [PATCH 07/37] test(ptodsl): cover RMSNorm example compile

---
 ptodsl/README.md                              | 18 ++++
 .../user_guide/04-type-system-and-buffer.md   |  7 +-
 ptodsl/tests/test_jit_compile.py              |  6 +-
 ptodsl/tests/test_rmsnorm_example_compile.py  | 98 +++++++++++++++++++
 4 files changed, 124 insertions(+), 5 deletions(-)
 create mode 100644 ptodsl/tests/test_rmsnorm_example_compile.py

diff --git a/ptodsl/README.md b/ptodsl/README.md
index c2e034ac5a..43fcbdb5a5 100644
--- a/ptodsl/README.md
+++ b/ptodsl/README.md
@@ -152,6 +152,22 @@ Direct run on a real NPU:
 python3 ptodsl/examples/flash_attention_softmax_launch.py
 ```
 
+### `rmsnorm_alloc_buffer_simt.py`
+
+Compile-only RMSNorm example for explicit-mode SIMT kernels. It exercises
+`pto.alloc_buffer(...)`, contiguous `scalar.load` / `scalar.store`, `pto.vec`,
+`pto.simt_allreduce_sum(...)`, and a runtime token loop that lowers to
+`scf.for`.
+
+```bash
+python3 ptodsl/examples/rmsnorm_alloc_buffer_simt.py --variant x128 > /tmp/rmsnorm_x128.mlir
+python3 ptodsl/examples/rmsnorm_alloc_buffer_simt.py --variant x64 > /tmp/rmsnorm_x64.mlir
+```
+
+Expected: MLIR containing `@rmsnorm_4096_alloc_buffer_simt_context_kernel`,
+`scf.for`, `vector<2xf32>` for `x128`, `vector<4xf32>` for `x64`, and the
+`__tl_allreduce_sum` helper.
+
 ### Launch artifacts
 
 - `~/.cache/ptodsl/` — JIT-compiled kernel `.so` cache
@@ -167,6 +183,7 @@ python3 ptodsl/tests/test_jit_compile.py
 python3 ptodsl/tests/test_jit_diagnostics.py
 python3 ptodsl/tests/test_subkernel_diagnostics.py
 python3 ptodsl/tests/test_flash_attention_demo_compile.py
+python3 ptodsl/tests/test_rmsnorm_example_compile.py
 python3 ptodsl/tests/test_ptoas_frontend_verify.py
 python3 ptodsl/tests/test_docs_as_test.py
 ```
@@ -178,6 +195,7 @@ ptodsl_jit_compile: PASS
 ptodsl_jit_diagnostics: PASS
 ptodsl_subkernel_diagnostics: PASS
 ptodsl_flash_attention_demo_compile: PASS
+ptodsl_rmsnorm_example_compile: PASS
 ptodsl_ptoas_frontend_verify: PASS
 ptodsl_docs_as_test: PASS
 ```
diff --git a/ptodsl/docs/user_guide/04-type-system-and-buffer.md b/ptodsl/docs/user_guide/04-type-system-and-buffer.md
index c557b42e91..5a44790df4 100644
--- a/ptodsl/docs/user_guide/04-type-system-and-buffer.md
+++ b/ptodsl/docs/user_guide/04-type-system-and-buffer.md
@@ -179,6 +179,7 @@ ptr_ub  = pto.ptr(pto.f16, pto.MemorySpace.UB)
 
 Use `pto.alloc_buffer(...)` in explicit-mode kernels to allocate scratch storage that is addressed through pointer-style operations:
 
+<!-- ptodsl-doc-pending: {"reason":"illustrative fragment; covered by test_jit_compile alloc_buffer probes"} -->
 ```python
 ub_scratch = pto.alloc_buffer((4096,), pto.f32, scope="ub")
 fragment = pto.alloc_buffer((32,), pto.f32, scope="local", persistent=True)
@@ -186,9 +187,11 @@ fragment = pto.alloc_buffer((32,), pto.f32, scope="local", persistent=True)
 
 `scope="ub"` reserves space in the function-level Unified Buffer scratch area and returns a typed UB pointer. The allocation contributes to the kernel's dynamic shared-memory size and can be passed to explicit data-movement helpers such as `pto.mte_gm_ub(...)` and `pto.mte_ub_gm(...)`.
 
-`scope="local"` creates SIMT-local fragment storage for use by lower-level load/store surfaces. It is intended for per-workitem arrays such as `x_frag[]` and `w_frag[]`. The `persistent` flag is accepted as lifetime metadata for callers that need to distinguish reusable fragment storage from ordinary temporary scratch.
+UB allocations are laid out in bytes during tracing. Each allocation starts at a 32-byte-aligned offset, and the final reserved size is rounded up to 32 bytes before it is written to the kernel's `dyn_shared_memory_buf` attribute. The returned value is a typed pointer to the requested element type, not a high-level buffer object.
 
-Shapes must be static positive integers so the frontend can compute storage size and layout while tracing.
+`scope="local"` creates SIMT-local fragment storage for use by lower-level load/store surfaces. It lowers inside the active SIMT helper as an `llvm.alloca`. It is intended for per-workitem arrays such as `x_frag[]` and `w_frag[]`. The `persistent` flag is accepted as lifetime metadata for callers that need to distinguish reusable fragment storage from ordinary temporary scratch; it does not change the returned pointer type.
+
+Shapes must be static positive integers so the frontend can compute storage size and layout while tracing. `alloc_buffer` lowers directly to the pointer arithmetic and local allocation operations needed by the kernel; it does not introduce a new high-level PTO IR operation.
 
 ## 4.6 TensorView
 
diff --git a/ptodsl/tests/test_jit_compile.py b/ptodsl/tests/test_jit_compile.py
index bb7d6bf01a..00e21ff30f 100644
--- a/ptodsl/tests/test_jit_compile.py
+++ b/ptodsl/tests/test_jit_compile.py
@@ -905,8 +905,8 @@ def rmsnorm_alloc_buffer_layout_probe(
 ):
     w_ub = pto.alloc_buffer((4096,), pto.f32, scope="ub")
     x_ub = pto.alloc_buffer((2, 4096), pto.f32, scope="ub")
-    rstd_ub = pto.alloc_buffer((16,), pto.f32, scope="ub")
     y_ub = pto.alloc_buffer((2, 4096), pto.f32, scope="ub")
+    rstd_ub = pto.alloc_buffer((2,), pto.f32, scope="ub")
     reduce_scratch = pto.alloc_buffer((128,), pto.f32, scope="ub")
 
     pto.mte_gm_ub(W, w_ub, 0, 4096 * 4, nburst=(1, 0, 0))
@@ -4045,10 +4045,10 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
     rmsnorm_alloc_buffer_text = rmsnorm_alloc_buffer_layout_probe.compile().mlir_text()
     expect_parse_roundtrip_and_verify(rmsnorm_alloc_buffer_text, "RMSNorm alloc_buffer layout specialization")
     expect(
-        "dyn_shared_memory_buf = 82496 : i64" in rmsnorm_alloc_buffer_text,
+        "dyn_shared_memory_buf = 82464 : i64" in rmsnorm_alloc_buffer_text,
         "RMSNorm alloc_buffer layout should reserve the same UB scratch size as the expanded RMSNorm kernel",
     )
-    for expected_offset in (16384, 49152, 49216, 81984):
+    for expected_offset in (16384, 49152, 81920, 81952):
         expect(
             f"arith.constant {expected_offset} : index" in rmsnorm_alloc_buffer_text,
             f"RMSNorm alloc_buffer layout should materialize UB byte offset {expected_offset}",
diff --git a/ptodsl/tests/test_rmsnorm_example_compile.py b/ptodsl/tests/test_rmsnorm_example_compile.py
new file mode 100644
index 0000000000..3130123f7a
--- /dev/null
+++ b/ptodsl/tests/test_rmsnorm_example_compile.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+from importlib.util import module_from_spec, spec_from_file_location
+from pathlib import Path
+import sys
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(REPO_ROOT / "ptodsl"))
+
+from mlir.ir import Module
+from ptodsl._bootstrap import make_context
+
+
+def expect(condition: bool, message: str) -> None:
+    if not condition:
+        raise AssertionError(message)
+
+
+def expect_parse_roundtrip_and_verify(text: str, label: str) -> None:
+    with make_context() as ctx:
+        parsed = Module.parse(text, ctx)
+        parsed.operation.verify()
+        roundtrip_text = str(parsed)
+    expect(
+        roundtrip_text == text,
+        f"{label} should survive Module.parse(...) round-trip without textual drift",
+    )
+
+
+def load_rmsnorm_example():
+    example_path = REPO_ROOT / "ptodsl" / "examples" / "rmsnorm_alloc_buffer_simt.py"
+    expect(example_path.is_file(), f"RMSNorm example is missing: {example_path}")
+
+    spec = spec_from_file_location("ptodsl_rmsnorm_alloc_buffer_simt", example_path)
+    expect(spec is not None and spec.loader is not None, f"unable to create import spec for {example_path}")
+    module = module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def check_variant(compiled, *, label: str, vector_type: str, helper_name_fragment: str, ub_size: int) -> None:
+    compiled.verify()
+    text = compiled.mlir_text()
+    expect_parse_roundtrip_and_verify(text, f"RMSNorm {label} MLIR")
+
+    expect("func.func @rmsnorm_4096_alloc_buffer_simt_context_kernel" in text, f"{label}: missing entry")
+    expect(f"dyn_shared_memory_buf = {ub_size} : i64" in text, f"{label}: unexpected UB scratch size")
+    expect("scf.for" in text, f"{label}: tokens_per_core loop should lower to scf.for")
+    expect("pto.mte_gm_ub" in text, f"{label}: missing GM->UB transfer")
+    expect("pto.mte_ub_gm" in text, f"{label}: missing UB->GM transfer")
+    expect(vector_type in text, f"{label}: missing contiguous vector access type {vector_type}")
+    expect(helper_name_fragment in text, f"{label}: missing allreduce helper")
+    expect("func.call @__tl_allreduce_sum" in text or "call @__tl_allreduce_sum" in text,
+           f"{label}: allreduce should remain helper-call based")
+
+    expect(
+        text.count("pto.mte_gm_ub") == 2,
+        f"{label}: expected compact transfer structure with 2 GM->UB ops",
+    )
+    expect(
+        text.count("pto.mte_ub_gm") == 2,
+        f"{label}: expected compact transfer structure with 2 UB->GM ops",
+    )
+
+
+def main() -> None:
+    example = load_rmsnorm_example()
+
+    expect(hasattr(example, "build_x128"), "RMSNorm example should export build_x128()")
+    expect(hasattr(example, "build_x64"), "RMSNorm example should export build_x64()")
+
+    check_variant(
+        example.build_x128(),
+        label="x128",
+        vector_type="vector<2xf32>",
+        helper_name_fragment="__tl_allreduce_sum_f32_t128_s1_o0",
+        ub_size=82464,
+    )
+    check_variant(
+        example.build_x64(),
+        label="x64",
+        vector_type="vector<4xf32>",
+        helper_name_fragment="__tl_allreduce_sum_f32_t64_s1_o0",
+        ub_size=82208,
+    )
+
+    print("ptodsl_rmsnorm_example_compile: PASS")
+
+
+if __name__ == "__main__":
+    main()

From 1b397882bea27863429f98cc60bab94109b8a01e Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Thu, 25 Jun 2026 18:54:45 +0800
Subject: [PATCH 08/37] docs(ptodsl): avoid fixed alloc_buffer alignment
 contract

---
 ptodsl/docs/user_guide/04-type-system-and-buffer.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ptodsl/docs/user_guide/04-type-system-and-buffer.md b/ptodsl/docs/user_guide/04-type-system-and-buffer.md
index 5a44790df4..3c597f5d75 100644
--- a/ptodsl/docs/user_guide/04-type-system-and-buffer.md
+++ b/ptodsl/docs/user_guide/04-type-system-and-buffer.md
@@ -187,7 +187,7 @@ fragment = pto.alloc_buffer((32,), pto.f32, scope="local", persistent=True)
 
 `scope="ub"` reserves space in the function-level Unified Buffer scratch area and returns a typed UB pointer. The allocation contributes to the kernel's dynamic shared-memory size and can be passed to explicit data-movement helpers such as `pto.mte_gm_ub(...)` and `pto.mte_ub_gm(...)`.
 
-UB allocations are laid out in bytes during tracing. Each allocation starts at a 32-byte-aligned offset, and the final reserved size is rounded up to 32 bytes before it is written to the kernel's `dyn_shared_memory_buf` attribute. The returned value is a typed pointer to the requested element type, not a high-level buffer object.
+UB allocations are laid out in bytes during tracing. The frontend may insert alignment padding between allocations, and the final reserved size is written to the kernel's `dyn_shared_memory_buf` attribute. The returned value is a typed pointer to the requested element type, not a high-level buffer object.
 
 `scope="local"` creates SIMT-local fragment storage for use by lower-level load/store surfaces. It lowers inside the active SIMT helper as an `llvm.alloca`. It is intended for per-workitem arrays such as `x_frag[]` and `w_frag[]`. The `persistent` flag is accepted as lifetime metadata for callers that need to distinguish reusable fragment storage from ordinary temporary scratch; it does not change the returned pointer type.
 

From 1e7cc2382a8756b137c2f74b47784eed44a1419d Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Thu, 25 Jun 2026 19:03:20 +0800
Subject: [PATCH 09/37] docs(ptodsl): summarize alloc_buffer scopes in table

---
 ptodsl/docs/user_guide/04-type-system-and-buffer.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/ptodsl/docs/user_guide/04-type-system-and-buffer.md b/ptodsl/docs/user_guide/04-type-system-and-buffer.md
index 3c597f5d75..276fca8e96 100644
--- a/ptodsl/docs/user_guide/04-type-system-and-buffer.md
+++ b/ptodsl/docs/user_guide/04-type-system-and-buffer.md
@@ -185,11 +185,10 @@ ub_scratch = pto.alloc_buffer((4096,), pto.f32, scope="ub")
 fragment = pto.alloc_buffer((32,), pto.f32, scope="local", persistent=True)
 ```
 
-`scope="ub"` reserves space in the function-level Unified Buffer scratch area and returns a typed UB pointer. The allocation contributes to the kernel's dynamic shared-memory size and can be passed to explicit data-movement helpers such as `pto.mte_gm_ub(...)` and `pto.mte_ub_gm(...)`.
-
-UB allocations are laid out in bytes during tracing. The frontend may insert alignment padding between allocations, and the final reserved size is written to the kernel's `dyn_shared_memory_buf` attribute. The returned value is a typed pointer to the requested element type, not a high-level buffer object.
-
-`scope="local"` creates SIMT-local fragment storage for use by lower-level load/store surfaces. It lowers inside the active SIMT helper as an `llvm.alloca`. It is intended for per-workitem arrays such as `x_frag[]` and `w_frag[]`. The `persistent` flag is accepted as lifetime metadata for callers that need to distinguish reusable fragment storage from ordinary temporary scratch; it does not change the returned pointer type.
+| Scope | Storage | Returned value | Typical use | Layout notes |
+|-------|---------|----------------|-------------|--------------|
+| `"ub"` | Function-level Unified Buffer scratch | Typed `!pto.ptr<T, ub>` | MTE source/destination buffers, cross-SIMT scratch such as reductions | Contributes to `dyn_shared_memory_buf`; the frontend may insert alignment padding between allocations |
+| `"local"` | SIMT-helper local storage | Typed local pointer backed by `llvm.alloca` | Per-workitem fragments such as `x_frag[]` and `w_frag[]` | Lives inside the active SIMT helper; `persistent=True` is lifetime metadata and does not change the pointer type |
 
 Shapes must be static positive integers so the frontend can compute storage size and layout while tracing. `alloc_buffer` lowers directly to the pointer arithmetic and local allocation operations needed by the kernel; it does not introduce a new high-level PTO IR operation.
 

From b5a1d786aebe455869d21a2daa2d151553c3d4a8 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Thu, 25 Jun 2026 19:53:01 +0800
Subject: [PATCH 10/37] test(ptodsl): compact rmsnorm simt loops

---
 ptodsl/examples/rmsnorm_alloc_buffer_simt.py | 15 +++++++++++----
 ptodsl/tests/test_rmsnorm_example_compile.py |  9 +++++++++
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
index fd078ec35a..894c48d9c1 100644
--- a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
@@ -15,6 +15,7 @@
 - contiguous scalar ``load`` / ``store`` vector accesses
 - ``pto.simt_allreduce_sum(...)`` for cross-workitem sum reduction
 - runtime ``range(...)`` for the token loop so the AST rewrite emits ``scf.for``
+- explicit ``pto.for_(...)`` loops inside SIMT helpers to avoid trace-time expansion
 
 Run this file directly to print the emitted MLIR for one specialization.
 """
@@ -49,7 +50,7 @@ def init_weight_fragment_body(
 ):
     tx = pto.get_tid_x()
 
-    for r in pto.static_range(0, rounds):
+    with pto.for_(0, rounds, step=1) as r:
         ub_offset = r * threads * lanes + tx * lanes
         frag_offset = r * lanes
 
@@ -73,9 +74,12 @@ def rmsnorm_4096_token_body(
     hidden_size: pto.const_expr = 4096,
 ):
     tx = pto.get_tid_x()
-    local_sum = 0.0
+    local_sum = pto.const(0.0, dtype=pto.f32)
 
-    for r in pto.static_range(0, rounds):
+    sum_loop = pto.for_(0, rounds, step=1).carry(local_sum=local_sum)
+    with sum_loop:
+        r = sum_loop.iv
+        local_sum = sum_loop.local_sum
         lane_offset = r * threads * lanes + tx * lanes
         x_offset = ping * hidden_size + lane_offset
         frag_offset = r * lanes
@@ -86,6 +90,9 @@ def rmsnorm_4096_token_body(
         for lane in pto.static_range(0, lanes):
             x = scalar.load(x_frag, frag_offset + lane)
             local_sum = local_sum + x * x
+        sum_loop.update(local_sum=local_sum)
+
+    local_sum = sum_loop.final("local_sum")
 
     sum_sq = pto.simt_allreduce_sum(
         local_sum,
@@ -101,7 +108,7 @@ def rmsnorm_4096_token_body(
         with br.then_:
             scalar.store(rstd, rstd_ub, ping)
 
-    for r in pto.static_range(0, rounds):
+    with pto.for_(0, rounds, step=1) as r:
         lane_offset = r * threads * lanes + tx * lanes
         y_offset = ping * hidden_size + lane_offset
         frag_offset = r * lanes
diff --git a/ptodsl/tests/test_rmsnorm_example_compile.py b/ptodsl/tests/test_rmsnorm_example_compile.py
index 3130123f7a..94d981ddc6 100644
--- a/ptodsl/tests/test_rmsnorm_example_compile.py
+++ b/ptodsl/tests/test_rmsnorm_example_compile.py
@@ -53,6 +53,7 @@ def check_variant(compiled, *, label: str, vector_type: str, helper_name_fragmen
     expect("func.func @rmsnorm_4096_alloc_buffer_simt_context_kernel" in text, f"{label}: missing entry")
     expect(f"dyn_shared_memory_buf = {ub_size} : i64" in text, f"{label}: unexpected UB scratch size")
     expect("scf.for" in text, f"{label}: tokens_per_core loop should lower to scf.for")
+    expect(text.count("scf.for") >= 4, f"{label}: SIMT inner loops should lower to compact scf.for ops")
     expect("pto.mte_gm_ub" in text, f"{label}: missing GM->UB transfer")
     expect("pto.mte_ub_gm" in text, f"{label}: missing UB->GM transfer")
     expect(vector_type in text, f"{label}: missing contiguous vector access type {vector_type}")
@@ -68,6 +69,14 @@ def check_variant(compiled, *, label: str, vector_type: str, helper_name_fragmen
         text.count("pto.mte_ub_gm") == 2,
         f"{label}: expected compact transfer structure with 2 UB->GM ops",
     )
+    expect(
+        text.count("pto.castptr") <= 12,
+        f"{label}: SIMT inner loops should not be trace-time expanded into many castptr ops",
+    )
+    expect(
+        text.count("pto.store ") <= 8,
+        f"{label}: SIMT inner loops should not be trace-time expanded into many scalar stores",
+    )
 
 
 def main() -> None:

From 2a254f517e62093a7f1ea8ba75abc946979ddbb4 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Thu, 25 Jun 2026 21:06:13 +0800
Subject: [PATCH 11/37] test(ptodsl): align rmsnorm simt body

---
 ptodsl/examples/rmsnorm_alloc_buffer_simt.py | 37 ++++++++++----------
 ptodsl/tests/test_jit_compile.py             |  6 ++--
 ptodsl/tests/test_rmsnorm_example_compile.py | 28 +++++++++++++--
 3 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
index 894c48d9c1..f0b440b3e1 100644
--- a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
@@ -63,7 +63,6 @@ def rmsnorm_4096_token_body(
     y_ub,
     rstd_ub,
     reduce_scratch,
-    x_frag,
     w_frag,
     eps: pto.f32,
     ping: pto.i32,
@@ -74,12 +73,13 @@ def rmsnorm_4096_token_body(
     hidden_size: pto.const_expr = 4096,
 ):
     tx = pto.get_tid_x()
-    local_sum = pto.const(0.0, dtype=pto.f32)
+    frag_elems: pto.const_expr = rounds * lanes
+    x_frag = pto.alloc_buffer((frag_elems,), pto.f32, scope="local")
+    sum_sq = pto.alloc_buffer((1,), pto.f32, scope="local")
 
-    sum_loop = pto.for_(0, rounds, step=1).carry(local_sum=local_sum)
-    with sum_loop:
-        r = sum_loop.iv
-        local_sum = sum_loop.local_sum
+    scalar.store(pto.const(0.0, dtype=pto.f32), sum_sq, 0)
+
+    with pto.for_(0, rounds, step=1) as r:
         lane_offset = r * threads * lanes + tx * lanes
         x_offset = ping * hidden_size + lane_offset
         frag_offset = r * lanes
@@ -88,11 +88,12 @@ def rmsnorm_4096_token_body(
         scalar.store(x_vec, x_frag, frag_offset)
 
         for lane in pto.static_range(0, lanes):
+            local_sum = scalar.load(sum_sq, 0)
             x = scalar.load(x_frag, frag_offset + lane)
             local_sum = local_sum + x * x
-        sum_loop.update(local_sum=local_sum)
+            scalar.store(local_sum, sum_sq, 0)
 
-    local_sum = sum_loop.final("local_sum")
+    local_sum = scalar.load(sum_sq, 0)
 
     sum_sq = pto.simt_allreduce_sum(
         local_sum,
@@ -106,18 +107,18 @@ def rmsnorm_4096_token_body(
 
     with pto.if_(tx == 0) as br:
         with br.then_:
-            scalar.store(rstd, rstd_ub, ping)
+            scalar.store(rstd, rstd_ub, ping * 8)
 
     with pto.for_(0, rounds, step=1) as r:
         lane_offset = r * threads * lanes + tx * lanes
         y_offset = ping * hidden_size + lane_offset
         frag_offset = r * lanes
 
-        for lane in pto.static_range(0, lanes):
-            x = scalar.load(x_frag, frag_offset + lane)
-            w = scalar.load(w_frag, frag_offset + lane)
-            y = x * rstd * w
-            scalar.store(y, y_ub, y_offset + lane)
+        x_vec = scalar.load(x_frag, frag_offset, contiguous=lanes)
+        w_vec = scalar.load(w_frag, frag_offset, contiguous=lanes)
+        rstd_vec = pto.vec(pto.f32, lanes, init=rstd)
+        y_vec = x_vec * rstd_vec * w_vec
+        scalar.store(y_vec, y_ub, y_offset)
 
 
 @pto.jit(target="a5", mode="explicit")
@@ -142,10 +143,9 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
     w_ub = pto.alloc_buffer((hidden_size,), pto.f32, scope="ub")
     x_ub = pto.alloc_buffer((2, hidden_size), pto.f32, scope="ub")
     y_ub = pto.alloc_buffer((2, hidden_size), pto.f32, scope="ub")
-    rstd_ub = pto.alloc_buffer((2,), pto.f32, scope="ub")
+    rstd_ub = pto.alloc_buffer((2, 8), pto.f32, scope="ub")
     reduce_scratch = pto.alloc_buffer((threads,), pto.f32, scope="ub")
 
-    x_frag = pto.alloc_buffer((frag_elems,), pto.f32, scope="local")
     w_frag = pto.alloc_buffer((frag_elems,), pto.f32, scope="local", persistent=True)
 
     pto.mte_gm_ub(
@@ -183,7 +183,6 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
                 y_ub,
                 rstd_ub,
                 reduce_scratch,
-                x_frag,
                 w_frag,
                 eps,
                 ping,
@@ -194,14 +193,14 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
             )
 
         pto.mte_ub_gm(
-            y_ub,
+            pto.addptr(y_ub, ping * hidden_size),
             pto.addptr(Y, token_id * hidden_size),
             hidden_size * 4,
             nburst=(1, hidden_size * 4, hidden_size * 4),
         )
 
         pto.mte_ub_gm(
-            rstd_ub,
+            pto.addptr(rstd_ub, ping * 8),
             pto.addptr(RSTD, token_id),
             4,
             nburst=(1, 4, 4),
diff --git a/ptodsl/tests/test_jit_compile.py b/ptodsl/tests/test_jit_compile.py
index 00e21ff30f..09d1b64e52 100644
--- a/ptodsl/tests/test_jit_compile.py
+++ b/ptodsl/tests/test_jit_compile.py
@@ -906,7 +906,7 @@ def rmsnorm_alloc_buffer_layout_probe(
     w_ub = pto.alloc_buffer((4096,), pto.f32, scope="ub")
     x_ub = pto.alloc_buffer((2, 4096), pto.f32, scope="ub")
     y_ub = pto.alloc_buffer((2, 4096), pto.f32, scope="ub")
-    rstd_ub = pto.alloc_buffer((2,), pto.f32, scope="ub")
+    rstd_ub = pto.alloc_buffer((2, 8), pto.f32, scope="ub")
     reduce_scratch = pto.alloc_buffer((128,), pto.f32, scope="ub")
 
     pto.mte_gm_ub(W, w_ub, 0, 4096 * 4, nburst=(1, 0, 0))
@@ -4045,10 +4045,10 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
     rmsnorm_alloc_buffer_text = rmsnorm_alloc_buffer_layout_probe.compile().mlir_text()
     expect_parse_roundtrip_and_verify(rmsnorm_alloc_buffer_text, "RMSNorm alloc_buffer layout specialization")
     expect(
-        "dyn_shared_memory_buf = 82464 : i64" in rmsnorm_alloc_buffer_text,
+        "dyn_shared_memory_buf = 82496 : i64" in rmsnorm_alloc_buffer_text,
         "RMSNorm alloc_buffer layout should reserve the same UB scratch size as the expanded RMSNorm kernel",
     )
-    for expected_offset in (16384, 49152, 81920, 81952):
+    for expected_offset in (16384, 49152, 81920, 81984):
         expect(
             f"arith.constant {expected_offset} : index" in rmsnorm_alloc_buffer_text,
             f"RMSNorm alloc_buffer layout should materialize UB byte offset {expected_offset}",
diff --git a/ptodsl/tests/test_rmsnorm_example_compile.py b/ptodsl/tests/test_rmsnorm_example_compile.py
index 94d981ddc6..5ba54f4c26 100644
--- a/ptodsl/tests/test_rmsnorm_example_compile.py
+++ b/ptodsl/tests/test_rmsnorm_example_compile.py
@@ -9,6 +9,7 @@
 
 from importlib.util import module_from_spec, spec_from_file_location
 from pathlib import Path
+import re
 import sys
 
 REPO_ROOT = Path(__file__).resolve().parents[2]
@@ -77,6 +78,29 @@ def check_variant(compiled, *, label: str, vector_type: str, helper_name_fragmen
         text.count("pto.store ") <= 8,
         f"{label}: SIMT inner loops should not be trace-time expanded into many scalar stores",
     )
+    expect(text.count("llvm.alloca") == 3, f"{label}: expected w_frag plus x_frag/sum_sq local buffers")
+    expect(
+        re.search(
+            r"func\.func @inline_simt_1__ptodsl_[^{]+\{(?:(?!func\.func @).)*"
+            r"llvm\.alloca(?:(?!func\.func @).)*llvm\.alloca",
+            text,
+            re.S,
+        )
+        is not None,
+        f"{label}: x_frag and sum_sq should be allocated inside the token SIMT helper",
+    )
+    expect(
+        re.search(
+            rf"llvm\.insertelement .* : {re.escape(vector_type)}(?:(?!func\.func @).)*"
+            rf"arith\.mulf .* : {re.escape(vector_type)}(?:(?!func\.func @).)*"
+            rf"arith\.mulf .* : {re.escape(vector_type)}(?:(?!func\.func @).)*"
+            rf"llvm\.store .* : {re.escape(vector_type)}",
+            text,
+            re.S,
+        )
+        is not None,
+        f"{label}: y = x * rstd * w should lower as vector broadcast/mul/store",
+    )
 
 
 def main() -> None:
@@ -90,14 +114,14 @@ def main() -> None:
         label="x128",
         vector_type="vector<2xf32>",
         helper_name_fragment="__tl_allreduce_sum_f32_t128_s1_o0",
-        ub_size=82464,
+        ub_size=82496,
     )
     check_variant(
         example.build_x64(),
         label="x64",
         vector_type="vector<4xf32>",
         helper_name_fragment="__tl_allreduce_sum_f32_t64_s1_o0",
-        ub_size=82208,
+        ub_size=82240,
     )
 
     print("ptodsl_rmsnorm_example_compile: PASS")

From 4e7e957b0d463d2fa80298ce705e7750e3f51a38 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Thu, 25 Jun 2026 21:29:13 +0800
Subject: [PATCH 12/37] test(ptodsl): match rmsnorm rstd store

---
 ptodsl/examples/rmsnorm_alloc_buffer_simt.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
index f0b440b3e1..0d975468d2 100644
--- a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
@@ -105,9 +105,7 @@ def rmsnorm_4096_token_body(
 
     rstd = 1.0 / scalar.sqrt(sum_sq / hidden_size + eps)
 
-    with pto.if_(tx == 0) as br:
-        with br.then_:
-            scalar.store(rstd, rstd_ub, ping * 8)
+    scalar.store(rstd, rstd_ub, ping * 8)
 
     with pto.for_(0, rounds, step=1) as r:
         lane_offset = r * threads * lanes + tx * lanes

From 3df78feef9d5b8c0763fe5ce383ebed2654e8d4e Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Thu, 25 Jun 2026 21:46:13 +0800
Subject: [PATCH 13/37] test(ptodsl): rename rmsnorm simt helper

---
 ptodsl/examples/rmsnorm_alloc_buffer_simt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
index 0d975468d2..b5a57d3280 100644
--- a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
@@ -58,7 +58,7 @@ def init_weight_fragment_body(
         scalar.store(w_vec, w_frag, frag_offset)
 
 
-def rmsnorm_4096_token_body(
+def rmsnorm_simt_token_body(
     x_ub,
     y_ub,
     rstd_ub,
@@ -176,7 +176,7 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
         )
 
         with pto.simt():
-            rmsnorm_4096_token_body(
+            rmsnorm_simt_token_body(
                 x_ub,
                 y_ub,
                 rstd_ub,

From ce139daf36c229ba2ee618660739c764a4c41fb1 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Thu, 25 Jun 2026 22:12:37 +0800
Subject: [PATCH 14/37] test(ptodsl): validate rmsnorm simt partition

---
 ptodsl/examples/rmsnorm_alloc_buffer_simt.py | 23 ++++++++++-------
 ptodsl/tests/test_rmsnorm_example_compile.py | 26 ++++++++++++++++++++
 2 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
index b5a57d3280..26577907dd 100644
--- a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
@@ -134,7 +134,12 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
     hidden_size: pto.const_expr = 4096,
     n_cores: pto.const_expr = 64,
     tokens_per_core: pto.const_expr = 64,
+    f32_bytes: pto.const_expr = 4,
 ):
+    assert threads * rounds * lanes == hidden_size, (
+        "threads * rounds * lanes must equal hidden_size for RMSNorm SIMT partitioning"
+    )
+
     core_id = pto.get_block_idx()
     frag_elems: pto.const_expr = rounds * lanes
 
@@ -150,8 +155,8 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
         W,
         w_ub,
         0,
-        hidden_size * 4,
-        nburst=(1, hidden_size * 4, hidden_size * 4),
+        hidden_size * f32_bytes,
+        nburst=(1, hidden_size * f32_bytes, hidden_size * f32_bytes),
     )
 
     with pto.simt():
@@ -170,9 +175,9 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
         pto.mte_gm_ub(
             pto.addptr(X, token_id * hidden_size),
             x_ub,
-            ping * hidden_size * 4,
-            hidden_size * 4,
-            nburst=(1, hidden_size * 4, hidden_size * 4),
+            ping * hidden_size * f32_bytes,
+            hidden_size * f32_bytes,
+            nburst=(1, hidden_size * f32_bytes, hidden_size * f32_bytes),
         )
 
         with pto.simt():
@@ -193,15 +198,15 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
         pto.mte_ub_gm(
             pto.addptr(y_ub, ping * hidden_size),
             pto.addptr(Y, token_id * hidden_size),
-            hidden_size * 4,
-            nburst=(1, hidden_size * 4, hidden_size * 4),
+            hidden_size * f32_bytes,
+            nburst=(1, hidden_size * f32_bytes, hidden_size * f32_bytes),
         )
 
         pto.mte_ub_gm(
             pto.addptr(rstd_ub, ping * 8),
             pto.addptr(RSTD, token_id),
-            4,
-            nburst=(1, 4, 4),
+            f32_bytes,
+            nburst=(1, f32_bytes, f32_bytes),
         )
 
 
diff --git a/ptodsl/tests/test_rmsnorm_example_compile.py b/ptodsl/tests/test_rmsnorm_example_compile.py
index 5ba54f4c26..67dc979b87 100644
--- a/ptodsl/tests/test_rmsnorm_example_compile.py
+++ b/ptodsl/tests/test_rmsnorm_example_compile.py
@@ -24,6 +24,22 @@ def expect(condition: bool, message: str) -> None:
         raise AssertionError(message)
 
 
+def expect_raises(exc_type, func, message_substring: str | None = None) -> Exception:
+    try:
+        func()
+    except exc_type as exc:
+        if message_substring is not None and message_substring not in str(exc):
+            raise AssertionError(
+                f"expected {exc_type.__name__} containing {message_substring!r}, got {exc!r}"
+            ) from exc
+        return exc
+    except Exception as exc:
+        raise AssertionError(
+            f"expected {exc_type.__name__}, got {exc.__class__.__name__}: {exc}"
+        ) from exc
+    raise AssertionError(f"expected {exc_type.__name__} to be raised")
+
+
 def expect_parse_roundtrip_and_verify(text: str, label: str) -> None:
     with make_context() as ctx:
         parsed = Module.parse(text, ctx)
@@ -108,6 +124,16 @@ def main() -> None:
 
     expect(hasattr(example, "build_x128"), "RMSNorm example should export build_x128()")
     expect(hasattr(example, "build_x64"), "RMSNorm example should export build_x64()")
+    expect_raises(
+        AssertionError,
+        lambda: example.rmsnorm_4096_alloc_buffer_simt_context_kernel.compile(
+            threads=128,
+            rounds=16,
+            lanes=2,
+            hidden_size=4097,
+        ),
+        "threads * rounds * lanes must equal hidden_size",
+    )
 
     check_variant(
         example.build_x128(),

From e3f97467d4328780c12d3597af35ce540e073719 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Fri, 26 Jun 2026 08:44:55 +0800
Subject: [PATCH 15/37] docs(ptodsl): clarify alloc buffer parameters

---
 .../user_guide/04-type-system-and-buffer.md   | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/ptodsl/docs/user_guide/04-type-system-and-buffer.md b/ptodsl/docs/user_guide/04-type-system-and-buffer.md
index 276fca8e96..990532610f 100644
--- a/ptodsl/docs/user_guide/04-type-system-and-buffer.md
+++ b/ptodsl/docs/user_guide/04-type-system-and-buffer.md
@@ -177,7 +177,11 @@ ptr_ub  = pto.ptr(pto.f16, pto.MemorySpace.UB)
 
 ## 4.5 Explicit scratch buffers
 
-Use `pto.alloc_buffer(...)` in explicit-mode kernels to allocate scratch storage that is addressed through pointer-style operations:
+Use `pto.alloc_buffer(...)` in explicit-mode kernels to allocate scratch storage that is addressed through pointer-style operations.
+
+```text
+pto.alloc_buffer(shape, dtype, *, scope="ub", persistent=False)
+```
 
 <!-- ptodsl-doc-pending: {"reason":"illustrative fragment; covered by test_jit_compile alloc_buffer probes"} -->
 ```python
@@ -185,12 +189,22 @@ ub_scratch = pto.alloc_buffer((4096,), pto.f32, scope="ub")
 fragment = pto.alloc_buffer((32,), pto.f32, scope="local", persistent=True)
 ```
 
-| Scope | Storage | Returned value | Typical use | Layout notes |
-|-------|---------|----------------|-------------|--------------|
-| `"ub"` | Function-level Unified Buffer scratch | Typed `!pto.ptr<T, ub>` | MTE source/destination buffers, cross-SIMT scratch such as reductions | Contributes to `dyn_shared_memory_buf`; the frontend may insert alignment padding between allocations |
-| `"local"` | SIMT-helper local storage | Typed local pointer backed by `llvm.alloca` | Per-workitem fragments such as `x_frag[]` and `w_frag[]` | Lives inside the active SIMT helper; `persistent=True` is lifetime metadata and does not change the pointer type |
-
-Shapes must be static positive integers so the frontend can compute storage size and layout while tracing. `alloc_buffer` lowers directly to the pointer arithmetic and local allocation operations needed by the kernel; it does not introduce a new high-level PTO IR operation.
+| Parameter | Description |
+|-----------|-------------|
+| `shape` | Static positive integer shape. Pass an `int`, `tuple[int, ...]`, or `list[int]`. |
+| `dtype` | Element type of the returned buffer, such as `pto.f32` or `pto.i32`. |
+| `scope` | Scratch storage kind. Recommended values are `"ub"` and `"local"`; `"vec"` aliases `"ub"`, and `"private"` aliases `"local"`. |
+| `persistent` | Lifetime metadata for the frontend. It does not change the returned pointer type. |
+
+| Scope | Meaning | Returned value |
+|-------|---------|----------------|
+| `"ub"` | Function-level Unified Buffer scratch, typically used by MTE transfers or shared SIMT scratch. | Typed `!pto.ptr<T, ub>` |
+| `"local"` | SIMT-helper local scratch for per-workitem temporary fragments. | Typed local pointer backed by `llvm.alloca` |
+
+For `"ub"` buffers, the generated kernel records the total required UB scratch
+size in `dyn_shared_memory_buf`, measured in bytes and including any frontend
+alignment padding. `alloc_buffer` lowers directly to pointer arithmetic or local
+allocation operations; it does not introduce a new high-level PTO IR operation.
 
 ## 4.6 TensorView
 

From ab7239db1a6cc6d7637223649e2914a432ab08a1 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Fri, 26 Jun 2026 09:25:58 +0800
Subject: [PATCH 16/37] docs(ptodsl): restrict alloc buffer scope

---
 .../user_guide/04-type-system-and-buffer.md   | 17 +++++------
 ptodsl/ptodsl/_ops.py                         | 17 ++++++-----
 ptodsl/tests/test_jit_compile.py              | 30 +++++++++++++++++++
 3 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/ptodsl/docs/user_guide/04-type-system-and-buffer.md b/ptodsl/docs/user_guide/04-type-system-and-buffer.md
index 990532610f..f4b389312e 100644
--- a/ptodsl/docs/user_guide/04-type-system-and-buffer.md
+++ b/ptodsl/docs/user_guide/04-type-system-and-buffer.md
@@ -177,7 +177,7 @@ ptr_ub  = pto.ptr(pto.f16, pto.MemorySpace.UB)
 
 ## 4.5 Explicit scratch buffers
 
-Use `pto.alloc_buffer(...)` in explicit-mode kernels to allocate scratch storage that is addressed through pointer-style operations.
+Allocate explicit scratch storage for pointer-style load, store, and data movement operations.
 
 ```text
 pto.alloc_buffer(shape, dtype, *, scope="ub", persistent=False)
@@ -193,18 +193,17 @@ fragment = pto.alloc_buffer((32,), pto.f32, scope="local", persistent=True)
 |-----------|-------------|
 | `shape` | Static positive integer shape. Pass an `int`, `tuple[int, ...]`, or `list[int]`. |
 | `dtype` | Element type of the returned buffer, such as `pto.f32` or `pto.i32`. |
-| `scope` | Scratch storage kind. Recommended values are `"ub"` and `"local"`; `"vec"` aliases `"ub"`, and `"private"` aliases `"local"`. |
-| `persistent` | Lifetime metadata for the frontend. It does not change the returned pointer type. |
+| `scope` | Scratch storage kind. Use `"ub"` or `"local"`. |
+| `persistent` | Optional Boolean, either `True` or `False`; the default is `False`. It is frontend metadata and does not change the returned pointer type. |
 
 | Scope | Meaning | Returned value |
 |-------|---------|----------------|
-| `"ub"` | Function-level Unified Buffer scratch, typically used by MTE transfers or shared SIMT scratch. | Typed `!pto.ptr<T, ub>` |
-| `"local"` | SIMT-helper local scratch for per-workitem temporary fragments. | Typed local pointer backed by `llvm.alloca` |
+| `"ub"` | Function-level Unified Buffer scratch, typically used by data movement operations or shared SIMT scratch. | Typed UB pointer |
+| `"local"` | SIMT-helper local scratch for per-workitem temporary fragments. | Typed local pointer |
 
-For `"ub"` buffers, the generated kernel records the total required UB scratch
-size in `dyn_shared_memory_buf`, measured in bytes and including any frontend
-alignment padding. `alloc_buffer` lowers directly to pointer arithmetic or local
-allocation operations; it does not introduce a new high-level PTO IR operation.
+A `"ub"` buffer is available throughout the generated kernel body, regardless
+of the `persistent` value. A `"local"` buffer is available only inside the SIMT
+helper invocation that allocates it.
 
 ## 4.6 TensorView
 
diff --git a/ptodsl/ptodsl/_ops.py b/ptodsl/ptodsl/_ops.py
index 723e5878a7..7c9942b9eb 100644
--- a/ptodsl/ptodsl/_ops.py
+++ b/ptodsl/ptodsl/_ops.py
@@ -2330,6 +2330,7 @@ def alloc_buffer(shape, dtype, *, scope="ub", persistent=False):
     """
     _require_explicit_mode("pto.alloc_buffer(...)")
     normalized_scope = _normalize_alloc_buffer_scope(scope)
+    persistent = _normalize_alloc_buffer_persistent(persistent)
     element_type = _resolve(dtype)
     element_count = _static_alloc_buffer_element_count(shape)
     elem_bytes = _element_bytewidth(element_type)
@@ -2358,21 +2359,21 @@ def alloc_buffer(shape, dtype, *, scope="ub", persistent=False):
 
 def _normalize_alloc_buffer_scope(scope):
     if not isinstance(scope, str):
-        try:
-            space = _normalize_address_space(scope)
-        except Exception:
-            space = None
-        if space == _pto.AddressSpace.VEC:
-            return "ub"
         raise TypeError("pto.alloc_buffer(..., scope=...) expects 'ub' or 'local'")
     normalized = scope.strip().lower()
-    if normalized in {"ub", "vec"}:
+    if normalized == "ub":
         return "ub"
-    if normalized in {"local", "private"}:
+    if normalized == "local":
         return "local"
     raise ValueError("pto.alloc_buffer(..., scope=...) expects one of 'ub' or 'local'")
 
 
+def _normalize_alloc_buffer_persistent(persistent):
+    if not isinstance(persistent, bool):
+        raise TypeError("pto.alloc_buffer(..., persistent=...) expects True or False")
+    return persistent
+
+
 def _static_alloc_buffer_element_count(shape):
     if isinstance(shape, int):
         dims = (shape,)
diff --git a/ptodsl/tests/test_jit_compile.py b/ptodsl/tests/test_jit_compile.py
index 09d1b64e52..ed575acb0c 100644
--- a/ptodsl/tests/test_jit_compile.py
+++ b/ptodsl/tests/test_jit_compile.py
@@ -884,6 +884,21 @@ def alloc_buffer_local_probe():
     alloc_buffer_local_helper()
 
 
+@pto.jit(target="a5", mode="explicit")
+def alloc_buffer_vec_scope_probe():
+    _ = pto.alloc_buffer((1,), pto.f32, scope="vec")
+
+
+@pto.jit(target="a5", mode="explicit")
+def alloc_buffer_private_scope_probe():
+    _ = pto.alloc_buffer((1,), pto.f32, scope="private")
+
+
+@pto.jit(target="a5", mode="explicit")
+def alloc_buffer_non_bool_persistent_probe():
+    _ = pto.alloc_buffer((1,), pto.f32, scope="local", persistent=1)
+
+
 @pto.simt
 def rmsnorm_alloc_buffer_frag_helper(
     w_ub: pto.ptr(pto.f32, pto.MemorySpace.UB),
@@ -4041,6 +4056,21 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
         is not None,
         "alloc_buffer(scope='local') probe should keep allocation inside the SIMT helper body",
     )
+    expect_raises(
+        ValueError,
+        lambda: alloc_buffer_vec_scope_probe.compile(),
+        "expects one of 'ub' or 'local'",
+    )
+    expect_raises(
+        ValueError,
+        lambda: alloc_buffer_private_scope_probe.compile(),
+        "expects one of 'ub' or 'local'",
+    )
+    expect_raises(
+        TypeError,
+        lambda: alloc_buffer_non_bool_persistent_probe.compile(),
+        "expects True or False",
+    )
 
     rmsnorm_alloc_buffer_text = rmsnorm_alloc_buffer_layout_probe.compile().mlir_text()
     expect_parse_roundtrip_and_verify(rmsnorm_alloc_buffer_text, "RMSNorm alloc_buffer layout specialization")

From 0c4e6340a95f91aaedcca117f415c80c8ab6787b Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Fri, 26 Jun 2026 09:37:59 +0800
Subject: [PATCH 17/37] docs(ptodsl): split scalar contiguous access docs

---
 .../user_guide/06-scalar-and-pointer-ops.md   | 79 +++++++++++++++----
 1 file changed, 62 insertions(+), 17 deletions(-)

diff --git a/ptodsl/docs/user_guide/06-scalar-and-pointer-ops.md b/ptodsl/docs/user_guide/06-scalar-and-pointer-ops.md
index 2e509ca8d3..c2efaceeea 100644
--- a/ptodsl/docs/user_guide/06-scalar-and-pointer-ops.md
+++ b/ptodsl/docs/user_guide/06-scalar-and-pointer-ops.md
@@ -35,26 +35,28 @@ When in doubt, ask: *can this value change between launches of the same compiled
 
 ## 6.2 Scalar access: load and store
 
-`scalar.load` reads one scalar element from a typed pointer or tile location. With `contiguous=N`, it reads `N` adjacent elements as a builtin MLIR vector value. `scalar.store` writes either a scalar or one of those builtin vector values back. These are the canonical memory ops for SIMT authoring. Offsets are counted in elements, not bytes.
+`scalar.load` and `scalar.store` access typed pointers and tile locations.
+Offsets are counted in elements, not bytes.
 
-#### `scalar.load(ptr: PtrType, offset: Index, *, contiguous: int | None = None) -> ScalarType | VecValue`
+### Scalar load
 
-**Description**: Loads one scalar element from a typed pointer at the given element offset, or `contiguous` adjacent elements as `vector<NxT>`.
+#### `scalar.load(ptr_or_ref, offset=None) -> ScalarType`
+
+**Description**: Loads one scalar element from a typed pointer, pointer view, or
+tile element reference.
 
 **Parameters**:
 
 | Parameter | Type | Description |
 |-----------|------|-------------|
-| `ptr` | `PtrType` | Typed pointer (`pto.ptr<T, space>`) or the result of `tile.as_ptr()` |
-| `offset` | `Index` | Element displacement from `ptr` |
-| `contiguous` | `int` or `None` | `None` and `1` load one scalar; `N > 1` loads `N` adjacent elements |
+| `ptr_or_ref` | Typed pointer, pointer view, or tile element reference | Source location |
+| `offset` | `Index` or `None` | Element displacement from `ptr_or_ref`; omit when the offset is already encoded in `ptr_or_ref` |
 
 **Returns**:
 
 | Return Value | Type | Description |
 |--------------|------|-------------|
 | `value` | `ScalarType` | The loaded scalar, matching the pointer's element type |
-| `value` | `pto.vec(T, N)` | Returned when `contiguous=N > 1`; lowers as builtin `vector<NxT>` |
 
 **Tile-index form** — the preferred syntax when loading from a tile:
 
@@ -73,28 +75,52 @@ val = scalar.load(ptr, offset)       # explicit offset
 val = scalar.load(ptr + offset)      # pointer arithmetic shorthand
 ```
 
-**Contiguous vector form**:
+### Contiguous vector load
+
+#### `scalar.load(ptr_or_ref, offset=None, *, contiguous: int) -> VecValue`
+
+**Description**: Loads `contiguous` adjacent elements from a typed pointer as
+one vector value.
+
+**Parameters**:
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `ptr_or_ref` | Typed pointer or pointer view | Source location |
+| `offset` | `Index` or `None` | First element to load; omit when the offset is already encoded in `ptr_or_ref` |
+| `contiguous` | Positive Python `int` greater than `1` | Number of adjacent elements to load |
+
+**Returns**:
+
+| Return Value | Type | Description |
+|--------------|------|-------------|
+| `value` | `pto.vec(T, N)` | Vector value with `N == contiguous` and element type `T` |
+
+**Example**:
 
 ```python
 x4 = scalar.load(ptr, offset, contiguous=4)
 ```
 
-For a `pto.ptr(pto.f32, "ub")`, this produces a value with DSL type `pto.vec(pto.f32, 4)` and MLIR type `vector<4xf32>`. The frontend lowers this directly to low-level pointer arithmetic plus an LLVM vector load; it does not introduce a new PTO semantic op.
+For a `pto.ptr(pto.f32, "ub")`, this produces a DSL vector value with type
+`pto.vec(pto.f32, 4)`.
 
 ---
 
-#### `scalar.store(value: ScalarType | VecValue, ptr: PtrType, offset: Index, *, contiguous: int | None = None) -> None`
+### Scalar store
+
+#### `scalar.store(value: ScalarType, ptr_or_ref, offset=None) -> None`
 
-**Description**: Stores one scalar element or a builtin vector value to a typed pointer at the given element offset.
+**Description**: Stores one scalar element to a typed pointer, pointer view, or
+tile element reference.
 
 **Parameters**:
 
 | Parameter | Type | Description |
 |-----------|------|-------------|
-| `value` | `ScalarType` or `pto.vec(T, N)` | Scalar value or contiguous vector value to write |
-| `ptr` | `PtrType` | Typed destination pointer |
-| `offset` | `Index` | Element displacement from `ptr` |
-| `contiguous` | `int` or `None` | Optional width check for vector stores; if provided, it must match the vector lane count |
+| `value` | `ScalarType` | Scalar value to write |
+| `ptr_or_ref` | Typed pointer, pointer view, or tile element reference | Destination location |
+| `offset` | `Index` or `None` | Element displacement from `ptr_or_ref`; omit when the offset is already encoded in `ptr_or_ref` |
 
 **Returns**: None (side-effect operation).
 
@@ -112,14 +138,33 @@ scalar.store(value, tile[row, col])
 scalar.store(value, ptr, offset)
 ```
 
-**Contiguous vector form**:
+### Contiguous vector store
+
+#### `scalar.store(value: VecValue, ptr_or_ref, offset=None, *, contiguous: int | None = None) -> None`
+
+**Description**: Stores a vector value to adjacent elements of a typed pointer.
+The store width is taken from the vector lane count. If `contiguous` is
+provided, it must match that lane count.
+
+**Parameters**:
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `value` | `pto.vec(T, N)` | Vector value to write |
+| `ptr_or_ref` | Typed pointer or pointer view | Destination location |
+| `offset` | `Index` or `None` | First element to store; omit when the offset is already encoded in `ptr_or_ref` |
+| `contiguous` | `int` or `None` | Optional width check; when provided, it must equal `N` |
+
+**Example**:
 
 ```python
 scalar.store(x4, ptr, offset)
 scalar.store(x4, ptr, offset, contiguous=4)  # optional width check
 ```
 
-Vector stores lower directly to an LLVM vector store. Scalar stores remain scalar stores; `scalar.store(scalar_value, ptr, offset, contiguous=N)` is rejected because scalar values are not implicitly broadcast for stores.
+`scalar.store(scalar_value, ptr, offset, contiguous=N)` is rejected because
+scalar values are not implicitly broadcast for vector stores. Use `pto.vec(...)`
+to build an explicit vector value first.
 
 #### `pto.vec(dtype, lanes, *, init=None)`
 

From e7fa2e9a2c2393e941ce9051a4b73108e303d43c Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Fri, 26 Jun 2026 09:44:08 +0800
Subject: [PATCH 18/37] docs(ptodsl): preserve scalar access wording

---
 .../user_guide/06-scalar-and-pointer-ops.md   | 92 +++++++++----------
 1 file changed, 45 insertions(+), 47 deletions(-)

diff --git a/ptodsl/docs/user_guide/06-scalar-and-pointer-ops.md b/ptodsl/docs/user_guide/06-scalar-and-pointer-ops.md
index c2efaceeea..b08ed57e6f 100644
--- a/ptodsl/docs/user_guide/06-scalar-and-pointer-ops.md
+++ b/ptodsl/docs/user_guide/06-scalar-and-pointer-ops.md
@@ -35,22 +35,20 @@ When in doubt, ask: *can this value change between launches of the same compiled
 
 ## 6.2 Scalar access: load and store
 
-`scalar.load` and `scalar.store` access typed pointers and tile locations.
-Offsets are counted in elements, not bytes.
+`scalar.load` reads one scalar element from a typed pointer or tile location.
+`scalar.store` writes one scalar element back. These are the canonical scalar
+memory ops for SIMT authoring. Offsets are counted in elements, not bytes.
 
-### Scalar load
+#### `scalar.load(ptr: PtrType, offset: Index) -> ScalarType`
 
-#### `scalar.load(ptr_or_ref, offset=None) -> ScalarType`
-
-**Description**: Loads one scalar element from a typed pointer, pointer view, or
-tile element reference.
+**Description**: Loads one scalar element from a typed pointer at the given element offset.
 
 **Parameters**:
 
 | Parameter | Type | Description |
 |-----------|------|-------------|
-| `ptr_or_ref` | Typed pointer, pointer view, or tile element reference | Source location |
-| `offset` | `Index` or `None` | Element displacement from `ptr_or_ref`; omit when the offset is already encoded in `ptr_or_ref` |
+| `ptr` | `PtrType` | Typed pointer (`pto.ptr<T, space>`) or the result of `tile.as_ptr()` |
+| `offset` | `Index` | Element displacement from `ptr` |
 
 **Returns**:
 
@@ -75,72 +73,72 @@ val = scalar.load(ptr, offset)       # explicit offset
 val = scalar.load(ptr + offset)      # pointer arithmetic shorthand
 ```
 
-### Contiguous vector load
+---
 
-#### `scalar.load(ptr_or_ref, offset=None, *, contiguous: int) -> VecValue`
+#### `scalar.store(value: ScalarType, ptr: PtrType, offset: Index) -> None`
 
-**Description**: Loads `contiguous` adjacent elements from a typed pointer as
-one vector value.
+**Description**: Stores one scalar element to a typed pointer at the given element offset.
 
 **Parameters**:
 
 | Parameter | Type | Description |
 |-----------|------|-------------|
-| `ptr_or_ref` | Typed pointer or pointer view | Source location |
-| `offset` | `Index` or `None` | First element to load; omit when the offset is already encoded in `ptr_or_ref` |
-| `contiguous` | Positive Python `int` greater than `1` | Number of adjacent elements to load |
-
-**Returns**:
+| `value` | `ScalarType` | Scalar value to write |
+| `ptr` | `PtrType` | Typed destination pointer |
+| `offset` | `Index` | Element displacement from `ptr` |
 
-| Return Value | Type | Description |
-|--------------|------|-------------|
-| `value` | `pto.vec(T, N)` | Vector value with `N == contiguous` and element type `T` |
+**Returns**: None (side-effect operation).
 
-**Example**:
+**Tile-index form**:
 
+<!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"scalar_ops.tile_access","symbol":"scalar_ops_tile_access_probe","compile":{}} -->
 ```python
-x4 = scalar.load(ptr, offset, contiguous=4)
+scalar.store(value, tile[row, col])
 ```
 
-For a `pto.ptr(pto.f32, "ub")`, this produces a DSL vector value with type
-`pto.vec(pto.f32, 4)`.
+**Pointer forms**:
 
----
+<!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"scalar_ops.tile_access","symbol":"scalar_ops_tile_access_probe","compile":{}} -->
+```python
+scalar.store(value, ptr, offset)
+```
 
-### Scalar store
+### Contiguous vector access
 
-#### `scalar.store(value: ScalarType, ptr_or_ref, offset=None) -> None`
+Use `contiguous=N` when a single work-item should read or write `N` adjacent
+elements as one vector value. `N` must be a positive Python integer greater than
+`1`.
 
-**Description**: Stores one scalar element to a typed pointer, pointer view, or
-tile element reference.
+#### `scalar.load(ptr: PtrType, offset: Index, *, contiguous: int) -> VecValue`
+
+**Description**: Loads `contiguous` adjacent elements from a typed pointer.
 
 **Parameters**:
 
 | Parameter | Type | Description |
 |-----------|------|-------------|
-| `value` | `ScalarType` | Scalar value to write |
-| `ptr_or_ref` | Typed pointer, pointer view, or tile element reference | Destination location |
-| `offset` | `Index` or `None` | Element displacement from `ptr_or_ref`; omit when the offset is already encoded in `ptr_or_ref` |
-
-**Returns**: None (side-effect operation).
+| `ptr` | `PtrType` | Typed source pointer |
+| `offset` | `Index` | First element to load |
+| `contiguous` | Positive Python `int` greater than `1` | Number of adjacent elements to load |
 
-**Tile-index form**:
+**Returns**:
 
-<!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"scalar_ops.tile_access","symbol":"scalar_ops_tile_access_probe","compile":{}} -->
-```python
-scalar.store(value, tile[row, col])
-```
+| Return Value | Type | Description |
+|--------------|------|-------------|
+| `value` | `pto.vec(T, N)` | Vector value with `N == contiguous` and element type `T` |
 
-**Pointer forms**:
+**Example**:
 
-<!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"scalar_ops.tile_access","symbol":"scalar_ops_tile_access_probe","compile":{}} -->
 ```python
-scalar.store(value, ptr, offset)
+x4 = scalar.load(ptr, offset, contiguous=4)
 ```
 
-### Contiguous vector store
+For a `pto.ptr(pto.f32, "ub")`, this produces a DSL vector value with type
+`pto.vec(pto.f32, 4)`.
+
+---
 
-#### `scalar.store(value: VecValue, ptr_or_ref, offset=None, *, contiguous: int | None = None) -> None`
+#### `scalar.store(value: VecValue, ptr: PtrType, offset: Index, *, contiguous: int | None = None) -> None`
 
 **Description**: Stores a vector value to adjacent elements of a typed pointer.
 The store width is taken from the vector lane count. If `contiguous` is
@@ -151,8 +149,8 @@ provided, it must match that lane count.
 | Parameter | Type | Description |
 |-----------|------|-------------|
 | `value` | `pto.vec(T, N)` | Vector value to write |
-| `ptr_or_ref` | Typed pointer or pointer view | Destination location |
-| `offset` | `Index` or `None` | First element to store; omit when the offset is already encoded in `ptr_or_ref` |
+| `ptr` | `PtrType` | Typed destination pointer |
+| `offset` | `Index` | First element to store |
 | `contiguous` | `int` or `None` | Optional width check; when provided, it must equal `N` |
 
 **Example**:

From e69811c2c813a4d375117648734a25f4fc08e0e0 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Fri, 26 Jun 2026 10:03:39 +0800
Subject: [PATCH 19/37] docs(ptodsl): move builtin vector docs

---
 .../user_guide/06-scalar-and-pointer-ops.md   | 15 +------
 .../docs/user_guide/08-compute-operations.md  | 40 +++++++++++++++++++
 2 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/ptodsl/docs/user_guide/06-scalar-and-pointer-ops.md b/ptodsl/docs/user_guide/06-scalar-and-pointer-ops.md
index b08ed57e6f..4db3e5acb6 100644
--- a/ptodsl/docs/user_guide/06-scalar-and-pointer-ops.md
+++ b/ptodsl/docs/user_guide/06-scalar-and-pointer-ops.md
@@ -161,19 +161,8 @@ scalar.store(x4, ptr, offset, contiguous=4)  # optional width check
 ```
 
 `scalar.store(scalar_value, ptr, offset, contiguous=N)` is rejected because
-scalar values are not implicitly broadcast for vector stores. Use `pto.vec(...)`
-to build an explicit vector value first.
-
-#### `pto.vec(dtype, lanes, *, init=None)`
-
-`pto.vec(dtype, lanes)` names a builtin vector type such as `vector<4xf32>`. When `init` is provided, it constructs a vector value. A scalar initializer is broadcast to every lane:
-
-```python
-rstd4 = pto.vec(pto.f32, 4, init=rstd)
-y4 = x4 * rstd4
-```
-
-The initial vector arithmetic surface is intentionally narrow: multiplication of compatible `VecValue` operands lowers to elementwise `arith.mulf` on builtin vector types.
+scalar values are not implicitly broadcast for vector stores. To build an
+explicit broadcast vector, use `pto.vec(...)`; see Section 8.4.
 
 ### Scalar value adaptation
 
diff --git a/ptodsl/docs/user_guide/08-compute-operations.md b/ptodsl/docs/user_guide/08-compute-operations.md
index ac4e32a3a9..5c14332ca8 100644
--- a/ptodsl/docs/user_guide/08-compute-operations.md
+++ b/ptodsl/docs/user_guide/08-compute-operations.md
@@ -1864,3 +1864,43 @@ The `mte_l1_l0a`/`mte_l1_l0b` stage operands from the authored source tiles into
 | `pto.mad_mx_bias(lhs, rhs, dst, bias, m, n, k, **clauses)` | MX-format bias-init matmul |
 
 MX variants require MX-enabled dtypes (f8) and pre-loaded scale payloads. For most users, the standard `mad`, `mad_acc`, and `mad_bias` are the primary interface.
+
+---
+
+## 8.4 Builtin vector values
+
+Builtin vector values are small fixed-lane vectors used by contiguous scalar
+accesses and element-wise vector expressions. They are distinct from the
+`VRegType` values used inside `@pto.simd` kernels.
+
+#### `pto.vec(dtype, lanes, *, init=None)`
+
+**Description**: Names a builtin vector type. When `init` is provided,
+constructs a vector value. A scalar initializer is broadcast to every lane.
+
+**Parameters**:
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `dtype` | PTO dtype | Element type, such as `pto.f32` |
+| `lanes` | Positive Python `int` | Number of lanes |
+| `init` | Scalar value, vector value, or `None` | Optional initializer; scalar values are broadcast to all lanes |
+
+**Returns**:
+
+| Return Value | Type | Description |
+|--------------|------|-------------|
+| `result` | Vector type or `pto.vec(dtype, lanes)` value | Without `init`, returns a vector type descriptor; with `init`, returns a vector value |
+
+**Example**:
+
+<!-- ptodsl-doc-pending: {"reason":"illustrative fragment; covered by test_jit_compile scalar contiguous vector probes"} -->
+```python
+x4 = scalar.load(ptr, offset, contiguous=4)
+rstd4 = pto.vec(pto.f32, 4, init=rstd)
+y4 = x4 * rstd4
+scalar.store(y4, ptr, offset)
+```
+
+Use this form when a scalar value must participate in element-wise arithmetic
+with a vector value returned by `scalar.load(..., contiguous=N)`.

From 0e46cb4f3539030e086adbbc15355dace970ed9e Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Fri, 26 Jun 2026 11:22:10 +0800
Subject: [PATCH 20/37] feat(ptodsl): support inline simt launch dimensions

---
 ptodsl/docs/user_guide/01-introduction.md     |  4 ++-
 .../03-kernel-entry-and-subkernels.md         | 11 ++++++
 ptodsl/docs/user_guide/13-simt-micro-ops.md   |  4 +--
 ptodsl/ptodsl/_subkernels.py                  | 21 ++++++++++-
 ptodsl/ptodsl/_tracing/session.py             | 25 ++++++++++---
 ptodsl/tests/test_jit_compile.py              | 36 +++++++++++++++++++
 6 files changed, 93 insertions(+), 8 deletions(-)

diff --git a/ptodsl/docs/user_guide/01-introduction.md b/ptodsl/docs/user_guide/01-introduction.md
index 12eba5ce58..d7d29e8857 100644
--- a/ptodsl/docs/user_guide/01-introduction.md
+++ b/ptodsl/docs/user_guide/01-introduction.md
@@ -257,7 +257,9 @@ These are hardware-bound compute sub-kernels, each mapped to a specific NPU comp
 
 Each can be invoked as a named decorated function (`@pto.cube` /
 `@pto.simd` / `@pto.simt`) or inline as a context manager
-(`with pto.cube():`, `with pto.simd():`, `with pto.simt():`).
+(`with pto.cube():`, `with pto.simd():`, `with pto.simt():`). Inline SIMT
+scopes can also spell launch dimensions directly with
+`with pto.simt(dim_x, dim_y, dim_z):`.
 
 The boundary contract is strict: vreg values do not escape a simd kernel, cube-local state does not leak into UB, and data crosses layer boundaries only through UB-backed tiles or typed UB pointers.
 
diff --git a/ptodsl/docs/user_guide/03-kernel-entry-and-subkernels.md b/ptodsl/docs/user_guide/03-kernel-entry-and-subkernels.md
index 7bb02647ca..3852149e28 100644
--- a/ptodsl/docs/user_guide/03-kernel-entry-and-subkernels.md
+++ b/ptodsl/docs/user_guide/03-kernel-entry-and-subkernels.md
@@ -1001,6 +1001,8 @@ In addition to the decorator form, each sub-kernel unit provides a context
 manager: `with pto.cube():`, `with pto.simd():`, and `with pto.simt():`. These
 open one-off anonymous sub-kernel bodies without requiring a separate named
 Python function. Inline scopes are supported in top-level `@pto.jit` bodies.
+SIMT inline scopes may also spell explicit launch dimensions as
+`with pto.simt(dim_x, dim_y, dim_z):`.
 
 ### Syntax
 
@@ -1022,6 +1024,12 @@ with pto.simt():
     scalar.store(o_next, o_next_tile[row, col])
 ```
 
+```python
+with pto.simt(128, 1, 1):
+    tid = pto.get_tid_x()
+    scalar.store(tid, scratch_ub, scalar.index_cast(tid))
+```
+
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"kernel_entry.inline_cube_scope","symbol":"kernel_entry_inline_cube_scope_probe","compile":{"BLOCK_M":16,"BLOCK_K":16,"BLOCK_N":16}} -->
 ```python
 with pto.cube():
@@ -1041,6 +1049,9 @@ with pto.cube():
   / `pto.section.cube` bodies inside the outlined helper.
 - `with pto.simt():` preserves its scalar body inside one outlined
   `pto.simt_entry` helper, and the caller emits `pto.store_vfsimt_info`.
+- `with pto.simt(dim_x, dim_y, dim_z):` uses the same inline outlining and
+  automatic capture rules, but emits a caller-side explicit SIMT launch with
+  the authored dimensions.
 - Values defined inside the inline sub-kernel cannot escape the block directly.
   Use Tiles, typed pointers, or other mutable references to communicate results
   back to the caller.
diff --git a/ptodsl/docs/user_guide/13-simt-micro-ops.md b/ptodsl/docs/user_guide/13-simt-micro-ops.md
index 928cffeba9..18eb196014 100644
--- a/ptodsl/docs/user_guide/13-simt-micro-ops.md
+++ b/ptodsl/docs/user_guide/13-simt-micro-ops.md
@@ -10,8 +10,8 @@ scalar values loaded from tiles.
 #### `pto.store_vfsimt_info(dim_z, dim_y, dim_x) -> None`
 
 **Description**: Emits the low-level VPTO launch descriptor operation. Most
-code should use `body[dim_x, dim_y, dim_z](...)` or `pto.simt_launch(...)`
-instead.
+code should use `body[dim_x, dim_y, dim_z](...)`, `pto.simt_launch(...)`, or
+the inline form `with pto.simt(dim_x, dim_y, dim_z):` instead.
 
 **Parameters**:
 
diff --git a/ptodsl/ptodsl/_subkernels.py b/ptodsl/ptodsl/_subkernels.py
index 1777177074..0f080b500e 100644
--- a/ptodsl/ptodsl/_subkernels.py
+++ b/ptodsl/ptodsl/_subkernels.py
@@ -405,6 +405,7 @@ def __init__(
         ast_rewrite: bool = True,
         simt_max_threads: int | None = None,
         simt_max_regs: int | None = None,
+        simt_inline_dims: tuple | None = None,
     ):
         self._role = role
         self._name = name
@@ -412,9 +413,12 @@ def __init__(
         self._ast_rewrite = ast_rewrite
         self._simt_max_threads = simt_max_threads
         self._simt_max_regs = simt_max_regs
+        self._simt_inline_dims = simt_inline_dims
         self._session_cm = None
 
     def __call__(self, fn):
+        if self._simt_inline_dims is not None:
+            raise TypeError("pto.simt(dim_x, dim_y, dim_z) is only supported as an inline context manager")
         return SubkernelTemplate(
             SubkernelSpec(
                 role=self._role,
@@ -446,6 +450,7 @@ def __enter__(self):
             self._role.value,
             symbol_name,
             self._target,
+            simt_launch_dims=self._simt_inline_dims,
         )
         self._session_cm.__enter__()
         return None
@@ -465,6 +470,7 @@ def _subkernel_decorator(
     ast_rewrite: bool = True,
     simt_max_threads: int | None = None,
     simt_max_regs: int | None = None,
+    simt_inline_dims: tuple | None = None,
 ):
     return _SubkernelSurface(
         role,
@@ -473,6 +479,7 @@ def _subkernel_decorator(
         ast_rewrite=ast_rewrite,
         simt_max_threads=simt_max_threads,
         simt_max_regs=simt_max_regs,
+        simt_inline_dims=simt_inline_dims,
     )
 
 
@@ -485,6 +492,7 @@ def _decorate_subkernel(
     ast_rewrite: bool = True,
     simt_max_threads: int | None = None,
     simt_max_regs: int | None = None,
+    simt_inline_dims: tuple | None = None,
 ):
     if fn is not None:
         return _subkernel_decorator(
@@ -494,6 +502,7 @@ def _decorate_subkernel(
             ast_rewrite=ast_rewrite,
             simt_max_threads=simt_max_threads,
             simt_max_regs=simt_max_regs,
+            simt_inline_dims=simt_inline_dims,
         )(fn)
     return _subkernel_decorator(
         role,
@@ -502,6 +511,7 @@ def _decorate_subkernel(
         ast_rewrite=ast_rewrite,
         simt_max_threads=simt_max_threads,
         simt_max_regs=simt_max_regs,
+        simt_inline_dims=simt_inline_dims,
     )
 
 
@@ -527,7 +537,7 @@ def _validate_simt_resource_attr(name: str, value: int | None) -> int | None:
 
 def simt(
     fn=None,
-    *,
+    *dims,
     name: str | None = None,
     target: str = "a5",
     ast_rewrite: bool = True,
@@ -536,6 +546,14 @@ def simt(
 ):
     max_threads = _validate_simt_resource_attr("max_threads", max_threads)
     max_regs = _validate_simt_resource_attr("max_regs", max_regs)
+    simt_inline_dims = None
+    if fn is not None and not callable(fn):
+        dims = (fn, *dims)
+        fn = None
+    if dims:
+        if len(dims) != 3:
+            raise TypeError("pto.simt(dim_x, dim_y, dim_z) expects exactly three launch dimensions")
+        simt_inline_dims = tuple(dims)
     return _decorate_subkernel(
         KernelRole.SIMT,
         fn,
@@ -544,6 +562,7 @@ def simt(
         ast_rewrite=ast_rewrite,
         simt_max_threads=max_threads,
         simt_max_regs=max_regs,
+        simt_inline_dims=simt_inline_dims,
     )
 
 
diff --git a/ptodsl/ptodsl/_tracing/session.py b/ptodsl/ptodsl/_tracing/session.py
index 6bafea72d5..49f6b76b05 100644
--- a/ptodsl/ptodsl/_tracing/session.py
+++ b/ptodsl/ptodsl/_tracing/session.py
@@ -119,6 +119,7 @@ class InlineSubkernelOutlineFrame:
     owner_symbol_name: str
     wrapper_op: object
     body_block: object
+    simt_launch_dims: tuple | None = None
 
 
 class TraceSession:
@@ -317,7 +318,14 @@ def enter_subkernel_body(self, role: str, symbol_name: str, target: str):
                 raise RuntimeError("PTODSL trace-session subkernel stack corruption detected")
 
     @contextmanager
-    def enter_inline_subkernel(self, role: str, symbol_name: str, target: str):
+    def enter_inline_subkernel(
+        self,
+        role: str,
+        symbol_name: str,
+        target: str,
+        *,
+        simt_launch_dims: tuple | None = None,
+    ):
         """Capture one inline subkernel scope and outline it into a helper on exit."""
         frame = SubkernelTraceFrame(
             role=role,
@@ -331,6 +339,7 @@ def enter_inline_subkernel(self, role: str, symbol_name: str, target: str):
             owner_symbol_name=self.current_function_owner_symbol_name,
             wrapper_op=wrapper_op,
             body_block=body_block,
+            simt_launch_dims=simt_launch_dims,
         )
         self._subkernel_stack.append(frame)
         try:
@@ -441,9 +450,17 @@ def _outline_inline_subkernel(self, outline_frame: InlineSubkernelOutlineFrame)
             )
 
         with InsertionPoint(outline_frame.wrapper_op.operation):
-            if role == "simt":
-                self._emit_simt_helper_launch_metadata()
-            func.CallOp(helper_fn, list(captures))
+            if role == "simt" and outline_frame.simt_launch_dims is not None:
+                dim_x, dim_y, dim_z = _coerce_simt_launch_dims(outline_frame.simt_launch_dims)
+                Operation.create(
+                    "pto.simt_launch",
+                    attributes={"callee": FlatSymbolRefAttr.get(_symbol_name(helper_fn))},
+                    operands=[dim_x, dim_y, dim_z, *captures],
+                )
+            else:
+                if role == "simt":
+                    self._emit_simt_helper_launch_metadata()
+                func.CallOp(helper_fn, list(captures))
 
         entry_block = helper_fn.add_entry_block()
         with InsertionPoint(entry_block):
diff --git a/ptodsl/tests/test_jit_compile.py b/ptodsl/tests/test_jit_compile.py
index ed575acb0c..a2cdfa557e 100644
--- a/ptodsl/tests/test_jit_compile.py
+++ b/ptodsl/tests/test_jit_compile.py
@@ -694,6 +694,17 @@ def inline_subkernel_scope_probe(*, TRACE_TOKEN: pto.const_expr = 0):
         pto.pipe_barrier(pto.Pipe.ALL)
 
 
+@pto.jit(target="a5", mode="explicit")
+def inline_simt_launch_dims_probe(
+    gm: pto.ptr(pto.i32, "gm"),
+    *,
+    TRACE_TOKEN: pto.const_expr = 0,
+):
+    with pto.simt(32, 2, 1):
+        tid = pto.get_tid_x()
+        pto.stg(tid, gm, scalar.index_cast(tid))
+
+
 @pto.simt
 def simt_tid_probe():
     pto.get_tid_x()
@@ -3995,6 +4006,31 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
         "outlined inline helpers should preserve the authored SIMD/Cube sections and SIMT scalar ops",
     )
 
+    inline_simt_launch_text = inline_simt_launch_dims_probe.compile(TRACE_TOKEN=1).mlir_text()
+    expect_parse_roundtrip_and_verify(inline_simt_launch_text, "inline simt launch-dims specialization")
+    expect(
+        re.search(r"pto\.simt_launch @inline_simt_[0-9]+__ptodsl_[0-9a-f]+<<<", inline_simt_launch_text)
+        is not None,
+        "with pto.simt(dim_x, dim_y, dim_z) should emit VPTO simt_launch sugar",
+    )
+    expect(
+        "pto.store_vfsimt_info" not in inline_simt_launch_text,
+        "with pto.simt(dim_x, dim_y, dim_z) should leave launch metadata to simt_launch expansion",
+    )
+    expect(
+        re.search(
+            r"func\.func @inline_simt_[0-9]+__ptodsl_[0-9a-f]+\(%arg0: !pto\.ptr<i32, gm>\) attributes \{[^}]*pto\.simt_entry[^}]*\}",
+            inline_simt_launch_text,
+        )
+        is not None,
+        "inline SIMT launch-dims helper should capture enclosing values as helper arguments",
+    )
+    expect_raises(
+        TypeError,
+        lambda: pto.simt(32, 1),
+        "expects exactly three",
+    )
+
     simt_text = simt_helper_lowering_probe.compile(TRACE_TOKEN=1).mlir_text()
     expect_parse_roundtrip_and_verify(simt_text, "simt helper lowering specialization")
     expect(

From ff75684d452c706296ccedfc57fa4ad4eecc9d98 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Fri, 26 Jun 2026 11:44:40 +0800
Subject: [PATCH 21/37] =?UTF-8?q?=E8=B4=B4=E8=BF=91golden=E6=96=B9?=
 =?UTF-8?q?=E4=BE=BF=E5=AF=B9=E6=AF=94?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ptodsl/README.md                             |  2 +-
 ptodsl/examples/rmsnorm_alloc_buffer_simt.py | 11 +++++------
 ptodsl/tests/test_rmsnorm_example_compile.py |  2 +-
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/ptodsl/README.md b/ptodsl/README.md
index 43fcbdb5a5..b9dd53a5e4 100644
--- a/ptodsl/README.md
+++ b/ptodsl/README.md
@@ -165,7 +165,7 @@ python3 ptodsl/examples/rmsnorm_alloc_buffer_simt.py --variant x64 > /tmp/rmsnor
 ```
 
 Expected: MLIR containing `@rmsnorm_4096_alloc_buffer_simt_context_kernel`,
-`scf.for`, `vector<2xf32>` for `x128`, `vector<4xf32>` for `x64`, and the
+`scf.for`, `vector<4xf32>` for both `x128` and `x64`, and the
 `__tl_allreduce_sum` helper.
 
 ### Launch artifacts
diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
index 26577907dd..4863fe955f 100644
--- a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
@@ -122,15 +122,14 @@ def rmsnorm_simt_token_body(
 @pto.jit(target="a5", mode="explicit")
 def rmsnorm_4096_alloc_buffer_simt_context_kernel(
     X: pto.ptr(pto.f32, "gm"),
-    W: pto.ptr(pto.f32, "gm"),
     Y: pto.ptr(pto.f32, "gm"),
+    W: pto.ptr(pto.f32, "gm"),
     RSTD: pto.ptr(pto.f32, "gm"),
     eps: pto.f32,
-    batch: pto.i32,
     *,
     threads: pto.const_expr = 128,
-    rounds: pto.const_expr = 16,
-    lanes: pto.const_expr = 2,
+    rounds: pto.const_expr = 8,
+    lanes: pto.const_expr = 4,
     hidden_size: pto.const_expr = 4096,
     n_cores: pto.const_expr = 64,
     tokens_per_core: pto.const_expr = 64,
@@ -213,8 +212,8 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
 def build_x128():
     return rmsnorm_4096_alloc_buffer_simt_context_kernel.compile(
         threads=128,
-        rounds=16,
-        lanes=2,
+        rounds=8,
+        lanes=4,
         tokens_per_core=64,
     )
 
diff --git a/ptodsl/tests/test_rmsnorm_example_compile.py b/ptodsl/tests/test_rmsnorm_example_compile.py
index 67dc979b87..f31f9ee53b 100644
--- a/ptodsl/tests/test_rmsnorm_example_compile.py
+++ b/ptodsl/tests/test_rmsnorm_example_compile.py
@@ -138,7 +138,7 @@ def main() -> None:
     check_variant(
         example.build_x128(),
         label="x128",
-        vector_type="vector<2xf32>",
+        vector_type="vector<4xf32>",
         helper_name_fragment="__tl_allreduce_sum_f32_t128_s1_o0",
         ub_size=82496,
     )

From 1339499ff9ae3f07b5954d09e5d95bbcafaabde3 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Fri, 26 Jun 2026 12:54:42 +0800
Subject: [PATCH 22/37] Make rmsNorm align to MLIR golden for easier
 implementation comparison

---
 ptodsl/README.md                             |  7 ++--
 ptodsl/examples/rmsnorm_alloc_buffer_simt.py | 42 +++++++++++++++-----
 ptodsl/tests/test_rmsnorm_example_compile.py | 16 ++++++++
 3 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/ptodsl/README.md b/ptodsl/README.md
index b9dd53a5e4..511db83824 100644
--- a/ptodsl/README.md
+++ b/ptodsl/README.md
@@ -156,8 +156,8 @@ python3 ptodsl/examples/flash_attention_softmax_launch.py
 
 Compile-only RMSNorm example for explicit-mode SIMT kernels. It exercises
 `pto.alloc_buffer(...)`, contiguous `scalar.load` / `scalar.store`, `pto.vec`,
-`pto.simt_allreduce_sum(...)`, and a runtime token loop that lowers to
-`scf.for`.
+`pto.simt_allreduce_sum(...)`, explicit pipe `set_flag` / `wait_flag` sync,
+and a runtime token loop that lowers to `scf.for`.
 
 ```bash
 python3 ptodsl/examples/rmsnorm_alloc_buffer_simt.py --variant x128 > /tmp/rmsnorm_x128.mlir
@@ -166,7 +166,8 @@ python3 ptodsl/examples/rmsnorm_alloc_buffer_simt.py --variant x64 > /tmp/rmsnor
 
 Expected: MLIR containing `@rmsnorm_4096_alloc_buffer_simt_context_kernel`,
 `scf.for`, `vector<4xf32>` for both `x128` and `x64`, and the
-`__tl_allreduce_sum` helper.
+`__tl_allreduce_sum` helper. The main token loop should also contain dynamic
+`pto.set_flag_dyn` / `pto.wait_flag_dyn` operations for the ping-pong events.
 
 ### Launch artifacts
 
diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
index 4863fe955f..d727b21eda 100644
--- a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
@@ -65,7 +65,7 @@ def rmsnorm_simt_token_body(
     reduce_scratch,
     w_frag,
     eps: pto.f32,
-    ping: pto.i32,
+    pingpong: pto.i32,
     *,
     threads: pto.const_expr = 128,
     rounds: pto.const_expr = 16,
@@ -81,7 +81,7 @@ def rmsnorm_simt_token_body(
 
     with pto.for_(0, rounds, step=1) as r:
         lane_offset = r * threads * lanes + tx * lanes
-        x_offset = ping * hidden_size + lane_offset
+        x_offset = pingpong * hidden_size + lane_offset
         frag_offset = r * lanes
 
         x_vec = scalar.load(x_ub, x_offset, contiguous=lanes)
@@ -105,11 +105,11 @@ def rmsnorm_simt_token_body(
 
     rstd = 1.0 / scalar.sqrt(sum_sq / hidden_size + eps)
 
-    scalar.store(rstd, rstd_ub, ping * 8)
+    scalar.store(rstd, rstd_ub, pingpong * 8)
 
     with pto.for_(0, rounds, step=1) as r:
         lane_offset = r * threads * lanes + tx * lanes
-        y_offset = ping * hidden_size + lane_offset
+        y_offset = pingpong * hidden_size + lane_offset
         frag_offset = r * lanes
 
         x_vec = scalar.load(x_frag, frag_offset, contiguous=lanes)
@@ -157,8 +157,10 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
         hidden_size * f32_bytes,
         nburst=(1, hidden_size * f32_bytes, hidden_size * f32_bytes),
     )
+    pto.set_flag("MTE2", "V", event_id=3)
+    pto.wait_flag("MTE2", "V", event_id=3)
 
-    with pto.simt():
+    with pto.simt(threads, 1, 1):
         init_weight_fragment_body(
             w_ub,
             w_frag,
@@ -167,19 +169,28 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
             lanes=lanes,
         )
 
+    pto.set_flag("V", "MTE2", event_id=0)
+    pto.set_flag("MTE3", "V", event_id=0)
+    pto.set_flag("V", "MTE2", event_id=1)
+    pto.set_flag("MTE3", "V", event_id=1)
+
     for local_token in range(0, tokens_per_core):
         token_id = local_token * n_cores + core_id
-        ping = local_token % 2
+        pingpong = local_token % 2
 
+        pto.wait_flag("V", "MTE2", event_id=pingpong)
         pto.mte_gm_ub(
             pto.addptr(X, token_id * hidden_size),
             x_ub,
-            ping * hidden_size * f32_bytes,
+            pingpong * hidden_size * f32_bytes,
             hidden_size * f32_bytes,
             nburst=(1, hidden_size * f32_bytes, hidden_size * f32_bytes),
         )
+        pto.set_flag("MTE2", "V", event_id=pingpong)
 
-        with pto.simt():
+        pto.wait_flag("MTE2", "V", event_id=pingpong)
+        pto.wait_flag("MTE3", "V", event_id=pingpong)
+        with pto.simt(threads, 1, 1):
             rmsnorm_simt_token_body(
                 x_ub,
                 y_ub,
@@ -187,26 +198,35 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
                 reduce_scratch,
                 w_frag,
                 eps,
-                ping,
+                pingpong,
                 threads=threads,
                 rounds=rounds,
                 lanes=lanes,
                 hidden_size=hidden_size,
             )
+        pto.set_flag("V", "MTE2", event_id=pingpong)
+        pto.set_flag("V", "MTE3", event_id=pingpong)
 
+        pto.wait_flag("V", "MTE3", event_id=pingpong)
         pto.mte_ub_gm(
-            pto.addptr(y_ub, ping * hidden_size),
+            pto.addptr(y_ub, pingpong * hidden_size),
             pto.addptr(Y, token_id * hidden_size),
             hidden_size * f32_bytes,
             nburst=(1, hidden_size * f32_bytes, hidden_size * f32_bytes),
         )
 
         pto.mte_ub_gm(
-            pto.addptr(rstd_ub, ping * 8),
+            pto.addptr(rstd_ub, pingpong * 8),
             pto.addptr(RSTD, token_id),
             f32_bytes,
             nburst=(1, f32_bytes, f32_bytes),
         )
+        pto.set_flag("MTE3", "V", event_id=pingpong)
+
+    pto.wait_flag("V", "MTE2", event_id=0)
+    pto.wait_flag("V", "MTE2", event_id=1)
+    pto.wait_flag("MTE3", "V", event_id=0)
+    pto.wait_flag("MTE3", "V", event_id=1)
 
 
 def build_x128():
diff --git a/ptodsl/tests/test_rmsnorm_example_compile.py b/ptodsl/tests/test_rmsnorm_example_compile.py
index f31f9ee53b..85014d6d48 100644
--- a/ptodsl/tests/test_rmsnorm_example_compile.py
+++ b/ptodsl/tests/test_rmsnorm_example_compile.py
@@ -73,6 +73,22 @@ def check_variant(compiled, *, label: str, vector_type: str, helper_name_fragmen
     expect(text.count("scf.for") >= 4, f"{label}: SIMT inner loops should lower to compact scf.for ops")
     expect("pto.mte_gm_ub" in text, f"{label}: missing GM->UB transfer")
     expect("pto.mte_ub_gm" in text, f"{label}: missing UB->GM transfer")
+    expect(text.count("pto.simt_launch @inline_simt_") == 2,
+           f"{label}: inline SIMT scopes should lower to explicit simt_launch ops")
+    expect("pto.store_vfsimt_info" not in text,
+           f"{label}: explicit simt_launch dims should not emit caller-side store_vfsimt_info")
+    expect("pto.set_flag[<PIPE_MTE2>, <PIPE_V>, <EVENT_ID3>]" in text,
+           f"{label}: W load should signal the SIMT initialization helper")
+    expect("pto.wait_flag[<PIPE_MTE2>, <PIPE_V>, <EVENT_ID3>]" in text,
+           f"{label}: SIMT initialization helper should wait for W load")
+    expect("pto.set_flag[<PIPE_V>, <PIPE_MTE2>, <EVENT_ID0>]" in text,
+           f"{label}: missing V->MTE2 ping-pong priming flag")
+    expect("pto.set_flag[<PIPE_MTE3>, <PIPE_V>, <EVENT_ID1>]" in text,
+           f"{label}: missing MTE3->V pong priming flag")
+    expect(text.count("pto.set_flag_dyn") == 4,
+           f"{label}: token loop should lower four dynamic set_flag ops")
+    expect(text.count("pto.wait_flag_dyn") == 4,
+           f"{label}: token loop should lower four dynamic wait_flag ops")
     expect(vector_type in text, f"{label}: missing contiguous vector access type {vector_type}")
     expect(helper_name_fragment in text, f"{label}: missing allreduce helper")
     expect("func.call @__tl_allreduce_sum" in text or "call @__tl_allreduce_sum" in text,

From 08185baf050178c8d94fadbae0bd06402cc0494b Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Fri, 26 Jun 2026 14:45:01 +0800
Subject: [PATCH 23/37] Clarify inline SIMT launch context docs

---
 .../03-kernel-entry-and-subkernels.md          | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/ptodsl/docs/user_guide/03-kernel-entry-and-subkernels.md b/ptodsl/docs/user_guide/03-kernel-entry-and-subkernels.md
index 3852149e28..b588e7dadd 100644
--- a/ptodsl/docs/user_guide/03-kernel-entry-and-subkernels.md
+++ b/ptodsl/docs/user_guide/03-kernel-entry-and-subkernels.md
@@ -736,8 +736,9 @@ two ways:
 
 1. **As decorated functions** — reusable, named sub-kernels called from
    `@pto.jit` entries and modules.
-2. **As context managers** (`with pto.cube():`, etc.) — inline blocks for
-   one-off snippets (see Section 3.8).
+2. **As context managers** (`with pto.cube():`, `with pto.simd():`,
+   `with pto.simt():`, and `with pto.simt(dim_x, dim_y, dim_z):`) — inline
+   blocks for one-off snippets (see Section 3.8).
 
 Named sub-kernel decorators use the same default AST rewrite model as
 `@pto.jit`: supported Python `if` and `for range(...)` statements lower to
@@ -997,12 +998,13 @@ Specific SIMT micro-op APIs are documented in Chapter 13.
 
 ## 3.8 Inline context manager syntax
 
-In addition to the decorator form, each sub-kernel unit provides a context
-manager: `with pto.cube():`, `with pto.simd():`, and `with pto.simt():`. These
-open one-off anonymous sub-kernel bodies without requiring a separate named
-Python function. Inline scopes are supported in top-level `@pto.jit` bodies.
-SIMT inline scopes may also spell explicit launch dimensions as
-`with pto.simt(dim_x, dim_y, dim_z):`.
+In addition to the decorator form, each sub-kernel unit provides an inline
+context manager form: `with pto.cube():`, `with pto.simd():`,
+`with pto.simt():`, and `with pto.simt(dim_x, dim_y, dim_z):`. These open
+one-off anonymous sub-kernel bodies without requiring a separate named Python
+function. Inline scopes are supported in top-level `@pto.jit` bodies. The
+dimensioned SIMT form uses the same inline body style while making the caller
+emit an explicit `pto.simt_launch`.
 
 ### Syntax
 

From 993ab62a69a6da17a4e73c74e2141aef64fa9634 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Fri, 26 Jun 2026 16:35:36 +0800
Subject: [PATCH 24/37] refactor(ptodsl): use python loops in rmsnorm simt body

---
 ptodsl/examples/rmsnorm_alloc_buffer_simt.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
index d727b21eda..b3dfd16ca0 100644
--- a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
@@ -15,7 +15,7 @@
 - contiguous scalar ``load`` / ``store`` vector accesses
 - ``pto.simt_allreduce_sum(...)`` for cross-workitem sum reduction
 - runtime ``range(...)`` for the token loop so the AST rewrite emits ``scf.for``
-- explicit ``pto.for_(...)`` loops inside SIMT helpers to avoid trace-time expansion
+- Python ``range(...)`` loops inside SIMT helpers to emit compact runtime loops
 
 Run this file directly to print the emitted MLIR for one specialization.
 """
@@ -79,7 +79,7 @@ def rmsnorm_simt_token_body(
 
     scalar.store(pto.const(0.0, dtype=pto.f32), sum_sq, 0)
 
-    with pto.for_(0, rounds, step=1) as r:
+    for r in range(0, rounds):
         lane_offset = r * threads * lanes + tx * lanes
         x_offset = pingpong * hidden_size + lane_offset
         frag_offset = r * lanes
@@ -107,7 +107,7 @@ def rmsnorm_simt_token_body(
 
     scalar.store(rstd, rstd_ub, pingpong * 8)
 
-    with pto.for_(0, rounds, step=1) as r:
+    for r in range(0, rounds):
         lane_offset = r * threads * lanes + tx * lanes
         y_offset = pingpong * hidden_size + lane_offset
         frag_offset = r * lanes

From f10301a2d3d7af2743799d225d53efc8fb3a5196 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Fri, 26 Jun 2026 16:37:04 +0800
Subject: [PATCH 25/37] refactor(ptodsl): remove alloc buffer persistent flag

---
 .../user_guide/04-type-system-and-buffer.md   | 10 ++-
 ptodsl/examples/rmsnorm_alloc_buffer_simt.py  | 69 ++++++-------------
 ptodsl/ptodsl/_ops.py                         | 17 +----
 ptodsl/ptodsl/_surface_values.py              |  3 -
 ptodsl/tests/test_jit_compile.py              | 17 +----
 ptodsl/tests/test_rmsnorm_example_compile.py  | 17 +++--
 6 files changed, 41 insertions(+), 92 deletions(-)

diff --git a/ptodsl/docs/user_guide/04-type-system-and-buffer.md b/ptodsl/docs/user_guide/04-type-system-and-buffer.md
index f4b389312e..844cc0a49c 100644
--- a/ptodsl/docs/user_guide/04-type-system-and-buffer.md
+++ b/ptodsl/docs/user_guide/04-type-system-and-buffer.md
@@ -180,13 +180,13 @@ ptr_ub  = pto.ptr(pto.f16, pto.MemorySpace.UB)
 Allocate explicit scratch storage for pointer-style load, store, and data movement operations.
 
 ```text
-pto.alloc_buffer(shape, dtype, *, scope="ub", persistent=False)
+pto.alloc_buffer(shape, dtype, *, scope="ub")
 ```
 
 <!-- ptodsl-doc-pending: {"reason":"illustrative fragment; covered by test_jit_compile alloc_buffer probes"} -->
 ```python
 ub_scratch = pto.alloc_buffer((4096,), pto.f32, scope="ub")
-fragment = pto.alloc_buffer((32,), pto.f32, scope="local", persistent=True)
+fragment = pto.alloc_buffer((32,), pto.f32, scope="local")
 ```
 
 | Parameter | Description |
@@ -194,16 +194,14 @@ fragment = pto.alloc_buffer((32,), pto.f32, scope="local", persistent=True)
 | `shape` | Static positive integer shape. Pass an `int`, `tuple[int, ...]`, or `list[int]`. |
 | `dtype` | Element type of the returned buffer, such as `pto.f32` or `pto.i32`. |
 | `scope` | Scratch storage kind. Use `"ub"` or `"local"`. |
-| `persistent` | Optional Boolean, either `True` or `False`; the default is `False`. It is frontend metadata and does not change the returned pointer type. |
 
 | Scope | Meaning | Returned value |
 |-------|---------|----------------|
 | `"ub"` | Function-level Unified Buffer scratch, typically used by data movement operations or shared SIMT scratch. | Typed UB pointer |
 | `"local"` | SIMT-helper local scratch for per-workitem temporary fragments. | Typed local pointer |
 
-A `"ub"` buffer is available throughout the generated kernel body, regardless
-of the `persistent` value. A `"local"` buffer is available only inside the SIMT
-helper invocation that allocates it.
+A `"ub"` buffer is available throughout the generated kernel body. A `"local"`
+buffer is available only inside the SIMT helper invocation that allocates it.
 
 ## 4.6 TensorView
 
diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
index b3dfd16ca0..ac2f81a90f 100644
--- a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
@@ -14,6 +14,7 @@
 - ``pto.alloc_buffer(...)`` for UB scratch and lane-local storage
 - contiguous scalar ``load`` / ``store`` vector accesses
 - ``pto.simt_allreduce_sum(...)`` for cross-workitem sum reduction
+- W stays in UB after the GM->UB preload and is read directly by the token SIMT body
 - runtime ``range(...)`` for the token loop so the AST rewrite emits ``scf.for``
 - Python ``range(...)`` loops inside SIMT helpers to emit compact runtime loops
 
@@ -40,30 +41,13 @@
 from ptodsl import pto, scalar
 
 
-def init_weight_fragment_body(
-    w_ub,
-    w_frag,
-    *,
-    threads: pto.const_expr = 128,
-    rounds: pto.const_expr = 16,
-    lanes: pto.const_expr = 2,
-):
-    tx = pto.get_tid_x()
-
-    with pto.for_(0, rounds, step=1) as r:
-        ub_offset = r * threads * lanes + tx * lanes
-        frag_offset = r * lanes
-
-        w_vec = scalar.load(w_ub, ub_offset, contiguous=lanes)
-        scalar.store(w_vec, w_frag, frag_offset)
-
-
+@pto.simt
 def rmsnorm_simt_token_body(
     x_ub,
     y_ub,
     rstd_ub,
     reduce_scratch,
-    w_frag,
+    w_ub,
     eps: pto.f32,
     pingpong: pto.i32,
     *,
@@ -108,12 +92,14 @@ def rmsnorm_simt_token_body(
     scalar.store(rstd, rstd_ub, pingpong * 8)
 
     for r in range(0, rounds):
-        lane_offset = r * threads * lanes + tx * lanes
-        y_offset = pingpong * hidden_size + lane_offset
+        round_offset = r * threads * lanes
+        thread_offset = tx * lanes
+        lane_base = round_offset + thread_offset
+        y_offset = pingpong * hidden_size + lane_base
         frag_offset = r * lanes
 
         x_vec = scalar.load(x_frag, frag_offset, contiguous=lanes)
-        w_vec = scalar.load(w_frag, frag_offset, contiguous=lanes)
+        w_vec = scalar.load(w_ub, lane_base, contiguous=lanes)
         rstd_vec = pto.vec(pto.f32, lanes, init=rstd)
         y_vec = x_vec * rstd_vec * w_vec
         scalar.store(y_vec, y_ub, y_offset)
@@ -140,7 +126,6 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
     )
 
     core_id = pto.get_block_idx()
-    frag_elems: pto.const_expr = rounds * lanes
 
     w_ub = pto.alloc_buffer((hidden_size,), pto.f32, scope="ub")
     x_ub = pto.alloc_buffer((2, hidden_size), pto.f32, scope="ub")
@@ -148,8 +133,6 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
     rstd_ub = pto.alloc_buffer((2, 8), pto.f32, scope="ub")
     reduce_scratch = pto.alloc_buffer((threads,), pto.f32, scope="ub")
 
-    w_frag = pto.alloc_buffer((frag_elems,), pto.f32, scope="local", persistent=True)
-
     pto.mte_gm_ub(
         W,
         w_ub,
@@ -160,15 +143,6 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
     pto.set_flag("MTE2", "V", event_id=3)
     pto.wait_flag("MTE2", "V", event_id=3)
 
-    with pto.simt(threads, 1, 1):
-        init_weight_fragment_body(
-            w_ub,
-            w_frag,
-            threads=threads,
-            rounds=rounds,
-            lanes=lanes,
-        )
-
     pto.set_flag("V", "MTE2", event_id=0)
     pto.set_flag("MTE3", "V", event_id=0)
     pto.set_flag("V", "MTE2", event_id=1)
@@ -190,20 +164,19 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
 
         pto.wait_flag("MTE2", "V", event_id=pingpong)
         pto.wait_flag("MTE3", "V", event_id=pingpong)
-        with pto.simt(threads, 1, 1):
-            rmsnorm_simt_token_body(
-                x_ub,
-                y_ub,
-                rstd_ub,
-                reduce_scratch,
-                w_frag,
-                eps,
-                pingpong,
-                threads=threads,
-                rounds=rounds,
-                lanes=lanes,
-                hidden_size=hidden_size,
-            )
+        rmsnorm_simt_token_body[threads, 1, 1](
+            x_ub,
+            y_ub,
+            rstd_ub,
+            reduce_scratch,
+            w_ub,
+            eps,
+            pingpong,
+            threads=threads,
+            rounds=rounds,
+            lanes=lanes,
+            hidden_size=hidden_size,
+        )
         pto.set_flag("V", "MTE2", event_id=pingpong)
         pto.set_flag("V", "MTE3", event_id=pingpong)
 
diff --git a/ptodsl/ptodsl/_ops.py b/ptodsl/ptodsl/_ops.py
index 7c9942b9eb..1e08778e28 100644
--- a/ptodsl/ptodsl/_ops.py
+++ b/ptodsl/ptodsl/_ops.py
@@ -2319,7 +2319,7 @@ def _tile_transfer_partition(tv, tile, *, offsets=None, sizes=None, context: str
     return partition_view(tv, offsets=normalized_offsets, sizes=normalized_sizes)
 
 
-def alloc_buffer(shape, dtype, *, scope="ub", persistent=False):
+def alloc_buffer(shape, dtype, *, scope="ub"):
     """
     Allocate explicit scratch storage and return an address-like surface value.
 
@@ -2330,7 +2330,6 @@ def alloc_buffer(shape, dtype, *, scope="ub", persistent=False):
     """
     _require_explicit_mode("pto.alloc_buffer(...)")
     normalized_scope = _normalize_alloc_buffer_scope(scope)
-    persistent = _normalize_alloc_buffer_persistent(persistent)
     element_type = _resolve(dtype)
     element_count = _static_alloc_buffer_element_count(shape)
     elem_bytes = _element_bytewidth(element_type)
@@ -2343,7 +2342,6 @@ def alloc_buffer(shape, dtype, *, scope="ub", persistent=False):
             element_type,
             element_count,
             byte_size,
-            persistent=persistent,
         )
     if normalized_scope == "local":
         return _alloc_local_buffer(
@@ -2352,7 +2350,6 @@ def alloc_buffer(shape, dtype, *, scope="ub", persistent=False):
             element_type,
             element_count,
             byte_size,
-            persistent=persistent,
         )
     raise AssertionError(f"unhandled alloc_buffer scope {normalized_scope!r}")
 
@@ -2368,12 +2365,6 @@ def _normalize_alloc_buffer_scope(scope):
     raise ValueError("pto.alloc_buffer(..., scope=...) expects one of 'ub' or 'local'")
 
 
-def _normalize_alloc_buffer_persistent(persistent):
-    if not isinstance(persistent, bool):
-        raise TypeError("pto.alloc_buffer(..., persistent=...) expects True or False")
-    return persistent
-
-
 def _static_alloc_buffer_element_count(shape):
     if isinstance(shape, int):
         dims = (shape,)
@@ -2399,7 +2390,7 @@ def _static_alloc_buffer_element_count(shape):
     return count
 
 
-def _alloc_ub_buffer(shape, dtype, element_type, element_count, byte_size, *, persistent):
+def _alloc_ub_buffer(shape, dtype, element_type, element_count, byte_size):
     from ._tracing.active import current_session
 
     session = current_session()
@@ -2422,11 +2413,10 @@ def _alloc_ub_buffer(shape, dtype, element_type, element_count, byte_size, *, pe
         element_count=element_count,
         byte_size=byte_size,
         byte_offset=byte_offset,
-        persistent=persistent,
     )
 
 
-def _alloc_local_buffer(shape, dtype, element_type, element_count, byte_size, *, persistent):
+def _alloc_local_buffer(shape, dtype, element_type, element_count, byte_size):
     i32 = IntegerType.get_signless(32)
     count = _materialize_integer_literal(i32, element_count)
     llvm_ptr_type = Type.parse("!llvm.ptr")
@@ -2446,7 +2436,6 @@ def _alloc_local_buffer(shape, dtype, element_type, element_count, byte_size, *,
         element_type=element_type,
         element_count=element_count,
         byte_size=byte_size,
-        persistent=persistent,
     )
 
 
diff --git a/ptodsl/ptodsl/_surface_values.py b/ptodsl/ptodsl/_surface_values.py
index a1cb60a588..a05f2d85ee 100644
--- a/ptodsl/ptodsl/_surface_values.py
+++ b/ptodsl/ptodsl/_surface_values.py
@@ -359,7 +359,6 @@ def __init__(
         element_count,
         byte_size,
         byte_offset=None,
-        persistent=False,
     ):
         super().__init__(value)
         self.scope = scope
@@ -369,7 +368,6 @@ def __init__(
         self.element_count = element_count
         self.byte_size = byte_size
         self.byte_offset = byte_offset
-        self.persistent = bool(persistent)
 
     @property
     def surface_metadata(self):
@@ -381,7 +379,6 @@ def surface_metadata(self):
             "element_count": self.element_count,
             "byte_size": self.byte_size,
             "byte_offset": self.byte_offset,
-            "persistent": self.persistent,
         }
 
 
diff --git a/ptodsl/tests/test_jit_compile.py b/ptodsl/tests/test_jit_compile.py
index a2cdfa557e..34f96cbc83 100644
--- a/ptodsl/tests/test_jit_compile.py
+++ b/ptodsl/tests/test_jit_compile.py
@@ -887,7 +887,7 @@ def alloc_buffer_ub_probe(
 
 @pto.simt
 def alloc_buffer_local_helper():
-    _ = pto.alloc_buffer((32,), pto.f32, scope="local", persistent=True)
+    _ = pto.alloc_buffer((32,), pto.f32, scope="local")
 
 
 @pto.jit(target="a5", mode="explicit")
@@ -905,11 +905,6 @@ def alloc_buffer_private_scope_probe():
     _ = pto.alloc_buffer((1,), pto.f32, scope="private")
 
 
-@pto.jit(target="a5", mode="explicit")
-def alloc_buffer_non_bool_persistent_probe():
-    _ = pto.alloc_buffer((1,), pto.f32, scope="local", persistent=1)
-
-
 @pto.simt
 def rmsnorm_alloc_buffer_frag_helper(
     w_ub: pto.ptr(pto.f32, pto.MemorySpace.UB),
@@ -919,7 +914,7 @@ def rmsnorm_alloc_buffer_frag_helper(
     _ = w_ub
     _ = x_ub
     _ = pto.alloc_buffer((32,), pto.f32, scope="local")
-    _ = pto.alloc_buffer((32,), pto.f32, scope="local", persistent=True)
+    _ = pto.alloc_buffer((1,), pto.f32, scope="local")
 
 
 @pto.jit(target="a5", mode="explicit")
@@ -4102,12 +4097,6 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
         lambda: alloc_buffer_private_scope_probe.compile(),
         "expects one of 'ub' or 'local'",
     )
-    expect_raises(
-        TypeError,
-        lambda: alloc_buffer_non_bool_persistent_probe.compile(),
-        "expects True or False",
-    )
-
     rmsnorm_alloc_buffer_text = rmsnorm_alloc_buffer_layout_probe.compile().mlir_text()
     expect_parse_roundtrip_and_verify(rmsnorm_alloc_buffer_text, "RMSNorm alloc_buffer layout specialization")
     expect(
@@ -4121,7 +4110,7 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
         )
     expect(
         rmsnorm_alloc_buffer_text.count("llvm.alloca") == 2,
-        "RMSNorm alloc_buffer fragment helper should allocate x_frag and persistent w_frag locally",
+        "RMSNorm alloc_buffer fragment helper should allocate x_frag and sum_sq locally",
     )
     expect(
         re.search(r"call @rmsnorm_alloc_buffer_frag_helper__simt_\d+\(", rmsnorm_alloc_buffer_text)
diff --git a/ptodsl/tests/test_rmsnorm_example_compile.py b/ptodsl/tests/test_rmsnorm_example_compile.py
index 85014d6d48..eca4c387f5 100644
--- a/ptodsl/tests/test_rmsnorm_example_compile.py
+++ b/ptodsl/tests/test_rmsnorm_example_compile.py
@@ -70,17 +70,19 @@ def check_variant(compiled, *, label: str, vector_type: str, helper_name_fragmen
     expect("func.func @rmsnorm_4096_alloc_buffer_simt_context_kernel" in text, f"{label}: missing entry")
     expect(f"dyn_shared_memory_buf = {ub_size} : i64" in text, f"{label}: unexpected UB scratch size")
     expect("scf.for" in text, f"{label}: tokens_per_core loop should lower to scf.for")
-    expect(text.count("scf.for") >= 4, f"{label}: SIMT inner loops should lower to compact scf.for ops")
+    expect(text.count("scf.for") >= 3, f"{label}: SIMT inner loops should lower to compact scf.for ops")
     expect("pto.mte_gm_ub" in text, f"{label}: missing GM->UB transfer")
     expect("pto.mte_ub_gm" in text, f"{label}: missing UB->GM transfer")
-    expect(text.count("pto.simt_launch @inline_simt_") == 2,
-           f"{label}: inline SIMT scopes should lower to explicit simt_launch ops")
+    expect(text.count("pto.simt_launch @rmsnorm_simt_token_body__simt_") == 1,
+           f"{label}: indexed SIMT call should lower to one explicit token simt_launch op")
+    expect("pto.simt_launch @inline_simt_" not in text,
+           f"{label}: token SIMT body should be emitted as the named helper, not an inline helper")
     expect("pto.store_vfsimt_info" not in text,
            f"{label}: explicit simt_launch dims should not emit caller-side store_vfsimt_info")
     expect("pto.set_flag[<PIPE_MTE2>, <PIPE_V>, <EVENT_ID3>]" in text,
-           f"{label}: W load should signal the SIMT initialization helper")
+           f"{label}: W load should signal completion before token processing")
     expect("pto.wait_flag[<PIPE_MTE2>, <PIPE_V>, <EVENT_ID3>]" in text,
-           f"{label}: SIMT initialization helper should wait for W load")
+           f"{label}: token processing should start after the W load completes")
     expect("pto.set_flag[<PIPE_V>, <PIPE_MTE2>, <EVENT_ID0>]" in text,
            f"{label}: missing V->MTE2 ping-pong priming flag")
     expect("pto.set_flag[<PIPE_MTE3>, <PIPE_V>, <EVENT_ID1>]" in text,
@@ -110,10 +112,11 @@ def check_variant(compiled, *, label: str, vector_type: str, helper_name_fragmen
         text.count("pto.store ") <= 8,
         f"{label}: SIMT inner loops should not be trace-time expanded into many scalar stores",
     )
-    expect(text.count("llvm.alloca") == 3, f"{label}: expected w_frag plus x_frag/sum_sq local buffers")
+    expect(text.count("llvm.alloca") == 2, f"{label}: expected x_frag and sum_sq local buffers")
+    expect("w_frag" not in text, f"{label}: W should be read directly from UB, not from a local fragment")
     expect(
         re.search(
-            r"func\.func @inline_simt_1__ptodsl_[^{]+\{(?:(?!func\.func @).)*"
+            r"func\.func @rmsnorm_simt_token_body__simt_[^{]+\{(?:(?!func\.func @).)*"
             r"llvm\.alloca(?:(?!func\.func @).)*llvm\.alloca",
             text,
             re.S,

From 25d0bd5fdfbc806d6cb3143e1c8fc6780c5d5799 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Fri, 26 Jun 2026 16:41:28 +0800
Subject: [PATCH 26/37] Align RMSNorm SIMT loops with golden MLIR

---
 ptodsl/examples/rmsnorm_alloc_buffer_simt.py | 14 +++++++-------
 ptodsl/tests/test_rmsnorm_example_compile.py |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
index ac2f81a90f..916aefc046 100644
--- a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
@@ -61,8 +61,6 @@ def rmsnorm_simt_token_body(
     x_frag = pto.alloc_buffer((frag_elems,), pto.f32, scope="local")
     sum_sq = pto.alloc_buffer((1,), pto.f32, scope="local")
 
-    scalar.store(pto.const(0.0, dtype=pto.f32), sum_sq, 0)
-
     for r in range(0, rounds):
         lane_offset = r * threads * lanes + tx * lanes
         x_offset = pingpong * hidden_size + lane_offset
@@ -71,11 +69,13 @@ def rmsnorm_simt_token_body(
         x_vec = scalar.load(x_ub, x_offset, contiguous=lanes)
         scalar.store(x_vec, x_frag, frag_offset)
 
-        for lane in pto.static_range(0, lanes):
-            local_sum = scalar.load(sum_sq, 0)
-            x = scalar.load(x_frag, frag_offset + lane)
-            local_sum = local_sum + x * x
-            scalar.store(local_sum, sum_sq, 0)
+    scalar.store(pto.const(0.0, dtype=pto.f32), sum_sq, 0)
+
+    for i in range(0, frag_elems):
+        local_sum = scalar.load(sum_sq, 0)
+        x = scalar.load(x_frag, i)
+        local_sum = local_sum + x * x
+        scalar.store(local_sum, sum_sq, 0)
 
     local_sum = scalar.load(sum_sq, 0)
 
diff --git a/ptodsl/tests/test_rmsnorm_example_compile.py b/ptodsl/tests/test_rmsnorm_example_compile.py
index eca4c387f5..3bb6fd0740 100644
--- a/ptodsl/tests/test_rmsnorm_example_compile.py
+++ b/ptodsl/tests/test_rmsnorm_example_compile.py
@@ -70,7 +70,7 @@ def check_variant(compiled, *, label: str, vector_type: str, helper_name_fragmen
     expect("func.func @rmsnorm_4096_alloc_buffer_simt_context_kernel" in text, f"{label}: missing entry")
     expect(f"dyn_shared_memory_buf = {ub_size} : i64" in text, f"{label}: unexpected UB scratch size")
     expect("scf.for" in text, f"{label}: tokens_per_core loop should lower to scf.for")
-    expect(text.count("scf.for") >= 3, f"{label}: SIMT inner loops should lower to compact scf.for ops")
+    expect(text.count("scf.for") >= 4, f"{label}: SIMT inner loops should lower to compact scf.for ops")
     expect("pto.mte_gm_ub" in text, f"{label}: missing GM->UB transfer")
     expect("pto.mte_ub_gm" in text, f"{label}: missing UB->GM transfer")
     expect(text.count("pto.simt_launch @rmsnorm_simt_token_body__simt_") == 1,

From 1835bc14c6859dca4546b65269bb527b1117d783 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Sat, 27 Jun 2026 16:54:15 +0800
Subject: [PATCH 27/37] test(ptodsl): add rmsnorm launch validation script

---
 .../rmsnorm_alloc_buffer_simt_launch.py       | 225 ++++++++++++++++++
 1 file changed, 225 insertions(+)
 create mode 100644 ptodsl/examples/rmsnorm_alloc_buffer_simt_launch.py

diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt_launch.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt_launch.py
new file mode 100644
index 0000000000..f271c10861
--- /dev/null
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt_launch.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+"""
+Launch and validate the RMSNorm alloc_buffer/SIMT example on an Ascend NPU.
+
+The test compares the kernel outputs written to GM against a NumPy RMSNorm
+reference. It also fills output buffers with sentinels and checks guard regions
+after the logical outputs, so missed writes and simple over-writes are caught by
+the same host-side validation.
+"""
+
+from __future__ import annotations
+
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+import sys
+import time
+
+import numpy as np
+
+
+if __package__ in {None, ""}:
+    here = Path(__file__).resolve()
+    for candidate in here.parents:
+        if (candidate / "ptodsl" / "__init__.py").exists():
+            sys.path.insert(0, str(candidate))
+            break
+    else:
+        raise RuntimeError(
+            "Unable to locate the PTODSL Python package root from rmsnorm_alloc_buffer_simt_launch.py"
+        )
+
+
+from rmsnorm_alloc_buffer_simt import rmsnorm_4096_alloc_buffer_simt_context_kernel
+
+
+_DEVICE = "npu:0"
+_HIDDEN_SIZE = 4096
+_THREADS = 128
+_ROUNDS = 8
+_LANES = 4
+_EPS = np.float32(1.0e-6)
+_Y_GUARD_ELEMS = 1024
+_RSTD_GUARD_ELEMS = 64
+_SENTINEL = np.float32(123456.0)
+
+
+@dataclass(frozen=True)
+class Case:
+    name: str
+    n_cores: int
+    tokens_per_core: int
+    seed: int
+    rtol: float = 1.0e-4
+    y_atol: float = 1.0e-4
+    rstd_atol: float = 1.0e-5
+
+    @property
+    def tokens(self) -> int:
+        return self.n_cores * self.tokens_per_core
+
+
+CASES = [
+    Case("one_core_one_token", n_cores=1, tokens_per_core=1, seed=0x483001),
+    Case("one_core_four_tokens", n_cores=1, tokens_per_core=4, seed=0x483004),
+    Case("four_cores_two_tokens_each", n_cores=4, tokens_per_core=2, seed=0x483402),
+]
+
+FULL_CASE = Case("full_64_cores_64_tokens_each", n_cores=64, tokens_per_core=64, seed=0x483640)
+
+
+def init_runtime():
+    import torch
+    import torch_npu  # noqa: F401
+
+    torch.npu.config.allow_internal_format = False
+    torch_npu.npu.set_compile_mode(jit_compile=False)
+    torch.npu.set_device(_DEVICE)
+    return torch
+
+
+def npu_stream(torch):
+    return torch.npu.current_stream()._as_parameter_  # noqa: SLF001
+
+
+def make_inputs(case: Case) -> tuple[np.ndarray, np.ndarray]:
+    rng = np.random.RandomState(case.seed)
+    x = rng.uniform(-0.75, 0.75, size=(case.tokens, _HIDDEN_SIZE)).astype(np.float32)
+    w = rng.uniform(0.5, 1.5, size=(_HIDDEN_SIZE,)).astype(np.float32)
+
+    # Make token/core addressing mistakes obvious in the output comparison.
+    token_offsets = (np.arange(case.tokens, dtype=np.float32)[:, None] * np.float32(0.001))
+    x = (x + token_offsets).astype(np.float32)
+    return x, w
+
+
+def rmsnorm_reference(x: np.ndarray, w: np.ndarray, eps: np.float32) -> tuple[np.ndarray, np.ndarray]:
+    sum_sq = np.sum(x * x, axis=1, dtype=np.float32)
+    rstd = (np.float32(1.0) / np.sqrt(sum_sq / np.float32(x.shape[1]) + eps)).astype(np.float32)
+    y = (x * rstd[:, None] * w[None, :]).astype(np.float32)
+    return y, rstd
+
+
+def compile_kernel(case: Case):
+    return rmsnorm_4096_alloc_buffer_simt_context_kernel.compile(
+        threads=_THREADS,
+        rounds=_ROUNDS,
+        lanes=_LANES,
+        hidden_size=_HIDDEN_SIZE,
+        n_cores=case.n_cores,
+        tokens_per_core=case.tokens_per_core,
+    )
+
+
+def assert_guard_unchanged(name: str, guard: np.ndarray) -> None:
+    if not np.all(guard == _SENTINEL):
+        bad = np.nonzero(guard != _SENTINEL)[0]
+        first = int(bad[0])
+        raise AssertionError(
+            f"{name} guard overwritten at guard index {first}: got {guard[first]!r}, expected {_SENTINEL!r}"
+        )
+
+
+def run_case(case: Case, torch) -> None:
+    x, w = make_inputs(case)
+    y_ref, rstd_ref = rmsnorm_reference(x, w, _EPS)
+
+    x_t = torch.from_numpy(x).to(_DEVICE)
+    w_t = torch.from_numpy(w).to(_DEVICE)
+
+    y_storage = torch.full(
+        (case.tokens * _HIDDEN_SIZE + _Y_GUARD_ELEMS,),
+        float(_SENTINEL),
+        dtype=torch.float32,
+        device=_DEVICE,
+    )
+    rstd_storage = torch.full(
+        (case.tokens + _RSTD_GUARD_ELEMS,),
+        float(_SENTINEL),
+        dtype=torch.float32,
+        device=_DEVICE,
+    )
+
+    stream = npu_stream(torch)
+
+    t0 = time.perf_counter()
+    compiled = compile_kernel(case)
+    compile_s = time.perf_counter() - t0
+
+    t0 = time.perf_counter()
+    compiled[case.n_cores, stream](
+        x_t.data_ptr(),
+        y_storage.data_ptr(),
+        w_t.data_ptr(),
+        rstd_storage.data_ptr(),
+        float(_EPS),
+    )
+    torch.npu.synchronize()
+    launch_s = time.perf_counter() - t0
+
+    y_out = y_storage[: case.tokens * _HIDDEN_SIZE].cpu().numpy().reshape(case.tokens, _HIDDEN_SIZE)
+    rstd_out = rstd_storage[: case.tokens].cpu().numpy()
+    y_guard = y_storage[case.tokens * _HIDDEN_SIZE :].cpu().numpy()
+    rstd_guard = rstd_storage[case.tokens :].cpu().numpy()
+
+    np.testing.assert_allclose(rstd_out, rstd_ref, rtol=case.rtol, atol=case.rstd_atol)
+    np.testing.assert_allclose(y_out, y_ref, rtol=case.rtol, atol=case.y_atol)
+    assert_guard_unchanged("Y", y_guard)
+    assert_guard_unchanged("RSTD", rstd_guard)
+
+    y_diff = float(np.max(np.abs(y_out - y_ref))) if y_out.size else 0.0
+    rstd_diff = float(np.max(np.abs(rstd_out - rstd_ref))) if rstd_out.size else 0.0
+    print(
+        f"PASS {case.name}  "
+        f"grid={case.n_cores} tokens={case.tokens} "
+        f"compile={compile_s:.3f}s launch={launch_s:.3f}s "
+        f"max|Y|={y_diff:.3e} max|RSTD|={rstd_diff:.3e}"
+    )
+
+
+def emit_mlir(case: Case) -> str:
+    return compile_kernel(case).mlir_text()
+
+
+def main(argv=None) -> int:
+    global _DEVICE
+
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--device", default=_DEVICE, help="torch NPU device, default: npu:0")
+    parser.add_argument("--case", choices=[case.name for case in CASES] + [FULL_CASE.name, "all"], default="all")
+    parser.add_argument("--include-full", action="store_true", help="include the 64-core x 64-token full case")
+    parser.add_argument("--emit-mlir", action="store_true", help="print MLIR for the selected case and exit")
+    args = parser.parse_args(argv)
+
+    _DEVICE = args.device
+
+    selected = list(CASES)
+    if args.include_full:
+        selected.append(FULL_CASE)
+    if args.case != "all":
+        all_cases = {case.name: case for case in selected + [FULL_CASE]}
+        selected = [all_cases[args.case]]
+
+    if args.emit_mlir:
+        if len(selected) != 1:
+            parser.error("--emit-mlir expects one concrete --case")
+        print(emit_mlir(selected[0]))
+        return 0
+
+    torch = init_runtime()
+    for case in selected:
+        run_case(case, torch)
+    print("All RMSNorm cases passed.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From d1719fe79dd63679f65dd43057894a195032c493 Mon Sep 17 00:00:00 2001
From: default <default>
Date: Tue, 30 Jun 2026 02:16:07 +0800
Subject: [PATCH 28/37] fix(ptodsl): load rmsnorm pingpong tile via UB pointer

Signed-off-by: andodo <andodo@andododeMacBook-Air.local>
---
 ptodsl/examples/rmsnorm_alloc_buffer_simt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
index 916aefc046..fec01095c9 100644
--- a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
@@ -155,8 +155,8 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
         pto.wait_flag("V", "MTE2", event_id=pingpong)
         pto.mte_gm_ub(
             pto.addptr(X, token_id * hidden_size),
-            x_ub,
-            pingpong * hidden_size * f32_bytes,
+            pto.addptr(x_ub, pingpong * hidden_size),
+            0,
             hidden_size * f32_bytes,
             nburst=(1, hidden_size * f32_bytes, hidden_size * f32_bytes),
         )

From 3cb1af617669b31d3f09aa1de2ba00df84d19828 Mon Sep 17 00:00:00 2001
From: default <default>
Date: Tue, 30 Jun 2026 11:24:01 +0800
Subject: [PATCH 29/37] fix(ptodsl): pass dynamic shared memory to runtime
 launch

Signed-off-by: andodo <andodo@andododeMacBook-Air.local>
---
 ptodsl/ptodsl/_runtime/codegen.py      | 5 +++--
 ptodsl/ptodsl/_runtime/native_build.py | 9 +++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/ptodsl/ptodsl/_runtime/codegen.py b/ptodsl/ptodsl/_runtime/codegen.py
index ba93106c25..61b50a288c 100644
--- a/ptodsl/ptodsl/_runtime/codegen.py
+++ b/ptodsl/ptodsl/_runtime/codegen.py
@@ -92,7 +92,7 @@ def _runtime_scalar_cpp_type(annotation) -> str:
 def launch_symbol_name(ir_function_name: str) -> str:
     return f"ptodsl_launch_{ir_function_name}"
 
-def generate_launch_cpp(*, ir_function_name: str, kernel_signature) -> str:
+def generate_launch_cpp(*, ir_function_name: str, kernel_signature, dyn_shared_bytes: int = 0) -> str:
     """Return C++ source for one extern-C launch entry point."""
     gm_params = []
     host_params = []
@@ -125,7 +125,8 @@ def generate_launch_cpp(*, ir_function_name: str, kernel_signature) -> str:
         "#endif\n\n"
         f'extern "C" __global__ AICORE void {ir_function_name}({gm_sig});\n\n'
         f"extern \"C\" void {launch_symbol}({host_sig}) {{\n"
-        f"    {ir_function_name}<<<grid, nullptr, stream>>>({kernel_call});\n"
+        f"    constexpr uint32_t dynSharedBytes = {int(dyn_shared_bytes)};\n"
+        f"    {ir_function_name}<<<grid, dynSharedBytes, stream>>>({kernel_call});\n"
         "}\n"
     )
 
diff --git a/ptodsl/ptodsl/_runtime/native_build.py b/ptodsl/ptodsl/_runtime/native_build.py
index 777fa54201..9252a6c07d 100644
--- a/ptodsl/ptodsl/_runtime/native_build.py
+++ b/ptodsl/ptodsl/_runtime/native_build.py
@@ -10,6 +10,7 @@
 from __future__ import annotations
 
 import os
+import re
 import subprocess
 from pathlib import Path
 
@@ -30,6 +31,13 @@
 )
 
 
+def _extract_dyn_shared_memory_bytes(mlir_text: str) -> int:
+    match = re.search(r"dyn_shared_memory_buf\s*=\s*(\d+)\s*:\s*i64", mlir_text)
+    if match is None:
+        return 0
+    return int(match.group(1))
+
+
 def _run(cmd: list[str], *, cwd: Path | None = None) -> None:
     result = subprocess.run(cmd, cwd=str(cwd) if cwd else None, capture_output=True, text=True)
     if result.returncode != 0:
@@ -175,6 +183,7 @@ def build_native_library(
     launch_cpp_text = generate_launch_cpp(
         ir_function_name=ir_function_name,
         kernel_signature=kernel_signature,
+        dyn_shared_bytes=_extract_dyn_shared_memory_bytes(mlir_text),
     )
     sim_mode = bool(os.environ.get("MSPROF_SIMULATOR_MODE"))
     link_config_text = "\n".join(runtime_library_flags(sim_mode=sim_mode))

From ceb8e01414c6f7ebc75355e11601396c4d7ba07b Mon Sep 17 00:00:00 2001
From: default <default>
Date: Tue, 30 Jun 2026 14:42:34 +0800
Subject: [PATCH 30/37] fix(ptodsl): inline cross-warp allreduce path

Path 3 now emits the cross-warp allreduce body directly in the caller instead of returning through a SIMT helper call. Paths 1, 2, and 4 still use helper calls and should be converted to inline emission in a follow-up.

Signed-off-by: andodo <andodo@andododeMacBook-Air.local>
---
 ptodsl/ptodsl/_allreduce.py | 127 ++++++++++++++++++++++++++++++++++--
 1 file changed, 123 insertions(+), 4 deletions(-)

diff --git a/ptodsl/ptodsl/_allreduce.py b/ptodsl/ptodsl/_allreduce.py
index d841374827..02298d3f90 100644
--- a/ptodsl/ptodsl/_allreduce.py
+++ b/ptodsl/ptodsl/_allreduce.py
@@ -316,10 +316,8 @@ def _dispatch_allreduce_helper(value, *, scratch,
 
     # ── Path 3: cross_warp_reduce ────────────────────────────────────────
     if scale <= 32 and _is_pow2(threads) and _is_pow2(scale):
-        return _invoke_helper(
-            name,
-            lambda hf: _emit_cross_warp_reduce(hf, **args),
-            value, scratch,
+        return _emit_cross_warp_reduce_inline(
+            raw_value, unwrap_surface_value(scratch), **args,
         )
 
     # ── Path 4: ub_reduce fallback (threads > 32, anything else) ─────────
@@ -330,6 +328,127 @@ def _dispatch_allreduce_helper(value, *, scratch,
     )
 
 
+def _emit_cross_warp_reduce_inline(x, scratch, *,
+                                   dtype, threads, scale, thread_offset):
+    """Emit cross-warp all-reduce directly at the current insertion point."""
+    num_warps = threads // 32
+    scalar_t = _mlir_scalar_type(dtype)
+    identity_val = _IDENTITY[dtype]
+
+    i32 = IntegerType.get_signless(32)
+    idx_t = IndexType.get()
+
+    c0_i32 = arith.ConstantOp(i32, 0).result
+    c5_i32 = arith.ConstantOp(i32, 5).result
+    c31_i32 = arith.ConstantOp(i32, 31).result
+    c32_i32 = arith.ConstantOp(i32, 32).result
+    c_scale = arith.ConstantOp(i32, scale).result
+    c_num_warps = arith.ConstantOp(i32, num_warps).result
+    c_offset = arith.ConstantOp(i32, thread_offset).result
+    c_identity = arith.ConstantOp(scalar_t, identity_val).result
+
+    tid_x = _pto.GetTidXOp().result
+    if thread_offset:
+        tx = arith.SubIOp(tid_x, c_offset).result
+        wid = arith.ShRUIOp(tx, c5_i32).result
+        lid = arith.AndIOp(tx, c31_i32).result
+    else:
+        tx = tid_x
+        wid = arith.ShRUIOp(tx, c5_i32).result
+        lid = _pto.GetLaneIdOp().result
+
+    if scale == 1:
+        warp_val = _REDUX_OP(x).result
+    else:
+        warp_val = _emit_butterfly(
+            x, threads=32, scale=scale,
+        )
+
+    is_writer = arith.CmpIOp(arith.CmpIPredicate.ult, lid, c_scale).result
+    write_if = scf.IfOp(is_writer, hasElse=False)
+    with InsertionPoint(write_if.then_block):
+        slot = arith.AddIOp(
+            arith.MulIOp(wid, c_scale).result, lid).result
+        slot_idx = arith.IndexCastOp(idx_t, slot).result
+        _emit_store(scratch, slot_idx, warp_val)
+        scf.YieldOp([])
+
+    _pto.SyncthreadsOp()
+
+    is_leader_warp = arith.CmpIOp(
+        arith.CmpIPredicate.ult, tx, c32_i32).result
+    outer_if = scf.IfOp(is_leader_warp, [scalar_t], hasElse=True)
+
+    with InsertionPoint(outer_if.then_block):
+        if scale == 1:
+            need_load = arith.CmpIOp(
+                arith.CmpIPredicate.ult, lid, c_num_warps).result
+            inner_if = scf.IfOp(need_load, [scalar_t], hasElse=True)
+            with InsertionPoint(inner_if.then_block):
+                lid_idx = arith.IndexCastOp(idx_t, lid).result
+                tmp = _emit_load(scalar_t, scratch, lid_idx)
+                scf.YieldOp([tmp])
+            with InsertionPoint(inner_if.else_block):
+                scf.YieldOp([c_identity])
+            loaded = inner_if.results[0]
+            stage4_result = _REDUX_OP(loaded).result
+        elif scale * num_warps <= 32:
+            total = scale * num_warps
+            c_total = arith.ConstantOp(i32, total).result
+            need_load = arith.CmpIOp(
+                arith.CmpIPredicate.ult, lid, c_total).result
+            inner_if = scf.IfOp(need_load, [scalar_t], hasElse=True)
+            with InsertionPoint(inner_if.then_block):
+                lid_idx = arith.IndexCastOp(idx_t, lid).result
+                tmp = _emit_load(scalar_t, scratch, lid_idx)
+                scf.YieldOp([tmp])
+            with InsertionPoint(inner_if.else_block):
+                scf.YieldOp([c_identity])
+            loaded = inner_if.results[0]
+            stage4_result = _emit_butterfly(
+                loaded,
+                threads=total, scale=scale,
+            )
+        else:
+            is_reducer = arith.CmpIOp(
+                arith.CmpIPredicate.ult, lid, c_scale).result
+            result = c_identity
+            my_slot = arith.RemUIOp(lid, c_scale).result
+            for w in range(num_warps):
+                c_w = arith.ConstantOp(i32, w).result
+                idx_val = arith.AddIOp(
+                    arith.MulIOp(c_w, c_scale).result, my_slot).result
+                slot_idx = arith.IndexCastOp(idx_t, idx_val).result
+                loaded_v = _emit_load(
+                    scalar_t, scratch, slot_idx)
+                result = _apply_sum(result, loaded_v)
+            stage4_result = arith.SelectOp(
+                is_reducer, result, c_identity).result
+
+        scf.YieldOp([stage4_result])
+
+    with InsertionPoint(outer_if.else_block):
+        scf.YieldOp([c_identity])
+
+    partial_reduced = outer_if.results[0]
+
+    is_global_leader = arith.CmpIOp(
+        arith.CmpIPredicate.ult, tx, c_scale).result
+    write_result_if = scf.IfOp(is_global_leader, hasElse=False)
+    with InsertionPoint(write_result_if.then_block):
+        tx_idx = arith.IndexCastOp(idx_t, tx).result
+        _emit_store(scratch, tx_idx, partial_reduced)
+        scf.YieldOp([])
+
+    _pto.SyncthreadsOp()
+    my_slot = arith.RemUIOp(tx, c_scale).result
+    load_idx = arith.IndexCastOp(idx_t, my_slot).result
+    result = _emit_load(scalar_t, scratch, load_idx)
+
+    _pto.SyncthreadsOp()
+    return wrap_surface_value(result)
+
+
 # ═══════════════════════════════════════════════════════════════════════════════
 # emitter: warp_reduce  (Path 1: threads ≤ 32, pow2, pow2 scale)
 # ═══════════════════════════════════════════════════════════════════════════════

From 3f8afd40e505d5ba440bb1b0c535323a1a4d358a Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Tue, 30 Jun 2026 15:10:17 +0800
Subject: [PATCH 31/37] fix(ptodsl): use pto sqrt in rmsnorm simt example

Signed-off-by: andodo <andodo@andododeMacBook-Air.local>
---
 ptodsl/examples/rmsnorm_alloc_buffer_simt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
index fec01095c9..0bee1b0df7 100644
--- a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
@@ -87,7 +87,7 @@ def rmsnorm_simt_token_body(
         thread_offset=0,
     )
 
-    rstd = 1.0 / scalar.sqrt(sum_sq / hidden_size + eps)
+    rstd = 1.0 / pto.sqrt(sum_sq / hidden_size + eps)
 
     scalar.store(rstd, rstd_ub, pingpong * 8)
 

From 28fa9efc5284bea4f491a92601bfb7faa1b4169f Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Tue, 30 Jun 2026 16:19:52 +0800
Subject: [PATCH 32/37] refactor(ptodsl): make alloc_buffer local-only

Signed-off-by: andodo <andodo@andododeMacBook-Air.local>
---
 ptodsl/README.md                              |  3 +-
 .../user_guide/04-type-system-and-buffer.md   | 20 ++---
 ptodsl/examples/rmsnorm_alloc_buffer_simt.py  | 20 ++---
 ptodsl/ptodsl/_jit.py                         | 16 ++++
 ptodsl/ptodsl/_ops.py                         | 81 +++++--------------
 ptodsl/ptodsl/_surface_values.py              |  6 --
 ptodsl/ptodsl/_tracing/module_builder.py      |  1 +
 ptodsl/ptodsl/_tracing/session.py             | 37 +--------
 ptodsl/ptodsl/scalar.py                       |  2 +-
 ptodsl/tests/test_allreduce.py                | 18 -----
 ptodsl/tests/test_jit_compile.py              | 78 ++++--------------
 ptodsl/tests/test_rmsnorm_example_compile.py  |  2 +-
 12 files changed, 79 insertions(+), 205 deletions(-)

diff --git a/ptodsl/README.md b/ptodsl/README.md
index 511db83824..30a76b8dd0 100644
--- a/ptodsl/README.md
+++ b/ptodsl/README.md
@@ -155,7 +155,8 @@ python3 ptodsl/examples/flash_attention_softmax_launch.py
 ### `rmsnorm_alloc_buffer_simt.py`
 
 Compile-only RMSNorm example for explicit-mode SIMT kernels. It exercises
-`pto.alloc_buffer(...)`, contiguous `scalar.load` / `scalar.store`, `pto.vec`,
+SIMT-local `pto.alloc_buffer(...)`, hand-authored dynamic UB scratch offsets,
+contiguous `scalar.load` / `scalar.store`, `pto.vec`,
 `pto.simt_allreduce_sum(...)`, explicit pipe `set_flag` / `wait_flag` sync,
 and a runtime token loop that lowers to `scf.for`.
 
diff --git a/ptodsl/docs/user_guide/04-type-system-and-buffer.md b/ptodsl/docs/user_guide/04-type-system-and-buffer.md
index 844cc0a49c..3c0506bb77 100644
--- a/ptodsl/docs/user_guide/04-type-system-and-buffer.md
+++ b/ptodsl/docs/user_guide/04-type-system-and-buffer.md
@@ -177,31 +177,27 @@ ptr_ub  = pto.ptr(pto.f16, pto.MemorySpace.UB)
 
 ## 4.5 Explicit scratch buffers
 
-Allocate explicit scratch storage for pointer-style load, store, and data movement operations.
+Allocate SIMT lane-local scratch storage for pointer-style load and store
+operations inside a SIMT helper.
 
 ```text
-pto.alloc_buffer(shape, dtype, *, scope="ub")
+pto.alloc_buffer(shape, dtype)
 ```
 
 <!-- ptodsl-doc-pending: {"reason":"illustrative fragment; covered by test_jit_compile alloc_buffer probes"} -->
 ```python
-ub_scratch = pto.alloc_buffer((4096,), pto.f32, scope="ub")
-fragment = pto.alloc_buffer((32,), pto.f32, scope="local")
+scratch = pto.alloc_buffer((32,), pto.f32)
 ```
 
 | Parameter | Description |
 |-----------|-------------|
 | `shape` | Static positive integer shape. Pass an `int`, `tuple[int, ...]`, or `list[int]`. |
 | `dtype` | Element type of the returned buffer, such as `pto.f32` or `pto.i32`. |
-| `scope` | Scratch storage kind. Use `"ub"` or `"local"`. |
 
-| Scope | Meaning | Returned value |
-|-------|---------|----------------|
-| `"ub"` | Function-level Unified Buffer scratch, typically used by data movement operations or shared SIMT scratch. | Typed UB pointer |
-| `"local"` | SIMT-helper local scratch for per-workitem temporary fragments. | Typed local pointer |
-
-A `"ub"` buffer is available throughout the generated kernel body. A `"local"`
-buffer is available only inside the SIMT helper invocation that allocates it.
+The returned pointer names a local allocation in the SIMT helper invocation
+that allocates it. Use this for per-workitem temporary fragments, scalar
+scratch values, or staged values that are accessed through pointer-style loads
+and stores.
 
 ## 4.6 TensorView
 
diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
index 0bee1b0df7..4c53b37a2e 100644
--- a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
@@ -11,7 +11,8 @@
 
 The example exercises the PTODSL surfaces needed by the RMSNorm SimtVF kernel:
 
-- ``pto.alloc_buffer(...)`` for UB scratch and lane-local storage
+- ``pto.alloc_buffer(...)`` for lane-local SIMT fragment storage
+- hand-authored dynamic UB scratch layout via ``pto.castptr`` / ``pto.addptr``
 - contiguous scalar ``load`` / ``store`` vector accesses
 - ``pto.simt_allreduce_sum(...)`` for cross-workitem sum reduction
 - W stays in UB after the GM->UB preload and is read directly by the token SIMT body
@@ -58,8 +59,8 @@ def rmsnorm_simt_token_body(
 ):
     tx = pto.get_tid_x()
     frag_elems: pto.const_expr = rounds * lanes
-    x_frag = pto.alloc_buffer((frag_elems,), pto.f32, scope="local")
-    sum_sq = pto.alloc_buffer((1,), pto.f32, scope="local")
+    x_frag = pto.alloc_buffer((frag_elems,), pto.f32)
+    sum_sq = pto.alloc_buffer((1,), pto.f32)
 
     for r in range(0, rounds):
         lane_offset = r * threads * lanes + tx * lanes
@@ -105,7 +106,7 @@ def rmsnorm_simt_token_body(
         scalar.store(y_vec, y_ub, y_offset)
 
 
-@pto.jit(target="a5", mode="explicit")
+@pto.jit(target="a5", mode="explicit", dyn_shared_memory_buf=82496)
 def rmsnorm_4096_alloc_buffer_simt_context_kernel(
     X: pto.ptr(pto.f32, "gm"),
     Y: pto.ptr(pto.f32, "gm"),
@@ -127,11 +128,12 @@ def rmsnorm_4096_alloc_buffer_simt_context_kernel(
 
     core_id = pto.get_block_idx()
 
-    w_ub = pto.alloc_buffer((hidden_size,), pto.f32, scope="ub")
-    x_ub = pto.alloc_buffer((2, hidden_size), pto.f32, scope="ub")
-    y_ub = pto.alloc_buffer((2, hidden_size), pto.f32, scope="ub")
-    rstd_ub = pto.alloc_buffer((2, 8), pto.f32, scope="ub")
-    reduce_scratch = pto.alloc_buffer((threads,), pto.f32, scope="ub")
+    ub_base = pto.castptr(pto.const(0, dtype=pto.ui64), pto.ptr(pto.f32, "ub"))
+    w_ub = pto.addptr(ub_base, 0)
+    reduce_scratch = pto.addptr(ub_base, hidden_size)
+    x_ub = pto.addptr(ub_base, hidden_size + 128)
+    y_ub = pto.addptr(ub_base, hidden_size + 128 + 2 * hidden_size)
+    rstd_ub = pto.addptr(ub_base, hidden_size + 128 + 4 * hidden_size)
 
     pto.mte_gm_ub(
         W,
diff --git a/ptodsl/ptodsl/_jit.py b/ptodsl/ptodsl/_jit.py
index 5c23bc97f0..dbfe9756c2 100644
--- a/ptodsl/ptodsl/_jit.py
+++ b/ptodsl/ptodsl/_jit.py
@@ -76,6 +76,16 @@ def _normalize_backend(backend: str, *, fn=None) -> str:
     return backend
 
 
+def _normalize_dyn_shared_memory_buf(value):
+    if value is None:
+        return None
+    if isinstance(value, bool) or not isinstance(value, int):
+        raise TypeError("@pto.jit dyn_shared_memory_buf must be a non-negative integer byte count")
+    if value < 0:
+        raise ValueError("@pto.jit dyn_shared_memory_buf must be non-negative")
+    return value
+
+
 def _module_attr_map(module):
     attrs = module.operation.attributes
     return {name: str(attrs[name]) for name in _MODULE_ATTRS if name in attrs}
@@ -167,6 +177,7 @@ def jit(
     entry: bool = True,
     mode: str = "auto",
     insert_sync: bool | None = None,
+    dyn_shared_memory_buf: int | None = None,
     ast_rewrite: bool | None = None,
     frontend_options: Mapping | None = None,
 ):
@@ -187,6 +198,9 @@ def jit(
     insert_sync: ``True``/``False`` to explicitly control PTOAS sync insertion
                  for launch builds. ``None`` keeps the mode-based default
                  behavior.
+    dyn_shared_memory_buf:
+                 Dynamic UB scratch byte count to attach to the entry function
+                 and pass to native launch code.
     ast_rewrite:
                  ``True`` enables AST rewriting of Python ``if`` /
                  ``for range(...)`` into device-side PTODSL control flow.
@@ -208,6 +222,7 @@ def jit(
         ast_rewrite=ast_rewrite,
         frontend_options=frontend_options,
     )
+    normalized_dyn_shared_memory_buf = _normalize_dyn_shared_memory_buf(dyn_shared_memory_buf)
 
     def decorator(fn):
         fn_name = name or fn.__name__
@@ -229,6 +244,7 @@ def decorator(fn):
                 entry=entry,
                 mode=normalized_mode,
                 insert_sync=insert_sync,
+                dyn_shared_memory_buf=normalized_dyn_shared_memory_buf,
                 module_style=ModuleStyle.BACKEND_PARTITIONED,
                 source_file=source_file,
                 source_line=getattr(fn.__code__, "co_firstlineno", None),
diff --git a/ptodsl/ptodsl/_ops.py b/ptodsl/ptodsl/_ops.py
index 1e08778e28..c232dccec1 100644
--- a/ptodsl/ptodsl/_ops.py
+++ b/ptodsl/ptodsl/_ops.py
@@ -2319,50 +2319,34 @@ def _tile_transfer_partition(tv, tile, *, offsets=None, sizes=None, context: str
     return partition_view(tv, offsets=normalized_offsets, sizes=normalized_sizes)
 
 
-def alloc_buffer(shape, dtype, *, scope="ub"):
+def alloc_buffer(shape, dtype, **kwargs):
     """
-    Allocate explicit scratch storage and return an address-like surface value.
+    Allocate SIMT lane-local scratch storage and return an address-like value.
 
-    ``scope="ub"`` reserves a byte range in the function-level UB scratch area
-    and returns a typed PTO pointer. ``scope="local"`` emits an LLVM stack
-    allocation for SIMT lane-local fragment storage. Access lowering for local
-    buffers is intentionally left to the scalar/vector load-store surfaces.
+    The allocation emits an LLVM stack allocation in the surrounding SIMT
+    helper. UB scratch uses explicit ``pto.castptr`` / ``pto.addptr`` pointer
+    authoring and ``@pto.jit(dyn_shared_memory_buf=...)`` launch metadata.
     """
+    if kwargs:
+        unexpected = ", ".join(sorted(kwargs))
+        raise TypeError(
+            f"pto.alloc_buffer(...) does not accept keyword argument(s): {unexpected}. "
+            "It only allocates SIMT local buffers; author UB scratch explicitly with "
+            "pto.castptr/pto.addptr and @pto.jit(dyn_shared_memory_buf=...)."
+        )
     _require_explicit_mode("pto.alloc_buffer(...)")
-    normalized_scope = _normalize_alloc_buffer_scope(scope)
     element_type = _resolve(dtype)
     element_count = _static_alloc_buffer_element_count(shape)
     elem_bytes = _element_bytewidth(element_type)
     byte_size = element_count * elem_bytes
 
-    if normalized_scope == "ub":
-        return _alloc_ub_buffer(
-            shape,
-            dtype,
-            element_type,
-            element_count,
-            byte_size,
-        )
-    if normalized_scope == "local":
-        return _alloc_local_buffer(
-            shape,
-            dtype,
-            element_type,
-            element_count,
-            byte_size,
-        )
-    raise AssertionError(f"unhandled alloc_buffer scope {normalized_scope!r}")
-
-
-def _normalize_alloc_buffer_scope(scope):
-    if not isinstance(scope, str):
-        raise TypeError("pto.alloc_buffer(..., scope=...) expects 'ub' or 'local'")
-    normalized = scope.strip().lower()
-    if normalized == "ub":
-        return "ub"
-    if normalized == "local":
-        return "local"
-    raise ValueError("pto.alloc_buffer(..., scope=...) expects one of 'ub' or 'local'")
+    return _alloc_local_buffer(
+        shape,
+        dtype,
+        element_type,
+        element_count,
+        byte_size,
+    )
 
 
 def _static_alloc_buffer_element_count(shape):
@@ -2390,32 +2374,6 @@ def _static_alloc_buffer_element_count(shape):
     return count
 
 
-def _alloc_ub_buffer(shape, dtype, element_type, element_count, byte_size):
-    from ._tracing.active import current_session
-
-    session = current_session()
-    if session is None:
-        raise RuntimeError("pto.alloc_buffer(scope='ub') may only be used while tracing a PTODSL kernel")
-
-    byte_offset = session.allocate_ub_scratch(byte_size, alignment=32)
-    ub_base_i8 = wrap_surface_value(session.get_or_create_ub_base_i8_ptr())
-    if byte_offset:
-        ptr_i8_value = addptr(ub_base_i8, arith.ConstantOp(IndexType.get(), byte_offset).result)
-    else:
-        ptr_i8_value = ub_base_i8
-    ptr_value = castptr(ptr_i8_value, ptr(element_type, "ub"))
-    return AllocatedBufferValue(
-        unwrap_surface_value(ptr_value),
-        scope="ub",
-        shape=_normalize_alloc_buffer_shape_metadata(shape),
-        dtype=dtype,
-        element_type=element_type,
-        element_count=element_count,
-        byte_size=byte_size,
-        byte_offset=byte_offset,
-    )
-
-
 def _alloc_local_buffer(shape, dtype, element_type, element_count, byte_size):
     i32 = IntegerType.get_signless(32)
     count = _materialize_integer_literal(i32, element_count)
@@ -2430,7 +2388,6 @@ def _alloc_local_buffer(shape, dtype, element_type, element_count, byte_size):
     ).results[0]
     return AllocatedBufferValue(
         alloca,
-        scope="local",
         shape=_normalize_alloc_buffer_shape_metadata(shape),
         dtype=dtype,
         element_type=element_type,
diff --git a/ptodsl/ptodsl/_surface_values.py b/ptodsl/ptodsl/_surface_values.py
index a05f2d85ee..e829ad5cfe 100644
--- a/ptodsl/ptodsl/_surface_values.py
+++ b/ptodsl/ptodsl/_surface_values.py
@@ -352,33 +352,27 @@ def __init__(
         self,
         value,
         *,
-        scope,
         shape,
         dtype,
         element_type,
         element_count,
         byte_size,
-        byte_offset=None,
     ):
         super().__init__(value)
-        self.scope = scope
         self.shape = tuple(shape)
         self.dtype = dtype
         self.element_type = element_type
         self.element_count = element_count
         self.byte_size = byte_size
-        self.byte_offset = byte_offset
 
     @property
     def surface_metadata(self):
         return {
-            "scope": self.scope,
             "shape": self.shape,
             "dtype": self.dtype,
             "element_type": self.element_type,
             "element_count": self.element_count,
             "byte_size": self.byte_size,
-            "byte_offset": self.byte_offset,
         }
 
 
diff --git a/ptodsl/ptodsl/_tracing/module_builder.py b/ptodsl/ptodsl/_tracing/module_builder.py
index f15f62e398..715f45b082 100644
--- a/ptodsl/ptodsl/_tracing/module_builder.py
+++ b/ptodsl/ptodsl/_tracing/module_builder.py
@@ -35,6 +35,7 @@ class KernelModuleSpec:
     entry: bool = True
     mode: str = "auto"
     insert_sync: bool | None = None
+    dyn_shared_memory_buf: int | None = None
     module_style: ModuleStyle = ModuleStyle.NESTED
     source_file: str | None = None
     source_line: int | None = None
diff --git a/ptodsl/ptodsl/_tracing/session.py b/ptodsl/ptodsl/_tracing/session.py
index 49f6b76b05..0738afcc9a 100644
--- a/ptodsl/ptodsl/_tracing/session.py
+++ b/ptodsl/ptodsl/_tracing/session.py
@@ -153,8 +153,6 @@ def __init__(self, module_spec, module, entry_function):
         }
         self._subkernel_stack: list[SubkernelTraceFrame] = []
         self._carry_loop_stack = []
-        self._ub_base_i8_ptr = None
-        self._ub_scratch_next_byte = 0
         self._inline_subkernel_counter = 0
         self._escaped_inline_values: dict[object, tuple[str, str]] = {}
 
@@ -207,32 +205,6 @@ def bind_entry_block(self, entry_block) -> None:
         """Record the root entry block for the active trace."""
         self.entry_block = entry_block
 
-    @property
-    def ub_scratch_size(self) -> int:
-        return self._ub_scratch_next_byte
-
-    def get_or_create_ub_base_i8_ptr(self):
-        """Return the shared UB byte-base pointer for explicit scratch buffers."""
-        if self._ub_base_i8_ptr is not None:
-            return self._ub_base_i8_ptr
-        from .._ops import castptr
-        from .._types import int8, ptr
-
-        i64 = IntegerType.get_signless(64)
-        zero = arith.ConstantOp(i64, 0).result
-        self._ub_base_i8_ptr = castptr(zero, ptr(int8, "ub")).value
-        return self._ub_base_i8_ptr
-
-    def allocate_ub_scratch(self, byte_size: int, *, alignment: int = 32) -> int:
-        """Reserve one aligned byte range in the function-level UB scratch area."""
-        if not isinstance(byte_size, int) or byte_size <= 0:
-            raise ValueError(f"UB scratch allocation expects a positive byte size, got {byte_size!r}")
-        if not isinstance(alignment, int) or alignment <= 0:
-            raise ValueError(f"UB scratch allocation expects a positive alignment, got {alignment!r}")
-        offset = _align_up(self._ub_scratch_next_byte, alignment)
-        self._ub_scratch_next_byte = offset + byte_size
-        return offset
-
     def validate_surface_value_access(self, value) -> None:
         """Reject inline-subkernel SSA values that escaped their outlined helper body."""
         record = self._escaped_inline_values.get(value)
@@ -878,18 +850,15 @@ def validate_final_state(self) -> None:
             raise RuntimeError("PTODSL trace-session exited with an open subkernel lowering frame")
         if self._carry_loop_stack:
             raise RuntimeError("PTODSL trace-session exited with an open loop-carry lowering frame")
-        if self._ub_scratch_next_byte:
+        dyn_shared_memory_buf = getattr(self.module_spec, "dyn_shared_memory_buf", None)
+        if dyn_shared_memory_buf:
             i64 = IntegerType.get_signless(64)
             self.entry_function.attributes["dyn_shared_memory_buf"] = IntegerAttr.get(
                 i64,
-                _align_up(self._ub_scratch_next_byte, 32),
+                dyn_shared_memory_buf,
             )
 
 
-def _align_up(value: int, alignment: int) -> int:
-    return ((value + alignment - 1) // alignment) * alignment
-
-
 def _coerce_simt_launch_dims(dims):
     if not isinstance(dims, (tuple, list)) or len(dims) != 3:
         raise TypeError("pto.simt_launch(..., dims=...) expects a 3-item (dim_x, dim_y, dim_z) tuple")
diff --git a/ptodsl/ptodsl/scalar.py b/ptodsl/ptodsl/scalar.py
index 10132294ad..ce9ed453b5 100644
--- a/ptodsl/ptodsl/scalar.py
+++ b/ptodsl/ptodsl/scalar.py
@@ -202,7 +202,7 @@ def _allocated_buffer_target(target):
 
 
 def _is_local_allocated_buffer(allocated_buffer) -> bool:
-    return allocated_buffer is not None and allocated_buffer.scope == "local"
+    return allocated_buffer is not None
 
 
 def _infer_buffer_element_type(buffer_type, *, allocated_buffer=None):
diff --git a/ptodsl/tests/test_allreduce.py b/ptodsl/tests/test_allreduce.py
index c427a96314..f9262bda0c 100644
--- a/ptodsl/tests/test_allreduce.py
+++ b/ptodsl/tests/test_allreduce.py
@@ -310,24 +310,6 @@ def kernel_128(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled.verify()
 
-    # ── issue 483 integration: alloc_buffer(scope="ub") scratch ─────────────
-    @pto.jit(target="a5", mode="explicit")
-    def kernel_alloc_buffer_scratch():
-        reduce_scratch = pto.alloc_buffer((128,), pto.f32, scope="ub")
-        with pto.simt():
-            x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, reduce_scratch, threads=128, scale=1)
-
-    compiled_alloc = kernel_alloc_buffer_scratch.compile()
-    mlir_alloc = compiled_alloc.mlir_text()
-    expect("dyn_shared_memory_buf = 512 : i64" in mlir_alloc,
-           "IR: alloc_buffer scratch reserves 128 f32 elements in UB")
-    expect("call @__tl_allreduce_sum_f32_t128_s1_o0" in mlir_alloc,
-           "IR: alloc_buffer scratch can be passed to simt_allreduce_sum")
-    expect("!pto.ptr<f32, ub>" in mlir_alloc,
-           "IR: allreduce scratch keeps typed UB pointer")
-    compiled_alloc.verify()
-
     # ── cross_warp: sum, f32, t=64 (2 warps) ────────────────────────────────
     @pto.jit(target="a5")
     def kernel_64(scratch_gm: pto.ptr(pto.f32, "gm")):
diff --git a/ptodsl/tests/test_jit_compile.py b/ptodsl/tests/test_jit_compile.py
index 34f96cbc83..dd759bcf0a 100644
--- a/ptodsl/tests/test_jit_compile.py
+++ b/ptodsl/tests/test_jit_compile.py
@@ -875,19 +875,9 @@ def simt_helper_lowering_probe(*, TRACE_TOKEN: pto.const_expr = 0):
     simt_tid_probe()
 
 
-@pto.jit(target="a5", mode="explicit")
-def alloc_buffer_ub_probe(
-    A_ptr: pto.ptr(pto.f32, "gm"),
-    O_ptr: pto.ptr(pto.f32, "gm"),
-):
-    scratch = pto.alloc_buffer((64,), pto.f32, scope="ub")
-    pto.mte_gm_ub(A_ptr, scratch, 0, 256, nburst=(1, 0, 0))
-    pto.mte_ub_gm(scratch, O_ptr, 256, nburst=(1, 0, 0))
-
-
 @pto.simt
 def alloc_buffer_local_helper():
-    _ = pto.alloc_buffer((32,), pto.f32, scope="local")
+    _ = pto.alloc_buffer((32,), pto.f32)
 
 
 @pto.jit(target="a5", mode="explicit")
@@ -895,16 +885,6 @@ def alloc_buffer_local_probe():
     alloc_buffer_local_helper()
 
 
-@pto.jit(target="a5", mode="explicit")
-def alloc_buffer_vec_scope_probe():
-    _ = pto.alloc_buffer((1,), pto.f32, scope="vec")
-
-
-@pto.jit(target="a5", mode="explicit")
-def alloc_buffer_private_scope_probe():
-    _ = pto.alloc_buffer((1,), pto.f32, scope="private")
-
-
 @pto.simt
 def rmsnorm_alloc_buffer_frag_helper(
     w_ub: pto.ptr(pto.f32, pto.MemorySpace.UB),
@@ -913,22 +893,23 @@ def rmsnorm_alloc_buffer_frag_helper(
     _ = pto.get_tid_x()
     _ = w_ub
     _ = x_ub
-    _ = pto.alloc_buffer((32,), pto.f32, scope="local")
-    _ = pto.alloc_buffer((1,), pto.f32, scope="local")
+    _ = pto.alloc_buffer((32,), pto.f32)
+    _ = pto.alloc_buffer((1,), pto.f32)
 
 
-@pto.jit(target="a5", mode="explicit")
+@pto.jit(target="a5", mode="explicit", dyn_shared_memory_buf=82496)
 def rmsnorm_alloc_buffer_layout_probe(
     X: pto.ptr(pto.f32, "gm"),
     W: pto.ptr(pto.f32, "gm"),
     Y: pto.ptr(pto.f32, "gm"),
     RSTD: pto.ptr(pto.f32, "gm"),
 ):
-    w_ub = pto.alloc_buffer((4096,), pto.f32, scope="ub")
-    x_ub = pto.alloc_buffer((2, 4096), pto.f32, scope="ub")
-    y_ub = pto.alloc_buffer((2, 4096), pto.f32, scope="ub")
-    rstd_ub = pto.alloc_buffer((2, 8), pto.f32, scope="ub")
-    reduce_scratch = pto.alloc_buffer((128,), pto.f32, scope="ub")
+    ub_base = pto.castptr(pto.const(0, dtype=pto.ui64), pto.ptr(pto.f32, "ub"))
+    w_ub = pto.addptr(ub_base, 0)
+    reduce_scratch = pto.addptr(ub_base, 4096)
+    x_ub = pto.addptr(ub_base, 4224)
+    y_ub = pto.addptr(ub_base, 12416)
+    rstd_ub = pto.addptr(ub_base, 20608)
 
     pto.mte_gm_ub(W, w_ub, 0, 4096 * 4, nburst=(1, 0, 0))
     pto.mte_gm_ub(X, x_ub, 0, 4096 * 4, nburst=(1, 0, 0))
@@ -1649,7 +1630,7 @@ def scalar_contiguous_vector_probe():
 
 @pto.simt
 def scalar_contiguous_local_alloc_buffer_helper():
-    data = pto.alloc_buffer((16,), pto.f32, scope="local")
+    data = pto.alloc_buffer((16,), pto.f32)
     x4 = scalar.load(data, 0, contiguous=4)
     scale4 = pto.vec(pto.f32, 4, init=1.0)
     y4 = x4 * scale4
@@ -4058,26 +4039,11 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
     expect("pto.get_tid_y" in simt_text, "SIMT helper body should contain pto.get_tid_y")
     expect("pto.get_tid_z" in simt_text, "SIMT helper body should contain pto.get_tid_z")
 
-    alloc_buffer_ub_text = alloc_buffer_ub_probe.compile().mlir_text()
-    expect_parse_roundtrip_and_verify(alloc_buffer_ub_text, "alloc_buffer UB specialization")
-    expect(
-        "dyn_shared_memory_buf = 256 : i64" in alloc_buffer_ub_text,
-        "alloc_buffer(scope='ub') should size the function-level UB scratch area",
-    )
-    expect(
-        "pto.castptr %c0_i64" in alloc_buffer_ub_text and "!pto.ptr<i8, ub>" in alloc_buffer_ub_text,
-        "alloc_buffer(scope='ub') should materialize a shared UB byte-base pointer",
-    )
-    expect(
-        "pto.mte_gm_ub" in alloc_buffer_ub_text and "pto.mte_ub_gm" in alloc_buffer_ub_text,
-        "alloc_buffer(scope='ub') result should be accepted by explicit MTE helpers",
-    )
-
     alloc_buffer_local_text = alloc_buffer_local_probe.compile().mlir_text()
     expect_parse_roundtrip_and_verify(alloc_buffer_local_text, "alloc_buffer local specialization")
     expect(
         "llvm.alloca" in alloc_buffer_local_text and "x f32" in alloc_buffer_local_text,
-        "alloc_buffer(scope='local') should lower to an LLVM stack allocation in the SIMT helper",
+        "alloc_buffer should lower to an LLVM stack allocation in the SIMT helper",
     )
     expect(
         re.search(
@@ -4085,28 +4051,18 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
             alloc_buffer_local_text,
         )
         is not None,
-        "alloc_buffer(scope='local') probe should keep allocation inside the SIMT helper body",
-    )
-    expect_raises(
-        ValueError,
-        lambda: alloc_buffer_vec_scope_probe.compile(),
-        "expects one of 'ub' or 'local'",
-    )
-    expect_raises(
-        ValueError,
-        lambda: alloc_buffer_private_scope_probe.compile(),
-        "expects one of 'ub' or 'local'",
+        "alloc_buffer probe should keep allocation inside the SIMT helper body",
     )
     rmsnorm_alloc_buffer_text = rmsnorm_alloc_buffer_layout_probe.compile().mlir_text()
-    expect_parse_roundtrip_and_verify(rmsnorm_alloc_buffer_text, "RMSNorm alloc_buffer layout specialization")
+    expect_parse_roundtrip_and_verify(rmsnorm_alloc_buffer_text, "RMSNorm hand-authored UB layout specialization")
     expect(
         "dyn_shared_memory_buf = 82496 : i64" in rmsnorm_alloc_buffer_text,
-        "RMSNorm alloc_buffer layout should reserve the same UB scratch size as the expanded RMSNorm kernel",
+        "RMSNorm hand-authored UB layout should declare the expanded RMSNorm kernel scratch size",
     )
-    for expected_offset in (16384, 49152, 81920, 81984):
+    for expected_offset in (4096, 4224, 12416, 20608):
         expect(
             f"arith.constant {expected_offset} : index" in rmsnorm_alloc_buffer_text,
-            f"RMSNorm alloc_buffer layout should materialize UB byte offset {expected_offset}",
+            f"RMSNorm hand-authored UB layout should materialize f32 offset {expected_offset}",
         )
     expect(
         rmsnorm_alloc_buffer_text.count("llvm.alloca") == 2,
diff --git a/ptodsl/tests/test_rmsnorm_example_compile.py b/ptodsl/tests/test_rmsnorm_example_compile.py
index 3bb6fd0740..6fb4973ebc 100644
--- a/ptodsl/tests/test_rmsnorm_example_compile.py
+++ b/ptodsl/tests/test_rmsnorm_example_compile.py
@@ -166,7 +166,7 @@ def main() -> None:
         label="x64",
         vector_type="vector<4xf32>",
         helper_name_fragment="__tl_allreduce_sum_f32_t64_s1_o0",
-        ub_size=82240,
+        ub_size=82496,
     )
 
     print("ptodsl_rmsnorm_example_compile: PASS")

From 102e0083043ae1213e83f7b26bf311628aaf44ac Mon Sep 17 00:00:00 2001
From: default <default>
Date: Tue, 30 Jun 2026 20:04:04 +0800
Subject: [PATCH 33/37] test(ptodsl): add manual dyn-UB RMSNorm launch

Signed-off-by: andodo <andodo@andododeMacBook-Air.local>
---
 ...rmsnorm_alloc_buffer_simt_manual_launch.py | 293 ++++++++++++++++++
 1 file changed, 293 insertions(+)
 create mode 100644 ptodsl/examples/rmsnorm_alloc_buffer_simt_manual_launch.py

diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt_manual_launch.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt_manual_launch.py
new file mode 100644
index 0000000000..4c4386a085
--- /dev/null
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt_manual_launch.py
@@ -0,0 +1,293 @@
+# Copyright (c) 2026 Huawei Technologies Co., Ltd.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+"""
+Launch and validate the RMSNorm alloc_buffer/SIMT example with a hand-written
+host wrapper that passes dynamic UB bytes explicitly.
+
+This is intentionally a bypass of PTODSL's ``compiled[grid, stream](...)``
+runtime launch path. The PTODSL kernel is still compiled to MLIR, then this
+script builds a custom ``launch.cpp`` containing:
+
+    kernel<<<grid, 82496, stream>>>(...)
+
+Use it to validate the kernel in environments where the generated PTODSL
+runtime wrapper does not yet consume ``dyn_shared_memory_buf``.
+"""
+
+from __future__ import annotations
+
+import argparse
+import ctypes
+import hashlib
+from pathlib import Path
+import sys
+import time
+
+import numpy as np
+
+
+if __package__ in {None, ""}:
+    here = Path(__file__).resolve()
+    for candidate in here.parents:
+        if (candidate / "ptodsl" / "__init__.py").exists():
+            sys.path.insert(0, str(candidate))
+            break
+    else:
+        raise RuntimeError(
+            "Unable to locate the PTODSL Python package root from "
+            "rmsnorm_alloc_buffer_simt_manual_launch.py"
+        )
+
+
+from ptodsl._runtime.native_build import (  # noqa: E402
+    _compile_launch_cpp,
+    _effective_insert_sync,
+    _link_shared_library,
+    _run_ptoas,
+)
+
+from rmsnorm_alloc_buffer_simt_launch import (  # noqa: E402
+    _DEVICE,
+    _EPS,
+    _HIDDEN_SIZE,
+    _RSTD_GUARD_ELEMS,
+    _SENTINEL,
+    _Y_GUARD_ELEMS,
+    CASES,
+    FULL_CASE,
+    Case,
+    assert_guard_unchanged,
+    compile_kernel,
+    init_runtime,
+    make_inputs,
+    npu_stream,
+    rmsnorm_reference,
+)
+
+
+_DYN_SHARED_BYTES = 82496
+
+
+def _manual_launch_cpp(*, ir_function_name: str, launch_symbol: str, dyn_shared_bytes: int) -> str:
+    return f"""#include <stdint.h>
+
+#ifndef AICORE
+#define AICORE [aicore]
+#endif
+
+extern "C" __global__ AICORE void {ir_function_name}(
+    __gm__ float *X,
+    __gm__ float *Y,
+    __gm__ float *W,
+    __gm__ float *RSTD,
+    float eps);
+
+extern "C" void {launch_symbol}(
+    uint32_t grid,
+    void *stream,
+    float *X,
+    float *Y,
+    float *W,
+    float *RSTD,
+    float eps) {{
+  constexpr uint32_t dynSharedBytes = {int(dyn_shared_bytes)};
+  {ir_function_name}<<<grid, dynSharedBytes, stream>>>(
+      (__gm__ float *)X,
+      (__gm__ float *)Y,
+      (__gm__ float *)W,
+      (__gm__ float *)RSTD,
+      eps);
+}}
+"""
+
+
+def _manual_cache_dir(compiled, launch_cpp_text: str) -> Path:
+    payload = "\n".join([
+        compiled.mlir_text(),
+        launch_cpp_text,
+        repr(compiled.specialization_key),
+    ]).encode("utf-8")
+    digest = hashlib.sha256(payload).hexdigest()[:16]
+    return Path.home() / ".cache" / "ptodsl" / f"{compiled._py_name}_manual_dynub_{digest}"
+
+
+def build_manual_library(compiled, *, dyn_shared_bytes: int = _DYN_SHARED_BYTES) -> tuple[Path, str]:
+    module_spec = compiled._module_spec
+    ir_function_name = module_spec.function_name
+    launch_symbol = f"ptodsl_manual_launch_{ir_function_name}"
+
+    declared_dyn_shared = getattr(module_spec, "dyn_shared_memory_buf", None)
+    if declared_dyn_shared != dyn_shared_bytes:
+        raise RuntimeError(
+            f"expected @pto.jit dyn_shared_memory_buf={dyn_shared_bytes}, "
+            f"got {declared_dyn_shared!r}"
+        )
+
+    launch_cpp_text = _manual_launch_cpp(
+        ir_function_name=ir_function_name,
+        launch_symbol=launch_symbol,
+        dyn_shared_bytes=dyn_shared_bytes,
+    )
+    cache_dir = _manual_cache_dir(compiled, launch_cpp_text)
+    mlir_path = cache_dir / "kernel.mlir"
+    kernel_object = cache_dir / "kernel.o"
+    launch_cpp = cache_dir / "manual_launch.cpp"
+    launch_object = cache_dir / "manual_launch.o"
+    shared_library = cache_dir / f"lib{ir_function_name}_manual_dynub.so"
+
+    if shared_library.is_file():
+        return shared_library, launch_symbol
+
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    mlir_path.write_text(compiled.mlir_text(), encoding="utf-8")
+    launch_cpp.write_text(launch_cpp_text, encoding="utf-8")
+
+    _run_ptoas(
+        mlir_path,
+        kernel_object,
+        target_arch=module_spec.target_arch,
+        insert_sync=_effective_insert_sync(
+            mode=module_spec.mode,
+            insert_sync=module_spec.insert_sync,
+        ),
+    )
+    _compile_launch_cpp(
+        launch_cpp,
+        launch_object,
+        kernel_kind=module_spec.kernel_kind,
+        export_macro=f"{ir_function_name}_EXPORTS",
+    )
+    _link_shared_library(
+        launch_object,
+        kernel_object,
+        shared_library,
+        kernel_kind=module_spec.kernel_kind,
+    )
+    return shared_library, launch_symbol
+
+
+def _manual_launch(compiled, *, grid: int, stream, x_ptr: int, y_ptr: int, w_ptr: int, rstd_ptr: int, eps: float):
+    lib_path, launch_symbol = build_manual_library(compiled)
+    lib = ctypes.CDLL(str(lib_path))
+    launch = getattr(lib, launch_symbol)
+    launch.argtypes = [
+        ctypes.c_uint32,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_float,
+    ]
+    launch.restype = None
+    launch(
+        ctypes.c_uint32(grid),
+        ctypes.c_void_p(int(getattr(stream, "value", stream))),
+        ctypes.c_void_p(x_ptr),
+        ctypes.c_void_p(y_ptr),
+        ctypes.c_void_p(w_ptr),
+        ctypes.c_void_p(rstd_ptr),
+        ctypes.c_float(eps),
+    )
+
+
+def run_case_manual(case: Case, torch) -> None:
+    x, w = make_inputs(case)
+    y_ref, rstd_ref = rmsnorm_reference(x, w, _EPS)
+
+    x_t = torch.from_numpy(x).to(_DEVICE)
+    w_t = torch.from_numpy(w).to(_DEVICE)
+
+    y_storage = torch.full(
+        (case.tokens * _HIDDEN_SIZE + _Y_GUARD_ELEMS,),
+        float(_SENTINEL),
+        dtype=torch.float32,
+        device=_DEVICE,
+    )
+    rstd_storage = torch.full(
+        (case.tokens + _RSTD_GUARD_ELEMS,),
+        float(_SENTINEL),
+        dtype=torch.float32,
+        device=_DEVICE,
+    )
+
+    stream = npu_stream(torch)
+
+    t0 = time.perf_counter()
+    compiled = compile_kernel(case)
+    compile_s = time.perf_counter() - t0
+
+    t0 = time.perf_counter()
+    _manual_launch(
+        compiled,
+        grid=case.n_cores,
+        stream=stream,
+        x_ptr=x_t.data_ptr(),
+        y_ptr=y_storage.data_ptr(),
+        w_ptr=w_t.data_ptr(),
+        rstd_ptr=rstd_storage.data_ptr(),
+        eps=float(_EPS),
+    )
+    torch.npu.synchronize()
+    launch_s = time.perf_counter() - t0
+
+    y_out = y_storage[: case.tokens * _HIDDEN_SIZE].cpu().numpy().reshape(case.tokens, _HIDDEN_SIZE)
+    rstd_out = rstd_storage[: case.tokens].cpu().numpy()
+    y_guard = y_storage[case.tokens * _HIDDEN_SIZE :].cpu().numpy()
+    rstd_guard = rstd_storage[case.tokens :].cpu().numpy()
+
+    np.testing.assert_allclose(rstd_out, rstd_ref, rtol=case.rtol, atol=case.rstd_atol)
+    np.testing.assert_allclose(y_out, y_ref, rtol=case.rtol, atol=case.y_atol)
+    assert_guard_unchanged("Y", y_guard)
+    assert_guard_unchanged("RSTD", rstd_guard)
+
+    y_diff = float(np.max(np.abs(y_out - y_ref))) if y_out.size else 0.0
+    rstd_diff = float(np.max(np.abs(rstd_out - rstd_ref))) if rstd_out.size else 0.0
+    simt_config = getattr(case, "simt_config", "threads=128 rounds=8 lanes=4")
+    print(
+        f"PASS {case.name} manual-dynub  "
+        f"grid={case.n_cores} tokens={case.tokens} {simt_config} "
+        f"dynSharedBytes={_DYN_SHARED_BYTES} "
+        f"compile={compile_s:.3f}s launch={launch_s:.3f}s "
+        f"max|Y|={y_diff:.3e} max|RSTD|={rstd_diff:.3e}"
+    )
+
+
+def main(argv=None) -> int:
+    import rmsnorm_alloc_buffer_simt_launch as base_launch
+
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--device", default=_DEVICE, help="torch NPU device, default: npu:0")
+    parser.add_argument(
+        "--case",
+        choices=[case.name for case in CASES] + [FULL_CASE.name, "all"],
+        default="all",
+    )
+    parser.add_argument("--include-full", action="store_true", help="include the 64-core x 64-token full case")
+    args = parser.parse_args(argv)
+
+    base_launch._DEVICE = args.device
+    globals()["_DEVICE"] = args.device
+
+    selected = list(CASES)
+    if args.include_full:
+        selected.append(FULL_CASE)
+    if args.case != "all":
+        all_cases = {case.name: case for case in selected + [FULL_CASE]}
+        selected = [all_cases[args.case]]
+
+    torch = init_runtime()
+    for case in selected:
+        run_case_manual(case, torch)
+    print("All RMSNorm manual dynamic-UB cases passed.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 59ac739cca65022f2ef81952af34d342ba5c0b4f Mon Sep 17 00:00:00 2001
From: kuri780 <185585386+kuri780@users.noreply.github.com>
Date: Tue, 30 Jun 2026 21:16:01 +0800
Subject: [PATCH 34/37] feat(ptodsl): inline SIMT allreduce implementation

Signed-off-by: andodo <andodo@andododeMacBook-Air.local>
---
 ptodsl/examples/rmsnorm_alloc_buffer_simt.py  |   2 +-
 ptodsl/ptodsl/_allreduce.py                   | 896 +++++-------------
 ptodsl/ptodsl/pto.py                          |   2 +-
 ptodsl/tests/test_allreduce.py                | 321 +++++--
 ptodsl/tests/test_rmsnorm_example_compile.py  |  13 +-
 .../simt/allreduce_cross_max/compare.py       |  15 +
 .../simt/allreduce_cross_max/golden.py        |  22 +
 .../simt/allreduce_cross_max/kernel.pto       |  65 ++
 .../simt/allreduce_cross_max/launch.cpp       |  11 +
 .../simt/allreduce_cross_max/main.cpp         |  43 +
 .../simt/allreduce_cross_min/compare.py       |  15 +
 .../simt/allreduce_cross_min/golden.py        |  22 +
 .../simt/allreduce_cross_min/kernel.pto       |  65 ++
 .../simt/allreduce_cross_min/launch.cpp       |  11 +
 .../simt/allreduce_cross_min/main.cpp         |  43 +
 .../simt/allreduce_cross_sum/compare.py       |  15 +
 .../simt/allreduce_cross_sum/golden.py        |  22 +
 .../simt/allreduce_cross_sum/kernel.pto       |  65 ++
 .../simt/allreduce_cross_sum/launch.cpp       |  11 +
 .../simt/allreduce_cross_sum/main.cpp         |  43 +
 .../simt/allreduce_warp_max/compare.py        |  15 +
 .../simt/allreduce_warp_max/golden.py         |  22 +
 .../simt/allreduce_warp_max/kernel.pto        |  21 +
 .../simt/allreduce_warp_max/launch.cpp        |  11 +
 .../micro-op/simt/allreduce_warp_max/main.cpp |  43 +
 .../simt/allreduce_warp_min/compare.py        |  15 +
 .../simt/allreduce_warp_min/golden.py         |  22 +
 .../simt/allreduce_warp_min/kernel.pto        |  21 +
 .../simt/allreduce_warp_min/launch.cpp        |  11 +
 .../micro-op/simt/allreduce_warp_min/main.cpp |  43 +
 .../simt/allreduce_warp_sum/compare.py        |  15 +
 .../simt/allreduce_warp_sum/golden.py         |  22 +
 .../simt/allreduce_warp_sum/kernel.pto        |  21 +
 .../simt/allreduce_warp_sum/launch.cpp        |  11 +
 .../micro-op/simt/allreduce_warp_sum/main.cpp |  43 +
 35 files changed, 1296 insertions(+), 742 deletions(-)
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_max/compare.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_max/golden.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_max/kernel.pto
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_max/launch.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_max/main.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_min/compare.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_min/golden.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_min/kernel.pto
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_min/launch.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_min/main.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_sum/compare.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_sum/golden.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_sum/kernel.pto
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_sum/launch.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_cross_sum/main.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_max/compare.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_max/golden.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_max/kernel.pto
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_max/launch.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_max/main.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_min/compare.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_min/golden.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_min/kernel.pto
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_min/launch.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_min/main.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_sum/compare.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_sum/golden.py
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_sum/kernel.pto
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_sum/launch.cpp
 create mode 100644 test/vpto/cases/micro-op/simt/allreduce_warp_sum/main.cpp

diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
index 4c53b37a2e..eae64dde01 100644
--- a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
+++ b/ptodsl/examples/rmsnorm_alloc_buffer_simt.py
@@ -82,10 +82,10 @@ def rmsnorm_simt_token_body(
 
     sum_sq = pto.simt_allreduce_sum(
         local_sum,
-        reduce_scratch,
         threads=threads,
         scale=1,
         thread_offset=0,
+        scratch=reduce_scratch,
     )
 
     rstd = 1.0 / pto.sqrt(sum_sq / hidden_size + eps)
diff --git a/ptodsl/ptodsl/_allreduce.py b/ptodsl/ptodsl/_allreduce.py
index 02298d3f90..3c6637bdb4 100644
--- a/ptodsl/ptodsl/_allreduce.py
+++ b/ptodsl/ptodsl/_allreduce.py
@@ -6,250 +6,272 @@
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 """
-SIMT cross-workitem all-reduce helpers.
+SIMT cross-workitem all-reduce.
 
-Implements ``AscendAllReduce<Reducer, threads, scale, thread_offset>::run()``
-as PTO IR helper functions that are lazily emitted into the trace module.
+All-reduce ops are emitted **inline** at the current insertion point.
+Three reducer variants: ``simt_allreduce_sum``, ``simt_allreduce_max``, ``simt_allreduce_min``.
 
-Public entry point: ``simt_allreduce_sum(value, scratch=None, *, threads, scale, thread_offset)``,
-callable from within a ``@pto.simt`` context.
+Dispatch tree (compile-time, since *threads* / *scale* are Python ints)::
 
-Dispatch tree (mirrors the C++ compile-time dispatch in ``reduce.h``)::
-
-    threads <= scale                                    → identity
-    threads ≤ 32,  pow2(threads), pow2(scale)           → warp_reduce
-    threads ≤ 32                                        → ub_reduce
-    threads > 32, pow2(threads), scale ≤ 32, pow2(scale) → cross_warp_reduce
-    otherwise                                           → ub_reduce
+    threads <= scale                                       →  identity
+    threads ≤ 32,  pow2(threads), pow2(scale)              →  warp_reduce
+    threads ≤ 32                                           →  ub_reduce
+    threads > 32,  pow2(threads), scale≤32, pow2(scale)   →  cross_warp_reduce
+    otherwise                                              →  ub_reduce (fallback)
 """
 
 from __future__ import annotations
 
-from ._surface_values import unwrap_surface_value, wrap_surface_value
-from ._tracing.active import require_active_session
-from ._tracing.session import HelperFunctionSpec
+from . import scalar
+from ._control_flow import if_, for_
+from ._ops import const as _const, get_laneid, get_tid_x, redux_add, redux_max, redux_min, shuffle_bfly, syncthreads
+from ._surface_values import unwrap_surface_value
+from ._types import _resolve, float16 as _f16_dtype, float32 as _f32_dtype
 
-from mlir.dialects import arith, func, scf
-from mlir.dialects import pto as _pto
-from mlir.ir import F16Type, F32Type, IndexType, InsertionPoint, IntegerType, Operation, UnitAttr
+from mlir.ir import F16Type, F32Type
 
 
-# ═══════════════════════════════════════════════════════════════════════════════
-# helpers
-# ═══════════════════════════════════════════════════════════════════════════════
+# ── helpers ────────────────────────────────────────────────────────────────────
 
 def _is_pow2(n: int) -> bool:
+    """Compile-time power-of-two check."""
     return n > 0 and (n & (n - 1)) == 0
 
 
-def _helper_name(dtype: str, threads: int, scale: int, thread_offset: int) -> str:
-    """Canonical helper symbol name for a specific all-reduce instance.
+# ── reducer dispatch tables ────────────────────────────────────────────────────
+
+_REDUCER_IDENTITY = {
+    "sum": {"f32": 0.0, "f16": 0.0},
+    "max": {"f32": float("-inf"), "f16": float("-inf")},
+    "min": {"f32": float("inf"), "f16": float("inf")},
+}
 
-    Example: ``__tl_allreduce_sum_f32_t128_s1_o0``.
-    """
-    return f"__tl_allreduce_sum_{dtype}_t{threads}_s{scale}_o{thread_offset}"
+_REDUCER_COMBINE = {
+    "sum": lambda a, b: a + b,
+    "max": scalar.max,
+    "min": scalar.min,
+}
 
+_REDUCER_REDUX = {
+    "sum": redux_add,
+    "max": redux_max,
+    "min": redux_min,
+}
+
+
+# ── butterfly  ──────────────────────────────────────────────────────────────────
+
+def _emit_butterfly(v, *, threads: int, scale: int, reducer: str):
+    """Unrolled butterfly shuffle reduce."""
+    combine = _REDUCER_COMBINE[reducer]
+    cur = threads
+    while cur > scale:
+        offset = cur // 2
+        v = combine(v, shuffle_bfly(v, offset))
+        cur //= 2
+    return v
 
-def _dtype_to_str(mlir_type) -> str:
-    """Map an MLIR scalar type to a canonical dtype string."""
-    if mlir_type == F32Type.get():
-        return "f32"
-    if mlir_type == F16Type.get():
-        return "f16"
-    raise NotImplementedError(
-        f"all_reduce: unsupported dtype {mlir_type}"
-    )
 
+# ── warp_hw_reduce  ────────────────────────────────────────────────────────────
 
-def _mlir_scalar_type(dtype: str):
-    """Map a canonical dtype string back to an MLIR scalar type."""
-    if dtype == "f32":
-        return F32Type.get()
-    if dtype == "f16":
-        return F16Type.get()
-    raise NotImplementedError(
-        f"all_reduce: unsupported dtype {dtype!r}"
+def _emit_warp_hw_reduce(x, *, threads: int, lane_in_warp, dtype: str, reducer: str):
+    """Warp-level hardware reduce with group masking."""
+    redux_fn = _REDUCER_REDUX[reducer]
+    groups = 32 // threads
+
+    if groups == 1:
+        return redux_fn(x)
+
+    c_identity = _const(
+        _REDUCER_IDENTITY[reducer][dtype],
+        dtype=_resolve(_f32_dtype if dtype == "f32" else _f16_dtype),
     )
+    my_group = lane_in_warp // threads
 
+    for g in range(groups):
+        in_group = my_group == g
+        masked = scalar.select(in_group, x, c_identity)
+        reduced = redux_fn(masked)
+        x = scalar.select(in_group, reduced, x)
+    return x
 
-# ── compile-time parameter tables ──────────────────────────────────────────
 
-_IDENTITY = {
-    "f32": 0.0,
-    "f16": 0.0,
-}
-"""Identity element for sum reduction (0.0 for both f32 and f16)."""
-
-_REDUX_OP = _pto.ReduxAddOp
-"""Reduction operator (hardware redux_add)."""
-
-
-# ── scratch validation ────────────────────────────────────────────────────
-
-def _validate_scratch(scratch, expected_mlir_type, *, context: str):
-    """Verify *scratch* is a ``!pto.ptr<expected_mlir_type, ub>`` buffer."""
-    raw_scratch = unwrap_surface_value(scratch)
-    try:
-        ptr_type = _pto.PtrType(raw_scratch.type)
-    except Exception:
-        raise TypeError(
-            f"all_reduce {context}: scratch must be a !pto.ptr buffer, "
-            f"got {raw_scratch.type}"
-        ) from None
-    vec_attr = _pto.AddressSpaceAttr.get(_pto.AddressSpace.VEC)
-    if ptr_type.memory_space != vec_attr:
-        raise TypeError(
-            f"all_reduce {context}: scratch must be in UB memory space, "
-            f"got {ptr_type.memory_space}"
-        )
-    if ptr_type.element_type != expected_mlir_type:
-        raise TypeError(
-            f"all_reduce {context}: scratch element type mismatch: "
-            f"expected {expected_mlir_type}, got {ptr_type.element_type}"
-        )
+# ── warp_reduce  ───────────────────────────────────────────────────────────────
+
+def _emit_warp_reduce(x, *,
+                      dtype, threads, scale, thread_offset, reducer):
+    """Single-warp all-reduce."""
+    extent = threads // scale
+    if extent <= 1:
+        return x
 
+    if thread_offset:
+        lane_in_warp = (get_tid_x() - thread_offset) & 31
+    else:
+        lane_in_warp = get_laneid()
 
-# ── shared helper-emission utility ─────────────────────────────────────────
+    if extent >= 16 and scale == 1:
+        return _emit_warp_hw_reduce(
+            x, threads=threads,
+            lane_in_warp=lane_in_warp, dtype=dtype, reducer=reducer,
+        )
+    return _emit_butterfly(x, threads=threads, scale=scale, reducer=reducer)
 
-def _invoke_helper(helper_name, emit_fn, *surface_args):
-    """Look up or lazily create *helper_name*, then ``func.call`` it.
 
-    *emit_fn(helper_fn)* is called exactly once per trace session — on the
-    first invocation for this *helper_name*.
-    """
-    session = require_active_session("simt_allreduce_sum")
-    raw_args = [unwrap_surface_value(a) for a in surface_args]
-    arg_types = tuple(a.type for a in raw_args)
+# ── cross_warp_reduce  ─────────────────────────────────────────────────────────
 
-    helper_spec = HelperFunctionSpec(
-        symbol_name=helper_name,
-        arg_types=arg_types,
-        result_types=(arg_types[0],),
-        attributes=(("pto.simt_entry", UnitAttr.get()),),
+def _emit_cross_warp_reduce(x, scratch, *,
+                            dtype, threads, scale, thread_offset, reducer):
+    """Cross-warp all-reduce (threads > 32)."""
+    num_warps = threads // 32
+    c_identity = _const(
+        _REDUCER_IDENTITY[reducer][dtype],
+        dtype=_resolve(_f32_dtype if dtype == "f32" else _f16_dtype),
     )
-    helper_fn, created = session.get_or_create_helper_function(helper_spec)
-    if created:
-        emit_fn(helper_fn)
-    call = func.CallOp(helper_fn, raw_args)
-    return wrap_surface_value(call.result)
+    combine = _REDUCER_COMBINE[reducer]
+    redux_fn = _REDUCER_REDUX[reducer]
 
+    # ── thread indexing ──────────────────────────────────────────────────
+    tid_x = get_tid_x()
+    if thread_offset:
+        tx = tid_x - thread_offset
+        wid = tx // 32
+        lid = tx & 31
+    else:
+        tx = tid_x
+        wid = tx // 32
+        lid = get_laneid()
 
-# ── reduction operator application ─────────────────────────────────────────
+    # ── per-warp reduce ──────────────────────────────────────────────────
+    if scale == 1:
+        warp_val = redux_fn(x)
+    else:
+        warp_val = _emit_butterfly(x, threads=32, scale=scale, reducer=reducer)
 
-def _emit_store(buffer, offset, value):
-    """Emit ``pto.store`` — accepts Ptr and any MemRef (including UB/VEC).
+    # ── warp leaders write partial results ───────────────────────────────
+    is_writer = lid < scale
+    with if_(is_writer) as br:
+        with br.then_:
+            slot = wid * scale + lid
+            scalar.store(warp_val, scratch, scalar.index_cast(slot))
 
-    Unlike ``pto.store_scalar`` (which rejects VEC memrefs), ``pto.store``
-    uses ``PTO_BufferLikeType`` and survives the Ptr→MemRef type conversion
-    pass during lowering.
-    """
-    Operation.create(
-        "pto.store",
-        operands=[buffer, offset, value],
-    )
+    syncthreads()
 
+    # ── leader warp reduces partial sums ─────────────────────────────────
+    is_leader_warp = tx < 32
+    with if_(is_leader_warp) as br:
+        with br.then_:
+            if scale == 1:
+                need_load = lid < num_warps
+                with if_(need_load) as inner_br:
+                    with inner_br.then_:
+                        tmp = scalar.load(scratch, scalar.index_cast(lid))
+                        inner_br.assign(loaded=tmp)
+                    with inner_br.else_:
+                        inner_br.assign(loaded=c_identity)
+                loaded = inner_br.loaded
+                stage4_result = redux_fn(loaded)
+            elif scale * num_warps <= 32:
+                total = scale * num_warps
+                need_load = lid < total
+                with if_(need_load) as inner_br:
+                    with inner_br.then_:
+                        tmp = scalar.load(scratch, scalar.index_cast(lid))
+                        inner_br.assign(loaded=tmp)
+                    with inner_br.else_:
+                        inner_br.assign(loaded=c_identity)
+                loaded = inner_br.loaded
+                stage4_result = _emit_butterfly(
+                    loaded, threads=total, scale=scale, reducer=reducer,
+                )
+            else:
+                is_reducer = lid < scale
+                reduced = c_identity
+                my_slot = lid % scale
+                for w in range(num_warps):
+                    idx_val = w * scale + my_slot
+                    loaded_v = scalar.load(scratch, scalar.index_cast(idx_val))
+                    reduced = combine(reduced, loaded_v)
+                stage4_result = scalar.select(is_reducer, reduced, c_identity)
 
-def _emit_load(result_type, buffer, offset):
-    """Emit ``pto.load`` — accepts Ptr and any MemRef (including UB/VEC).
+            br.assign(stage4_result=stage4_result)
+        with br.else_:
+            br.assign(stage4_result=c_identity)
 
-    Counterpart to ``_emit_store``.  Returns the loaded SSA value.
-    """
-    return Operation.create(
-        "pto.load",
-        results=[result_type],
-        operands=[buffer, offset],
-    ).results[0]
+    partial_reduced = br.stage4_result
 
+    # ── global leader writes result ──────────────────────────────────────
+    is_global_leader = tx < scale
+    with if_(is_global_leader) as br5:
+        with br5.then_:
+            scalar.store(partial_reduced, scratch, scalar.index_cast(tx))
 
-def _apply_sum(a, b):
-    """Emit ``a = a + b`` (float addition)."""
-    return arith.AddFOp(a, b).result
+    # ── broadcast ────────────────────────────────────────────────────────
+    syncthreads()
+    result = scalar.load(scratch, scalar.index_cast(tx % scale))
+    syncthreads()
 
+    return result
 
-def _emit_butterfly(v, *, threads: int, scale: int):
-    """Emit unrolled butterfly shuffle reduce.
 
-    Implements::
+# ── ub_reduce  ─────────────────────────────────────────────────────────────────
 
-        cur = threads
-        while cur > scale:
-            x = op(x, shfl_xor(x, cur/2))
-            cur /= 2
+def _emit_ub_reduce(x, scratch, *,
+                    dtype, threads, scale, thread_offset, reducer):
+    """UB-scratch all-reduce (fallback for non-pow2 or general case)."""
+    combine = _REDUCER_COMBINE[reducer]
 
-    All loops are unrolled at emission time.  Caller must have set the
-    insertion point.
-    """
-    i32 = IntegerType.get_signless(32)
-    cur = threads
-    while cur > scale:
-        offset = cur // 2
-        c_offset = arith.ConstantOp(i32, offset).result
-        shfl = _pto.ShuffleBflyOp(v, c_offset).result
-        v = _apply_sum(v, shfl)
-        cur //= 2
-    return v
+    # ── thread indexing ──────────────────────────────────────────────────
+    tid_x = get_tid_x()
+    tx = (tid_x - thread_offset) if thread_offset else tid_x
+    group = tx // threads
+    lane = tx % threads
 
+    # ── each lane writes x → scratch[tx] ─────────────────────────────────
+    scalar.store(x, scratch, scalar.index_cast(tx))
+    syncthreads()
 
-def _emit_warp_hw_reduce(x, *, threads: int,
-                         lane_in_warp, c_identity, i32):
-    """Emit warp-level hardware reduce.
+    # ── reducers sequentially combine ────────────────────────────────────
+    is_reducer = lane < scale
+    with if_(is_reducer) as br:
+        with br.then_:
+            group_offset = group * threads
+            first_elem = group_offset + lane
+            acc = scalar.load(scratch, scalar.index_cast(first_elem))
 
-    When *threads* == 32 ("groups" == 1): a single ``pto.redux_*``.
-    When *threads* < 32 ("groups" > 1): one ``pto.redux_*`` per group,
-    with identity masking for lanes outside the group.
+            carry_loop = for_(scale, threads, step=scale).carry(acc=acc)
+            with carry_loop:
+                prev = carry_loop.acc
+                elem = first_elem + carry_loop.iv
+                loaded = scalar.load(scratch, elem)
+                carry_loop.update(acc=combine(prev, loaded))
+            acc = carry_loop.final("acc")
 
-    Caller must have set the insertion point.
-    """
-    groups = 32 // threads
+            br.assign(flag=acc)
+        with br.else_:
+            br.assign(flag=x)
 
-    if groups == 1:
-        return _REDUX_OP(x).result
+    flag = br.flag
+    syncthreads()
 
-    c_threads = arith.ConstantOp(i32, threads).result
-    my_group = arith.DivUIOp(lane_in_warp, c_threads).result
+    # ── per-class leader writes back ─────────────────────────────────────
+    is_leader = lane < scale
+    with if_(is_leader) as br5:
+        with br5.then_:
+            scalar.store(flag, scratch, scalar.index_cast(group * threads + lane))
 
-    for g in range(groups):
-        c_g = arith.ConstantOp(i32, g).result
-        in_group = arith.CmpIOp(arith.CmpIPredicate.eq, my_group, c_g).result
-        masked = arith.SelectOp(in_group, x, c_identity).result
-        reduced = _REDUX_OP(masked).result
-        x = arith.SelectOp(in_group, reduced, x).result
-    return x
+    # ── broadcast ────────────────────────────────────────────────────────
+    syncthreads()
+    result = scalar.load(scratch, scalar.index_cast(group * threads + (tx % scale)))
+    syncthreads()
 
+    return result
 
-# ═══════════════════════════════════════════════════════════════════════════════
-# public API
-# ═══════════════════════════════════════════════════════════════════════════════
-
-def simt_allreduce_sum(value, scratch=None, *,
-               threads: int,
-               scale: int = 1,
-               thread_offset: int = 0):
-    """Cross-workitem all-reduce for SIMT VF context.
-
-    Dispatch logic mirrors the compile-time tree in
-    ``AscendAllReduce<Reducer, threads, scale, thread_offset>::run()``.
-
-    Args:
-        value: Lane-local scalar (f32 or f16).
-        threads: Number of workitems.  Must satisfy ``threads % scale == 0``.
-        scale: Scale factor (must divide *threads*).  Defaults to 1.
-        thread_offset: Thread offset.  Defaults to 0.
-        scratch: UB scratch buffer (``!pto.ptr<dtype, ub>``).  Required for
-            ``cross_warp_reduce`` and ``ub_reduce`` paths.  Defaults to None.
-
-    Returns:
-        Lane-uniform scalar (same type as *value*) — the reduced sum.
-    """
-    return _dispatch_allreduce_helper(
-        value, scratch=scratch,
-        threads=threads, scale=scale, thread_offset=thread_offset,
-    )
 
+# ── public API  ────────────────────────────────────────────────────────────────
 
-def _dispatch_allreduce_helper(value, *, scratch,
-                                threads, scale, thread_offset):
-    # ── parameter validation (before identity shortcut) ───────────────────
+def _check_params(*, threads, scale, thread_offset):
+    """Validate allreduce parameters (compile-time checks)."""
     for name, val in (("threads", threads), ("scale", scale),
                        ("thread_offset", thread_offset)):
         if not isinstance(val, int):
@@ -271,497 +293,63 @@ def _dispatch_allreduce_helper(value, *, scratch,
             f"got threads={threads}, scale={scale}"
         )
 
-    # ── Path 0: identity ──────────────────────────────────────────────────
+
+def _simt_allreduce(value, *, threads, scale, thread_offset, scratch, reducer):
+    """Unified allreduce dispatch tree."""
+    _check_params(threads=threads, scale=scale, thread_offset=thread_offset)
+
     if threads <= scale:
         return value
 
-    # ── dtype validation ─────────────────────────────────────────────────
     raw_value = unwrap_surface_value(value)
-    dtype = _dtype_to_str(raw_value.type)
-    if dtype not in ("f32", "f16"):
-        raise NotImplementedError(
-            f"all_reduce only supports f32/f16, got {dtype}"
-        )
+    if raw_value.type == F32Type.get():
+        dtype = "f32"
+    elif raw_value.type == F16Type.get():
+        dtype = "f16"
+    else:
+        raise NotImplementedError(f"all_reduce: unsupported dtype {raw_value.type}")
 
-    name = _helper_name(dtype, threads, scale, thread_offset)
     args = dict(dtype=dtype, threads=threads, scale=scale,
-                thread_offset=thread_offset)
+                thread_offset=thread_offset, reducer=reducer)
 
-    # ── Path 1: warp_reduce ───────────────────────────────────────────────
     if threads <= 32 and _is_pow2(threads) and _is_pow2(scale):
-        return _invoke_helper(
-            name,
-            lambda hf: _emit_warp_reduce(hf, **args),
-            value,
-        )
+        return _emit_warp_reduce(value, **args)
 
-    # ── All paths below require a scratch buffer ──────────────────────────
     if scratch is None:
         raise ValueError(
-            f"all_reduce sum/{dtype}/t{threads}/s{scale}/o{thread_offset} "
+            f"all_reduce {reducer}/{dtype}/t{threads}/s{scale}/o{thread_offset} "
             "requires a UB scratch buffer"
         )
-    _validate_scratch(
-        scratch, raw_value.type,
-        context=f"sum/{dtype}/t{threads}/s{scale}/o{thread_offset}",
-    )
 
-    # ── Path 2: ub_reduce (threads ≤ 32, non-pow2) ──────────────────────
     if threads <= 32:
-        return _invoke_helper(
-            name,
-            lambda hf: _emit_ub_reduce(hf, **args),
-            value, scratch,
-        )
+        return _emit_ub_reduce(value, scratch, **args)
 
-    # ── Path 3: cross_warp_reduce ────────────────────────────────────────
     if scale <= 32 and _is_pow2(threads) and _is_pow2(scale):
-        return _emit_cross_warp_reduce_inline(
-            raw_value, unwrap_surface_value(scratch), **args,
-        )
+        return _emit_cross_warp_reduce(value, scratch, **args)
 
-    # ── Path 4: ub_reduce fallback (threads > 32, anything else) ─────────
-    return _invoke_helper(
-        name,
-        lambda hf: _emit_ub_reduce(hf, **args),
-        value, scratch,
-    )
+    return _emit_ub_reduce(value, scratch, **args)
 
 
-def _emit_cross_warp_reduce_inline(x, scratch, *,
-                                   dtype, threads, scale, thread_offset):
-    """Emit cross-warp all-reduce directly at the current insertion point."""
-    num_warps = threads // 32
-    scalar_t = _mlir_scalar_type(dtype)
-    identity_val = _IDENTITY[dtype]
-
-    i32 = IntegerType.get_signless(32)
-    idx_t = IndexType.get()
-
-    c0_i32 = arith.ConstantOp(i32, 0).result
-    c5_i32 = arith.ConstantOp(i32, 5).result
-    c31_i32 = arith.ConstantOp(i32, 31).result
-    c32_i32 = arith.ConstantOp(i32, 32).result
-    c_scale = arith.ConstantOp(i32, scale).result
-    c_num_warps = arith.ConstantOp(i32, num_warps).result
-    c_offset = arith.ConstantOp(i32, thread_offset).result
-    c_identity = arith.ConstantOp(scalar_t, identity_val).result
-
-    tid_x = _pto.GetTidXOp().result
-    if thread_offset:
-        tx = arith.SubIOp(tid_x, c_offset).result
-        wid = arith.ShRUIOp(tx, c5_i32).result
-        lid = arith.AndIOp(tx, c31_i32).result
-    else:
-        tx = tid_x
-        wid = arith.ShRUIOp(tx, c5_i32).result
-        lid = _pto.GetLaneIdOp().result
+def simt_allreduce_sum(value, *, threads, scale=1, thread_offset=0, scratch=None):
+    """Sum reduce across SIMT work-items."""
+    return _simt_allreduce(value, threads=threads, scale=scale,
+                           thread_offset=thread_offset, scratch=scratch, reducer="sum")
 
-    if scale == 1:
-        warp_val = _REDUX_OP(x).result
-    else:
-        warp_val = _emit_butterfly(
-            x, threads=32, scale=scale,
-        )
-
-    is_writer = arith.CmpIOp(arith.CmpIPredicate.ult, lid, c_scale).result
-    write_if = scf.IfOp(is_writer, hasElse=False)
-    with InsertionPoint(write_if.then_block):
-        slot = arith.AddIOp(
-            arith.MulIOp(wid, c_scale).result, lid).result
-        slot_idx = arith.IndexCastOp(idx_t, slot).result
-        _emit_store(scratch, slot_idx, warp_val)
-        scf.YieldOp([])
-
-    _pto.SyncthreadsOp()
-
-    is_leader_warp = arith.CmpIOp(
-        arith.CmpIPredicate.ult, tx, c32_i32).result
-    outer_if = scf.IfOp(is_leader_warp, [scalar_t], hasElse=True)
-
-    with InsertionPoint(outer_if.then_block):
-        if scale == 1:
-            need_load = arith.CmpIOp(
-                arith.CmpIPredicate.ult, lid, c_num_warps).result
-            inner_if = scf.IfOp(need_load, [scalar_t], hasElse=True)
-            with InsertionPoint(inner_if.then_block):
-                lid_idx = arith.IndexCastOp(idx_t, lid).result
-                tmp = _emit_load(scalar_t, scratch, lid_idx)
-                scf.YieldOp([tmp])
-            with InsertionPoint(inner_if.else_block):
-                scf.YieldOp([c_identity])
-            loaded = inner_if.results[0]
-            stage4_result = _REDUX_OP(loaded).result
-        elif scale * num_warps <= 32:
-            total = scale * num_warps
-            c_total = arith.ConstantOp(i32, total).result
-            need_load = arith.CmpIOp(
-                arith.CmpIPredicate.ult, lid, c_total).result
-            inner_if = scf.IfOp(need_load, [scalar_t], hasElse=True)
-            with InsertionPoint(inner_if.then_block):
-                lid_idx = arith.IndexCastOp(idx_t, lid).result
-                tmp = _emit_load(scalar_t, scratch, lid_idx)
-                scf.YieldOp([tmp])
-            with InsertionPoint(inner_if.else_block):
-                scf.YieldOp([c_identity])
-            loaded = inner_if.results[0]
-            stage4_result = _emit_butterfly(
-                loaded,
-                threads=total, scale=scale,
-            )
-        else:
-            is_reducer = arith.CmpIOp(
-                arith.CmpIPredicate.ult, lid, c_scale).result
-            result = c_identity
-            my_slot = arith.RemUIOp(lid, c_scale).result
-            for w in range(num_warps):
-                c_w = arith.ConstantOp(i32, w).result
-                idx_val = arith.AddIOp(
-                    arith.MulIOp(c_w, c_scale).result, my_slot).result
-                slot_idx = arith.IndexCastOp(idx_t, idx_val).result
-                loaded_v = _emit_load(
-                    scalar_t, scratch, slot_idx)
-                result = _apply_sum(result, loaded_v)
-            stage4_result = arith.SelectOp(
-                is_reducer, result, c_identity).result
-
-        scf.YieldOp([stage4_result])
-
-    with InsertionPoint(outer_if.else_block):
-        scf.YieldOp([c_identity])
-
-    partial_reduced = outer_if.results[0]
-
-    is_global_leader = arith.CmpIOp(
-        arith.CmpIPredicate.ult, tx, c_scale).result
-    write_result_if = scf.IfOp(is_global_leader, hasElse=False)
-    with InsertionPoint(write_result_if.then_block):
-        tx_idx = arith.IndexCastOp(idx_t, tx).result
-        _emit_store(scratch, tx_idx, partial_reduced)
-        scf.YieldOp([])
-
-    _pto.SyncthreadsOp()
-    my_slot = arith.RemUIOp(tx, c_scale).result
-    load_idx = arith.IndexCastOp(idx_t, my_slot).result
-    result = _emit_load(scalar_t, scratch, load_idx)
-
-    _pto.SyncthreadsOp()
-    return wrap_surface_value(result)
-
-
-# ═══════════════════════════════════════════════════════════════════════════════
-# emitter: warp_reduce  (Path 1: threads ≤ 32, pow2, pow2 scale)
-# ═══════════════════════════════════════════════════════════════════════════════
-
-def _emit_warp_reduce(helper_fn, *,
-                      dtype, threads, scale, thread_offset):
-    """Build the body of a single-warp all-reduce helper.
-
-    Dispatches to:
-
-    * ``warp_hw_reduce`` when ``extent >= 16`` and ``scale == 1``
-      (fast hardware redux, with group masking for threads < 32).
-    * ``butterfly`` otherwise (software shuffle via ``pto.shuffle_bfly``).
-    """
-    extent = threads // scale
-    scalar_t = _mlir_scalar_type(dtype)
-    identity_val = _IDENTITY[dtype]
-    i32 = IntegerType.get_signless(32)
-
-    entry = helper_fn.add_entry_block()
-    with InsertionPoint(entry):
-        x = entry.arguments[0]
-
-        c_offset = arith.ConstantOp(i32, thread_offset).result
-        c_identity = arith.ConstantOp(scalar_t, identity_val).result
-
-        if thread_offset:
-            # lane_in_warp = (tid_x - offset) & 31
-            tid_x = _pto.GetTidXOp().result
-            tx = arith.SubIOp(tid_x, c_offset).result
-            lane_in_warp = arith.AndIOp(tx, arith.ConstantOp(i32, 31).result).result
-        else:
-            lane_in_warp = _pto.GetLaneIdOp().result
-
-        if extent >= 16 and scale == 1:
-            result = _emit_warp_hw_reduce(
-                x, threads=threads,
-                lane_in_warp=lane_in_warp, c_identity=c_identity, i32=i32,
-            )
-        else:
-            result = _emit_butterfly(
-                x, threads=threads, scale=scale,
-            )
-
-        func.ReturnOp([result])
-
-
-# ═══════════════════════════════════════════════════════════════════════════════
-# emitter: cross_warp_reduce  (Path 3: threads > 32)
-# ═══════════════════════════════════════════════════════════════════════════════
 
-def _emit_cross_warp_reduce(helper_fn, *,
-                            dtype, threads, scale, thread_offset):
-    """Build the body of a cross-warp all-reduce helper.
+def simt_allreduce_max(value, *, threads, scale=1, thread_offset=0, scratch=None):
+    """Max reduce across SIMT work-items."""
+    return _simt_allreduce(value, threads=threads, scale=scale,
+                           thread_offset=thread_offset, scratch=scratch, reducer="max")
 
-    Algorithm overview:
 
-    1. *num_warps* subgroups of 32 lanes each do a per-warp reduce.
-    2. Warp leaders (lid < scale) write → scratch[wid * scale + lid].
-    3. ``pto.syncthreads``.
-    4. Leader warp (lanes with ``tx < 32``) reduces the partial sums:
-       - scale == 1:  ``hw_reduce`` across leader warp.
-       - scale * num_warps ≤ 32:  ``butterfly<total, scale, 0>``.
-       - otherwise:  manual loop over warps.
-    5. Global leader (tx < scale) writes result → scratch[tx].
-    6. ``pto.syncthreads`` + broadcast: each lane reads scratch[tx % scale].
-    7. Extra ``pto.syncthreads`` to fence scratch reuse.
-    """
-    num_warps = threads // 32
-    scalar_t = _mlir_scalar_type(dtype)
-    identity_val = _IDENTITY[dtype]
-
-    i32 = IntegerType.get_signless(32)
-    idx_t = IndexType.get()
-
-    entry = helper_fn.add_entry_block()
-    with InsertionPoint(entry):
-        x = entry.arguments[0]
-        scratch = entry.arguments[1]
-
-        # ── constants ────────────────────────────────────────────────────
-        c0_i32 = arith.ConstantOp(i32, 0).result
-        c5_i32 = arith.ConstantOp(i32, 5).result
-        c31_i32 = arith.ConstantOp(i32, 31).result
-        c32_i32 = arith.ConstantOp(i32, 32).result
-        c_scale = arith.ConstantOp(i32, scale).result
-        c_num_warps = arith.ConstantOp(i32, num_warps).result
-        c_offset = arith.ConstantOp(i32, thread_offset).result
-        c_identity = arith.ConstantOp(scalar_t, identity_val).result
-
-        # ── thread indexing ──────────────────────────────────────────────
-        tid_x = _pto.GetTidXOp().result
-        if thread_offset:
-            tx = arith.SubIOp(tid_x, c_offset).result
-            wid = arith.ShRUIOp(tx, c5_i32).result
-            lid = arith.AndIOp(tx, c31_i32).result
-        else:
-            tx = tid_x
-            wid = arith.ShRUIOp(tx, c5_i32).result
-            lid = _pto.GetLaneIdOp().result
-
-        # ── Stage 1: per-warp reduce ─────────────────────────────────────
-        if scale == 1:
-            warp_val = _REDUX_OP(x).result
-        else:
-            warp_val = _emit_butterfly(
-                x, threads=32, scale=scale,
-            )
-
-        # ── Stage 2: warp leaders write partial results ──────────────────
-        is_writer = arith.CmpIOp(arith.CmpIPredicate.ult, lid, c_scale).result
-        write_if = scf.IfOp(is_writer, hasElse=False)
-        with InsertionPoint(write_if.then_block):
-            slot = arith.AddIOp(
-                arith.MulIOp(wid, c_scale).result, lid).result
-            slot_idx = arith.IndexCastOp(idx_t, slot).result
-            _emit_store(scratch, slot_idx, warp_val)
-            scf.YieldOp([])
-
-        # ── Stage 3: sync before reading partial results ─────────────────
-        _pto.SyncthreadsOp()
-
-        # ── Stage 4: leader warp reduces partial sums ────────────────────
-        is_leader_warp = arith.CmpIOp(
-            arith.CmpIPredicate.ult, tx, c32_i32).result
-        outer_if = scf.IfOp(is_leader_warp, [scalar_t], hasElse=True)
-
-        with InsertionPoint(outer_if.then_block):
-            if scale == 1:
-                # ── scale == 1: hw_reduce across leader warp ────────────
-                need_load = arith.CmpIOp(
-                    arith.CmpIPredicate.ult, lid, c_num_warps).result
-                inner_if = scf.IfOp(need_load, [scalar_t], hasElse=True)
-                with InsertionPoint(inner_if.then_block):
-                    lid_idx = arith.IndexCastOp(idx_t, lid).result
-                    tmp = _emit_load(scalar_t, scratch, lid_idx)
-                    scf.YieldOp([tmp])
-                with InsertionPoint(inner_if.else_block):
-                    scf.YieldOp([c_identity])
-                loaded = inner_if.results[0]
-                stage4_result = _REDUX_OP(loaded).result
-            elif scale * num_warps <= 32:
-                # ── scale > 1, fits in one warp: butterfly ──────────────
-                total = scale * num_warps
-                c_total = arith.ConstantOp(i32, total).result
-                need_load = arith.CmpIOp(
-                    arith.CmpIPredicate.ult, lid, c_total).result
-                inner_if = scf.IfOp(need_load, [scalar_t], hasElse=True)
-                with InsertionPoint(inner_if.then_block):
-                    lid_idx = arith.IndexCastOp(idx_t, lid).result
-                    tmp = _emit_load(scalar_t, scratch, lid_idx)
-                    scf.YieldOp([tmp])
-                with InsertionPoint(inner_if.else_block):
-                    scf.YieldOp([c_identity])
-                loaded = inner_if.results[0]
-                stage4_result = _emit_butterfly(
-                    loaded,
-                    threads=total, scale=scale,
-                )
-            else:
-                # ── manual loop: lid < scale lanes each reduce num_warps
-                is_reducer = arith.CmpIOp(
-                    arith.CmpIPredicate.ult, lid, c_scale).result
-                result = c_identity
-                my_slot = arith.RemUIOp(lid, c_scale).result
-                for w in range(num_warps):
-                    c_w = arith.ConstantOp(i32, w).result
-                    idx_val = arith.AddIOp(
-                        arith.MulIOp(c_w, c_scale).result, my_slot).result
-                    slot_idx = arith.IndexCastOp(idx_t, idx_val).result
-                    loaded_v = _emit_load(
-                        scalar_t, scratch, slot_idx)
-                    result = _apply_sum(result, loaded_v)
-                stage4_result = arith.SelectOp(
-                    is_reducer, result, c_identity).result
-
-            scf.YieldOp([stage4_result])
-
-        with InsertionPoint(outer_if.else_block):
-            scf.YieldOp([c_identity])
-
-        partial_reduced = outer_if.results[0]
-
-        # ── Stage 5: global leader writes result to scratch ──────────────
-        is_global_leader = arith.CmpIOp(
-            arith.CmpIPredicate.ult, tx, c_scale).result
-        write_result_if = scf.IfOp(is_global_leader, hasElse=False)
-        with InsertionPoint(write_result_if.then_block):
-            tx_idx = arith.IndexCastOp(idx_t, tx).result
-            _emit_store(scratch, tx_idx, partial_reduced)
-            scf.YieldOp([])
-
-        # ── Stage 6: sync + broadcast load scratch[tx % scale] ───────────
-        _pto.SyncthreadsOp()
-        my_slot = arith.RemUIOp(tx, c_scale).result
-        load_idx = arith.IndexCastOp(idx_t, my_slot).result
-        result = _emit_load(scalar_t, scratch, load_idx)
-
-        # ── Stage 7: extra sync to fence scratch reuse ───────────────────
-        _pto.SyncthreadsOp()
-
-        func.ReturnOp([result])
-
-
-# ═══════════════════════════════════════════════════════════════════════════════
-# emitter: ub_reduce  (Paths 2 & 4: fallback via UB scratch)
-# ═══════════════════════════════════════════════════════════════════════════════
-
-def _emit_ub_reduce(helper_fn, *,
-                    dtype, threads, scale, thread_offset):
-    """Build the body of a UB-scratch all-reduce helper.
-
-    Algorithm:
-
-    1. Each lane writes x → scratch[tx].
-    2. ``pto.syncthreads``.
-    3. Lanes with ``lane % scale == 0`` sequentially reduce scratch slots.
-    4. ``pto.syncthreads``.
-    5. Global leader (lane % scale == 0, lane / scale == 0) writes back.
-    6. ``pto.syncthreads`` + broadcast: each lane reads scratch[tx % scale].
-    7. ``pto.syncthreads`` to fence scratch reuse.
-    """
-    scalar_t = _mlir_scalar_type(dtype)
-    i32 = IntegerType.get_signless(32)
-    idx_t = IndexType.get()
-
-    entry = helper_fn.add_entry_block()
-    with InsertionPoint(entry):
-        x = entry.arguments[0]
-        scratch = entry.arguments[1]
-
-        # ── constants ────────────────────────────────────────────────────
-        c0_i32 = arith.ConstantOp(i32, 0).result
-        c_threads = arith.ConstantOp(i32, threads).result
-        c_scale = arith.ConstantOp(i32, scale).result
-        c_offset = arith.ConstantOp(i32, thread_offset).result
-
-        # ── thread indexing ──────────────────────────────────────────────
-        tid_x = _pto.GetTidXOp().result
-        tx = arith.SubIOp(tid_x, c_offset).result if thread_offset else tid_x
-        group = arith.DivUIOp(tx, c_threads).result
-        lane = arith.RemUIOp(tx, c_threads).result
-        lane_mod = arith.RemUIOp(lane, c_scale).result
-
-        # ── Stage 1: each lane writes x → scratch[tx] ───────────────────
-        tx_idx = arith.IndexCastOp(idx_t, tx).result
-        _emit_store(scratch, tx_idx, x)
-
-        # ── Stage 2: sync ────────────────────────────────────────────────
-        _pto.SyncthreadsOp()
-
-        # ── Stage 3: reducers sequentially combine ───────────────────────
-        # lane < scale gives exactly one reducer per residue class
-        is_reducer = arith.CmpIOp(
-            arith.CmpIPredicate.ult, lane, c_scale).result
-        reduce_if = scf.IfOp(is_reducer, [scalar_t], hasElse=True)
-
-        with InsertionPoint(reduce_if.then_block):
-            # initial: load scratch[group * threads + lane]
-            group_offset = arith.MulIOp(group, c_threads).result
-            first_elem = arith.AddIOp(group_offset, lane).result
-            first_idx = arith.IndexCastOp(idx_t, first_elem).result
-            acc = _emit_load(scalar_t, scratch, first_idx)
-
-            # scf.for i = scale to threads step scale
-            lb = arith.ConstantOp(idx_t, scale).result
-            ub = arith.ConstantOp(idx_t, threads).result
-            step = arith.ConstantOp(idx_t, scale).result
-            for_op = scf.ForOp(lb, ub, step, [acc])
-            with InsertionPoint(for_op.body):
-                i = for_op.induction_variable
-                prev = for_op.inner_iter_args[0]
-                elem = arith.AddIOp(first_idx, i).result
-                loaded = _emit_load(
-                    scalar_t, scratch, elem)
-                new_acc = _apply_sum(prev, loaded)
-                scf.YieldOp([new_acc])
-            scf.YieldOp([for_op.results[0]])
-
-        with InsertionPoint(reduce_if.else_block):
-            scf.YieldOp([x])
-
-        flag = reduce_if.results[0]
-
-        # ── Stage 4: sync ────────────────────────────────────────────────
-        _pto.SyncthreadsOp()
-
-        # ── Stage 5: per-class leader writes reduced value ───────────────
-        # leader lanes 0..scale-1 each write their residue class result
-        is_leader = arith.CmpIOp(
-            arith.CmpIPredicate.ult, lane, c_scale).result
-        write_if = scf.IfOp(is_leader, hasElse=False)
-        with InsertionPoint(write_if.then_block):
-            dst_offset = arith.AddIOp(
-                arith.MulIOp(group, c_threads).result, lane).result
-            dst_idx = arith.IndexCastOp(idx_t, dst_offset).result
-            _emit_store(scratch, dst_idx, flag)
-            scf.YieldOp([])
-
-        # ── Stage 6: sync + broadcast scratch[group*threads + tx%scale] ──
-        _pto.SyncthreadsOp()
-        my_slot = arith.AddIOp(
-            arith.MulIOp(group, c_threads).result,
-            arith.RemUIOp(tx, c_scale).result).result
-        load_idx = arith.IndexCastOp(idx_t, my_slot).result
-        result = _emit_load(scalar_t, scratch, load_idx)
-
-        # ── Stage 7: extra sync to fence scratch reuse ───────────────────
-        _pto.SyncthreadsOp()
-
-        func.ReturnOp([result])
+def simt_allreduce_min(value, *, threads, scale=1, thread_offset=0, scratch=None):
+    """Min reduce across SIMT work-items."""
+    return _simt_allreduce(value, threads=threads, scale=scale,
+                           thread_offset=thread_offset, scratch=scratch, reducer="min")
 
 
 __all__ = [
     "simt_allreduce_sum",
+    "simt_allreduce_max",
+    "simt_allreduce_min",
 ]
diff --git a/ptodsl/ptodsl/pto.py b/ptodsl/ptodsl/pto.py
index d870cc3d5c..b499ea1055 100644
--- a/ptodsl/ptodsl/pto.py
+++ b/ptodsl/ptodsl/pto.py
@@ -145,7 +145,7 @@
 )
 
 # ── All-reduce ─────────────────────────────────────────────────────────────────
-from ._allreduce import simt_allreduce_sum  # noqa: F401
+from ._allreduce import simt_allreduce_max, simt_allreduce_min, simt_allreduce_sum  # noqa: F401
 
 # ── Decorator ─────────────────────────────────────────────────────────────────
 from ._jit import jit, KernelHandle, merge_jit_modules      # noqa: F401
diff --git a/ptodsl/tests/test_allreduce.py b/ptodsl/tests/test_allreduce.py
index f9262bda0c..3b8ca120b2 100644
--- a/ptodsl/tests/test_allreduce.py
+++ b/ptodsl/tests/test_allreduce.py
@@ -21,19 +21,7 @@ def expect(condition: bool, message: str) -> None:
 
 
 def main():
-    from ptodsl._allreduce import _helper_name, simt_allreduce_sum
-
-    # ══════════════════════════════════════════════════════════════════════════
-    # helper name format
-    # ══════════════════════════════════════════════════════════════════════════
-    expect(
-        _helper_name("f32", 128, 1, 0) == "__tl_allreduce_sum_f32_t128_s1_o0",
-        "helper name format (sum/f32/t128/s1/o0)",
-    )
-    expect(
-        _helper_name("f16", 32, 2, 4) == "__tl_allreduce_sum_f16_t32_s2_o4",
-        "helper name format (f16/t32/s2/o4)",
-    )
+    from ptodsl._allreduce import simt_allreduce_sum, simt_allreduce_max, simt_allreduce_min
 
     # ══════════════════════════════════════════════════════════════════════════
     # Path 0: identity (threads <= scale)
@@ -100,8 +88,6 @@ def kernel_warp(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_warp = kernel_warp.compile()
     mlir_warp = compiled_warp.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t32_s1_o0" in mlir_warp,
-           "IR: warp_reduce helper name")
     expect("pto.redux_add" in mlir_warp,
            "IR: redux_add in warp_reduce helper")
     expect("pto.syncthreads" not in mlir_warp,
@@ -124,8 +110,6 @@ def kernel_warp_t16(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_warp_t16 = kernel_warp_t16.compile()
     mlir_warp_t16 = compiled_warp_t16.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t16_s1_o0" in mlir_warp_t16,
-           "IR: warp_reduce t=16 helper name")
     expect("pto.redux_add" in mlir_warp_t16,
            "IR: redux_add for groups>1")
     expect("arith.select" in mlir_warp_t16,
@@ -148,8 +132,6 @@ def kernel_warp_t8(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_warp_t8 = kernel_warp_t8.compile()
     mlir_warp_t8 = compiled_warp_t8.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t8_s1_o0" in mlir_warp_t8,
-           "IR: warp_reduce t=8 butterfly helper name (sum)")
     expect("pto.shuffle_bfly" in mlir_warp_t8,
            "IR: shuffle_bfly for butterfly path")
     expect("pto.redux_add" not in mlir_warp_t8,
@@ -172,8 +154,6 @@ def kernel_warp_s2(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_warp_s2 = kernel_warp_s2.compile()
     mlir_warp_s2 = compiled_warp_s2.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t32_s2_o0" in mlir_warp_s2,
-           "IR: warp_reduce s=2 butterfly helper name (sum)")
     expect("pto.shuffle_bfly" in mlir_warp_s2,
            "IR: shuffle_bfly for butterfly (scale>1)")
     expect("pto.redux_add" not in mlir_warp_s2,
@@ -191,8 +171,6 @@ def kernel_warp_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
 
     compiled_warp_o4 = kernel_warp_o4.compile()
     mlir_warp_o4 = compiled_warp_o4.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t16_s1_o4" in mlir_warp_o4,
-           "IR: warp_reduce o=4 helper name")
     expect("pto.get_tid_x" in mlir_warp_o4,
            "IR: warp_reduce o=4 uses get_tid_x (not raw get_laneid)")
     expect("arith.subi" in mlir_warp_o4,
@@ -211,12 +189,10 @@ def kernel_ub6(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=6, scale=1)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=6, scale=1)
 
     compiled_ub6 = kernel_ub6.compile()
     mlir_ub6 = compiled_ub6.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t6_s1_o0" in mlir_ub6,
-           "IR: ub_reduce t=6 helper name")
     expect("pto.syncthreads" in mlir_ub6,
            "IR: ub_reduce has syncthreads")
     expect("pto.store" in mlir_ub6,
@@ -235,12 +211,10 @@ def kernel_ub6s2(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=6, scale=2)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=6, scale=2)
 
     compiled_ub6s2 = kernel_ub6s2.compile()
     mlir_ub6s2 = compiled_ub6s2.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t6_s2_o0" in mlir_ub6s2,
-           "IR: ub_reduce t=6 s=2 helper name")
     expect("pto.syncthreads" in mlir_ub6s2,
            "IR: ub_reduce t=6 s=2 has syncthreads")
     expect("pto.store" in mlir_ub6s2,
@@ -254,8 +228,8 @@ def kernel_ub6s2(scratch_gm: pto.ptr(pto.f32, "gm")):
     expect("pto.shuffle_bfly" not in mlir_ub6s2,
            "IR: ub_reduce t=6 s=2 has no butterfly shuffle")
     # scale>1 fixes: reducer uses lane < scale (ult), not lane_mod == 0
-    expect("arith.cmpi ult" in mlir_ub6s2,
-           "IR: ub_reduce t=6 s=2 reducer uses ult (lane < scale)")
+    expect("arith.cmpi slt" in mlir_ub6s2 or "arith.cmpi ult" in mlir_ub6s2,
+           "IR: ub_reduce t=6 s=2 reducer uses lane < scale")
     compiled_ub6s2.verify()
 
     # ── ub_reduce: sum, f32, t=6, s=1, o=4 (non-zero thread_offset) ─────────
@@ -265,13 +239,11 @@ def kernel_ub_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=6, scale=1,
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=6, scale=1,
                                      thread_offset=4)
 
     compiled_ub_o4 = kernel_ub_o4.compile()
     mlir_ub_o4 = compiled_ub_o4.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t6_s1_o4" in mlir_ub_o4,
-           "IR: ub_reduce o=4 helper name")
     expect("arith.subi" in mlir_ub_o4,
            "IR: ub_reduce o=4 uses subi for tx = tid_x - offset")
     compiled_ub_o4.verify()
@@ -286,17 +258,13 @@ def kernel_128(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=128, scale=1)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=1)
 
     compiled = kernel_128.compile()
     mlir = compiled.mlir_text()
 
-    expect("func.func @__tl_allreduce_sum_f32_t128_s1_o0" in mlir,
-           "IR: helper function definition")
     expect("pto.simt_entry" in mlir,
            "IR: helper carries pto.simt_entry")
-    expect("call @__tl_allreduce_sum_f32_t128_s1_o0" in mlir,
-           "IR: func.call to helper")
 
     for op_name in (
         "pto.redux_add", "pto.syncthreads", "pto.store", "pto.load",
@@ -317,12 +285,10 @@ def kernel_64(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=64, scale=1)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=64, scale=1)
 
     compiled_64 = kernel_64.compile()
     mlir_64 = compiled_64.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t64_s1_o0" in mlir_64,
-           "IR: helper for t=64")
     compiled_64.verify()
 
     # ── cross_warp: sum, f32, t=256 (8 warps) ───────────────────────────────
@@ -332,12 +298,10 @@ def kernel_256(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=256, scale=1)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=256, scale=1)
 
     compiled_256 = kernel_256.compile()
     mlir_256 = compiled_256.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t256_s1_o0" in mlir_256,
-           "IR: helper for t=256")
     compiled_256.verify()
 
     # ══════════════════════════════════════════════════════════════════════════
@@ -351,12 +315,10 @@ def kernel_cw_s2(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=128, scale=2)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=2)
 
     compiled_cw_s2 = kernel_cw_s2.compile()
     mlir_cw_s2 = compiled_cw_s2.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t128_s2_o0" in mlir_cw_s2,
-           "IR: cross_warp s=2 helper name")
     expect("pto.shuffle_bfly" in mlir_cw_s2,
            "IR: cross_warp s=2 has shuffle_bfly (butterfly for per-warp + leader)")
     expect("pto.syncthreads" in mlir_cw_s2,
@@ -375,12 +337,10 @@ def kernel_cw_s16(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=128, scale=16)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=16)
 
     compiled_cw_s16 = kernel_cw_s16.compile()
     mlir_cw_s16 = compiled_cw_s16.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t128_s16_o0" in mlir_cw_s16,
-           "IR: cross_warp s=16 manual helper name")
     expect("pto.syncthreads" in mlir_cw_s16,
            "IR: cross_warp s=16 has syncthreads")
     compiled_cw_s16.verify()
@@ -392,13 +352,11 @@ def kernel_cw_o4(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=128, scale=1,
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=1,
                                      thread_offset=4)
 
     compiled_cw_o4 = kernel_cw_o4.compile()
     mlir_cw_o4 = compiled_cw_o4.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t128_s1_o4" in mlir_cw_o4,
-           "IR: cross_warp o=4 helper name")
     expect("pto.get_tid_x" in mlir_cw_o4,
            "IR: cross_warp o=4 uses get_tid_x")
     expect("arith.subi" in mlir_cw_o4,
@@ -416,12 +374,10 @@ def kernel_ub48(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, ub_scratch, threads=48, scale=1)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=48, scale=1)
 
     compiled_ub48 = kernel_ub48.compile()
     mlir_ub48 = compiled_ub48.mlir_text()
-    expect("func.func @__tl_allreduce_sum_f32_t48_s1_o0" in mlir_ub48,
-           "IR: ub_reduce fallback t=48 helper name")
     expect("pto.syncthreads" in mlir_ub48,
            "IR: ub_reduce fallback has syncthreads")
     expect("pto.store" in mlir_ub48,
@@ -431,7 +387,6 @@ def kernel_ub48(scratch_gm: pto.ptr(pto.f32, "gm")):
     compiled_ub48.verify()
 
     # ══════════════════════════════════════════════════════════════════════════
-    # helper deduplication across multiple calls
     # ══════════════════════════════════════════════════════════════════════════
 
     @pto.jit(target="a5")
@@ -440,18 +395,13 @@ def kernel_reuse(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
         with pto.simt():
             x1 = pto.const(1.0, dtype=pto.f32)
-            _r1 = pto.simt_allreduce_sum(x1, ub_scratch, threads=128, scale=1)
+            _r1 = pto.simt_allreduce_sum(x1, scratch=ub_scratch, threads=128, scale=1)
             x2 = pto.const(2.0, dtype=pto.f32)
-            _r2 = pto.simt_allreduce_sum(x2, ub_scratch, threads=128, scale=1)
+            _r2 = pto.simt_allreduce_sum(x2, scratch=ub_scratch, threads=128, scale=1)
 
     compiled2 = kernel_reuse.compile()
     mlir2 = compiled2.mlir_text()
 
-    definitions = mlir2.count("func.func @__tl_allreduce_sum_f32_t128_s1_o0")
-    expect(definitions == 1,
-           f"IR: helper defined {definitions} times, expected 1")
-    calls = mlir2.count("call @__tl_allreduce_sum_f32_t128_s1_o0")
-    expect(calls == 2, f"IR: expected 2 call sites, got {calls}")
     compiled2.verify()
 
 
@@ -465,7 +415,7 @@ def kernel_reuse(scratch_gm: pto.ptr(pto.f32, "gm")):
     def kernel_no_scratch_cw(scratch_gm: pto.ptr(pto.f32, "gm")):
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, None, threads=128, scale=1)
+            _result = pto.simt_allreduce_sum(x, scratch=None, threads=128, scale=1)
 
     try:
         kernel_no_scratch_cw.compile()
@@ -479,7 +429,7 @@ def kernel_no_scratch_cw(scratch_gm: pto.ptr(pto.f32, "gm")):
     def kernel_no_scratch_ub(scratch_gm: pto.ptr(pto.f32, "gm")):
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, None, threads=6, scale=1)
+            _result = pto.simt_allreduce_sum(x, scratch=None, threads=6, scale=1)
 
     try:
         kernel_no_scratch_ub.compile()
@@ -488,26 +438,34 @@ def kernel_no_scratch_ub(scratch_gm: pto.ptr(pto.f32, "gm")):
         expect("requires a UB scratch buffer" in str(e),
                f"error message should mention scratch (ub_reduce), got: {e}")
 
-    # scratch must be a pto.ptr type
+    # scratch must be a pto.ptr type — PTODSL scalar.load/store catch this
+    @pto.jit(target="a5")
+    def kernel_non_ptr():
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            not_ptr = pto.const(0, dtype=pto.i32)
+            _result = pto.simt_allreduce_sum(x, scratch=not_ptr, threads=6, scale=1)
+
     try:
-        simt_allreduce_sum(1.0, "not_a_ptr", threads=6, scale=1)
-        raise AssertionError("expected TypeError for non-ptr scratch")
-    except (TypeError, AttributeError):
-        pass
+        kernel_non_ptr.compile()
+        raise AssertionError("expected error for non-ptr scratch")
+    except Exception:
+        pass  # PTODSL scalar.store / resolve_address_access catches this
 
     # cross_warp: gm scratch (wrong memory space) should be rejected
     @pto.jit(target="a5")
     def kernel_gm_scratch(scratch_gm: pto.ptr(pto.f32, "gm")):
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, scratch_gm, threads=128, scale=1)
+            _result = pto.simt_allreduce_sum(x, scratch=scratch_gm, threads=128, scale=1)
 
     try:
         kernel_gm_scratch.compile()
-        raise AssertionError("expected TypeError for gm scratch")
-    except TypeError as e:
-        expect("UB" in str(e).upper() or "memory space" in str(e).lower(),
-               f"gm scratch error should mention memory space, got: {e}")
+        raise AssertionError("expected error for gm scratch")
+    except Exception as e:
+        expect("ub" in str(e).lower() or "vec" in str(e).lower() or "address space" in str(e).lower()
+               or "memory" in str(e).lower(),
+               f"gm scratch error should mention address space, got: {e}")
 
     # cross_warp: i32 scratch with f32 x (dtype mismatch) should be rejected
     @pto.jit(target="a5")
@@ -516,15 +474,218 @@ def kernel_dtype_mismatch(scratch_gm: pto.ptr(pto.f32, "gm")):
         ub_i32 = pto.castptr(zero_u64, pto.ptr(pto.i32, "ub"))
         with pto.simt():
             x = pto.const(1.0, dtype=pto.f32)
-            _result = pto.simt_allreduce_sum(x, ub_i32, threads=128, scale=1)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_i32, threads=128, scale=1)
 
     try:
         kernel_dtype_mismatch.compile()
         raise AssertionError("expected TypeError for dtype mismatch scratch")
     except TypeError as e:
         err = str(e)
-        expect("element type" in err.lower() or "mismatch" in err.lower(),
-               f"dtype mismatch should mention element type, got: {e}")
+        expect("cannot coerce" in err.lower() or "element type" in err.lower()
+               or "mismatch" in err.lower(),
+               f"dtype mismatch should mention type, got: {e}")
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Max reducer — Path 1a: warp_reduce, hw redux (threads=32, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+    @pto.jit(target="a5")
+    def kernel_max_warp_hw(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_max(x, threads=32, scale=1)
+
+    compiled_max_warp = kernel_max_warp_hw.compile()
+    mlir_max_warp = compiled_max_warp.mlir_text()
+
+    expect(
+        "pto.redux_max" in mlir_max_warp,
+        "Path 1a (max): IR must contain pto.redux_max",
+    )
+    expect(
+        "pto.syncthreads" not in mlir_max_warp,
+        "Path 1a (max): single-warp hw reduce needs no syncthreads",
+    )
+
+    # ── Max reducer — Path 1c: warp_reduce, butterfly (threads=8, scale=1) ──
+    @pto.jit(target="a5")
+    def kernel_max_butterfly(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_max(x, threads=8, scale=1)
+
+    compiled_max_bfly = kernel_max_butterfly.compile()
+    mlir_max_bfly = str(compiled_max_bfly.mlir_text())
+
+    expect(
+        "arith.maximumf" in mlir_max_bfly,
+        "Path 1c (max): butterfly must emit arith.maximumf for element-wise max",
+    )
+    expect(
+        "pto.shuffle_bfly" in mlir_max_bfly,
+        "Path 1c (max): butterfly must use pto.shuffle_bfly",
+    )
+    expect(
+        "pto.redux_max" not in mlir_max_bfly,
+        "Path 1c (max): butterfly path should NOT use hw redux",
+    )
+
+    # ── Max reducer — Path 3: cross_warp_reduce (threads=128, scale=1) ──
+    @pto.jit(target="a5")
+    def kernel_max_cross_warp(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_max(x, scratch=ub_scratch, threads=128, scale=1)
+
+    compiled_max_cw = kernel_max_cross_warp.compile()
+    mlir_max_cw = str(compiled_max_cw.mlir_text())
+
+    expect(
+        "pto.redux_max" in mlir_max_cw,
+        "Path 3 (max): cross-warp IR must contain pto.redux_max",
+    )
+    expect(
+        "pto.syncthreads" in mlir_max_cw,
+        "Path 3 (max): cross-warp needs syncthreads barriers",
+    )
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Min reducer — Path 1a: warp_reduce, hw redux (threads=32, scale=1)
+    # ══════════════════════════════════════════════════════════════════════════
+    @pto.jit(target="a5")
+    def kernel_min_warp_hw(scratch_gm: pto.ptr(pto.f32, "gm")):
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_min(x, threads=32, scale=1)
+
+    compiled_min_warp = kernel_min_warp_hw.compile()
+    mlir_min_warp = str(compiled_min_warp.mlir_text())
+
+    expect(
+        "pto.redux_min" in mlir_min_warp,
+        "Path 1a (min): IR must contain pto.redux_min",
+    )
+    expect(
+        "pto.syncthreads" not in mlir_min_warp,
+        "Path 1a (min): single-warp hw reduce needs no syncthreads",
+    )
+
+    # ── Min reducer — Path 4 (ub_reduce fallback): threads=48, non-pow2 ──
+    @pto.jit(target="a5")
+    def kernel_min_ub(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_min(x, scratch=ub_scratch, threads=48, scale=1)
+
+    compiled_min_ub = kernel_min_ub.compile()
+    mlir_min_ub = str(compiled_min_ub.mlir_text())
+
+    expect(
+        "arith.minimumf" in mlir_min_ub,
+        "Path 4 (min): ub_reduce fallback must emit arith.minimumf",
+    )
+
+    # ── Identity smoke tests for max/min ───────────────────────────────────
+    expect(
+        simt_allreduce_max(1.0, threads=1, scale=1) == 1.0,
+        "Path 0 (max): threads <= scale returns identity (value unchanged)",
+    )
+    expect(
+        simt_allreduce_min(1.0, threads=2, scale=2) == 1.0,
+        "Path 0 (min): threads <= scale returns identity (value unchanged)",
+    )
+
+    # ══════════════════════════════════════════════════════════════════════════
+    # Lowering verification — ptoas → bisheng (full AOT compilation)
+    #
+    # Tests that the allreduce MLIR survives the complete ptoas pipeline:
+    #   MLIR (PTO dialect) → VPTO passes → LLVM IR → bisheng device codegen
+    #
+    # KNOWN TOOLCHAIN ISSUES (bisheng, not allreduce):
+    #   a) bisheng stack-smashing on SIMT code that stores to GM
+    #   b) bisheng stack-smashing on cross-warp scratch-buffer code (≥ 128 lanes)
+    #
+    # These are bisheng bugs — ptoas VPTO lowering succeeds; the crash is
+    # in the device LLVM→object step inside bisheng.  Verified by:
+    #   ptoas --emit-vpto-llvm-ir  → valid LLVM IR (no crash)
+    #   ptoas -o kernel.o          → bisheng crash during LLVM→object
+    # ══════════════════════════════════════════════════════════════════════════
+
+    import subprocess
+    import tempfile
+    from pathlib import Path
+
+    def _ptoas_binary() -> Path:
+        for p in [
+            Path(__file__).resolve().parents[2] / "build" / "tools" / "ptoas" / "ptoas",
+        ]:
+            if p.is_file():
+                return p
+        raise RuntimeError(
+            "ptoas binary not found; run `source scripts/ptoas_env.sh` or build ptoas"
+        )
+
+    def _lower_and_check(compiled, case_label: str, expect_pass: bool = True) -> bool:
+        """Run ``ptoas`` lowering on *compiled* MLIR.  Returns True on success."""
+        ptoas = _ptoas_binary()
+        mlir_text = compiled.mlir_text()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mlir_path = Path(tmpdir) / "kernel.mlir"
+            obj_path = Path(tmpdir) / "kernel.o"
+            mlir_path.write_text(mlir_text)
+            result = subprocess.run(
+                [str(ptoas), "--pto-arch=a5", "--pto-backend=vpto",
+                 "--enable-tile-op-expand",
+                 str(mlir_path), "-o", str(obj_path)],
+                capture_output=True, text=True,
+            )
+            ok = result.returncode == 0 and obj_path.is_file()
+            if ok:
+                return True
+            bisheng_crash = "stack smashing" in result.stderr or "exit code 134" in result.stderr
+            tag = "SKIP (bisheng bug)" if bisheng_crash else "FAIL"
+            if expect_pass and not bisheng_crash:
+                # Unexpected failure — report loudly
+                sys.stderr.write(
+                    f"\n  [{tag}] {case_label} (exit={result.returncode})\n"
+                    f"  STDERR: {result.stderr[:500]}\n"
+                )
+            else:
+                print(f"  [{tag}] {case_label}")
+            return False
+
+    # ── Warp-reduce (≤ 32 lanes, NO scratch, NO GM store) ──
+    # These are the simplest kernels — they only compute a value and return
+    # from the SIMT body without writing to GM.  They MUST lower cleanly
+    # because they avoid both known bisheng issues.
+    expect(
+        _lower_and_check(kernel_warp.compile(), "warp_sum_t32"),
+        "lowering: warp_sum (32 lanes, hw redux, no GM store) must pass",
+    )
+    expect(
+        _lower_and_check(kernel_max_warp_hw.compile(), "warp_max_t32"),
+        "lowering: warp_max (32 lanes, hw redux, no GM store) must pass",
+    )
+    expect(
+        _lower_and_check(kernel_min_warp_hw.compile(), "warp_min_t32"),
+        "lowering: warp_min (32 lanes, hw redux, no GM store) must pass",
+    )
+
+    # ── Cross-warp (128 lanes, UB scratch) — known bisheng crash ──
+    # ptoas VPTO lowering succeeds; bisheng crashes on the device LLVM IR.
+    @pto.jit(target="a5")
+    def _kernel_cross_lowering(scratch_gm: pto.ptr(pto.f32, "gm")):
+        zero_u64 = pto.const(0, dtype=pto.ui64)
+        ub_scratch = pto.castptr(zero_u64, pto.ptr(pto.f32, "ub"))
+        with pto.simt():
+            x = pto.const(1.0, dtype=pto.f32)
+            _result = pto.simt_allreduce_sum(x, scratch=ub_scratch, threads=128, scale=1)
+    _lower_and_check(_kernel_cross_lowering.compile(), "cross_sum_t128", expect_pass=False)
 
     print("ptodsl_allreduce: PASS")
 
diff --git a/ptodsl/tests/test_rmsnorm_example_compile.py b/ptodsl/tests/test_rmsnorm_example_compile.py
index 6fb4973ebc..5e1419c340 100644
--- a/ptodsl/tests/test_rmsnorm_example_compile.py
+++ b/ptodsl/tests/test_rmsnorm_example_compile.py
@@ -62,7 +62,7 @@ def load_rmsnorm_example():
     return module
 
 
-def check_variant(compiled, *, label: str, vector_type: str, helper_name_fragment: str, ub_size: int) -> None:
+def check_variant(compiled, *, label: str, vector_type: str, ub_size: int) -> None:
     compiled.verify()
     text = compiled.mlir_text()
     expect_parse_roundtrip_and_verify(text, f"RMSNorm {label} MLIR")
@@ -92,9 +92,12 @@ def check_variant(compiled, *, label: str, vector_type: str, helper_name_fragmen
     expect(text.count("pto.wait_flag_dyn") == 4,
            f"{label}: token loop should lower four dynamic wait_flag ops")
     expect(vector_type in text, f"{label}: missing contiguous vector access type {vector_type}")
-    expect(helper_name_fragment in text, f"{label}: missing allreduce helper")
-    expect("func.call @__tl_allreduce_sum" in text or "call @__tl_allreduce_sum" in text,
-           f"{label}: allreduce should remain helper-call based")
+    expect("__tl_allreduce_sum" not in text,
+           f"{label}: allreduce should be emitted inline, not as a helper call")
+    expect("pto.redux_add" in text, f"{label}: inline allreduce should use redux_add")
+    expect("pto.syncthreads" in text, f"{label}: inline allreduce should synchronize through UB scratch")
+    expect("pto.sqrt" in text, f"{label}: RMSNorm runtime sqrt should lower through the PTO SIMT sqrt op")
+    expect("math.sqrt" not in text, f"{label}: RMSNorm SIMT helper should not leave math.sqrt in the MLIR")
 
     expect(
         text.count("pto.mte_gm_ub") == 2,
@@ -158,14 +161,12 @@ def main() -> None:
         example.build_x128(),
         label="x128",
         vector_type="vector<4xf32>",
-        helper_name_fragment="__tl_allreduce_sum_f32_t128_s1_o0",
         ub_size=82496,
     )
     check_variant(
         example.build_x64(),
         label="x64",
         vector_type="vector<4xf32>",
-        helper_name_fragment="__tl_allreduce_sum_f32_t64_s1_o0",
         ub_size=82496,
     )
 
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_max/compare.py b/test/vpto/cases/micro-op/simt/allreduce_cross_max/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_max/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_max/golden.py b/test/vpto/cases/micro-op/simt/allreduce_cross_max/golden.py
new file mode 100644
index 0000000000..2dca5fb330
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_max/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 128
+EXPECTED = 1.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_max/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_cross_max/kernel.pto
new file mode 100644
index 0000000000..e9b2842bf5
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_max/kernel.pto
@@ -0,0 +1,65 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c0_i64 = arith.constant 0 : i64
+    %0 = builtin.unrealized_conversion_cast %c0_i64 : i64 to ui64
+    %1 = pto.castptr %0 : ui64 -> !pto.ptr<f32, ub>
+    %c128_i32 = arith.constant 128 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c128_i32, %c1_i32, %c1_i32_0>>>(%1, %arg0) : (!pto.ptr<f32, ub>, !pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, ub>, %arg1: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c5_i32 = arith.constant 5 : i32
+    %c31_i32 = arith.constant 31 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %cst_0 = arith.constant 0xFF800000 : f32
+    %1 = pto.get_tid_x : i32
+    %2 = arith.shrui %1, %c5_i32 : i32
+    %3 = pto.get_laneid : i32
+    %4 = pto.redux_max %cst : f32 -> f32
+    %5 = arith.cmpi ult, %3, %c1_i32 : i32
+    scf.if %5 {
+      %13 = arith.muli %2, %c1_i32 : i32
+      %14 = arith.addi %13, %3 : i32
+      %15 = arith.index_cast %14 : i32 to index
+      pto.store %4, %arg0[%15] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %6 = arith.cmpi ult, %1, %c32_i32 : i32
+    %7 = scf.if %6 -> (f32) {
+      %13 = arith.cmpi ult, %3, %c4_i32 : i32
+      %14 = scf.if %13 -> (f32) {
+        %16 = arith.index_cast %3 : i32 to index
+        %17 = pto.load %arg0[%16] : !pto.ptr<f32, ub> -> f32
+        scf.yield %17 : f32
+      } else {
+        scf.yield %cst_0 : f32
+      }
+      %15 = pto.redux_max %14 : f32 -> f32
+      scf.yield %15 : f32
+    } else {
+      scf.yield %cst_0 : f32
+    }
+    %8 = arith.cmpi ult, %1, %c1_i32 : i32
+    scf.if %8 {
+      %13 = arith.index_cast %1 : i32 to index
+      pto.store %7, %arg0[%13] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %9 = arith.remui %1, %c1_i32 : i32
+    %10 = arith.index_cast %9 : i32 to index
+    %11 = pto.load %arg0[%10] : !pto.ptr<f32, ub> -> f32
+    pto.syncthreads
+    %12 = arith.index_cast %0 : i32 to index
+    pto.store %11, %arg1[%12] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_max/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_max/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_max/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_max/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_max/main.cpp
new file mode 100644
index 0000000000..05dad144bd
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_max/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 128;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_min/compare.py b/test/vpto/cases/micro-op/simt/allreduce_cross_min/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_min/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_min/golden.py b/test/vpto/cases/micro-op/simt/allreduce_cross_min/golden.py
new file mode 100644
index 0000000000..2dca5fb330
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_min/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 128
+EXPECTED = 1.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_min/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_cross_min/kernel.pto
new file mode 100644
index 0000000000..f9353c1b71
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_min/kernel.pto
@@ -0,0 +1,65 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c0_i64 = arith.constant 0 : i64
+    %0 = builtin.unrealized_conversion_cast %c0_i64 : i64 to ui64
+    %1 = pto.castptr %0 : ui64 -> !pto.ptr<f32, ub>
+    %c128_i32 = arith.constant 128 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c128_i32, %c1_i32, %c1_i32_0>>>(%1, %arg0) : (!pto.ptr<f32, ub>, !pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, ub>, %arg1: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c5_i32 = arith.constant 5 : i32
+    %c31_i32 = arith.constant 31 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %cst_0 = arith.constant 0x7F800000 : f32
+    %1 = pto.get_tid_x : i32
+    %2 = arith.shrui %1, %c5_i32 : i32
+    %3 = pto.get_laneid : i32
+    %4 = pto.redux_min %cst : f32 -> f32
+    %5 = arith.cmpi ult, %3, %c1_i32 : i32
+    scf.if %5 {
+      %13 = arith.muli %2, %c1_i32 : i32
+      %14 = arith.addi %13, %3 : i32
+      %15 = arith.index_cast %14 : i32 to index
+      pto.store %4, %arg0[%15] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %6 = arith.cmpi ult, %1, %c32_i32 : i32
+    %7 = scf.if %6 -> (f32) {
+      %13 = arith.cmpi ult, %3, %c4_i32 : i32
+      %14 = scf.if %13 -> (f32) {
+        %16 = arith.index_cast %3 : i32 to index
+        %17 = pto.load %arg0[%16] : !pto.ptr<f32, ub> -> f32
+        scf.yield %17 : f32
+      } else {
+        scf.yield %cst_0 : f32
+      }
+      %15 = pto.redux_min %14 : f32 -> f32
+      scf.yield %15 : f32
+    } else {
+      scf.yield %cst_0 : f32
+    }
+    %8 = arith.cmpi ult, %1, %c1_i32 : i32
+    scf.if %8 {
+      %13 = arith.index_cast %1 : i32 to index
+      pto.store %7, %arg0[%13] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %9 = arith.remui %1, %c1_i32 : i32
+    %10 = arith.index_cast %9 : i32 to index
+    %11 = pto.load %arg0[%10] : !pto.ptr<f32, ub> -> f32
+    pto.syncthreads
+    %12 = arith.index_cast %0 : i32 to index
+    pto.store %11, %arg1[%12] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_min/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_min/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_min/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_min/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_min/main.cpp
new file mode 100644
index 0000000000..05dad144bd
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_min/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 128;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_sum/compare.py b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_sum/golden.py b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/golden.py
new file mode 100644
index 0000000000..ce2b4c57da
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 128
+EXPECTED = 128.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_sum/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/kernel.pto
new file mode 100644
index 0000000000..53f01c524f
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/kernel.pto
@@ -0,0 +1,65 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c0_i64 = arith.constant 0 : i64
+    %0 = builtin.unrealized_conversion_cast %c0_i64 : i64 to ui64
+    %1 = pto.castptr %0 : ui64 -> !pto.ptr<f32, ub>
+    %c128_i32 = arith.constant 128 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c128_i32, %c1_i32, %c1_i32_0>>>(%1, %arg0) : (!pto.ptr<f32, ub>, !pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, ub>, %arg1: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c5_i32 = arith.constant 5 : i32
+    %c31_i32 = arith.constant 31 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %1 = pto.get_tid_x : i32
+    %2 = arith.shrui %1, %c5_i32 : i32
+    %3 = pto.get_laneid : i32
+    %4 = pto.redux_add %cst : f32 -> f32
+    %5 = arith.cmpi ult, %3, %c1_i32 : i32
+    scf.if %5 {
+      %13 = arith.muli %2, %c1_i32 : i32
+      %14 = arith.addi %13, %3 : i32
+      %15 = arith.index_cast %14 : i32 to index
+      pto.store %4, %arg0[%15] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %6 = arith.cmpi ult, %1, %c32_i32 : i32
+    %7 = scf.if %6 -> (f32) {
+      %13 = arith.cmpi ult, %3, %c4_i32 : i32
+      %14 = scf.if %13 -> (f32) {
+        %16 = arith.index_cast %3 : i32 to index
+        %17 = pto.load %arg0[%16] : !pto.ptr<f32, ub> -> f32
+        scf.yield %17 : f32
+      } else {
+        scf.yield %cst_0 : f32
+      }
+      %15 = pto.redux_add %14 : f32 -> f32
+      scf.yield %15 : f32
+    } else {
+      scf.yield %cst_0 : f32
+    }
+    %8 = arith.cmpi ult, %1, %c1_i32 : i32
+    scf.if %8 {
+      %13 = arith.index_cast %1 : i32 to index
+      pto.store %7, %arg0[%13] : !pto.ptr<f32, ub>, f32
+    }
+    pto.syncthreads
+    %9 = arith.remui %1, %c1_i32 : i32
+    %10 = arith.index_cast %9 : i32 to index
+    %11 = pto.load %arg0[%10] : !pto.ptr<f32, ub> -> f32
+    pto.syncthreads
+    %12 = arith.index_cast %0 : i32 to index
+    pto.store %11, %arg1[%12] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_sum/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_cross_sum/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/main.cpp
new file mode 100644
index 0000000000..05dad144bd
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_cross_sum/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 128;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_max/compare.py b/test/vpto/cases/micro-op/simt/allreduce_warp_max/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_max/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_max/golden.py b/test/vpto/cases/micro-op/simt/allreduce_warp_max/golden.py
new file mode 100644
index 0000000000..76a227fb75
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_max/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 32
+EXPECTED = 1.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_max/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_warp_max/kernel.pto
new file mode 100644
index 0000000000..e22af0898e
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_max/kernel.pto
@@ -0,0 +1,21 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c32_i32, %c1_i32, %c1_i32_0>>>(%arg0) : (!pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_0 = arith.constant 0xFF800000 : f32
+    %1 = pto.get_laneid : i32
+    %2 = pto.redux_max %cst : f32 -> f32
+    %3 = arith.index_cast %0 : i32 to index
+    pto.store %2, %arg0[%3] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_max/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_max/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_max/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_max/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_max/main.cpp
new file mode 100644
index 0000000000..c4fcda9b36
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_max/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 32;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_min/compare.py b/test/vpto/cases/micro-op/simt/allreduce_warp_min/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_min/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_min/golden.py b/test/vpto/cases/micro-op/simt/allreduce_warp_min/golden.py
new file mode 100644
index 0000000000..76a227fb75
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_min/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 32
+EXPECTED = 1.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_min/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_warp_min/kernel.pto
new file mode 100644
index 0000000000..d921ef1fe3
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_min/kernel.pto
@@ -0,0 +1,21 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c32_i32, %c1_i32, %c1_i32_0>>>(%arg0) : (!pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_0 = arith.constant 0x7F800000 : f32
+    %1 = pto.get_laneid : i32
+    %2 = pto.redux_min %cst : f32 -> f32
+    %3 = arith.index_cast %0 : i32 to index
+    pto.store %2, %arg0[%3] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_min/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_min/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_min/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_min/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_min/main.cpp
new file mode 100644
index 0000000000..c4fcda9b36
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_min/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 32;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_sum/compare.py b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/compare.py
new file mode 100644
index 0000000000..83136b3727
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/compare.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys, numpy as np
+
+def main():
+    golden = np.fromfile("golden_out.bin", dtype=np.float32)
+    out = np.fromfile("out.bin", dtype=np.float32)
+    if golden.shape != out.shape or not np.allclose(golden, out, rtol=1e-5, atol=1e-5):
+        mismatches = np.nonzero(~np.isclose(golden, out, rtol=1e-5, atol=1e-5))[0]
+        idx = int(mismatches[0]) if mismatches.size else 0
+        print(f"[ERROR] mismatch at idx={{idx}}, golden={{golden[idx]:.6f}}, out={{out[idx]:.6f}}")
+        sys.exit(2)
+    print("[INFO] compare passed")
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_sum/golden.py b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/golden.py
new file mode 100644
index 0000000000..4a5fb7b63d
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/golden.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import argparse, numpy as np
+from pathlib import Path
+
+NLANES = 32
+EXPECTED = 32.0
+
+def generate(output_dir: Path) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    out = np.zeros(NLANES, dtype=np.float32)
+    out.tofile(output_dir / "out.bin")
+    golden = np.full(NLANES, EXPECTED, dtype=np.float32)
+    golden.tofile(output_dir / "golden_out.bin")
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--output-dir", type=Path, default=Path("."))
+    a = p.parse_args()
+    generate(a.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_sum/kernel.pto b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/kernel.pto
new file mode 100644
index 0000000000..18e1374384
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/kernel.pto
@@ -0,0 +1,21 @@
+module attributes {pto.kernel_kind = #pto.kernel_kind<vector>, pto.mode = "auto", pto.target_arch = "a5"} {
+  func.func @_kernel(%arg0: !pto.ptr<f32, gm>) attributes {pto.aicore} {
+    %c32_i32 = arith.constant 32 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c1_i32_0 = arith.constant 1 : i32
+    pto.simt_launch @_body__simt_0<<<%c32_i32, %c1_i32, %c1_i32_0>>>(%arg0) : (!pto.ptr<f32, gm>) -> ()
+    pto.barrier <PIPE_ALL>
+    return
+  }
+  func.func @_body__simt_0(%arg0: !pto.ptr<f32, gm>) attributes {pto.simt_entry} {
+    %0 = pto.get_tid_x : i32
+    %cst = arith.constant 1.000000e+00 : f32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %1 = pto.get_laneid : i32
+    %2 = pto.redux_add %cst : f32 -> f32
+    %3 = arith.index_cast %0 : i32 to index
+    pto.store %2, %arg0[%3] : !pto.ptr<f32, gm>, f32
+    return
+  }
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_sum/launch.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/launch.cpp
new file mode 100644
index 0000000000..94b5e94147
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/launch.cpp
@@ -0,0 +1,11 @@
+#ifndef __VEC_SCOPE__
+#define __VEC_SCOPE__
+#endif
+#include <stdint.h>
+#ifndef __CPU_SIM
+#include "acl/acl.h"
+#endif
+extern "C" __global__ [aicore] void _kernel(__gm__ float *out);
+void Launch_kernel(float *out, void *stream) {
+  _kernel<<<1, nullptr, stream>>>((__gm__ float *)out);
+}
diff --git a/test/vpto/cases/micro-op/simt/allreduce_warp_sum/main.cpp b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/main.cpp
new file mode 100644
index 0000000000..c4fcda9b36
--- /dev/null
+++ b/test/vpto/cases/micro-op/simt/allreduce_warp_sum/main.cpp
@@ -0,0 +1,43 @@
+#include "test_common.h"
+#include "acl/acl.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace PtoTestCommon;
+
+#define ACL_CHECK(expr) do { const aclError _ret = (expr); if (_ret != ACL_SUCCESS) { std::fprintf(stderr, "[ERROR] %s failed: %d (%s:%d)\n", #expr, (int)_ret, __FILE__, __LINE__); rc = 1; goto cleanup; } } while (0)
+
+void Launch_kernel(float *out, void *stream);
+
+int main() {
+  size_t elemCount = 32;
+  size_t fileSize = elemCount * sizeof(float);
+  float *outHost = nullptr;
+  float *outDevice = nullptr;
+  int rc = 0;
+  bool aclInited = false, deviceSet = false;
+  int deviceId = 0;
+  aclrtStream stream = nullptr;
+
+  ACL_CHECK(aclInit(nullptr)); aclInited = true;
+  if (const char *e = std::getenv("ACL_DEVICE_ID")) deviceId = std::atoi(e);
+  ACL_CHECK(aclrtSetDevice(deviceId)); deviceSet = true;
+  ACL_CHECK(aclrtCreateStream(&stream));
+  ACL_CHECK(aclrtMallocHost((void **)(&outHost), fileSize));
+  ACL_CHECK(aclrtMalloc((void **)&outDevice, fileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+  // Input: zero-initialize output buffer (kernel writes results)
+  std::memset(outHost, 0, fileSize);
+  ACL_CHECK(aclrtMemcpy(outDevice, fileSize, outHost, fileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+  Launch_kernel(outDevice, stream);
+  ACL_CHECK(aclrtSynchronizeStream(stream));
+  ACL_CHECK(aclrtMemcpy(outHost, fileSize, outDevice, fileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+  WriteFile("./out.bin", outHost, fileSize);
+
+cleanup:
+  aclrtFree(outDevice); aclrtFreeHost(outHost);
+  if (stream) aclrtDestroyStream(stream);
+  if (deviceSet) aclrtResetDevice(deviceId);
+  if (aclInited) aclFinalize();
+  return rc;
+}

From 433ff88cea5428493fd2ad0986b441cb8cd6540e Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Tue, 30 Jun 2026 21:59:18 +0800
Subject: [PATCH 35/37] test(ptodsl): keep only manual RMSNorm launch entry

Signed-off-by: andodo <andodo@andododeMacBook-Air.local>
---
 ptodsl/README.md                              |  10 +-
 .../rmsnorm_alloc_buffer_simt.py              |   0
 ...msnorm_alloc_buffer_simt_launch_common.py} | 111 +-----------------
 ...rmsnorm_alloc_buffer_simt_manual_launch.py |   7 +-
 ptodsl/tests/test_rmsnorm_example_compile.py  |   2 +-
 5 files changed, 12 insertions(+), 118 deletions(-)
 rename ptodsl/examples/{ => rms_norm}/rmsnorm_alloc_buffer_simt.py (100%)
 rename ptodsl/examples/{rmsnorm_alloc_buffer_simt_launch.py => rms_norm/rmsnorm_alloc_buffer_simt_launch_common.py} (51%)
 rename ptodsl/examples/{ => rms_norm}/rmsnorm_alloc_buffer_simt_manual_launch.py (97%)

diff --git a/ptodsl/README.md b/ptodsl/README.md
index 30a76b8dd0..ea25c280dc 100644
--- a/ptodsl/README.md
+++ b/ptodsl/README.md
@@ -152,7 +152,7 @@ Direct run on a real NPU:
 python3 ptodsl/examples/flash_attention_softmax_launch.py
 ```
 
-### `rmsnorm_alloc_buffer_simt.py`
+### `rms_norm/rmsnorm_alloc_buffer_simt.py`
 
 Compile-only RMSNorm example for explicit-mode SIMT kernels. It exercises
 SIMT-local `pto.alloc_buffer(...)`, hand-authored dynamic UB scratch offsets,
@@ -161,13 +161,13 @@ contiguous `scalar.load` / `scalar.store`, `pto.vec`,
 and a runtime token loop that lowers to `scf.for`.
 
 ```bash
-python3 ptodsl/examples/rmsnorm_alloc_buffer_simt.py --variant x128 > /tmp/rmsnorm_x128.mlir
-python3 ptodsl/examples/rmsnorm_alloc_buffer_simt.py --variant x64 > /tmp/rmsnorm_x64.mlir
+python3 ptodsl/examples/rms_norm/rmsnorm_alloc_buffer_simt.py --variant x128 > /tmp/rmsnorm_x128.mlir
+python3 ptodsl/examples/rms_norm/rmsnorm_alloc_buffer_simt.py --variant x64 > /tmp/rmsnorm_x64.mlir
 ```
 
 Expected: MLIR containing `@rmsnorm_4096_alloc_buffer_simt_context_kernel`,
-`scf.for`, `vector<4xf32>` for both `x128` and `x64`, and the
-`__tl_allreduce_sum` helper. The main token loop should also contain dynamic
+`scf.for`, `vector<4xf32>` for both `x128` and `x64`, and inline
+`pto.redux_add` / `pto.syncthreads` allreduce ops. The main token loop should also contain dynamic
 `pto.set_flag_dyn` / `pto.wait_flag_dyn` operations for the ping-pong events.
 
 ### Launch artifacts
diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt.py b/ptodsl/examples/rms_norm/rmsnorm_alloc_buffer_simt.py
similarity index 100%
rename from ptodsl/examples/rmsnorm_alloc_buffer_simt.py
rename to ptodsl/examples/rms_norm/rmsnorm_alloc_buffer_simt.py
diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt_launch.py b/ptodsl/examples/rms_norm/rmsnorm_alloc_buffer_simt_launch_common.py
similarity index 51%
rename from ptodsl/examples/rmsnorm_alloc_buffer_simt_launch.py
rename to ptodsl/examples/rms_norm/rmsnorm_alloc_buffer_simt_launch_common.py
index f271c10861..e7ca26dcc6 100644
--- a/ptodsl/examples/rmsnorm_alloc_buffer_simt_launch.py
+++ b/ptodsl/examples/rms_norm/rmsnorm_alloc_buffer_simt_launch_common.py
@@ -6,22 +6,13 @@
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 
-"""
-Launch and validate the RMSNorm alloc_buffer/SIMT example on an Ascend NPU.
-
-The test compares the kernel outputs written to GM against a NumPy RMSNorm
-reference. It also fills output buffers with sentinels and checks guard regions
-after the logical outputs, so missed writes and simple over-writes are caught by
-the same host-side validation.
-"""
+"""Shared host-side setup for the RMSNorm alloc_buffer/SIMT launch examples."""
 
 from __future__ import annotations
 
-import argparse
 from dataclasses import dataclass
 from pathlib import Path
 import sys
-import time
 
 import numpy as np
 
@@ -34,7 +25,8 @@
             break
     else:
         raise RuntimeError(
-            "Unable to locate the PTODSL Python package root from rmsnorm_alloc_buffer_simt_launch.py"
+            "Unable to locate the PTODSL Python package root from "
+            "rmsnorm_alloc_buffer_simt_launch_common.py"
         )
 
 
@@ -126,100 +118,3 @@ def assert_guard_unchanged(name: str, guard: np.ndarray) -> None:
         raise AssertionError(
             f"{name} guard overwritten at guard index {first}: got {guard[first]!r}, expected {_SENTINEL!r}"
         )
-
-
-def run_case(case: Case, torch) -> None:
-    x, w = make_inputs(case)
-    y_ref, rstd_ref = rmsnorm_reference(x, w, _EPS)
-
-    x_t = torch.from_numpy(x).to(_DEVICE)
-    w_t = torch.from_numpy(w).to(_DEVICE)
-
-    y_storage = torch.full(
-        (case.tokens * _HIDDEN_SIZE + _Y_GUARD_ELEMS,),
-        float(_SENTINEL),
-        dtype=torch.float32,
-        device=_DEVICE,
-    )
-    rstd_storage = torch.full(
-        (case.tokens + _RSTD_GUARD_ELEMS,),
-        float(_SENTINEL),
-        dtype=torch.float32,
-        device=_DEVICE,
-    )
-
-    stream = npu_stream(torch)
-
-    t0 = time.perf_counter()
-    compiled = compile_kernel(case)
-    compile_s = time.perf_counter() - t0
-
-    t0 = time.perf_counter()
-    compiled[case.n_cores, stream](
-        x_t.data_ptr(),
-        y_storage.data_ptr(),
-        w_t.data_ptr(),
-        rstd_storage.data_ptr(),
-        float(_EPS),
-    )
-    torch.npu.synchronize()
-    launch_s = time.perf_counter() - t0
-
-    y_out = y_storage[: case.tokens * _HIDDEN_SIZE].cpu().numpy().reshape(case.tokens, _HIDDEN_SIZE)
-    rstd_out = rstd_storage[: case.tokens].cpu().numpy()
-    y_guard = y_storage[case.tokens * _HIDDEN_SIZE :].cpu().numpy()
-    rstd_guard = rstd_storage[case.tokens :].cpu().numpy()
-
-    np.testing.assert_allclose(rstd_out, rstd_ref, rtol=case.rtol, atol=case.rstd_atol)
-    np.testing.assert_allclose(y_out, y_ref, rtol=case.rtol, atol=case.y_atol)
-    assert_guard_unchanged("Y", y_guard)
-    assert_guard_unchanged("RSTD", rstd_guard)
-
-    y_diff = float(np.max(np.abs(y_out - y_ref))) if y_out.size else 0.0
-    rstd_diff = float(np.max(np.abs(rstd_out - rstd_ref))) if rstd_out.size else 0.0
-    print(
-        f"PASS {case.name}  "
-        f"grid={case.n_cores} tokens={case.tokens} "
-        f"compile={compile_s:.3f}s launch={launch_s:.3f}s "
-        f"max|Y|={y_diff:.3e} max|RSTD|={rstd_diff:.3e}"
-    )
-
-
-def emit_mlir(case: Case) -> str:
-    return compile_kernel(case).mlir_text()
-
-
-def main(argv=None) -> int:
-    global _DEVICE
-
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--device", default=_DEVICE, help="torch NPU device, default: npu:0")
-    parser.add_argument("--case", choices=[case.name for case in CASES] + [FULL_CASE.name, "all"], default="all")
-    parser.add_argument("--include-full", action="store_true", help="include the 64-core x 64-token full case")
-    parser.add_argument("--emit-mlir", action="store_true", help="print MLIR for the selected case and exit")
-    args = parser.parse_args(argv)
-
-    _DEVICE = args.device
-
-    selected = list(CASES)
-    if args.include_full:
-        selected.append(FULL_CASE)
-    if args.case != "all":
-        all_cases = {case.name: case for case in selected + [FULL_CASE]}
-        selected = [all_cases[args.case]]
-
-    if args.emit_mlir:
-        if len(selected) != 1:
-            parser.error("--emit-mlir expects one concrete --case")
-        print(emit_mlir(selected[0]))
-        return 0
-
-    torch = init_runtime()
-    for case in selected:
-        run_case(case, torch)
-    print("All RMSNorm cases passed.")
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
diff --git a/ptodsl/examples/rmsnorm_alloc_buffer_simt_manual_launch.py b/ptodsl/examples/rms_norm/rmsnorm_alloc_buffer_simt_manual_launch.py
similarity index 97%
rename from ptodsl/examples/rmsnorm_alloc_buffer_simt_manual_launch.py
rename to ptodsl/examples/rms_norm/rmsnorm_alloc_buffer_simt_manual_launch.py
index 4c4386a085..eeb1a8719e 100644
--- a/ptodsl/examples/rmsnorm_alloc_buffer_simt_manual_launch.py
+++ b/ptodsl/examples/rms_norm/rmsnorm_alloc_buffer_simt_manual_launch.py
@@ -52,7 +52,8 @@
     _run_ptoas,
 )
 
-from rmsnorm_alloc_buffer_simt_launch import (  # noqa: E402
+import rmsnorm_alloc_buffer_simt_launch_common as launch_common  # noqa: E402
+from rmsnorm_alloc_buffer_simt_launch_common import (  # noqa: E402
     _DEVICE,
     _EPS,
     _HIDDEN_SIZE,
@@ -260,8 +261,6 @@ def run_case_manual(case: Case, torch) -> None:
 
 
 def main(argv=None) -> int:
-    import rmsnorm_alloc_buffer_simt_launch as base_launch
-
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("--device", default=_DEVICE, help="torch NPU device, default: npu:0")
     parser.add_argument(
@@ -272,7 +271,7 @@ def main(argv=None) -> int:
     parser.add_argument("--include-full", action="store_true", help="include the 64-core x 64-token full case")
     args = parser.parse_args(argv)
 
-    base_launch._DEVICE = args.device
+    launch_common._DEVICE = args.device
     globals()["_DEVICE"] = args.device
 
     selected = list(CASES)
diff --git a/ptodsl/tests/test_rmsnorm_example_compile.py b/ptodsl/tests/test_rmsnorm_example_compile.py
index 5e1419c340..0303f7b602 100644
--- a/ptodsl/tests/test_rmsnorm_example_compile.py
+++ b/ptodsl/tests/test_rmsnorm_example_compile.py
@@ -52,7 +52,7 @@ def expect_parse_roundtrip_and_verify(text: str, label: str) -> None:
 
 
 def load_rmsnorm_example():
-    example_path = REPO_ROOT / "ptodsl" / "examples" / "rmsnorm_alloc_buffer_simt.py"
+    example_path = REPO_ROOT / "ptodsl" / "examples" / "rms_norm" / "rmsnorm_alloc_buffer_simt.py"
     expect(example_path.is_file(), f"RMSNorm example is missing: {example_path}")
 
     spec = spec_from_file_location("ptodsl_rmsnorm_alloc_buffer_simt", example_path)

From 59ad97ed5c3f9219d735ad537ee2360e2ae0adf6 Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Tue, 30 Jun 2026 22:24:41 +0800
Subject: [PATCH 36/37] fix(ptodsl): remove automatic dyn shared launch bytes

Signed-off-by: andodo <andodo@andododeMacBook-Air.local>
---
 ptodsl/ptodsl/_runtime/codegen.py      | 5 ++---
 ptodsl/ptodsl/_runtime/native_build.py | 9 ---------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/ptodsl/ptodsl/_runtime/codegen.py b/ptodsl/ptodsl/_runtime/codegen.py
index 61b50a288c..ba93106c25 100644
--- a/ptodsl/ptodsl/_runtime/codegen.py
+++ b/ptodsl/ptodsl/_runtime/codegen.py
@@ -92,7 +92,7 @@ def _runtime_scalar_cpp_type(annotation) -> str:
 def launch_symbol_name(ir_function_name: str) -> str:
     return f"ptodsl_launch_{ir_function_name}"
 
-def generate_launch_cpp(*, ir_function_name: str, kernel_signature, dyn_shared_bytes: int = 0) -> str:
+def generate_launch_cpp(*, ir_function_name: str, kernel_signature) -> str:
     """Return C++ source for one extern-C launch entry point."""
     gm_params = []
     host_params = []
@@ -125,8 +125,7 @@ def generate_launch_cpp(*, ir_function_name: str, kernel_signature, dyn_shared_b
         "#endif\n\n"
         f'extern "C" __global__ AICORE void {ir_function_name}({gm_sig});\n\n'
         f"extern \"C\" void {launch_symbol}({host_sig}) {{\n"
-        f"    constexpr uint32_t dynSharedBytes = {int(dyn_shared_bytes)};\n"
-        f"    {ir_function_name}<<<grid, dynSharedBytes, stream>>>({kernel_call});\n"
+        f"    {ir_function_name}<<<grid, nullptr, stream>>>({kernel_call});\n"
         "}\n"
     )
 
diff --git a/ptodsl/ptodsl/_runtime/native_build.py b/ptodsl/ptodsl/_runtime/native_build.py
index 9252a6c07d..777fa54201 100644
--- a/ptodsl/ptodsl/_runtime/native_build.py
+++ b/ptodsl/ptodsl/_runtime/native_build.py
@@ -10,7 +10,6 @@
 from __future__ import annotations
 
 import os
-import re
 import subprocess
 from pathlib import Path
 
@@ -31,13 +30,6 @@
 )
 
 
-def _extract_dyn_shared_memory_bytes(mlir_text: str) -> int:
-    match = re.search(r"dyn_shared_memory_buf\s*=\s*(\d+)\s*:\s*i64", mlir_text)
-    if match is None:
-        return 0
-    return int(match.group(1))
-
-
 def _run(cmd: list[str], *, cwd: Path | None = None) -> None:
     result = subprocess.run(cmd, cwd=str(cwd) if cwd else None, capture_output=True, text=True)
     if result.returncode != 0:
@@ -183,7 +175,6 @@ def build_native_library(
     launch_cpp_text = generate_launch_cpp(
         ir_function_name=ir_function_name,
         kernel_signature=kernel_signature,
-        dyn_shared_bytes=_extract_dyn_shared_memory_bytes(mlir_text),
     )
     sim_mode = bool(os.environ.get("MSPROF_SIMULATOR_MODE"))
     link_config_text = "\n".join(runtime_library_flags(sim_mode=sim_mode))

From 709d9362d80bdd4b0544fd466db902e6ea45435c Mon Sep 17 00:00:00 2001
From: andodo <andodo@andododeMacBook-Air.local>
Date: Wed, 1 Jul 2026 09:23:50 +0800
Subject: [PATCH 37/37] test(ptodsl): relax RMSNorm MLIR shape checks

Signed-off-by: andodo <andodo@andododeMacBook-Air.local>
---
 ptodsl/tests/test_rmsnorm_example_compile.py | 34 ++++----------------
 1 file changed, 7 insertions(+), 27 deletions(-)

diff --git a/ptodsl/tests/test_rmsnorm_example_compile.py b/ptodsl/tests/test_rmsnorm_example_compile.py
index 0303f7b602..b815612e3e 100644
--- a/ptodsl/tests/test_rmsnorm_example_compile.py
+++ b/ptodsl/tests/test_rmsnorm_example_compile.py
@@ -70,11 +70,10 @@ def check_variant(compiled, *, label: str, vector_type: str, ub_size: int) -> No
     expect("func.func @rmsnorm_4096_alloc_buffer_simt_context_kernel" in text, f"{label}: missing entry")
     expect(f"dyn_shared_memory_buf = {ub_size} : i64" in text, f"{label}: unexpected UB scratch size")
     expect("scf.for" in text, f"{label}: tokens_per_core loop should lower to scf.for")
-    expect(text.count("scf.for") >= 4, f"{label}: SIMT inner loops should lower to compact scf.for ops")
     expect("pto.mte_gm_ub" in text, f"{label}: missing GM->UB transfer")
     expect("pto.mte_ub_gm" in text, f"{label}: missing UB->GM transfer")
-    expect(text.count("pto.simt_launch @rmsnorm_simt_token_body__simt_") == 1,
-           f"{label}: indexed SIMT call should lower to one explicit token simt_launch op")
+    expect("pto.simt_launch @rmsnorm_simt_token_body__simt_" in text,
+           f"{label}: indexed SIMT call should lower to an explicit token simt_launch op")
     expect("pto.simt_launch @inline_simt_" not in text,
            f"{label}: token SIMT body should be emitted as the named helper, not an inline helper")
     expect("pto.store_vfsimt_info" not in text,
@@ -87,35 +86,16 @@ def check_variant(compiled, *, label: str, vector_type: str, ub_size: int) -> No
            f"{label}: missing V->MTE2 ping-pong priming flag")
     expect("pto.set_flag[<PIPE_MTE3>, <PIPE_V>, <EVENT_ID1>]" in text,
            f"{label}: missing MTE3->V pong priming flag")
-    expect(text.count("pto.set_flag_dyn") == 4,
-           f"{label}: token loop should lower four dynamic set_flag ops")
-    expect(text.count("pto.wait_flag_dyn") == 4,
-           f"{label}: token loop should lower four dynamic wait_flag ops")
+    expect("pto.set_flag_dyn" in text, f"{label}: token loop should lower dynamic set_flag ops")
+    expect("pto.wait_flag_dyn" in text, f"{label}: token loop should lower dynamic wait_flag ops")
     expect(vector_type in text, f"{label}: missing contiguous vector access type {vector_type}")
     expect("__tl_allreduce_sum" not in text,
-           f"{label}: allreduce should be emitted inline, not as a helper call")
-    expect("pto.redux_add" in text, f"{label}: inline allreduce should use redux_add")
-    expect("pto.syncthreads" in text, f"{label}: inline allreduce should synchronize through UB scratch")
+           f"{label}: PR3 allreduce should inline the reduce sequence into the SIMT body")
+    expect("pto.redux_add" in text, f"{label}: PR3 inline allreduce should use redux_add")
+    expect("pto.syncthreads" in text, f"{label}: PR3 inline allreduce should synchronize through UB scratch")
     expect("pto.sqrt" in text, f"{label}: RMSNorm runtime sqrt should lower through the PTO SIMT sqrt op")
     expect("math.sqrt" not in text, f"{label}: RMSNorm SIMT helper should not leave math.sqrt in the MLIR")
 
-    expect(
-        text.count("pto.mte_gm_ub") == 2,
-        f"{label}: expected compact transfer structure with 2 GM->UB ops",
-    )
-    expect(
-        text.count("pto.mte_ub_gm") == 2,
-        f"{label}: expected compact transfer structure with 2 UB->GM ops",
-    )
-    expect(
-        text.count("pto.castptr") <= 12,
-        f"{label}: SIMT inner loops should not be trace-time expanded into many castptr ops",
-    )
-    expect(
-        text.count("pto.store ") <= 8,
-        f"{label}: SIMT inner loops should not be trace-time expanded into many scalar stores",
-    )
-    expect(text.count("llvm.alloca") == 2, f"{label}: expected x_frag and sum_sq local buffers")
     expect("w_frag" not in text, f"{label}: W should be read directly from UB, not from a local fragment")
     expect(
         re.search(