From 7c39285b8ff845c6a6b7e68a1984308737072eeb Mon Sep 17 00:00:00 2001
From: jimmychou <47636600+jimmychou0@users.noreply.github.com>
Date: Thu, 25 Jun 2026 11:56:23 +0800
Subject: [PATCH 1/3] feat(ptodsl): redesign tileop simt/simd/cube subkernel
 surface

---
 .../mix-kernel-mix-backend-compile-flow.md    |   5 +-
 ...sl-redesign-of-simd-simt-cube-subkernel.md | 193 ++++++
 include/PTO/IR/PTO.h                          |  27 +
 include/PTO/Transforms/Passes.h               |   3 +
 include/PTO/Transforms/Passes.td              |  85 ++-
 lib/PTO/IR/PTO.cpp                            |  16 +-
 lib/PTO/Transforms/CMakeLists.txt             |   3 +
 .../Transforms/InsertSync/InsertSyncDebug.cpp |  33 +-
 .../Transforms/InsertSync/PTOIRTranslator.cpp | 101 ++-
 .../Transforms/PTOInferTileOpSummaryPass.cpp  | 439 +++++++++++++
 .../PTOInstantiateAndInlineOpLib.cpp          |   2 +-
 .../Transforms/PTOMaterializeTileHandles.cpp  |   2 +-
 .../PTOMaterializeTileOpSectionsPass.cpp      | 308 +++++++++
 .../PTONormalizeUncoveredTileSections.cpp     |   8 +-
 lib/PTO/Transforms/PTOPlanMemory.cpp          |   4 +
 .../PTOVerifySubkernelPipeContractPass.cpp    |   8 +-
 .../PTOVerifyTileOpContractPass.cpp           | 590 ++++++++++++++++++
 lib/PTO/Transforms/PTOViewToMemref.cpp        |  30 +-
 lib/PTO/Transforms/Utils.cpp                  |   4 +
 ptodsl/README.md                              |   4 +-
 ptodsl/docs/user_guide/01-introduction.md     |  20 +-
 ptodsl/docs/user_guide/02-quick-start.md      |  10 +-
 .../03-kernel-entry-and-subkernels.md         | 165 ++---
 ptodsl/docs/user_guide/05-control-flow.md     |   4 +-
 .../docs/user_guide/07-data-movement-ops.md   |  10 +-
 .../docs/user_guide/08-compute-operations.md  |   6 +-
 ptodsl/docs/user_guide/10-sync-ops.md         |   2 +-
 .../11-flash-attention-walkthrough.md         |  10 +-
 .../docs/user_guide/12-additional-examples.md |  12 +-
 ptodsl/examples/dynamic_softmax_launch.py     |   2 +-
 ptodsl/examples/fast_inverse_dense_launch.py  |   2 +-
 ptodsl/examples/flash_attention/gu.py         |   6 +-
 ptodsl/examples/flash_attention/softmax.py    |   8 +-
 ptodsl/examples/flash_attention_sketch.py     |   8 +-
 .../flash_attention_softmax_launch.py         |   2 +-
 .../inverse_block_inversion_launch.py         |   2 +-
 .../examples/mixed_backend_kernel_module.py   |   2 +-
 ptodsl/examples/softmax_dsl.py                |   2 +-
 ptodsl/examples/tadd_dsl.py                   |   4 +-
 ptodsl/examples/tilelang_codegen.py           |   4 +-
 ptodsl/ptodsl/_diagnostics.py                 |  19 +-
 ptodsl/ptodsl/_subkernels.py                  | 106 +++-
 ptodsl/ptodsl/_tracing/runtime.py             |   2 +-
 ptodsl/ptodsl/_tracing/session.py             |  95 ++-
 ptodsl/ptodsl/pto.py                          |   2 +-
 .../tests/support/docs_fragment_fixtures.py   |  22 +-
 ptodsl/tests/test_ast_rewrite_example_ir.py   |   6 +-
 .../test_flash_attention_demo_compile.py      |   4 +-
 ptodsl/tests/test_jit_compile.py              | 207 +++++-
 ptodsl/tests/test_ptoas_frontend_verify.py    |   2 +-
 ptodsl/tests/test_subkernel_diagnostics.py    |  36 +-
 test/dsl-st/cube_matrix_pipeline.py           |   9 +-
 test/dsl-st/predicate_pack.py                 |   2 +-
 test/dsl-st/vmulscvt.py                       |   2 +-
 ..._memory_ptodsl_tileop_helper_vlds_vsts.pto |  54 ++
 .../pto/tileop_subkernel_call_autosync.pto    |  70 +++
 .../pto/tileop_subkernel_call_sync_model.pto  |  55 ++
 ...tload_tprefetch_low_precision_a5_valid.pto |   8 +-
 .../lit/pto/tstore_low_precision_a5_valid.pto |   6 +-
 ...tileop_helper_normalize_uncovered_skip.pto |  60 ++
 test/lit/vpto/tileop_materialize_sections.pto |  57 ++
 ...leop_materialize_sections_control_flow.pto |  48 ++
 ...aterialize_sections_control_flow_mixed.pto |  62 ++
 .../vpto/tileop_materialize_sections_cube.pto |  64 ++
 test/lit/vpto/tileop_summary_attrs.pto        |  55 ++
 .../lit/vpto/tileop_summary_attrs_subview.pto |  43 ++
 ...eop_verify_contract_alloc_tile_invalid.pto |  42 ++
 ...erify_contract_memref_boundary_invalid.pto |  27 +
 ...op_verify_contract_nested_call_invalid.pto |  60 ++
 ...eop_verify_contract_no_primary_invalid.pto |  41 ++
 ...p_verify_contract_ptr_boundary_invalid.pto |  27 +
 .../tileop_verify_contract_result_invalid.pto |  29 +
 ...leop_verify_contract_simt_only_invalid.pto |  38 ++
 tools/ptoas/ptoas.cpp                         |   5 +
 74 files changed, 3248 insertions(+), 263 deletions(-)
 create mode 100644 docs/designs/ptodsl-redesign-of-simd-simt-cube-subkernel.md
 create mode 100644 lib/PTO/Transforms/PTOInferTileOpSummaryPass.cpp
 create mode 100644 lib/PTO/Transforms/PTOMaterializeTileOpSectionsPass.cpp
 create mode 100644 lib/PTO/Transforms/PTOVerifyTileOpContractPass.cpp
 create mode 100644 test/lit/pto/plan_memory_ptodsl_tileop_helper_vlds_vsts.pto
 create mode 100644 test/lit/pto/tileop_subkernel_call_autosync.pto
 create mode 100644 test/lit/pto/tileop_subkernel_call_sync_model.pto
 create mode 100644 test/lit/vpto/tileop_helper_normalize_uncovered_skip.pto
 create mode 100644 test/lit/vpto/tileop_materialize_sections.pto
 create mode 100644 test/lit/vpto/tileop_materialize_sections_control_flow.pto
 create mode 100644 test/lit/vpto/tileop_materialize_sections_control_flow_mixed.pto
 create mode 100644 test/lit/vpto/tileop_materialize_sections_cube.pto
 create mode 100644 test/lit/vpto/tileop_summary_attrs.pto
 create mode 100644 test/lit/vpto/tileop_summary_attrs_subview.pto
 create mode 100644 test/lit/vpto/tileop_verify_contract_alloc_tile_invalid.pto
 create mode 100644 test/lit/vpto/tileop_verify_contract_memref_boundary_invalid.pto
 create mode 100644 test/lit/vpto/tileop_verify_contract_nested_call_invalid.pto
 create mode 100644 test/lit/vpto/tileop_verify_contract_no_primary_invalid.pto
 create mode 100644 test/lit/vpto/tileop_verify_contract_ptr_boundary_invalid.pto
 create mode 100644 test/lit/vpto/tileop_verify_contract_result_invalid.pto
 create mode 100644 test/lit/vpto/tileop_verify_contract_simt_only_invalid.pto

diff --git a/docs/designs/mix-kernel-mix-backend-compile-flow.md b/docs/designs/mix-kernel-mix-backend-compile-flow.md
index 27c0d5c9b0..fa729491e6 100644
--- a/docs/designs/mix-kernel-mix-backend-compile-flow.md
+++ b/docs/designs/mix-kernel-mix-backend-compile-flow.md
@@ -237,7 +237,7 @@ For `@pto.simd` / `@pto.cube` and inline `with pto.simd():` / `with pto.cube():`
 scopes, PTODSL:
 
 - outlines the subkernel body into a helper `func.func` when needed
-- marks the helper with `pto.ptodsl.subkernel_helper`
+- marks the helper with canonical `pto.tileop.helper`
 - emits a helper call from the caller body
 
 This is the PTODSL-side expression of a logical mixed kernel: the entry or
@@ -526,4 +526,5 @@ Use this order when debugging mixed compilation:
 | `pto.aicore` | `func.func` | Legacy entry marker accepted for compatibility. |
 | `pto.internal.non_entry` | `func.func` | Frontend/helper metadata; not used for current entry inference. |
 | `pto.ptodsl.logical_name` | `func.func` | Source-level logical name used when assembling wrappers and peer references. |
-| `pto.ptodsl.subkernel_helper` | `func.func` | Frontend helper classification: `simd`, `cube`, or `simt`. |
+| `pto.tileop.helper` | `func.func` | Canonical tileop-style helper marker emitted for `@pto.tileop` and retained `@pto.simd` / `@pto.cube`. |
+| `pto.ptodsl.subkernel_helper` | `func.func` | Legacy helper role marker retained for compatibility with older/manual IR. |
diff --git a/docs/designs/ptodsl-redesign-of-simd-simt-cube-subkernel.md b/docs/designs/ptodsl-redesign-of-simd-simt-cube-subkernel.md
new file mode 100644
index 0000000000..49efb25f26
--- /dev/null
+++ b/docs/designs/ptodsl-redesign-of-simd-simt-cube-subkernel.md
@@ -0,0 +1,193 @@
+
+# 定稿设计方案（final）
+
+## 1. 目标与用户模型
+
+`pto.tileop` 统一 custom subkernel 标识（取代 `pto.cube`/`pto.simd` 作 subkernel 标识职责；`pto.simt` 专属 launched SIMT）。建模为 tile-level IR 上以 tile/tensorview/scalar 为 IO、带 phase 摘要的命名 helper + `func.call`，让 `PTOInsertSync`/`PTOPlanMemory` 当一等公民。用户零参数，**摘要全由后端 `PTOInferTileOpSummaryPass` 推导；canonical helper marker 统一收敛到 `pto.tileop.helper` unit attr。**
+
+```python
+@pto.tileop
+def softmax(src_view: pto.TensorView, out_tile: pto.Tile, scratch_tile: pto.Tile,
+            rows: pto.i32, cols: pto.i32):
+    # caller 传入 scratch/out tile；body 不新建 tile buffer
+    pto.tload(src_view, scratch_tile)        # MTE
+    m = pto.vmax(scratch_tile)               # PIPE_V
+    e = pto.vexp(pto.vsub(m, scratch_tile))  # PIPE_V
+    s = pto.vsum(e)                          # PIPE_V
+    r = pto.vdiv(e, s)                       # PIPE_V
+    pto.tstore(out_tile, r)                  # MTE
+
+@pto.jit
+def kernel(out, x):
+    softmax(x, out, scratch, rows, cols)
+    softmax(x, out, scratch, rows, cols)     # 复用
+```
+
+约束（编译期强制）：
+- IO 只允许 Tile/TensorView/PartitionTensorView/PTO scalar。**输出 tile/tensorview 全走 output operand + operand_effects=write/readwrite；func.call results MVP 只允许 scalar。**
+- **helper 内禁止 `alloc_tile`/`reserve_buffer`/`TAlloc`/任何需 PlanMemory 为 callee-local 规划的 op；内部 tile buffer 必须来自 caller operand。内部 vreg/mask/scalar 临时可存但不跨边界。**
+- body 允许 tload/tstore、vector ops、scalar(PIPE_S) ops、cube ops、`pipe_barrier` 同步。
+- 不允许 host tensor/TensorSpec/vreg/mask/pipe_handle 跨边界；不允许 SIMT-only op。
+- **tileop 不能在另一 tileop/section 内调用**（避免 inline 后嵌套 section，`VPTOSplitCVModule:113-118` 禁嵌套 section）。
+- **负例：tileop 只有 MTE/S/sync、无 vector/cube 主计算证据时报错。**
+- MVP 单主计算域（vector **或** cube）+ 多辅助 pipe；reject cube+vector 混算。多 phase 是 correctness 必需。
+
+## 2. 关键后端事实（已核对，含 pipeline 实测）
+
+- `PTOInsertSync`/`PTOPlanMemory` 均 `func.walk` 全递归进 region。
+- InsertSync 现有两条 PTODSL subkernel 路径：legacy `simd/cube` 兼容路径仍按 helper role→单 pipe、memory operand **保守建模为 read+write**；`tileop` 路径已读 `primary_domain/phases/operand_effects`，按 **non-empty boundary-effect phase** 拆多 `CompoundInstanceElement`。
+- **`CompoundInstanceElement`（`SyncCommon.h:334-341`）单 `kPipeValue` + 单组 def/use**；空 def/use 节点合法但不贡献跨边界依赖。
+- **`classifyTileOpByPipe`（`PTONormalizeUncoveredTileSections.cpp:252-254`）把 MTE1 归 Cube**；`inferSegmentKind` 对混合段报错。
+- **`normalizeFunction`（:740-764）**：对不带 kernel_kind 且不带 tileop helper marker 的 func 会 `collectUncoveredTopLevelSegments`→`inferSegmentKind`，混合段 `emitSegmentInferenceError` 失败。`hasKnownKernelKindContext` 现已把 canonical `pto.tileop.helper`（并兼容 legacy `pto.ptodsl.subkernel_helper = "tileop"`）视为已知上下文并直接跳过 NormalizeUncovered。
+- **`VPTOSplitCVModule`**：`hasSectionKind`（:58-83）检查 func 含 `SectionCubeOp`/`SectionVectorOp`；不带 section 的 split candidate 被 `eraseSectionSplitCandidatesWithoutSectionKind`（:170-175）擦除；`:135` 要求"must contain section"；`:113-118` 禁嵌套 section。
+- **实测 pipeline 顺序（`ptoas.cpp:1780-1900`）**：
+  ```
+  preBackendPM: NormalizeUncoveredTileSections (1786)
+  main pm: ... → PTOInferTileOpSummaryPass → PTOMaterializeTileOpSectionsPass
+           → PTOVerifyTileOpContractPass → ... → ViewToMemref
+           → PlanMemory → ResolveReservedBuffers
+           → VerifySubkernelPipeContract → InsertSync → ... →
+           MaterializeTileHandles → InlineBackendHelpers
+  ```
+- MLIR attribute 不能引用 SSA value；PTO 无 ValueAttr 机制；custom attr 需在 `PTOAttrs.td` 注册（`PTO_Attr` 基类 :36）。
+- `alloc_tile`（`PTOOps.td:318`）、`reserve_buffer`（:1792）、`TAllocOp`（:2240，PlanMemory:478 处理）真实存在。
+
+## 3. IR 载体：方案 B — named helper + `func.call` + 后端推导的 phase 摘要
+
+复用 `func.call` + 命名 helper + callee 解析。**不复用 `kernel_kind`**，用后端推导的 `pto.tileop.primary_domain`。**不预套 section**，改 verifier + NormalizeUncovered 接受 tileop 裸 body。**前端只标 `pto.tileop.helper`，摘要全后端生成。**
+
+```
+// 前端 trace 后：helper 只带统一 marker
+func.func @softmax(%src: !pto.tensorview<...>, %out: !pto.tile<...>, %scratch: !pto.tile<...>,
+                   %rows: i32, %cols: i32)
+    { pto.tileop.helper } {
+  pto.tload %src, %scratch
+  %m = pto.vmax %scratch
+  ...
+  pto.tstore %out, %r
+  return
+}
+
+// PTOInferTileOpSummaryPass 后：补全摘要（真 MLIR attr 结构）
+func.func @softmax(%src, %out, %scratch, %rows, %cols)
+    { pto.tileop.helper,
+      pto.tileop.primary_domain = #pto.kernel_kind<vector>,
+      pto.tileop.phases = #array<#dict<{
+        pipe = #pto.pipe<MTE1>, operand_uses = [0], operand_defs = [2], result_defs = []
+      }, #dict<{
+        pipe = #pto.pipe<V>,    operand_uses = [2], operand_defs = [1], result_defs = []
+      }, #dict<{
+        pipe = #pto.pipe<MTE1>, operand_uses = [1], operand_defs = [1], result_defs = []
+      }>>,
+      pto.tileop.operand_effects = ["read", "readwrite", "readwrite", "read", "read"]
+    } {
+  ...
+}
+
+func.func @kernel(%out, %x, %scratch, %rows, %cols) {
+  func.call @softmax(%x, %out, %scratch, %rows, %cols) : (...) -> ()
+  func.call @softmax(%x, %out, %scratch, %rows, %cols) : (...) -> ()   // 复用
+}
+```
+
+### phase attr schema（真 MLIR 结构，需在 PTOAttrs.td 注册）
+
+- `pto.tileop.phases`: `ArrayAttr<DictionaryAttr>`，每 phase dict：
+  - `pipe`: 复用现有 pipe 整数枚举（`PTOAttrs.td:213-227`）或新 `PipeAttr`（需注册），按 op `getPipe()` 推。
+  - `operand_uses`: 整数 `ArrayAttr`（operand index 指向函数所有 operands；**InsertSync 只消费 memory-like operand**，scalar 可在 summary 供验证或忽略、不参与建图；可空）。
+  - `operand_defs`: 整数 `ArrayAttr`（同上，可空）。
+  - `result_defs`: 整数 `ArrayAttr`（**MVP 固定空或仅 scalar result，不参与 memory sync**；复杂语义后置）。
+- **effects 可空**：纯内部 phase 可全空，保留在 phases 用于校验/主域推导，**InsertSync 跳过不建 `CompoundInstanceElement`**。有 boundary effect 的 phase 才建节点。是否标 use/def 是 **policy 非 IR 不变量**。
+- `pto.tileop.operand_effects`：从 phases 非空 effects 派生（union use→read、def→write），unknown→readwrite；scalar 标 read 但不建图。
+- `pto.tileop.primary_domain`：主计算域 vector/cube（借用枚举值，不挂 `kernel_kind` attr）。
+- **去掉 `pipe_footprint`**：body pipe set 由 `phases.pipe` 集合表达。
+
+### operand index 作用域（明确）
+
+- index 指向**函数所有 operands**（含 scalar）。
+- InsertSync 只消费 **memory-like operand**（tile/tensorview）的 use/def 建 `CompoundInstanceElement`。
+- scalar operand 可在 summary 供验证或忽略；**不参与 def/use 建图**。
+
+### 摘要属性职责划分
+
+| 职责 | 由谁承担 |
+|---|---|
+| body 出现过的 pipe 集合 | `phases.pipe` 集合（verifier 校验 body op getPipe() ∈ 此集） |
+| caller 跨边界 sync 建模 | 有 boundary effect 的 phase（memory-like operand use/def 非空），InsertSync 为其建 `CompoundInstanceElement` |
+| 主计算域 | `primary_domain` |
+| 每 operand 副作用 | `operand_effects`（从非空 phase effects 派生；scalar 标 read 但不建图） |
+
+> 不保留 `has_sync`：InsertSync 假设 helper 内部自管同步、caller 层只管跨边界。
+
+### 输出/results 边界（MVP 硬约束）
+
+- 输出 tile/tensorview 全走 output operand + operand_effects=write/readwrite。
+- **func.call results MVP 只允许 scalar**（alias handle 后置）。
+- **helper 内禁 `alloc_tile`/`reserve_buffer`/`TAlloc`**；内部 tile 必须来自 caller operand；内部 vreg/mask/scalar 临时不跨边界。
+
+## 4. MVP 边界
+
+- 单主计算域 + 多辅助 pipe；reject cube+vector 混算。
+- **多 phase correctness 必需**，pipe 按 `getPipe()` 推；effects 可空（非空才建 sync 节点），softmax V phase 标 use/def 是 policy。
+- MTE1/2/3/4、PIPE_S 归 phase、不参与 primary_domain 判定（tileop 专用规则，不改全局 `classifyTileOpByPipe`）。
+- 禁 helper-local tile allocation。
+- SIMT-only op 排除。**tileop 禁嵌套调用**。
+- **负例：tileop 只有 MTE/S/sync 无主计算证据报错。**
+
+## 5. 改动点
+
+### 前端（`_subkernels.py` + `_tracing/session.py`）
+
+1. `_create_subkernel_section_op`：tileop **不预套 section**。
+2. helper 只附 `pto.tileop.helper`；**不写 primary_domain/phases/operand_effects**。
+3. helper 函数类型：输出全走 operand；results 只用于 scalar。
+4. 前端 public boundary 契约：保留 vreg/mask 不外逃；results 限 scalar；**禁 tileop 嵌套调用**。helper body 内 `alloc_tile/reserve_buffer/TAlloc` 等 helper-local 资源分配由后端 `PTOVerifyTileOpContractPass` 兜底拒绝。
+5. 装饰器无 `kind` 参数；`@pto.cube`/`@pto.simd` 别名（IR 层统一 "tileop"）。
+
+### 后端
+
+1. **verifier 改造**：tile op verifier 把带 `pto.tileop.helper` 的 func 当合法上下文；results 限 scalar；**拒绝 alloc_tile/reserve_buffer/TAlloc**；内部 vreg/mask/scalar 临时不跨边界。
+2. **`PTONormalizeUncoveredTileSections` 跳过 tileop**（P0）：`normalizeFunction`/`hasKnownKernelKindContext` 增条件——带 `pto.tileop.helper` 的 func 跳过，避免 preBackendPM:1786 扫到裸 body 混合段报错。
+3. **新增 `PTOInferTileOpSummaryPass`**：扫 helper body 推导 primary_domain + phases（pipe 按 `getPipe()`，effects 可空，operand index 指向所有 operand，memory-like 才建图）+ operand_effects（从非空 effects 派生）。tileop 专用 MTE/S 规则，不改全局 `classifyTileOpByPipe`。
+4. **新增 materialize pass**：按 primary_domain+phases 物化 `SectionCubeOp`/`SectionVectorOp`（只包 cube/vector 主段），MTE/S/sync 保持 top-level。**lit case 覆盖两类**：MTE+section.vector+MTE、MTE+section.cube+MTE，验证 `VPTOSplitCVModule`/EmitC/VPTO 接受（注意 `hasSectionKind`:58-83 要求 func 含 section，`eraseSectionSplitCandidatesWithoutSectionKind`:170-175 会擦除无 section 的 candidate）。
+5. **`UpdatePTODSLSubkernelCallInfo` 改造**：读 primary_domain+phases；按**有 boundary effect 的 phase**拆多 `CompoundInstanceElement`（空 effect phase 跳过，scalar operand 不建图）；memory operand 副作用从保守全 R+W 改读 operand_effects；支持 callsite scalar results 进依赖图。
+6. **新增 `PTOVerifyTileOpContractPass`**：校验 body op `getPipe()` ∈ phases pipe 集合、主域 pipe 与 primary_domain 一致、operand_effects == 非空 phases 派生、SIMT-only op 排除、cube+vector 混算 reject、results 限 scalar、tileop 无嵌套调用、**拒绝 alloc_tile/reserve_buffer/TAlloc**、**负例（只有 MTE/S/sync 无主计算证据报错）**。旧 `PTOVerifySubkernelPipeContractPass` 保留兼容 cube/simd。
+7. 主 pipeline 不再依赖 `PTOWrapFunctionsInSectionsPass` 为 tileop helper 自动套单段；tileop section 形成以 `PTOMaterializeTileOpSectionsPass` 为准。
+8. **`PTOInlineBackendHelpers`**：保证不丢围绕 call 的 sync ops；tileop 禁嵌套调用从源头避免 inline 后嵌套 section。
+
+### pass 顺序（实测修正，P0）
+
+```
+前端 trace → verifier(tileop 裸 body)
+→ [preBackendPM] NormalizeUncoveredTileSections (跳过 `pto.tileop.helper`)   ← P0 必须跳过
+→ [main pm] PTOInferTileOpSummaryPass
+→ PTOMaterializeTileOpSectionsPass
+→ PTOVerifyTileOpContractPass
+→ ... → ViewToMemref → PlanMemory → ResolveReservedBuffers →
+   VerifySubkernelPipeContract → InsertSync → ... →
+   MaterializeTileHandles → InlineBackendHelpers
+```
+
+### 可选后置（非 MVP）
+
+- helper-local tile allocation 的 callsite clone/inline（放开 result 返回 tile / alloc_tile 等）。
+- alias handle result（放开 result 非 scalar / result_defs 复杂语义）。
+- phases def/use 细到 UB 子区域。
+- 内联 opt pass。
+
+## 6. 当前落地状态
+
+已落地并与本文主设计一致的部分：
+
+1. `NormalizeUncoveredTileSections` 已把 tileop helper marker 视为已知上下文并跳过预归一化。
+2. `PTOInferTileOpSummaryPass`、`PTOMaterializeTileOpSectionsPass`、`PTOVerifyTileOpContractPass` 已接入主 pipeline，且都位于 `PlanMemory` 之前。
+3. `UpdatePTODSLSubkernelCallInfo` 已能消费 tileop 摘要，按 phase 建模跨 helper 边界的 InsertSync 依赖；legacy `simd/cube` 兼容路径仍保留保守单-pipe 建模。
+4. tileop helper ABI 已收敛为 Tile/TensorView/PartitionTensorView/PTO scalar；`ptr` 仍为 SIMT-only。
+5. `@pto.tileop` / retained `@pto.simd` / `@pto.cube` 在 IR 层语义上已统一到 tileop helper role，并使用 canonical `pto.tileop.helper` marker；后端仍兼容 legacy `pto.ptodsl.subkernel_helper = "tileop"`。
+
+## 7. 仍待单独收敛的差异
+
+1. **inline `with pto.tileop()` 仍有前端预套 section 的实现残留。**
+   目标设计仍是 tileop helper 不预套 section、统一交后端 materialize；当前 inline 路径仍会先包 `SectionVectorOp`。
+2. **`pto.tileop.operand_effects` 的“无显式 boundary effect 时默认值”尚未与本文最终写法重新对齐。**
+   本文目标写法仍按 `unknown→readwrite` 记录；当前实现会把无 boundary effect 的 operand 物化/校验为 `"read"`。这一点需要单独决策后再统一设计与实现。
diff --git a/include/PTO/IR/PTO.h b/include/PTO/IR/PTO.h
index e858212835..a4d76a0fa9 100644
--- a/include/PTO/IR/PTO.h
+++ b/include/PTO/IR/PTO.h
@@ -190,9 +190,36 @@ inline constexpr llvm::StringLiteral kPTOSimtMaxRegistersAttrName =
 inline constexpr llvm::StringLiteral kPTOVisibilityAttrName = "pto.visibility";
 inline constexpr llvm::StringLiteral kPTOVisibilityInternalValue = "internal";
 inline constexpr llvm::StringLiteral kPTOVisibilityExternalValue = "external";
+inline constexpr llvm::StringLiteral kPTODSLSubkernelHelperAttrName =
+    "pto.ptodsl.subkernel_helper";
+inline constexpr llvm::StringLiteral kPTOTileOpHelperAttrName =
+    "pto.tileop.helper";
 inline constexpr llvm::StringLiteral kPTODSLLogicalNameAttrName =
     "pto.ptodsl.logical_name";
 
+/// Return the logical PTODSL helper role when present.
+///
+/// Canonical tileop helpers use the unit attr `pto.tileop.helper`. Legacy
+/// helper roles still use `pto.ptodsl.subkernel_helper = "<role>"`.
+inline StringRef getPTODSLSubkernelHelperRole(::mlir::func::FuncOp func) {
+  if (!func)
+    return {};
+  if (func->hasAttrOfType<UnitAttr>(kPTOTileOpHelperAttrName))
+    return "tileop";
+  if (auto attr =
+          func->getAttrOfType<StringAttr>(kPTODSLSubkernelHelperAttrName))
+    return attr.getValue();
+  return {};
+}
+
+inline bool hasPTODSLSubkernelHelperMarker(::mlir::func::FuncOp func) {
+  return !getPTODSLSubkernelHelperRole(func).empty();
+}
+
+inline bool isPTODSLTileOpHelper(::mlir::func::FuncOp func) {
+  return getPTODSLSubkernelHelperRole(func) == "tileop";
+}
+
 /// Return the PTODSL logical function name when present, otherwise fall back to
 /// the current symbol name. PTODSL uses this to mark ABI-specialized helper and
 /// kernel-module symbols without relying on symbol-name parsing.
diff --git a/include/PTO/Transforms/Passes.h b/include/PTO/Transforms/Passes.h
index 85970756c5..8fb61c3080 100644
--- a/include/PTO/Transforms/Passes.h
+++ b/include/PTO/Transforms/Passes.h
@@ -40,6 +40,9 @@ std::unique_ptr<Pass> createPTOInferValidatePipeInitPass();
 std::unique_ptr<Pass> createPTOResolveReservedBuffersPass();
 std::unique_ptr<Pass> createPTOWrapFunctionsInSectionsPass();
 std::unique_ptr<Pass> createPTONormalizeUncoveredTileSectionsPass();
+std::unique_ptr<Pass> createPTOInferTileOpSummaryPass();
+std::unique_ptr<Pass> createPTOMaterializeTileOpSectionsPass();
+std::unique_ptr<Pass> createPTOVerifyTileOpContractPass();
 std::unique_ptr<Pass> createVPTOSplitCVModulePass();
 std::unique_ptr<Pass> createVPTONormalizeContainerPass();
 std::unique_ptr<Pass> createPTOVerifyTFreePass();
diff --git a/include/PTO/Transforms/Passes.td b/include/PTO/Transforms/Passes.td
index bcc165674a..17c158fb76 100644
--- a/include/PTO/Transforms/Passes.td
+++ b/include/PTO/Transforms/Passes.td
@@ -312,6 +312,80 @@ def PTONormalizeUncoveredTileSections
   ];
 }
 
+def PTOInferTileOpSummary
+    : Pass<"pto-infer-tileop-summary", "func::FuncOp"> {
+  let summary = "Infer phase summary attributes for PTODSL tileop helpers";
+  let description = [{
+    Scans functions marked with canonical `pto.tileop.helper` (while still
+    accepting legacy `pto.ptodsl.subkernel_helper = "tileop"`) and
+    derives the backend-owned summary attributes:
+    - `pto.tileop.primary_domain`
+    - `pto.tileop.phases`
+    - `pto.tileop.operand_effects`
+
+    This pass is intentionally summary-only. It does not materialize sections,
+    inline helpers, or change InsertSync modeling.
+  }];
+
+  let constructor = "mlir::pto::createPTOInferTileOpSummaryPass()";
+
+  let dependentDialects = [
+    "mlir::func::FuncDialect",
+    "mlir::pto::PTODialect"
+  ];
+}
+
+def PTOMaterializeTileOpSections
+    : Pass<"pto-materialize-tileop-sections", "func::FuncOp"> {
+  let summary = "Materialize one primary PTO section for PTODSL tileop helpers";
+  let description = [{
+    Consumes backend-owned tileop summary attributes on functions marked with
+    canonical `pto.tileop.helper` (while still accepting legacy
+    `pto.ptodsl.subkernel_helper = "tileop"`) and wraps one contiguous
+    primary-domain compute span in `pto.section.vector` or `pto.section.cube`.
+
+    The MVP implementation expects one contiguous primary compute span in the
+    helper body. Leading and trailing MTE phases remain top-level so late
+    helper inlining can expose VPTO section sugar to `vpto-split-cv-module`.
+  }];
+
+  let constructor = "mlir::pto::createPTOMaterializeTileOpSectionsPass()";
+
+  let dependentDialects = [
+    "mlir::func::FuncDialect",
+    "mlir::pto::PTODialect"
+  ];
+}
+
+def PTOVerifyTileOpContract
+    : Pass<"pto-verify-tileop-contract", "func::FuncOp"> {
+  let summary = "Verify backend-owned PTODSL tileop helper contracts";
+  let description = [{
+    Verifies functions marked with canonical `pto.tileop.helper` (while still
+    accepting legacy `pto.ptodsl.subkernel_helper = "tileop"`) after summary
+    inference and section materialization.
+
+    Current MVP contract:
+    - results must be PTO scalar values only
+    - helper-local `pto.alloc_tile`, `pto.reserve_buffer`, and `pto.talloc`
+      are rejected
+    - nested tileop helper calls are rejected
+    - SIMT-only PTO ops are rejected
+    - at least one primary compute op must exist, and all primary compute ops
+      must belong to exactly one domain (`vector` or `cube`)
+    - scalar/MTE/sync phases may coexist, but `pto.tileop.primary_domain`,
+      `pto.tileop.phases`, and `pto.tileop.operand_effects` must remain
+      consistent with the helper body
+  }];
+
+  let constructor = "mlir::pto::createPTOVerifyTileOpContractPass()";
+
+  let dependentDialects = [
+    "mlir::func::FuncDialect",
+    "mlir::pto::PTODialect"
+  ];
+}
+
 def VPTOSplitCVModule : Pass<"vpto-split-cv-module", "ModuleOp"> {
   let summary = "Split a VPTO module with cube/vector sections into kernel modules";
   let description = [{
@@ -558,7 +632,8 @@ def PTOInlineBackendHelpers
   let description = [{
     Force-inlines backend helper functions that should not survive to backend-
     specific lowering:
-    - PTODSL subkernel helpers marked with `pto.ptodsl.subkernel_helper`
+    - PTODSL subkernel helpers marked with `pto.tileop.helper` or legacy
+      `pto.ptodsl.subkernel_helper`
 
     This pass runs on the shared mainline before backend-specific pipelines so
     both VPTO and EmitC consume the same helper-inlined IR.
@@ -594,10 +669,10 @@ def PTOVerifySubkernelPipeContract
     : Pass<"pto-verify-subkernel-pipe-contract", "func::FuncOp"> {
   let summary = "Verify PTODSL subkernel helpers stay within one InsertSync pipe contract";
   let description = [{
-    Verifies PTODSL subkernel helpers marked with `pto.ptodsl.subkernel_helper`
-    and their same-module local call closure stay within one role-consistent PTO
-    compute contract before InsertSync models the call boundary as one compound
-    node.
+    Verifies legacy PTODSL subkernel helpers marked with
+    `pto.ptodsl.subkernel_helper` and their same-module local call closure stay
+    within one role-consistent PTO compute contract before InsertSync models
+    the call boundary as one compound node.
 
     Current contract:
     - `simd` helpers may only contain PTO ops on `PIPE_V`
diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp
index aa3d20071f..8f28b00190 100644
--- a/lib/PTO/IR/PTO.cpp
+++ b/lib/PTO/IR/PTO.cpp
@@ -13631,9 +13631,15 @@ getEnclosingFunctionKernelKind(Operation *op) {
   return kernelKindAttr.getKernelKind();
 }
 
+static bool isInsideTileOpSubkernelHelper(Operation *op) {
+  auto funcOp = op->getParentOfType<func::FuncOp>();
+  return pto::isPTODSLTileOpHelper(funcOp);
+}
+
 static bool isInsideSectionOrAttributedKernel(Operation *op) {
   return isInsideSectionCube(op) || isInsideSectionVector(op) ||
-         getEnclosingFunctionKernelKind(op).has_value();
+         getEnclosingFunctionKernelKind(op).has_value() ||
+         isInsideTileOpSubkernelHelper(op);
 }
 
 static LogicalResult verifySplitAttr(Operation *op, int64_t split) {
@@ -14923,7 +14929,7 @@ LogicalResult InitializeL2LPipeOp::verify() {
 
 LogicalResult TPushOp::verify() {
   if (!isInsideSectionOrAttributedKernel(getOperation()))
-    return emitOpError("must be inside pto.section.cube/vector or a kernel_kind function");
+    return emitOpError("must be inside pto.section.cube/vector, a kernel_kind function, or a tileop subkernel helper");
   if (failed(verifyPipeHandleProducer(getOperation(), getPipeHandle())))
     return failure();
   if (failed(verifySplitAttr(getOperation(), getSplit())))
@@ -14939,7 +14945,7 @@ LogicalResult TPushOp::verify() {
 
 LogicalResult TAllocOp::verify() {
   if (!isInsideSectionOrAttributedKernel(getOperation()))
-    return emitOpError("must be inside pto.section.cube/vector or a kernel_kind function");
+    return emitOpError("must be inside pto.section.cube/vector, a kernel_kind function, or a tileop subkernel helper");
   if (failed(verifyPipeHandleProducer(getOperation(), getPipeHandle())))
     return failure();
   if (failed(verifyTensorEntryMatchesInternalPipeInit(
@@ -14950,7 +14956,7 @@ LogicalResult TAllocOp::verify() {
 
 LogicalResult TPopOp::verify() {
   if (!isInsideSectionOrAttributedKernel(getOperation()))
-    return emitOpError("must be inside pto.section.cube/vector or a kernel_kind function");
+    return emitOpError("must be inside pto.section.cube/vector, a kernel_kind function, or a tileop subkernel helper");
   if (failed(verifyPipeHandleProducer(getOperation(), getPipeHandle())))
     return failure();
   if (failed(verifySplitAttr(getOperation(), getSplit())))
@@ -14967,7 +14973,7 @@ LogicalResult TPopOp::verify() {
 
 LogicalResult TFreeOp::verify() {
   if (!isInsideSectionOrAttributedKernel(getOperation()))
-    return emitOpError("must be inside pto.section.cube/vector or a kernel_kind function");
+    return emitOpError("must be inside pto.section.cube/vector, a kernel_kind function, or a tileop subkernel helper");
   if (failed(verifyPipeHandleProducer(getOperation(), getPipeHandle())))
     return failure();
   if (getEntry() &&
diff --git a/lib/PTO/Transforms/CMakeLists.txt b/lib/PTO/Transforms/CMakeLists.txt
index a7059674df..7065aeb074 100644
--- a/lib/PTO/Transforms/CMakeLists.txt
+++ b/lib/PTO/Transforms/CMakeLists.txt
@@ -66,6 +66,9 @@ add_mlir_dialect_library(PTOTransforms
   PTOResolveReservedBuffersPass.cpp
   PTOWrapFunctionsInSectionsPass.cpp
   PTONormalizeUncoveredTileSections.cpp
+  PTOInferTileOpSummaryPass.cpp
+  PTOMaterializeTileOpSectionsPass.cpp
+  PTOVerifyTileOpContractPass.cpp
   VPTONormalizeContainer.cpp
   VPTOSplitCVModule.cpp
   InsertSync/PTOIRTranslator.cpp
diff --git a/lib/PTO/Transforms/InsertSync/InsertSyncDebug.cpp b/lib/PTO/Transforms/InsertSync/InsertSyncDebug.cpp
index a01a50e1bc..72586626d5 100644
--- a/lib/PTO/Transforms/InsertSync/InsertSyncDebug.cpp
+++ b/lib/PTO/Transforms/InsertSync/InsertSyncDebug.cpp
@@ -14,6 +14,7 @@
 #include "mlir/IR/AsmState.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
+#include <mutex>
 
 using namespace mlir;
 using namespace mlir::pto;
@@ -28,6 +29,8 @@ llvm::cl::opt<unsigned> insertSyncDebugLevelOpt(
                    "0=off, 1=phase, 2=syncir, 3=trace"),
     llvm::cl::init(0));
 
+std::mutex insertSyncDebugDumpMutex;
+
 } // namespace
 
 unsigned mlir::pto::getInsertSyncDebugLevel() { return insertSyncDebugLevelOpt; }
@@ -321,16 +324,22 @@ void mlir::pto::dumpInsertSyncPhase(llvm::StringRef phase, const SyncIRs &syncIR
     }
   }
 
-  os << "\n// === [PTOInsertSync Debug] " << phase << " === //\n";
-  os << llvm::formatv("// nodes={0}, syncGroups={1}, activeOps={2} "
-                      "(set={3}, wait={4}, barrier={5}, blockSet={6}, "
-                      "blockWait={7}, blockAll={8})\n",
-                      syncIR.size(), syncOperations.size(), activeOps, setCnt,
-                      waitCnt, barrierCnt, blockSetCnt, blockWaitCnt,
-                      blockAllCnt);
+  std::string buffer;
+  llvm::raw_string_ostream bufferedOS(buffer);
+
+  bufferedOS << "\n// === [PTOInsertSync Debug] " << phase << " === //\n";
+  bufferedOS << llvm::formatv("// nodes={0}, syncGroups={1}, activeOps={2} "
+                              "(set={3}, wait={4}, barrier={5}, blockSet={6}, "
+                              "blockWait={7}, blockAll={8})\n",
+                              syncIR.size(), syncOperations.size(), activeOps,
+                              setCnt, waitCnt, barrierCnt, blockSetCnt,
+                              blockWaitCnt, blockAllCnt);
 
   if (level < static_cast<unsigned>(InsertSyncDebugLevel::SyncIR)) {
-    os << "// ========================================= //\n";
+    bufferedOS << "// ========================================= //\n";
+    bufferedOS.flush();
+    std::lock_guard<std::mutex> lock(insertSyncDebugDumpMutex);
+    os << buffer;
     return;
   }
 
@@ -340,6 +349,10 @@ void mlir::pto::dumpInsertSyncPhase(llvm::StringRef phase, const SyncIRs &syncIR
   options.showMemInfo = showMemInfo;
   options.showUselessSync = showMemInfo;
 
-  dumpSyncIR(os, syncIR, opForPrinting, options, showMemInfo);
-  os << "// ========================================= //\n";
+  dumpSyncIR(bufferedOS, syncIR, opForPrinting, options, showMemInfo);
+  bufferedOS << "// ========================================= //\n";
+  bufferedOS.flush();
+
+  std::lock_guard<std::mutex> lock(insertSyncDebugDumpMutex);
+  os << buffer;
 }
diff --git a/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp b/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp
index d190885ec6..87540f9ab9 100644
--- a/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp
+++ b/lib/PTO/Transforms/InsertSync/PTOIRTranslator.cpp
@@ -35,6 +35,12 @@ using namespace mlir::pto;
 
 namespace {
 
+static constexpr llvm::StringLiteral kTileOpPrimaryDomainAttr =
+    "pto.tileop.primary_domain";
+static constexpr llvm::StringLiteral kTileOpPhasesAttr = "pto.tileop.phases";
+static constexpr llvm::StringLiteral kTileOpOperandEffectsAttr =
+    "pto.tileop.operand_effects";
+
 constexpr size_t kTileRank2D = 2;
 constexpr unsigned kStrideInlineCapacity = 4;
 constexpr unsigned kMemoryEffectInlineCapacity = 4;
@@ -209,30 +215,104 @@ static func::FuncOp lookupPTODSLSubkernelHelper(func::CallOp callOp) {
   auto callee = module.lookupSymbol<func::FuncOp>(callOp.getCallee());
   if (!callee)
     return {};
-  if (!callee->hasAttr("pto.ptodsl.subkernel_helper"))
+  if (!pto::hasPTODSLSubkernelHelperMarker(callee))
     return {};
   return callee;
 }
 
+static StringRef getResolvedPTODSLSubkernelHelperRole(func::FuncOp callee) {
+  return pto::getPTODSLSubkernelHelperRole(callee);
+}
+
 static std::optional<pto::PipelineType>
 getPTODSLSubkernelHelperPipe(func::FuncOp callee) {
-  auto roleAttr =
-      callee->getAttrOfType<mlir::StringAttr>("pto.ptodsl.subkernel_helper");
-  if (!roleAttr)
+  StringRef role = getResolvedPTODSLSubkernelHelperRole(callee);
+  if (role.empty())
     return std::nullopt;
 
   return llvm::StringSwitch<std::optional<pto::PipelineType>>(
-             roleAttr.getValue())
+             role)
       .Case("cube", pto::PipelineType::PIPE_M)
       .Case("simd", pto::PipelineType::PIPE_V)
       .Default(std::nullopt);
 }
 
+static bool isTileOpSubkernelHelper(func::FuncOp callee) {
+  return pto::isPTODSLTileOpHelper(callee);
+}
+
 static bool isPTODSLSubkernelMemoryOperand(Type type) {
   return isa<MemRefType, pto::PtrType, pto::TileBufType, pto::TensorViewType,
              pto::PartitionTensorViewType>(type);
 }
 
+static bool collectPTODSLTileOpCallOperands(func::CallOp callOp,
+                                            ArrayAttr operandIndices,
+                                            SmallVectorImpl<Value> &values) {
+  for (Attribute operandIndexAttr : operandIndices) {
+    auto indexAttr = dyn_cast<IntegerAttr>(operandIndexAttr);
+    if (!indexAttr)
+      return false;
+
+    int64_t operandIndex = indexAttr.getInt();
+    if (operandIndex < 0 ||
+        operandIndex >= static_cast<int64_t>(callOp.getNumOperands()))
+      return false;
+
+    Value operand = callOp.getOperand(static_cast<unsigned>(operandIndex));
+    if (!isPTODSLSubkernelMemoryOperand(operand.getType()))
+      continue;
+    values.push_back(operand);
+  }
+  return true;
+}
+
+static bool getPTODSLTileOpCallPhases(func::CallOp callOp, func::FuncOp callee,
+                                      SmallVectorImpl<SyncMacroPhase> &phases) {
+  auto primaryDomainAttr =
+      callee->getAttrOfType<FunctionKernelKindAttr>(kTileOpPrimaryDomainAttr);
+  auto phasesAttr = callee->getAttrOfType<ArrayAttr>(kTileOpPhasesAttr);
+  auto operandEffectsAttr =
+      callee->getAttrOfType<ArrayAttr>(kTileOpOperandEffectsAttr);
+  if (!primaryDomainAttr || !phasesAttr || !operandEffectsAttr)
+    return false;
+
+  if (operandEffectsAttr.size() != callOp.getNumOperands())
+    return false;
+
+  unsigned macroPhaseId = 0;
+  for (Attribute phaseAttr : phasesAttr) {
+    auto dictAttr = dyn_cast<DictionaryAttr>(phaseAttr);
+    auto pipeAttr =
+        dictAttr ? dyn_cast_or_null<PipeAttr>(dictAttr.get("pipe")) : PipeAttr();
+    auto usesAttr =
+        dictAttr ? dyn_cast_or_null<ArrayAttr>(dictAttr.get("operand_uses"))
+                 : ArrayAttr();
+    auto defsAttr =
+        dictAttr ? dyn_cast_or_null<ArrayAttr>(dictAttr.get("operand_defs"))
+                 : ArrayAttr();
+    auto resultsAttr =
+        dictAttr ? dyn_cast_or_null<ArrayAttr>(dictAttr.get("result_defs"))
+                 : ArrayAttr();
+    if (!dictAttr || !pipeAttr || !usesAttr || !defsAttr || !resultsAttr)
+      return false;
+
+    SyncMacroPhase phase;
+    phase.pipe = static_cast<PipelineType>(pipeAttr.getPipe());
+    if (!collectPTODSLTileOpCallOperands(callOp, usesAttr, phase.useValues) ||
+        !collectPTODSLTileOpCallOperands(callOp, defsAttr, phase.defValues))
+      return false;
+
+    if (phase.useValues.empty() && phase.defValues.empty())
+      continue;
+
+    phase.phaseId = macroPhaseId++;
+    phases.push_back(std::move(phase));
+  }
+
+  return true;
+}
+
 static pto::TCoreType getPTODSLSubkernelHelperCoreType(
     pto::PipelineType pipe) {
   return pipe == pto::PipelineType::PIPE_M ? pto::TCoreType::CUBE
@@ -698,6 +778,17 @@ void PTOIRTranslator::UpdatePTODSLSubkernelCallInfo(func::CallOp callOp) {
   if (!callee)
     return;
 
+  if (isTileOpSubkernelHelper(callee)) {
+    SmallVector<SyncMacroPhase, 8> phases;
+    if (!getPTODSLTileOpCallPhases(callOp, callee, phases))
+      return;
+    for (const auto &phase : phases) {
+      MakeMacroCompound(callOp, phase.pipe, ValueRange(phase.defValues),
+                        ValueRange(phase.useValues), phase.phaseId);
+    }
+    return;
+  }
+
   std::optional<pto::PipelineType> pipe = getPTODSLSubkernelHelperPipe(callee);
   if (!pipe || *pipe == pto::PipelineType::PIPE_UNASSIGNED)
     return;
diff --git a/lib/PTO/Transforms/PTOInferTileOpSummaryPass.cpp b/lib/PTO/Transforms/PTOInferTileOpSummaryPass.cpp
new file mode 100644
index 0000000000..6c3bab2d57
--- /dev/null
+++ b/lib/PTO/Transforms/PTOInferTileOpSummaryPass.cpp
@@ -0,0 +1,439 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+#include "PTO/IR/PTO.h"
+#include "PTO/Transforms/Passes.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+
+namespace mlir {
+namespace pto {
+namespace func = ::mlir::func;
+#define GEN_PASS_DEF_PTOINFERTILEOPSUMMARY
+#include "PTO/Transforms/Passes.h.inc"
+} // namespace pto
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::pto;
+
+namespace {
+
+static constexpr llvm::StringLiteral kTileOpPrimaryDomainAttr =
+    "pto.tileop.primary_domain";
+static constexpr llvm::StringLiteral kTileOpPhasesAttr = "pto.tileop.phases";
+static constexpr llvm::StringLiteral kTileOpOperandEffectsAttr =
+    "pto.tileop.operand_effects";
+
+enum class BoundaryEffect : uint8_t {
+  None,
+  Read,
+  Write,
+  ReadWrite,
+};
+
+struct TileOpPhaseSummary {
+  PIPE pipe = PIPE::PIPE_UNASSIGNED;
+  llvm::SmallSet<int64_t, 8> operandUses;
+  llvm::SmallSet<int64_t, 8> operandDefs;
+};
+
+template <typename CallbackT>
+static void walkTileOpBodyInSourceOrder(Block &block, CallbackT &&callback) {
+  for (Operation &op : block) {
+    if (op.hasTrait<OpTrait::IsTerminator>())
+      continue;
+    callback(&op);
+    for (Region &region : op.getRegions())
+      for (Block &nestedBlock : region)
+        walkTileOpBodyInSourceOrder(nestedBlock, callback);
+  }
+}
+
+static bool isTileOpSubkernelHelper(func::FuncOp funcOp) {
+  return pto::isPTODSLTileOpHelper(funcOp);
+}
+
+static bool isMemoryLikeBoundaryType(Type type) {
+  return isa<TileBufType, TensorViewType, PartitionTensorViewType, PtrType,
+             MemRefType>(type);
+}
+
+static bool isTileOpBodyOp(Operation *op) {
+  if (!op || isa<func::ReturnOp>(op))
+    return false;
+  if (op->getName().getDialectNamespace() != PTODialect::getDialectNamespace())
+    return false;
+  return true;
+}
+
+static std::optional<PIPE> getTileOpBodyPipe(Operation *op) {
+  if (!isTileOpBodyOp(op))
+    return std::nullopt;
+
+  if (auto pipeOp = dyn_cast<OpPipeInterface>(op)) {
+    PIPE pipe = pipeOp.getPipe();
+    if (pipe != PIPE::PIPE_UNASSIGNED)
+      return pipe;
+  }
+
+  if (isa<VecScopeOp, StrictVecScopeOp>(op))
+    return std::nullopt;
+
+  StringRef name = op->getName().getStringRef();
+  if (name.starts_with("pto.v"))
+    return PIPE::PIPE_V;
+  if (name.starts_with("pto.mad"))
+    return PIPE::PIPE_M;
+  if (name == "pto.plt_b8" || name == "pto.plt_b16" ||
+      name == "pto.plt_b32" || name == "pto.pltm_b8" ||
+      name == "pto.pltm_b16" || name == "pto.pltm_b32" ||
+      name == "pto.load" || name == "pto.store" || name == "pto.ldg" ||
+      name == "pto.stg")
+    return PIPE::PIPE_S;
+  if (name == "pto.copy_gm_to_ubuf" || name == "pto.mte_gm_ub" ||
+      name == "pto.mte_gm_l1" || name == "pto.mte_gm_l1_frac")
+    return PIPE::PIPE_MTE2;
+  if (name == "pto.mte_ub_gm" || name == "pto.mte_l0c_gm")
+    return PIPE::PIPE_MTE3;
+  if (name == "pto.mte_l1_l0a" || name == "pto.mte_l1_l0b" ||
+      name == "pto.mte_l1_l0a_mx" || name == "pto.mte_l1_l0b_mx")
+    return PIPE::PIPE_MTE1;
+  return std::nullopt;
+}
+
+static bool isMainVectorPipe(PIPE pipe) {
+  return pipe == PIPE::PIPE_V || pipe == PIPE::PIPE_V2;
+}
+
+static bool isMainCubePipe(PIPE pipe) {
+  return pipe == PIPE::PIPE_M;
+}
+
+static FunctionKernelKind getDomainForPipe(PIPE pipe) {
+  return isMainCubePipe(pipe) ? FunctionKernelKind::Cube
+                              : FunctionKernelKind::Vector;
+}
+
+static BoundaryEffect joinEffect(BoundaryEffect oldEffect,
+                                 BoundaryEffect newEffect) {
+  if (oldEffect == BoundaryEffect::None)
+    return newEffect;
+  if (newEffect == BoundaryEffect::None || oldEffect == newEffect)
+    return oldEffect;
+  return BoundaryEffect::ReadWrite;
+}
+
+static StringRef stringifyBoundaryEffect(BoundaryEffect effect) {
+  switch (effect) {
+  case BoundaryEffect::None:
+    return "none";
+  case BoundaryEffect::Read:
+    return "read";
+  case BoundaryEffect::Write:
+    return "write";
+  case BoundaryEffect::ReadWrite:
+    return "readwrite";
+  }
+  llvm_unreachable("unexpected tileop boundary effect");
+}
+
+static LogicalResult buildOperandIndexMap(
+    func::FuncOp funcOp, llvm::SmallDenseMap<Value, int64_t, 16> &operandIndex) {
+  if (funcOp.isDeclaration())
+    return success();
+  if (!funcOp.getBody().hasOneBlock())
+    return funcOp.emitOpError("tileop summary inference requires a single-block helper body");
+
+  Block &entry = funcOp.getBody().front();
+  for (auto [index, arg] : llvm::enumerate(entry.getArguments()))
+    operandIndex.try_emplace(arg, static_cast<int64_t>(index));
+  return success();
+}
+
+static void appendSortedI64Attrs(Builder &builder, llvm::SmallSet<int64_t, 8> set,
+                                 SmallVectorImpl<Attribute> &attrs) {
+  SmallVector<int64_t, 8> values(set.begin(), set.end());
+  llvm::sort(values);
+  for (int64_t value : values)
+    attrs.push_back(builder.getI64IntegerAttr(value));
+}
+
+static ArrayAttr getSortedI64ArrayAttr(Builder &builder,
+                                       llvm::SmallSet<int64_t, 8> set) {
+  SmallVector<Attribute, 8> attrs;
+  appendSortedI64Attrs(builder, set, attrs);
+  return builder.getArrayAttr(attrs);
+}
+
+static Value traceBoundaryOperandToHelperArg(Value value) {
+  int loopBound = 256;
+  while (value && loopBound-- > 0) {
+    if (auto arg = dyn_cast<BlockArgument>(value)) {
+      auto *parentOp = arg.getOwner()->getParentOp();
+      if (auto forOp = dyn_cast_or_null<scf::ForOp>(parentOp)) {
+        if (arg.getArgNumber() > 0 &&
+            forOp.getInitArgs().size() >= arg.getArgNumber()) {
+          value = forOp.getInitArgs()[arg.getArgNumber() - 1];
+          continue;
+        }
+      }
+      return value;
+    }
+
+    Operation *def = value.getDefiningOp();
+    if (!def)
+      return value;
+
+    if (auto subview = dyn_cast<memref::SubViewOp>(def)) {
+      value = subview.getSource();
+      continue;
+    }
+    if (auto cast = dyn_cast<memref::CastOp>(def)) {
+      value = cast.getSource();
+      continue;
+    }
+    if (auto cast = dyn_cast<memref::MemorySpaceCastOp>(def)) {
+      value = cast.getSource();
+      continue;
+    }
+    if (auto cast = dyn_cast<memref::ReinterpretCastOp>(def)) {
+      value = cast.getSource();
+      continue;
+    }
+    if (auto collapse = dyn_cast<memref::CollapseShapeOp>(def)) {
+      value = collapse.getSrc();
+      continue;
+    }
+    if (auto expand = dyn_cast<memref::ExpandShapeOp>(def)) {
+      value = expand.getSrc();
+      continue;
+    }
+    if (auto reshape = dyn_cast<memref::ReshapeOp>(def)) {
+      value = reshape.getSource();
+      continue;
+    }
+    if (auto transpose = dyn_cast<memref::TransposeOp>(def)) {
+      value = transpose.getIn();
+      continue;
+    }
+    if (auto view = dyn_cast<memref::ViewOp>(def)) {
+      value = view.getViewSource();
+      continue;
+    }
+    if (auto tileBufAddr = dyn_cast<TileBufAddrOp>(def)) {
+      value = tileBufAddr.getSrc();
+      continue;
+    }
+    if (auto tensorViewAddr = dyn_cast<TensorViewAddrOp>(def)) {
+      value = tensorViewAddr.getSrc();
+      continue;
+    }
+    if (auto bind = dyn_cast<BindTileOp>(def)) {
+      value = bind.getSource();
+      continue;
+    }
+    if (auto subview = dyn_cast<SubViewOp>(def)) {
+      value = subview.getSource();
+      continue;
+    }
+    if (auto bitcast = dyn_cast<BitcastOp>(def)) {
+      value = bitcast.getSrc();
+      continue;
+    }
+    if (auto reshape = dyn_cast<TReshapeOp>(def)) {
+      value = reshape.getSrc();
+      continue;
+    }
+    if (auto cast = dyn_cast<PointerCastOp>(def)) {
+      if (cast.getAddrs().empty())
+        return value;
+      value = cast.getAddrs().front();
+      continue;
+    }
+    if (auto cast = dyn_cast<CastPtrOp>(def)) {
+      value = cast.getInput();
+      continue;
+    }
+    if (auto addPtr = dyn_cast<AddPtrOp>(def)) {
+      value = addPtr.getPtr();
+      continue;
+    }
+    if (auto unrealized = dyn_cast<UnrealizedConversionCastOp>(def)) {
+      if (unrealized.getInputs().empty())
+        return value;
+      if (auto result = dyn_cast<OpResult>(value)) {
+        unsigned resultNumber = result.getResultNumber();
+        if (resultNumber < unrealized.getInputs().size()) {
+          value = unrealized.getInputs()[resultNumber];
+          continue;
+        }
+      }
+      if (unrealized.getInputs().size() == 1) {
+        value = unrealized.getInputs().front();
+        continue;
+      }
+      return value;
+    }
+    if (auto forOp = dyn_cast<scf::ForOp>(def)) {
+      if (auto result = dyn_cast<OpResult>(value)) {
+        unsigned resultNumber = result.getResultNumber();
+        if (resultNumber < forOp.getInitArgs().size()) {
+          value = forOp.getInitArgs()[resultNumber];
+          continue;
+        }
+      }
+      return value;
+    }
+    return value;
+  }
+  return value;
+}
+
+static void recordBoundaryEffects(
+    Operation *op, const llvm::SmallDenseMap<Value, int64_t, 16> &operandIndex,
+    TileOpPhaseSummary &phase, SmallVectorImpl<BoundaryEffect> &operandEffects) {
+  auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op);
+  if (!effectInterface)
+    return;
+
+  SmallVector<SideEffects::EffectInstance<MemoryEffects::Effect>, 8> effects;
+  effectInterface.getEffects(effects);
+  for (const auto &effect : effects) {
+    Value value = traceBoundaryOperandToHelperArg(effect.getValue());
+    if (!value)
+      continue;
+    auto it = operandIndex.find(value);
+    if (it == operandIndex.end())
+      continue;
+
+    int64_t index = it->second;
+    if (index < 0 ||
+        index >= static_cast<int64_t>(operandEffects.size()) ||
+        !isMemoryLikeBoundaryType(value.getType()))
+      continue;
+
+    BoundaryEffect boundaryEffect = BoundaryEffect::None;
+    if (isa<MemoryEffects::Read>(effect.getEffect())) {
+      phase.operandUses.insert(index);
+      boundaryEffect = BoundaryEffect::Read;
+    } else if (isa<MemoryEffects::Write>(effect.getEffect()) ||
+               isa<MemoryEffects::Allocate>(effect.getEffect()) ||
+               isa<MemoryEffects::Free>(effect.getEffect())) {
+      phase.operandDefs.insert(index);
+      boundaryEffect = BoundaryEffect::Write;
+    }
+
+    operandEffects[index] = joinEffect(operandEffects[index], boundaryEffect);
+  }
+}
+
+static Attribute buildPhaseAttr(Builder &builder,
+                                const TileOpPhaseSummary &phase) {
+  NamedAttrList attrs;
+  attrs.append("pipe", PipeAttr::get(builder.getContext(), phase.pipe));
+  attrs.append("operand_uses", getSortedI64ArrayAttr(builder, phase.operandUses));
+  attrs.append("operand_defs", getSortedI64ArrayAttr(builder, phase.operandDefs));
+  attrs.append("result_defs", builder.getArrayAttr({}));
+  return builder.getDictionaryAttr(attrs);
+}
+
+static LogicalResult inferTileOpSummary(func::FuncOp funcOp) {
+  if (!isTileOpSubkernelHelper(funcOp))
+    return success();
+  if (funcOp.isDeclaration())
+    return success();
+
+  llvm::SmallDenseMap<Value, int64_t, 16> operandIndex;
+  if (failed(buildOperandIndexMap(funcOp, operandIndex)))
+    return failure();
+
+  Builder builder(funcOp.getContext());
+  SmallVector<BoundaryEffect, 8> operandEffects(
+      funcOp.getNumArguments(), BoundaryEffect::None);
+  SmallVector<TileOpPhaseSummary, 8> phases;
+  std::optional<FunctionKernelKind> primaryDomain;
+
+  Block &entry = funcOp.getBody().front();
+  LogicalResult walkResult = success();
+  walkTileOpBodyInSourceOrder(entry, [&](Operation *op) {
+    if (failed(walkResult))
+      return;
+
+    std::optional<PIPE> maybePipe = getTileOpBodyPipe(op);
+    if (!maybePipe)
+      return;
+    PIPE pipe = *maybePipe;
+
+    if (isMainVectorPipe(pipe) || isMainCubePipe(pipe)) {
+      FunctionKernelKind domain = getDomainForPipe(pipe);
+      if (primaryDomain && *primaryDomain != domain) {
+        walkResult = op->emitError()
+                     << "tileop helper mixes vector and cube primary compute "
+                        "pipes; MVP supports exactly one primary domain";
+        return;
+      }
+      primaryDomain = domain;
+    }
+
+    if (phases.empty() || phases.back().pipe != pipe) {
+      TileOpPhaseSummary phase;
+      phase.pipe = pipe;
+      phases.push_back(std::move(phase));
+    }
+    recordBoundaryEffects(op, operandIndex, phases.back(), operandEffects);
+  });
+  if (failed(walkResult))
+    return failure();
+
+  if (!primaryDomain)
+    return success();
+
+  funcOp->setAttr(
+      kTileOpPrimaryDomainAttr,
+      FunctionKernelKindAttr::get(funcOp.getContext(), *primaryDomain));
+
+  SmallVector<Attribute, 8> phaseAttrs;
+  phaseAttrs.reserve(phases.size());
+  for (const TileOpPhaseSummary &phase : phases)
+    phaseAttrs.push_back(buildPhaseAttr(builder, phase));
+  funcOp->setAttr(kTileOpPhasesAttr, builder.getArrayAttr(phaseAttrs));
+
+  SmallVector<Attribute, 8> effectAttrs;
+  effectAttrs.reserve(operandEffects.size());
+  for (BoundaryEffect effect : operandEffects) {
+    if (effect == BoundaryEffect::None)
+      effect = BoundaryEffect::Read;
+    effectAttrs.push_back(builder.getStringAttr(stringifyBoundaryEffect(effect)));
+  }
+  funcOp->setAttr(kTileOpOperandEffectsAttr,
+                  builder.getArrayAttr(effectAttrs));
+  return success();
+}
+
+struct PTOInferTileOpSummaryPass
+    : public mlir::pto::impl::PTOInferTileOpSummaryBase<
+          PTOInferTileOpSummaryPass> {
+  void runOnOperation() override {
+    if (failed(inferTileOpSummary(getOperation())))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass> mlir::pto::createPTOInferTileOpSummaryPass() {
+  return std::make_unique<PTOInferTileOpSummaryPass>();
+}
diff --git a/lib/PTO/Transforms/PTOInstantiateAndInlineOpLib.cpp b/lib/PTO/Transforms/PTOInstantiateAndInlineOpLib.cpp
index c278bd196a..54ecfe02d2 100644
--- a/lib/PTO/Transforms/PTOInstantiateAndInlineOpLib.cpp
+++ b/lib/PTO/Transforms/PTOInstantiateAndInlineOpLib.cpp
@@ -50,7 +50,7 @@ static bool isTilelangInlineProcFunc(func::FuncOp fn) {
 }
 
 static bool isPTODSLSubkernelHelperFunc(func::FuncOp fn) {
-  return fn->hasAttr("pto.ptodsl.subkernel_helper");
+  return pto::hasPTODSLSubkernelHelperMarker(fn);
 }
 
 static bool isTilelangTemplateFunc(func::FuncOp fn) {
diff --git a/lib/PTO/Transforms/PTOMaterializeTileHandles.cpp b/lib/PTO/Transforms/PTOMaterializeTileHandles.cpp
index 4d868a20f6..8a9ef6cdba 100644
--- a/lib/PTO/Transforms/PTOMaterializeTileHandles.cpp
+++ b/lib/PTO/Transforms/PTOMaterializeTileHandles.cpp
@@ -828,7 +828,7 @@ static bool isTileViewSemantics(StringAttr viewSemantics) {
 }
 
 static bool isPTODSLSubkernelHelper(func::FuncOp func) {
-  return func->hasAttr("pto.ptodsl.subkernel_helper");
+  return pto::hasPTODSLSubkernelHelperMarker(func);
 }
 
 static std::optional<SmallVector<Type>>
diff --git a/lib/PTO/Transforms/PTOMaterializeTileOpSectionsPass.cpp b/lib/PTO/Transforms/PTOMaterializeTileOpSectionsPass.cpp
new file mode 100644
index 0000000000..e95fb0273a
--- /dev/null
+++ b/lib/PTO/Transforms/PTOMaterializeTileOpSectionsPass.cpp
@@ -0,0 +1,308 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+#include "PTO/IR/PTO.h"
+#include "PTO/Transforms/Passes.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace pto {
+namespace func = ::mlir::func;
+#define GEN_PASS_DEF_PTOMATERIALIZETILEOPSECTIONS
+#include "PTO/Transforms/Passes.h.inc"
+} // namespace pto
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::pto;
+
+namespace {
+
+static constexpr llvm::StringLiteral kTileOpPrimaryDomainAttr =
+    "pto.tileop.primary_domain";
+static constexpr llvm::StringLiteral kTileOpPhasesAttr = "pto.tileop.phases";
+
+static bool isTileOpSubkernelHelper(func::FuncOp funcOp) {
+  return pto::isPTODSLTileOpHelper(funcOp);
+}
+
+static bool isTileOpBodyOp(Operation *op) {
+  if (!op || isa<func::ReturnOp>(op))
+    return false;
+  if (op->getName().getDialectNamespace() != PTODialect::getDialectNamespace())
+    return false;
+  return true;
+}
+
+static std::optional<PIPE> getTileOpBodyPipe(Operation *op) {
+  if (!isTileOpBodyOp(op))
+    return std::nullopt;
+
+  if (auto pipeOp = dyn_cast<OpPipeInterface>(op)) {
+    PIPE pipe = pipeOp.getPipe();
+    if (pipe != PIPE::PIPE_UNASSIGNED)
+      return pipe;
+  }
+
+  if (isa<VecScopeOp, StrictVecScopeOp>(op))
+    return std::nullopt;
+
+  StringRef name = op->getName().getStringRef();
+  if (name.starts_with("pto.v"))
+    return PIPE::PIPE_V;
+  if (name.starts_with("pto.mad"))
+    return PIPE::PIPE_M;
+  if (name == "pto.plt_b8" || name == "pto.plt_b16" ||
+      name == "pto.plt_b32" || name == "pto.pltm_b8" ||
+      name == "pto.pltm_b16" || name == "pto.pltm_b32" ||
+      name == "pto.load" || name == "pto.store" || name == "pto.ldg" ||
+      name == "pto.stg")
+    return PIPE::PIPE_S;
+  if (name == "pto.copy_gm_to_ubuf" || name == "pto.mte_gm_ub" ||
+      name == "pto.mte_gm_l1" || name == "pto.mte_gm_l1_frac")
+    return PIPE::PIPE_MTE2;
+  if (name == "pto.mte_ub_gm" || name == "pto.mte_l0c_gm")
+    return PIPE::PIPE_MTE3;
+  if (name == "pto.mte_l1_l0a" || name == "pto.mte_l1_l0b" ||
+      name == "pto.mte_l1_l0a_mx" || name == "pto.mte_l1_l0b_mx")
+    return PIPE::PIPE_MTE1;
+  return std::nullopt;
+}
+
+static bool hasExistingSection(func::FuncOp funcOp) {
+  bool found = false;
+  funcOp.walk([&](Operation *op) {
+    if (isa<SectionCubeOp, SectionVectorOp>(op)) {
+      found = true;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  return found;
+}
+
+static bool isPrimaryVectorPipe(PIPE pipe) {
+  return pipe == PIPE::PIPE_V || pipe == PIPE::PIPE_V2;
+}
+
+static bool isPrimaryCubePipe(PIPE pipe) {
+  return pipe == PIPE::PIPE_M;
+}
+
+static bool isPrimaryPipeForKind(PIPE pipe, FunctionKernelKind kind) {
+  switch (kind) {
+  case FunctionKernelKind::Vector:
+    return isPrimaryVectorPipe(pipe);
+  case FunctionKernelKind::Cube:
+    return isPrimaryCubePipe(pipe);
+  }
+  llvm_unreachable("unexpected kernel kind");
+}
+
+static FailureOr<FunctionKernelKind> getPrimaryDomain(func::FuncOp funcOp) {
+  auto primaryAttr =
+      funcOp->getAttrOfType<FunctionKernelKindAttr>(kTileOpPrimaryDomainAttr);
+  if (!primaryAttr)
+    return funcOp.emitOpError("requires ")
+           << kTileOpPrimaryDomainAttr << " before tileop section materialization";
+  return primaryAttr.getKernelKind();
+}
+
+static FailureOr<unsigned> findFirstPrimaryPhase(func::FuncOp funcOp,
+                                                 ArrayAttr phases,
+                                                 FunctionKernelKind kind) {
+  for (auto [index, attr] : llvm::enumerate(phases)) {
+    auto dict = dyn_cast<DictionaryAttr>(attr);
+    auto pipeAttr = dict ? dyn_cast_or_null<PipeAttr>(dict.get("pipe")) : PipeAttr();
+    if (!dict || !pipeAttr) {
+      return funcOp.emitOpError("expects ")
+             << kTileOpPhasesAttr << " entries to carry a pipe attr";
+    }
+    if (isPrimaryPipeForKind(pipeAttr.getPipe(), kind))
+      return index;
+  }
+  return funcOp.emitOpError("requires at least one primary compute phase in ")
+         << kTileOpPhasesAttr;
+}
+
+static FailureOr<unsigned> findLastPrimaryPhase(func::FuncOp funcOp,
+                                                ArrayAttr phases,
+                                                FunctionKernelKind kind) {
+  for (int index = static_cast<int>(phases.size()) - 1; index >= 0; --index) {
+    auto dict = dyn_cast<DictionaryAttr>(phases[index]);
+    auto pipeAttr = dict ? dyn_cast_or_null<PipeAttr>(dict.get("pipe")) : PipeAttr();
+    if (!dict || !pipeAttr) {
+      return funcOp.emitOpError("expects ")
+             << kTileOpPhasesAttr << " entries to carry a pipe attr";
+    }
+    if (isPrimaryPipeForKind(pipeAttr.getPipe(), kind))
+      return static_cast<unsigned>(index);
+  }
+  return funcOp.emitOpError("requires at least one primary compute phase in ")
+         << kTileOpPhasesAttr;
+}
+
+static SmallVector<Operation *, 8> collectTileOpBodyOps(Block &block) {
+  SmallVector<Operation *, 8> ops;
+  for (Operation &op : block.without_terminator()) {
+    if (!getTileOpBodyPipe(&op))
+      continue;
+    ops.push_back(&op);
+  }
+  return ops;
+}
+
+static FailureOr<std::optional<std::pair<unsigned, unsigned>>>
+findPrimaryOpRange(func::FuncOp funcOp, ArrayRef<Operation *> ops,
+                   FunctionKernelKind kind) {
+  int first = -1;
+  int last = -1;
+  for (auto [index, op] : llvm::enumerate(ops)) {
+    std::optional<PIPE> maybePipe = getTileOpBodyPipe(op);
+    if (!maybePipe)
+      continue;
+    PIPE pipe = *maybePipe;
+    if (!isPrimaryPipeForKind(pipe, kind))
+      continue;
+    if (first < 0)
+      first = static_cast<int>(index);
+    last = static_cast<int>(index);
+  }
+
+  if (first < 0 || last < 0)
+    return std::optional<std::pair<unsigned, unsigned>>();
+
+  for (int index = first; index <= last; ++index) {
+    std::optional<PIPE> maybePipe = getTileOpBodyPipe(ops[index]);
+    if (!maybePipe)
+      continue;
+    PIPE pipe = *maybePipe;
+    if (!isPrimaryPipeForKind(pipe, kind)) {
+      return ops[index]->emitError()
+             << "tileop primary compute span is not contiguous; MVP materializer "
+                "supports one contiguous primary-domain span";
+    }
+  }
+
+  return std::optional<std::pair<unsigned, unsigned>>(
+      std::make_pair(static_cast<unsigned>(first),
+                     static_cast<unsigned>(last)));
+}
+
+template <typename SectionOpT>
+static void wrapOperationRange(Block &block, Operation *firstOp,
+                               Operation *lastOp) {
+  OpBuilder builder(firstOp);
+  auto sectionOp = builder.create<SectionOpT>(firstOp->getLoc());
+  sectionOp.getBody().push_back(new Block());
+  Block &sectionBlock = sectionOp.getBody().front();
+
+  auto firstIt = Block::iterator(firstOp);
+  auto afterLastIt = std::next(Block::iterator(lastOp));
+  sectionBlock.getOperations().splice(sectionBlock.end(),
+                                      block.getOperations(), firstIt,
+                                      afterLastIt);
+}
+
+static LogicalResult materializePrimarySectionsInBlock(func::FuncOp funcOp,
+                                                       Block &block,
+                                                       FunctionKernelKind kind,
+                                                       bool &materializedAny) {
+  SmallVector<Operation *, 16> topLevelOps;
+  for (Operation &op : block.without_terminator())
+    topLevelOps.push_back(&op);
+
+  SmallVector<Operation *, 8> bodyOps = collectTileOpBodyOps(block);
+  if (!bodyOps.empty()) {
+    FailureOr<std::optional<std::pair<unsigned, unsigned>>> primaryRange =
+        findPrimaryOpRange(funcOp, bodyOps, kind);
+    if (failed(primaryRange))
+      return failure();
+
+    if (*primaryRange) {
+      Operation *firstPrimaryOp = bodyOps[(*primaryRange)->first];
+      Operation *lastPrimaryOp = bodyOps[(*primaryRange)->second];
+      switch (kind) {
+      case FunctionKernelKind::Vector:
+        wrapOperationRange<SectionVectorOp>(block, firstPrimaryOp,
+                                            lastPrimaryOp);
+        break;
+      case FunctionKernelKind::Cube:
+        wrapOperationRange<SectionCubeOp>(block, firstPrimaryOp, lastPrimaryOp);
+        break;
+      }
+      materializedAny = true;
+    }
+  }
+
+  for (Operation *op : topLevelOps) {
+    if (!op || getTileOpBodyPipe(op))
+      continue;
+    for (Region &region : op->getRegions())
+      for (Block &nestedBlock : region)
+        if (failed(materializePrimarySectionsInBlock(funcOp, nestedBlock, kind,
+                                                     materializedAny)))
+          return failure();
+  }
+  return success();
+}
+
+static LogicalResult materializeTileOpSection(func::FuncOp funcOp) {
+  if (!isTileOpSubkernelHelper(funcOp) || funcOp.isDeclaration())
+    return success();
+
+  if (hasExistingSection(funcOp))
+    return success();
+
+  auto phases = funcOp->getAttrOfType<ArrayAttr>(kTileOpPhasesAttr);
+  if (!phases || phases.empty())
+    return success();
+
+  FailureOr<FunctionKernelKind> primaryDomain = getPrimaryDomain(funcOp);
+  if (failed(primaryDomain))
+    return failure();
+
+  FailureOr<unsigned> firstPhase =
+      findFirstPrimaryPhase(funcOp, phases, *primaryDomain);
+  if (failed(firstPhase))
+    return failure();
+  FailureOr<unsigned> lastPhase =
+      findLastPrimaryPhase(funcOp, phases, *primaryDomain);
+  if (failed(lastPhase))
+    return failure();
+  (void)firstPhase;
+  (void)lastPhase;
+
+  bool materializedAny = false;
+  if (failed(materializePrimarySectionsInBlock(funcOp, funcOp.getBody().front(),
+                                               *primaryDomain, materializedAny)))
+    return failure();
+  if (!materializedAny) {
+    return funcOp.emitOpError(
+        "requires at least one primary compute op in helper body");
+  }
+  return success();
+}
+
+struct PTOMaterializeTileOpSectionsPass
+    : public mlir::pto::impl::PTOMaterializeTileOpSectionsBase<
+          PTOMaterializeTileOpSectionsPass> {
+  void runOnOperation() override {
+    if (failed(materializeTileOpSection(getOperation())))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass> mlir::pto::createPTOMaterializeTileOpSectionsPass() {
+  return std::make_unique<PTOMaterializeTileOpSectionsPass>();
+}
diff --git a/lib/PTO/Transforms/PTONormalizeUncoveredTileSections.cpp b/lib/PTO/Transforms/PTONormalizeUncoveredTileSections.cpp
index 55b78f2863..8348b8aa8a 100644
--- a/lib/PTO/Transforms/PTONormalizeUncoveredTileSections.cpp
+++ b/lib/PTO/Transforms/PTONormalizeUncoveredTileSections.cpp
@@ -110,6 +110,10 @@ static bool hasExplicitFunctionKernelKind(func::FuncOp funcOp) {
          funcOp->hasAttrOfType<FunctionKernelKindAttr>(FunctionKernelKindAttr::name);
 }
 
+static bool isTileOpSubkernelHelper(func::FuncOp funcOp) {
+  return pto::isPTODSLTileOpHelper(funcOp);
+}
+
 static bool isInsideKernelKindModule(func::FuncOp funcOp) {
   if (!funcOp)
     return false;
@@ -118,7 +122,9 @@ static bool isInsideKernelKindModule(func::FuncOp funcOp) {
 }
 
 static bool hasKnownKernelKindContext(func::FuncOp funcOp) {
-  return isInsideKernelKindModule(funcOp) || hasExplicitFunctionKernelKind(funcOp);
+  return isInsideKernelKindModule(funcOp) ||
+         hasExplicitFunctionKernelKind(funcOp) ||
+         isTileOpSubkernelHelper(funcOp);
 }
 
 static std::optional<AddressSpace> getBufferAddressSpace(Type type) {
diff --git a/lib/PTO/Transforms/PTOPlanMemory.cpp b/lib/PTO/Transforms/PTOPlanMemory.cpp
index aa3196a672..574a8f1f9a 100644
--- a/lib/PTO/Transforms/PTOPlanMemory.cpp
+++ b/lib/PTO/Transforms/PTOPlanMemory.cpp
@@ -446,6 +446,10 @@ void MemLivenessAnalysis::RecursionIR(Region *region, Liveness live) {
       OpKillHandle(curOpInfo, live, op->getBlock());
     } else if (auto storeOp = dyn_cast<memref::StoreOp>(op)) {
       UpdateStoreOpInfo(curOpInfo, storeOp.getMemRef(), live);
+    } else if (isa<pto::VldsOp, pto::Vldsx2Op, pto::VldasOp, pto::VstsOp,
+                   pto::Vstsx2Op>(op)) {
+      UpdateOpGenInfo(curOpInfo, llvm::to_vector(op->getOperands()));
+      OpKillHandle(curOpInfo, live, op->getBlock());
     } else if (auto ptoDpsOp = dyn_cast<pto::PTO_DpsInitOpInterface>(op)) {
       // PTO ops with destination (tile_buf, partition_view, etc.); no
       // tensor/memref-only verification.
diff --git a/lib/PTO/Transforms/PTOVerifySubkernelPipeContractPass.cpp b/lib/PTO/Transforms/PTOVerifySubkernelPipeContractPass.cpp
index 8635497f64..f728b94164 100644
--- a/lib/PTO/Transforms/PTOVerifySubkernelPipeContractPass.cpp
+++ b/lib/PTO/Transforms/PTOVerifySubkernelPipeContractPass.cpp
@@ -26,14 +26,8 @@ using namespace mlir::pto;
 
 namespace {
 
-static constexpr llvm::StringLiteral kPTODSLSubkernelHelperAttr =
-    "pto.ptodsl.subkernel_helper";
-
 static StringRef getSubkernelRole(func::FuncOp funcOp) {
-  if (auto roleAttr =
-          funcOp->getAttrOfType<StringAttr>(kPTODSLSubkernelHelperAttr))
-    return roleAttr.getValue();
-  return {};
+  return pto::getPTODSLSubkernelHelperRole(funcOp);
 }
 
 static std::optional<PIPE> getExpectedPipeForRole(StringRef role) {
diff --git a/lib/PTO/Transforms/PTOVerifyTileOpContractPass.cpp b/lib/PTO/Transforms/PTOVerifyTileOpContractPass.cpp
new file mode 100644
index 0000000000..2a91f275e5
--- /dev/null
+++ b/lib/PTO/Transforms/PTOVerifyTileOpContractPass.cpp
@@ -0,0 +1,590 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+#include "PTO/IR/PTO.h"
+#include "PTO/Transforms/Passes.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+
+namespace mlir {
+namespace pto {
+namespace func = ::mlir::func;
+#define GEN_PASS_DEF_PTOVERIFYTILEOPCONTRACT
+#include "PTO/Transforms/Passes.h.inc"
+} // namespace pto
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::pto;
+
+namespace {
+
+static constexpr llvm::StringLiteral kTileOpPrimaryDomainAttr =
+    "pto.tileop.primary_domain";
+static constexpr llvm::StringLiteral kTileOpPhasesAttr = "pto.tileop.phases";
+static constexpr llvm::StringLiteral kTileOpOperandEffectsAttr =
+    "pto.tileop.operand_effects";
+
+enum class BoundaryEffect : uint8_t {
+  None,
+  Read,
+  Write,
+  ReadWrite,
+};
+
+struct TileOpPhaseSummary {
+  PIPE pipe = PIPE::PIPE_UNASSIGNED;
+  llvm::SmallSet<int64_t, 8> operandUses;
+  llvm::SmallSet<int64_t, 8> operandDefs;
+};
+
+template <typename CallbackT>
+static void walkTileOpBodyInSourceOrder(Block &block, CallbackT &&callback) {
+  for (Operation &op : block) {
+    if (op.hasTrait<OpTrait::IsTerminator>())
+      continue;
+    callback(&op);
+    for (Region &region : op.getRegions())
+      for (Block &nestedBlock : region)
+        walkTileOpBodyInSourceOrder(nestedBlock, callback);
+  }
+}
+
+static bool isTileOpSubkernelHelper(func::FuncOp funcOp) {
+  return pto::isPTODSLTileOpHelper(funcOp);
+}
+
+static bool isMemoryLikeBoundaryType(Type type) {
+  return isa<TileBufType, TensorViewType, PartitionTensorViewType, PtrType,
+             MemRefType>(type);
+}
+
+static bool isTileOpHelperBoundaryType(Type type) {
+  return isa<TileBufType, TensorViewType, PartitionTensorViewType>(type);
+}
+
+static bool isTileOpScalarType(Type type) {
+  return isa<IntegerType, FloatType, IndexType>(type);
+}
+
+static bool isMainVectorPipe(PIPE pipe) {
+  return pipe == PIPE::PIPE_V || pipe == PIPE::PIPE_V2;
+}
+
+static bool isMainCubePipe(PIPE pipe) {
+  return pipe == PIPE::PIPE_M;
+}
+
+static bool isTileOpBodyOp(Operation *op) {
+  if (!op || isa<func::ReturnOp>(op))
+    return false;
+  if (op->getName().getDialectNamespace() != PTODialect::getDialectNamespace())
+    return false;
+  return true;
+}
+
+static std::optional<PIPE> getTileOpBodyPipe(Operation *op) {
+  if (!isTileOpBodyOp(op))
+    return std::nullopt;
+
+  if (auto pipeOp = dyn_cast<OpPipeInterface>(op)) {
+    PIPE pipe = pipeOp.getPipe();
+    if (pipe != PIPE::PIPE_UNASSIGNED)
+      return pipe;
+  }
+
+  if (isa<VecScopeOp, StrictVecScopeOp>(op))
+    return std::nullopt;
+
+  StringRef name = op->getName().getStringRef();
+  if (name.starts_with("pto.v"))
+    return PIPE::PIPE_V;
+  if (name.starts_with("pto.mad"))
+    return PIPE::PIPE_M;
+  if (name == "pto.plt_b8" || name == "pto.plt_b16" ||
+      name == "pto.plt_b32" || name == "pto.pltm_b8" ||
+      name == "pto.pltm_b16" || name == "pto.pltm_b32" ||
+      name == "pto.load" || name == "pto.store" || name == "pto.ldg" ||
+      name == "pto.stg")
+    return PIPE::PIPE_S;
+  if (name == "pto.copy_gm_to_ubuf" || name == "pto.mte_gm_ub" ||
+      name == "pto.mte_gm_l1" || name == "pto.mte_gm_l1_frac")
+    return PIPE::PIPE_MTE2;
+  if (name == "pto.mte_ub_gm" || name == "pto.mte_l0c_gm")
+    return PIPE::PIPE_MTE3;
+  if (name == "pto.mte_l1_l0a" || name == "pto.mte_l1_l0b" ||
+      name == "pto.mte_l1_l0a_mx" || name == "pto.mte_l1_l0b_mx")
+    return PIPE::PIPE_MTE1;
+  return std::nullopt;
+}
+
+static FunctionKernelKind getDomainForPipe(PIPE pipe) {
+  return isMainCubePipe(pipe) ? FunctionKernelKind::Cube
+                              : FunctionKernelKind::Vector;
+}
+
+static StringRef stringifyBoundaryEffect(BoundaryEffect effect) {
+  switch (effect) {
+  case BoundaryEffect::None:
+    return "none";
+  case BoundaryEffect::Read:
+    return "read";
+  case BoundaryEffect::Write:
+    return "write";
+  case BoundaryEffect::ReadWrite:
+    return "readwrite";
+  }
+  llvm_unreachable("unexpected tileop boundary effect");
+}
+
+static BoundaryEffect joinEffect(BoundaryEffect oldEffect,
+                                 BoundaryEffect newEffect) {
+  if (oldEffect == BoundaryEffect::None)
+    return newEffect;
+  if (newEffect == BoundaryEffect::None || oldEffect == newEffect)
+    return oldEffect;
+  return BoundaryEffect::ReadWrite;
+}
+
+static bool isSIMTOnlyPTOOp(Operation *op) {
+  return isa<StoreVfSimtInfoOp, SimtLaunchOp, GetTidXOp, GetTidYOp, GetTidZOp,
+             GetBlockDimXOp, GetBlockDimYOp, GetBlockDimZOp, GetGridDimXOp,
+             GetGridDimYOp, GetGridDimZOp, GetBlockIdxXOp, GetBlockIdxYOp,
+             GetBlockIdxZOp, GetVecCoreIdOp, GetLaneIdOp, GetClock32Op,
+             GetClock64Op, GetLaneMaskEqOp, GetLaneMaskLeOp, GetLaneMaskLtOp,
+             GetLaneMaskGeOp, GetLaneMaskGtOp, VoteAllOp, VoteAnyOp, VoteUniOp,
+             VoteBallotOp, ShuffleIdxOp, ShuffleUpOp, ShuffleDownOp,
+             ShuffleBflyOp, ReduxAddOp, ReduxMaxOp, ReduxMinOp, SyncthreadsOp,
+             ThreadfenceOp, ThreadfenceBlockOp, KeepOp, ResumeOp>(op);
+}
+
+static LogicalResult buildOperandIndexMap(
+    func::FuncOp funcOp, llvm::SmallDenseMap<Value, int64_t, 16> &operandIndex) {
+  if (funcOp.isDeclaration())
+    return success();
+  if (!funcOp.getBody().hasOneBlock())
+    return funcOp.emitOpError(
+        "tileop contract verification requires a single-block helper body");
+
+  for (auto [index, arg] :
+       llvm::enumerate(funcOp.getBody().front().getArguments()))
+    operandIndex.try_emplace(arg, static_cast<int64_t>(index));
+  return success();
+}
+
+static Value traceBoundaryOperandToHelperArg(Value value) {
+  int loopBound = 256;
+  while (value && loopBound-- > 0) {
+    if (auto arg = dyn_cast<BlockArgument>(value)) {
+      auto *parentOp = arg.getOwner()->getParentOp();
+      if (auto forOp = dyn_cast_or_null<scf::ForOp>(parentOp)) {
+        if (arg.getArgNumber() > 0 &&
+            forOp.getInitArgs().size() >= arg.getArgNumber()) {
+          value = forOp.getInitArgs()[arg.getArgNumber() - 1];
+          continue;
+        }
+      }
+      return value;
+    }
+
+    Operation *def = value.getDefiningOp();
+    if (!def)
+      return value;
+
+    if (auto subview = dyn_cast<memref::SubViewOp>(def)) {
+      value = subview.getSource();
+      continue;
+    }
+    if (auto cast = dyn_cast<memref::CastOp>(def)) {
+      value = cast.getSource();
+      continue;
+    }
+    if (auto cast = dyn_cast<memref::MemorySpaceCastOp>(def)) {
+      value = cast.getSource();
+      continue;
+    }
+    if (auto cast = dyn_cast<memref::ReinterpretCastOp>(def)) {
+      value = cast.getSource();
+      continue;
+    }
+    if (auto collapse = dyn_cast<memref::CollapseShapeOp>(def)) {
+      value = collapse.getSrc();
+      continue;
+    }
+    if (auto expand = dyn_cast<memref::ExpandShapeOp>(def)) {
+      value = expand.getSrc();
+      continue;
+    }
+    if (auto reshape = dyn_cast<memref::ReshapeOp>(def)) {
+      value = reshape.getSource();
+      continue;
+    }
+    if (auto transpose = dyn_cast<memref::TransposeOp>(def)) {
+      value = transpose.getIn();
+      continue;
+    }
+    if (auto view = dyn_cast<memref::ViewOp>(def)) {
+      value = view.getViewSource();
+      continue;
+    }
+    if (auto tileBufAddr = dyn_cast<TileBufAddrOp>(def)) {
+      value = tileBufAddr.getSrc();
+      continue;
+    }
+    if (auto tensorViewAddr = dyn_cast<TensorViewAddrOp>(def)) {
+      value = tensorViewAddr.getSrc();
+      continue;
+    }
+    if (auto bind = dyn_cast<BindTileOp>(def)) {
+      value = bind.getSource();
+      continue;
+    }
+    if (auto subview = dyn_cast<SubViewOp>(def)) {
+      value = subview.getSource();
+      continue;
+    }
+    if (auto bitcast = dyn_cast<BitcastOp>(def)) {
+      value = bitcast.getSrc();
+      continue;
+    }
+    if (auto reshape = dyn_cast<TReshapeOp>(def)) {
+      value = reshape.getSrc();
+      continue;
+    }
+    if (auto cast = dyn_cast<PointerCastOp>(def)) {
+      if (cast.getAddrs().empty())
+        return value;
+      value = cast.getAddrs().front();
+      continue;
+    }
+    if (auto cast = dyn_cast<CastPtrOp>(def)) {
+      value = cast.getInput();
+      continue;
+    }
+    if (auto addPtr = dyn_cast<AddPtrOp>(def)) {
+      value = addPtr.getPtr();
+      continue;
+    }
+    if (auto unrealized = dyn_cast<UnrealizedConversionCastOp>(def)) {
+      if (unrealized.getInputs().empty())
+        return value;
+      if (auto result = dyn_cast<OpResult>(value)) {
+        unsigned resultNumber = result.getResultNumber();
+        if (resultNumber < unrealized.getInputs().size()) {
+          value = unrealized.getInputs()[resultNumber];
+          continue;
+        }
+      }
+      if (unrealized.getInputs().size() == 1) {
+        value = unrealized.getInputs().front();
+        continue;
+      }
+      return value;
+    }
+    if (auto forOp = dyn_cast<scf::ForOp>(def)) {
+      if (auto result = dyn_cast<OpResult>(value)) {
+        unsigned resultNumber = result.getResultNumber();
+        if (resultNumber < forOp.getInitArgs().size()) {
+          value = forOp.getInitArgs()[resultNumber];
+          continue;
+        }
+      }
+      return value;
+    }
+    return value;
+  }
+  return value;
+}
+
+static void recordBoundaryEffects(
+    Operation *op, const llvm::SmallDenseMap<Value, int64_t, 16> &operandIndex,
+    TileOpPhaseSummary &phase, SmallVectorImpl<BoundaryEffect> &operandEffects) {
+  auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op);
+  if (!effectInterface)
+    return;
+
+  SmallVector<SideEffects::EffectInstance<MemoryEffects::Effect>, 8> effects;
+  effectInterface.getEffects(effects);
+  for (const auto &effect : effects) {
+    Value value = traceBoundaryOperandToHelperArg(effect.getValue());
+    if (!value)
+      continue;
+
+    auto it = operandIndex.find(value);
+    if (it == operandIndex.end())
+      continue;
+
+    int64_t index = it->second;
+    if (index < 0 || index >= static_cast<int64_t>(operandEffects.size()) ||
+        !isMemoryLikeBoundaryType(value.getType()))
+      continue;
+
+    BoundaryEffect boundaryEffect = BoundaryEffect::None;
+    if (isa<MemoryEffects::Read>(effect.getEffect())) {
+      phase.operandUses.insert(index);
+      boundaryEffect = BoundaryEffect::Read;
+    } else if (isa<MemoryEffects::Write>(effect.getEffect()) ||
+               isa<MemoryEffects::Allocate>(effect.getEffect()) ||
+               isa<MemoryEffects::Free>(effect.getEffect())) {
+      phase.operandDefs.insert(index);
+      boundaryEffect = BoundaryEffect::Write;
+    }
+
+    operandEffects[index] = joinEffect(operandEffects[index], boundaryEffect);
+  }
+}
+
+static LogicalResult verifyScalarResults(func::FuncOp funcOp) {
+  for (Type resultType : funcOp.getResultTypes()) {
+    if (isTileOpScalarType(resultType))
+      continue;
+    return funcOp.emitOpError()
+           << "tileop helper results are limited to PTO scalar values in the "
+              "MVP, but found result type "
+           << resultType;
+  }
+  return success();
+}
+
+static LogicalResult verifyArgumentBoundaryTypes(func::FuncOp funcOp) {
+  for (Type argType : funcOp.getArgumentTypes()) {
+    if (isTileOpHelperBoundaryType(argType) || isTileOpScalarType(argType))
+      continue;
+    return funcOp.emitOpError()
+           << "tileop helper arguments must be Tile/TensorView/"
+              "PartitionTensorView or PTO scalar values, but found "
+           << argType;
+  }
+  return success();
+}
+
+static LogicalResult verifySummaryAttrs(func::FuncOp funcOp,
+                                        std::optional<FunctionKernelKind> inferredDomain,
+                                        ArrayRef<TileOpPhaseSummary> inferredPhases,
+                                        ArrayRef<BoundaryEffect> inferredEffects) {
+  if (!inferredDomain)
+    return funcOp.emitOpError(
+        "requires at least one vector or cube primary compute op; helpers with "
+        "only MTE/scalar/sync phases are rejected");
+
+  auto primaryAttr =
+      funcOp->getAttrOfType<FunctionKernelKindAttr>(kTileOpPrimaryDomainAttr);
+  if (!primaryAttr)
+    return funcOp.emitOpError("requires ")
+           << kTileOpPrimaryDomainAttr << " before tileop contract verification";
+
+  if (primaryAttr.getKernelKind() != *inferredDomain) {
+    return funcOp.emitOpError("has stale ")
+           << kTileOpPrimaryDomainAttr << ": inferred primary domain is #pto.kernel_kind<"
+           << (*inferredDomain == FunctionKernelKind::Cube ? "cube" : "vector")
+           << ">";
+  }
+
+  auto phasesAttr = funcOp->getAttrOfType<ArrayAttr>(kTileOpPhasesAttr);
+  if (!phasesAttr)
+    return funcOp.emitOpError("requires ") << kTileOpPhasesAttr;
+  if (phasesAttr.size() != inferredPhases.size()) {
+    return funcOp.emitOpError("has stale ")
+           << kTileOpPhasesAttr << ": inferred " << inferredPhases.size()
+           << " phase(s), but attribute stores " << phasesAttr.size();
+  }
+
+  auto effectsAttr = funcOp->getAttrOfType<ArrayAttr>(kTileOpOperandEffectsAttr);
+  if (!effectsAttr)
+    return funcOp.emitOpError("requires ") << kTileOpOperandEffectsAttr;
+  if (effectsAttr.size() != inferredEffects.size()) {
+    return funcOp.emitOpError("has stale ")
+           << kTileOpOperandEffectsAttr << ": inferred "
+           << inferredEffects.size() << " operand effect(s), but attribute stores "
+           << effectsAttr.size();
+  }
+
+  for (unsigned index = 0; index < effectsAttr.size(); ++index) {
+    auto strAttr = dyn_cast<StringAttr>(effectsAttr[index]);
+    if (!strAttr)
+      return funcOp.emitOpError("expects ")
+             << kTileOpOperandEffectsAttr
+             << " entries to be string attributes";
+
+    BoundaryEffect expected = inferredEffects[index];
+    if (expected == BoundaryEffect::None)
+      expected = BoundaryEffect::Read;
+    if (strAttr.getValue() != stringifyBoundaryEffect(expected)) {
+      return funcOp.emitOpError("has stale ")
+             << kTileOpOperandEffectsAttr << " at operand #" << index
+             << ": inferred \"" << stringifyBoundaryEffect(expected)
+             << "\", but attribute stores \"" << strAttr.getValue() << "\"";
+    }
+  }
+
+  for (unsigned index = 0; index < phasesAttr.size(); ++index) {
+    auto dict = dyn_cast<DictionaryAttr>(phasesAttr[index]);
+    if (!dict)
+      return funcOp.emitOpError("expects ")
+             << kTileOpPhasesAttr << " entries to be dictionary attributes";
+
+    auto pipeAttr = dyn_cast_or_null<PipeAttr>(dict.get("pipe"));
+    auto usesAttr = dyn_cast_or_null<ArrayAttr>(dict.get("operand_uses"));
+    auto defsAttr = dyn_cast_or_null<ArrayAttr>(dict.get("operand_defs"));
+    auto resultsAttr = dyn_cast_or_null<ArrayAttr>(dict.get("result_defs"));
+    if (!pipeAttr || !usesAttr || !defsAttr || !resultsAttr) {
+      return funcOp.emitOpError("expects ")
+             << kTileOpPhasesAttr
+             << " entries to carry pipe/operand_uses/operand_defs/result_defs";
+    }
+
+    const TileOpPhaseSummary &expected = inferredPhases[index];
+    if (pipeAttr.getPipe() != expected.pipe) {
+      return funcOp.emitOpError("has stale ")
+             << kTileOpPhasesAttr << " at phase #" << index
+             << ": inferred pipe " << stringifyPIPE(expected.pipe)
+             << ", but attribute stores " << stringifyPIPE(pipeAttr.getPipe());
+    }
+
+    auto verifyIndexSet = [&](ArrayAttr values, llvm::SmallSet<int64_t, 8> set,
+                              StringRef fieldName) -> LogicalResult {
+      if (values.size() != set.size()) {
+        return funcOp.emitOpError("has stale ")
+               << kTileOpPhasesAttr << " at phase #" << index << " field '"
+               << fieldName << "'";
+      }
+      llvm::SmallSet<int64_t, 8> actual;
+      for (Attribute valueAttr : values) {
+        auto intAttr = dyn_cast<IntegerAttr>(valueAttr);
+        if (!intAttr)
+          return funcOp.emitOpError("expects ") << fieldName
+                                                << " indices to be integers";
+        actual.insert(intAttr.getInt());
+      }
+      if (actual != set) {
+        return funcOp.emitOpError("has stale ")
+               << kTileOpPhasesAttr << " at phase #" << index << " field '"
+               << fieldName << "'";
+      }
+      return success();
+    };
+
+    if (failed(verifyIndexSet(usesAttr, expected.operandUses, "operand_uses")) ||
+        failed(verifyIndexSet(defsAttr, expected.operandDefs, "operand_defs")))
+      return failure();
+    if (!resultsAttr.empty()) {
+      return funcOp.emitOpError("expects ")
+             << kTileOpPhasesAttr
+             << " result_defs to remain empty in the current MVP";
+    }
+  }
+
+  return success();
+}
+
+static LogicalResult verifyTileOpHelper(func::FuncOp funcOp) {
+  if (!isTileOpSubkernelHelper(funcOp) || funcOp.isDeclaration())
+    return success();
+
+  if (failed(verifyScalarResults(funcOp)) ||
+      failed(verifyArgumentBoundaryTypes(funcOp)))
+    return failure();
+
+  llvm::SmallDenseMap<Value, int64_t, 16> operandIndex;
+  if (failed(buildOperandIndexMap(funcOp, operandIndex)))
+    return failure();
+
+  SmallVector<BoundaryEffect, 8> operandEffects(funcOp.getNumArguments(),
+                                                BoundaryEffect::None);
+  SmallVector<TileOpPhaseSummary, 8> phases;
+  std::optional<FunctionKernelKind> primaryDomain;
+
+  Block &entry = funcOp.getBody().front();
+  LogicalResult walkResult = success();
+  walkTileOpBodyInSourceOrder(entry, [&](Operation *op) {
+    if (failed(walkResult))
+      return;
+
+    if (isa<AllocTileOp, ReserveBufferOp, TAllocOp>(op)) {
+      walkResult = op->emitError("is not allowed inside a tileop helper; "
+                                 "tileop helpers must not allocate "
+                                 "helper-local tile or reserved-buffer state");
+      return;
+    }
+
+    if (isSIMTOnlyPTOOp(op)) {
+      walkResult =
+          op->emitError("is SIMT-only and cannot appear inside a tileop helper");
+      return;
+    }
+
+    if (auto callOp = dyn_cast<func::CallOp>(op)) {
+      auto module = funcOp->getParentOfType<ModuleOp>();
+      auto callee = module ? module.lookupSymbol<func::FuncOp>(callOp.getCallee())
+                           : func::FuncOp();
+      if (callee && isTileOpSubkernelHelper(callee)) {
+        InFlightDiagnostic diag =
+            callOp.emitOpError("cannot call tileop helper @");
+        diag << callee.getSymName()
+             << " from another tileop helper; nested tileop calls are "
+                "rejected";
+        walkResult = failure();
+        return;
+      }
+      return;
+    }
+
+    if (op->getName().getDialectNamespace() != PTODialect::getDialectNamespace())
+      return;
+
+    std::optional<PIPE> maybePipe = getTileOpBodyPipe(op);
+    if (!maybePipe)
+      return;
+    PIPE pipe = *maybePipe;
+
+    if (isMainVectorPipe(pipe) || isMainCubePipe(pipe)) {
+      FunctionKernelKind domain = getDomainForPipe(pipe);
+      if (primaryDomain && *primaryDomain != domain) {
+        walkResult = op->emitError(
+            "tileop helper mixes vector and cube primary compute pipes; MVP "
+            "supports exactly one primary domain");
+        return;
+      }
+      primaryDomain = domain;
+    }
+
+    if (phases.empty() || phases.back().pipe != pipe) {
+      TileOpPhaseSummary phase;
+      phase.pipe = pipe;
+      phases.push_back(std::move(phase));
+    }
+    recordBoundaryEffects(op, operandIndex, phases.back(), operandEffects);
+  });
+
+  if (failed(walkResult))
+    return failure();
+
+  return verifySummaryAttrs(funcOp, primaryDomain, phases, operandEffects);
+}
+
+struct PTOVerifyTileOpContractPass
+    : public mlir::pto::impl::PTOVerifyTileOpContractBase<
+          PTOVerifyTileOpContractPass> {
+  void runOnOperation() override {
+    if (failed(verifyTileOpHelper(getOperation())))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass> mlir::pto::createPTOVerifyTileOpContractPass() {
+  return std::make_unique<PTOVerifyTileOpContractPass>();
+}
diff --git a/lib/PTO/Transforms/PTOViewToMemref.cpp b/lib/PTO/Transforms/PTOViewToMemref.cpp
index 45bdd9aaa5..265ed7e52d 100644
--- a/lib/PTO/Transforms/PTOViewToMemref.cpp
+++ b/lib/PTO/Transforms/PTOViewToMemref.cpp
@@ -682,6 +682,24 @@ static Type convertTileBufTypeToMemRef(mlir::pto::TileBufType tbTy) {
                          tbTy.getMemorySpace());
 }
 
+static Type convertTensorViewLikeTypeToMemRef(ArrayRef<int64_t> shape,
+                                              Type elementType,
+                                              MLIRContext *ctx,
+                                              bool forceDynamicShape) {
+  SmallVector<int64_t, 4> resultShape;
+  if (forceDynamicShape) {
+    resultShape.assign(shape.size(), ShapedType::kDynamic);
+  } else {
+    resultShape.assign(shape.begin(), shape.end());
+  }
+
+  SmallVector<int64_t, 4> dynStrides(shape.size(), ShapedType::kDynamic);
+  auto layoutAttr =
+      StridedLayoutAttr::get(ctx, ShapedType::kDynamic, dynStrides);
+  auto gmSpace = AddressSpaceAttr::get(ctx, AddressSpace::GM);
+  return MemRefType::get(resultShape, elementType, layoutAttr, gmSpace);
+}
+
 static Type convertPTOTypeToMemRef(Type t) {
   // 1. 处理 !pto.ptr<T>
   if (auto pty = dyn_cast<mlir::pto::PtrType>(t)) {
@@ -693,11 +711,15 @@ static Type convertPTOTypeToMemRef(Type t) {
   if (auto tbTy = dyn_cast<mlir::pto::TileBufType>(t))
     return convertTileBufTypeToMemRef(tbTy);
   if (auto tvTy = dyn_cast<mlir::pto::TensorViewType>(t))
-    return MemRefType::get(tvTy.getShape(), tvTy.getElementType(),
-                           MemRefLayoutAttrInterface(), Attribute());
+    return convertTensorViewLikeTypeToMemRef(tvTy.getShape(),
+                                             tvTy.getElementType(),
+                                             tvTy.getContext(),
+                                             /*forceDynamicShape=*/true);
   if (auto partTy = dyn_cast<mlir::pto::PartitionTensorViewType>(t))
-    return MemRefType::get(partTy.getShape(), partTy.getElementType(),
-                           MemRefLayoutAttrInterface(), Attribute());
+    return convertTensorViewLikeTypeToMemRef(partTy.getShape(),
+                                             partTy.getElementType(),
+                                             partTy.getContext(),
+                                             /*forceDynamicShape=*/false);
   // 其他类型透传
   return t;
 }
diff --git a/lib/PTO/Transforms/Utils.cpp b/lib/PTO/Transforms/Utils.cpp
index 58e68c77e2..f4d8c4812f 100644
--- a/lib/PTO/Transforms/Utils.cpp
+++ b/lib/PTO/Transforms/Utils.cpp
@@ -112,6 +112,10 @@ std::optional<std::pair<Value, Value>> getOperationAliasInfo(Operation *op) {
     return std::make_pair(toTensorOp.getResult(), toTensorOp.getOperand());
   } else if (auto toMemrefOp = dyn_cast<bufferization::ToMemrefOp>(op)) {
     return std::make_pair(toMemrefOp.getResult(), toMemrefOp.getOperand());
+  } else if (auto tileBufAddrOp = dyn_cast<pto::TileBufAddrOp>(op)) {
+    return std::make_pair(tileBufAddrOp.getDst(), tileBufAddrOp.getSrc());
+  } else if (auto tensorViewAddrOp = dyn_cast<pto::TensorViewAddrOp>(op)) {
+    return std::make_pair(tensorViewAddrOp.getDst(), tensorViewAddrOp.getSrc());
   }
   return std::nullopt;
 }
diff --git a/ptodsl/README.md b/ptodsl/README.md
index c2e034ac5a..3c227e4a13 100644
--- a/ptodsl/README.md
+++ b/ptodsl/README.md
@@ -243,7 +243,9 @@ The user guide under `ptodsl/docs/user_guide/` is the canonical PTODSL API
 reference. This README keeps only a compact map of the public surface:
 
 - `@pto.jit`: the only host-visible kernel entry
-- `@pto.cube`, `@pto.simd`, `@pto.simt`: hardware-unit sub-kernels
+- `@pto.tileop`: custom tile-op helper surface for vector-style sub-kernels
+- `@pto.simt`: SIMT helper surface with launch dimensions
+- `@pto.cube`, `@pto.simd`: retained hardware-specific custom OP entry points
 - `pto.ptr(...)` + runtime PTO scalar annotations: public entry ABI
 - `pto.make_tensor_view(...)`, `pto.partition_view(...)`, `pto.alloc_tile(...)`:
   core data-model builders
diff --git a/ptodsl/docs/user_guide/01-introduction.md b/ptodsl/docs/user_guide/01-introduction.md
index 12eba5ce58..951b384762 100644
--- a/ptodsl/docs/user_guide/01-introduction.md
+++ b/ptodsl/docs/user_guide/01-introduction.md
@@ -64,8 +64,8 @@ Python Wrapper              L0  user-facing wrapper (NumPy, torch-npu, pure Pyth
   │    └─ backend="emitc"         EmitC backend, mode="auto" only
   ├─ Tile Ops                     tile.load, tile.store, tile.add, ...
   ├─ MTE Ops                      mte_load / mte_store / mte_gm_ub / ...
-  ├─ @pto.cube                    matrix products (mad, mte_l1_l0a, mte_l0c_ub, ...)
-  ├─ @pto.simd                    row-wise vector math (vlds, vadd, vexp, vsts, ...)
+  ├─ @pto.tileop                  row-wise vector custom OPs (vlds, vadd, vexp, vsts, ...)
+  ├─ @pto.cube / @pto.simd        retained hardware-specific custom OP entry points
   └─ @pto.simt                    scalar-like compute (lds, sts, pointwise blends, ...)
 ```
 
@@ -245,19 +245,19 @@ micro-instruction surface — MTE ops, explicit sync, and pointer-level
 control — so you can mix tile operations with hand-authored instructions in
 the same kernel.
 
-### Sub-kernels — `@pto.cube` / `@pto.simd` / `@pto.simt`
+### Sub-kernels — `@pto.tileop` / `@pto.simt`
 
 These are hardware-bound compute sub-kernels, each mapped to a specific NPU compute unit:
 
-- **`@pto.cube`** consumes UB tiles and explicit cube-local scratch (LEFT, RIGHT, ACC, BIAS). Typical operations: `mad`, `mte_l1_l0a`, `mte_l1_l0b`, `mte_l0c_ub`.
+- **`@pto.tileop`** is the recommended surface for custom tile-op bodies that operate on UB tiles and vector registers. Typical operations: `vlds`, `vadd`, `vexp`, `vcgmax`, `vsts`.
 
-- **`@pto.simd`** operates on vector registers (`vreg`). Typical operations: `vlds`, `vadd`, `vexp`, `vcgmax`, `vsts`. Vector registers never cross the simd function boundary — persistent state is written back to UB tiles.
+- **`@pto.cube` / `@pto.simd`** remain available for existing code that wants hardware-specific names. They preserve the authored surface name in PTODSL diagnostics and tracing, but decorated helpers canonicalize to the same backend helper contract as `@pto.tileop`.
 
 - **`@pto.simt`** is a scalar-programmable processor group that executes scalar instructions across many work-items in parallel. Typical operations: `lds`, `sts`, scalar arithmetic and comparison. Well-suited for per-element tile walks, boundary metadata, and pointwise blends.
 
-Each can be invoked as a named decorated function (`@pto.cube` /
-`@pto.simd` / `@pto.simt`) or inline as a context manager
-(`with pto.cube():`, `with pto.simd():`, `with pto.simt():`).
+Tile-op helpers can be invoked as named decorated functions (`@pto.tileop`) or
+inline context managers (`with pto.tileop():`). SIMT helpers use `@pto.simt`
+and launch dimensions via `helper[x, y, z](...)` or `pto.simt_launch(...)`.
 
 The boundary contract is strict: vreg values do not escape a simd kernel, cube-local state does not leak into UB, and data crosses layer boundaries only through UB-backed tiles or typed UB pointers.
 
@@ -299,7 +299,7 @@ sequences four sub-kernel calls: `qk_matmul` (cube),
 **`@pto.cube`** performs `mte_l1_l0a` / `mte_l1_l0b` / `mad` /
 `mte_l0c_ub` for both QK^T and P@V products.
 
-**`@pto.simd`** implements the online softmax update: per-row max, exp, sum,
+**`@pto.tileop`** implements the online softmax update: per-row max, exp, sum,
 and alpha/beta computation using vector ops (`vlds`, `vcgmax`, `vexp`,
 `vcgadd`, `vsts`).
 
@@ -323,7 +323,7 @@ Chapter 11 walks through this example in full detail.
 |---------|-------|
 | 1 | Introduction (this chapter) |
 | 2 | Quick Start — a minimal working kernel |
-| 3 | Kernel entries, kernel modules, and sub-kernels: `@pto.jit(entry=True/False, backend=...)`, `@pto.cube`, `@pto.simd`, `@pto.simt` |
+| 3 | Kernel entries, kernel modules, and sub-kernels: `@pto.jit(entry=True/False, backend=...)`, `@pto.tileop`, `@pto.simt`, retained `@pto.cube` / `@pto.simd` |
 | 4 | Type system and buffer management: scalars, tiles, views, allocation |
 | 5 | Control flow: trace-time Python vs device-side `pto.for_` / `pto.if_` |
 | 6 | Scalar and pointer operations |
diff --git a/ptodsl/docs/user_guide/02-quick-start.md b/ptodsl/docs/user_guide/02-quick-start.md
index ba46bae7ba..534c1ebd10 100644
--- a/ptodsl/docs/user_guide/02-quick-start.md
+++ b/ptodsl/docs/user_guide/02-quick-start.md
@@ -197,8 +197,8 @@ switch that kernel to `mode="explicit"`:
 
 <!-- ptodsl-doc-test: {"mode":"compile","symbol":"vec_add_micro","compile":{"BLOCK":128}} -->
 ```python
-# SIMD sub-kernel — vector instructions on individual rows.
-@pto.simd
+# Tile-op helper — vector instructions on individual rows.
+@pto.tileop
 def add_rows(a_tile: pto.Tile, b_tile: pto.Tile, o_tile: pto.Tile,
              rows: pto.index, cols: pto.index):
     VEC = pto.elements_per_vreg(pto.f32)
@@ -211,7 +211,7 @@ def add_rows(a_tile: pto.Tile, b_tile: pto.Tile, o_tile: pto.Tile,
             o_vec = pto.vadd(a_vec, b_vec, mask)
             pto.vsts(o_vec, o_tile[r, c:], mask)
 
-# Single kernel entry in explicit mode — micro-instruction staging plus SIMD sub-kernel.
+# Single kernel entry in explicit mode — micro-instruction staging plus a tile-op helper.
 @pto.jit(target="a5", mode="explicit")
 def vec_add_micro(
     A_ptr: pto.ptr(pto.f32, "gm"),
@@ -252,11 +252,11 @@ def vec_add_micro(
   loops over blocks, and directly authors the micro-instruction schedule for
   each block.
 
-- **`@pto.simd` sub-kernel**: the top-level kernel calls a SIMD sub-kernel
+- **`@pto.tileop` sub-kernel**: the top-level kernel calls a vector custom OP
   for the row-wise vector work while keeping instruction staging in the
   explicit entry body.
 
-- **Inside `@pto.simd`**: the outer Python `for range(...)` iterates over rows,
+- **Inside `@pto.tileop`**: the outer Python `for range(...)` iterates over rows,
   and the inner Python `for range(...)` iterates over column chunks of the hardware vector width
   (`elements_per_vreg`). Each iteration loads a vector-width slice into a
   `vreg`, does the addition under a mask (for tail elements), and stores the
diff --git a/ptodsl/docs/user_guide/03-kernel-entry-and-subkernels.md b/ptodsl/docs/user_guide/03-kernel-entry-and-subkernels.md
index 7bb02647ca..3dbfdf9151 100644
--- a/ptodsl/docs/user_guide/03-kernel-entry-and-subkernels.md
+++ b/ptodsl/docs/user_guide/03-kernel-entry-and-subkernels.md
@@ -2,8 +2,8 @@
 
 PTODSL provides one kernel decorator (`@pto.jit`) with two roles
 (`entry=True` / `entry=False`), two compilation backends (`vpto` / `emitc`),
-and three compute-unit sub-kernel decorators (`@pto.cube`, `@pto.simd`,
-`@pto.simt`), plus matching context managers for inline use. This chapter covers
+custom OP sub-kernel decorators (`@pto.tileop`, retained `@pto.cube` /
+`@pto.simd`, and `@pto.simt`), plus matching context managers for inline use. This chapter covers
 the `@pto.jit` entry and module contracts, the two programming models, the two
 compilation backends, sub-kernel reference, parameter contracts, and boundary
 constraints.
@@ -21,8 +21,8 @@ Decorator overview:
   mode="auto"               tile-first authoring, compiler-managed staging (default)
   mode="explicit"           micro-instruction authoring, user-managed staging
 
-@pto.cube                   Cube-unit matrix sub-kernel
-@pto.simd                   SIMD-unit vector sub-kernel
+@pto.tileop                 Custom tile-op sub-kernel
+@pto.cube / @pto.simd       Retained hardware-specific custom OP entry points
 @pto.simt                   SIMT-unit scalar sub-kernel
 ```
 
@@ -54,8 +54,8 @@ The **`mode`** parameter selects the programming model within the kernel body
 it doesn't change how you compile or launch the kernel.
 
 `@pto.jit` owns compilation (tracing + lowering), caching, and — for
-`entry=True` — runtime launch binding. The compute-unit decorators
-(`@pto.cube`, `@pto.simd`, `@pto.simt`) define sub-kernels that are called from
+`entry=True` — runtime launch binding. The sub-kernel decorators
+(`@pto.tileop`, retained `@pto.cube` / `@pto.simd`, and `@pto.simt`) define helpers that are called from
 within `@pto.jit` bodies.
 
 
@@ -324,9 +324,9 @@ Module bodies follow the same AST rewrite rules as `@pto.jit(entry=True)`
 (see Chapter 5). In the default `mode="auto"`, Python `for` / `if` are
 rewritten to device-side control flow, and the compiler handles hardware
 section placement automatically — you can write `vlds` / `vadd` / `vsts`
-directly in the module body without an explicit `with pto.simd():`. In
+directly in the module body without an explicit `with pto.tileop():`. In
 `mode="explicit"`, you must manage hardware sections yourself with
-`with pto.simd():`, `with pto.cube():`, or `with pto.simt():`.
+`with pto.tileop():`, `with pto.cube():`, or `with pto.simt():`.
 
 ### Interface protocol
 
@@ -463,7 +463,7 @@ there are two kinds of helpers:
 - **Plain Python helpers** for code organization, repeated index math,
   partition construction, and orchestration that should stay in the caller's
   context.
-- **Sub-kernels** (`@pto.cube`, `@pto.simd`, `@pto.simt`) when the helper must
+- **Sub-kernels** (`@pto.tileop`, retained `@pto.cube` / `@pto.simd`, `@pto.simt`) when the helper must
   run on a specific hardware unit or use unit-local value categories such as
   `vreg` or cube-local scratch.
 
@@ -482,18 +482,18 @@ rewrite behavior when they are traced from a compiled specialization.
 Sub-kernels are the mechanism for custom compute in PTODSL — when Tile Ops
 cover your needs, you don't need one; when they don't, a sub-kernel gives you
 direct access to the hardware unit. In auto mode, a sub-kernel's parameters
-are restricted to `Tile` and PTO scalar types — the compiler owns staging and
-sync. In explicit mode, sub-kernels may also accept `PartitionTensorView` and
-`pto.ptr` parameters, matching the richer type surface available there.
-This richer pointer surface belongs to the **in-kernel orchestration and
-sub-kernel boundary**, not to the public `@pto.jit` host entry ABI.
+follow the decorator's role-specific ABI — the compiler still owns staging and
+sync for tileop-style helpers. `@pto.tileop` and retained `@pto.simd` helpers
+accept `Tile`, `TensorView`, `PartitionTensorView`, and PTO scalar parameters.
+`@pto.simt` additionally accepts typed `pto.ptr(...)` parameters. These richer
+in-kernel boundary types do not change the public `@pto.jit` host entry ABI.
 Section 3.3 covers each sub-kernel decorator in detail.
 
 ### Module vs sub-kernel
 
 **Module or sub-kernel?** A simple rule:
-- Logic that **must run on a specific hardware unit** (Cube, SIMD, or SIMT)
-  and operates on tiles → use a sub-kernel (`@pto.cube`, `@pto.simd`, `@pto.simt`).
+- Logic that **must run as a custom tile op or SIMT helper**
+  and operates on tiles → use a sub-kernel (`@pto.tileop`, `@pto.simt`, or the retained `@pto.cube` / `@pto.simd` names).
 - General device-side code organisation — allocating tiles, partitioning GM
   views, calling sub-kernels, mixing backends → use a kernel module
   (`@pto.jit(entry=False)`).
@@ -502,11 +502,11 @@ Modules **can** call sub-kernels (they are callable from both entries and
 modules). Sub-kernels **cannot** call modules — data crosses the sub-kernel
 boundary only through UB tiles, not through nested function calls.
 
-| | `@pto.jit(entry=False)` module | `@pto.simd` / `@pto.simt` / `@pto.cube` |
+| | `@pto.jit(entry=False)` module | `@pto.tileop` / `@pto.simt` / retained `@pto.cube` / `@pto.simd` |
 |---|---|---|
 | Positioning | General device-side function | **Custom tile op** — hardware-bound compute primitive |
 | Scope | Orchestration, tile allocation, data movement, sub-kernel dispatch | Single-hardware-unit compute logic |
-| ABI | **C ABI: ptr + PTO scalars only**. Tile/TensorView/PartitionTensorView cannot cross the function boundary. Caller passes `tile.as_ptr()`; module constructs local tiles internally | **Tile + PTO scalars**. In/out via mutable Tile parameters. `@pto.simt` additionally accepts typed UB pointers |
+| ABI | **C ABI: ptr + PTO scalars only**. Tile/TensorView/PartitionTensorView cannot cross the function boundary. Caller passes `tile.as_ptr()`; module constructs local tiles internally | Role-specific PTODSL helper ABI. `@pto.tileop` / retained `@pto.simd`: `Tile`, `TensorView`, `PartitionTensorView`, PTO scalars. `@pto.simt`: `Tile`, typed `pto.ptr(...)`, PTO scalars |
 | Backend | VPTO or EmitC | Always VPTO |
 | Compilation | Compiled as a separate child module, linked automatically | Outlined as a helper function inside the owning caller/module |
 | Callable from | Entries and other modules | Entries and modules |
@@ -542,10 +542,11 @@ gain access to MTE ops, explicit synchronization, and pointer manipulation.
 When you need precise control over individual instructions and phase ordering,
 you can drop below the tile abstraction without leaving the `@pto.jit` entry.
 
-The richer type surface also applies to sub-kernels: in auto mode, a
-sub-kernel's parameters are restricted to `Tile` and PTO scalar types; in
-explicit mode they may also accept `PartitionTensorView` and `pto.ptr`,
-matching the types available in the enclosing orchestration code.
+Explicit mode broadens the orchestration code you can write inside `@pto.jit`
+and `@pto.jit(entry=False)` bodies. Sub-kernel ABIs themselves remain
+role-specific rather than mode-specific: `@pto.tileop` / retained `@pto.simd`
+accept `Tile`, `TensorView`, `PartitionTensorView`, and PTO scalars, while
+`@pto.simt` accepts `Tile`, typed `pto.ptr(...)`, and PTO scalars.
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"kernel_entry.explicit_signature","symbol":"kernel_entry_explicit_signature_probe","compile":{"BLOCK":16}} -->
 ```python
@@ -592,7 +593,7 @@ def process_block(q_tile, k_part, v_part, k_tile, v_tile,
                   nburst=(rows, ub_row_stride, gm_row_stride))
 ```
 
-Sub-kernel calls and inline sub-kernel scopes (`with pto.simd():`, etc.) work
+Sub-kernel calls and inline sub-kernel scopes (`with pto.tileop():`, etc.) work
 identically in both modes.
 
 ### Choosing between modes
@@ -714,11 +715,16 @@ built-in ops don't cover your needs.
 
 **Sub-kernels are custom tile ops.** Their contract is strict:
 
-- **Inputs**: `Tile` references and PTO scalars (`pto.i32`, `pto.f32`, ...).
-  Data arrives from UB via tile handles; the sub-kernel does not own GM
-  addressing or DMA orchestration.
-- **Outputs**: written back to UB tiles. Sub-kernels have no return values —
-  results are communicated by writing to mutable `Tile` parameters.
+- **Inputs**: role-specific PTODSL boundary values. `@pto.tileop` and retained
+  `@pto.simd` accept `Tile`, `TensorView`, `PartitionTensorView`, and PTO
+  scalars (`pto.i32`, `pto.f32`, ...). `@pto.simt` accepts `Tile`, typed
+  `pto.ptr(...)`, and PTO scalars. Data still flows through device-side
+  boundary objects; sub-kernels do not define a host-visible ABI.
+- **Outputs**: written back to UB tiles. Mutable `Tile` parameters remain the
+  primary output path. In the current MVP, decorated tileop-style helpers may
+  additionally return PTO scalar values through `func.call` results, but
+  `Tile`, `TensorView`, `PartitionTensorView`, `vreg`, and mask values still do
+  not cross the helper boundary as results.
 - **No cross-boundary vreg**: vector registers (`vreg`) and cube-local state
   (LEFT, RIGHT, ACC) are private to the sub-kernel body and never escape.
 
@@ -729,14 +735,15 @@ When to use a sub-kernel vs a kernel module:
   views, or mix backends → use an `@pto.jit(entry=False)` kernel module
   instead. Modules can call sub-kernels, but sub-kernels cannot call modules.
 
-Sub-kernels are decorated with `@pto.cube`, `@pto.simd`, or `@pto.simt`.
+Sub-kernels are decorated with `@pto.tileop`, `@pto.simt`, or the retained
+hardware-specific `@pto.cube` / `@pto.simd` entry points.
 PTODSL lowers both surface forms to real helper `func.func` bodies instead of
 flattening them directly into the surrounding caller. They can be authored in
 two ways:
 
 1. **As decorated functions** — reusable, named sub-kernels called from
    `@pto.jit` entries and modules.
-2. **As context managers** (`with pto.cube():`, etc.) — inline blocks for
+2. **As context managers** (`with pto.tileop():`, etc.) — inline blocks for
    one-off snippets (see Section 3.8).
 
 Named sub-kernel decorators use the same default AST rewrite model as
@@ -798,27 +805,29 @@ explicitly.
 
 **Lowering model**: a decorated `@pto.cube` function becomes one reusable
 helper function inside the owning PTODSL child module. Each callsite lowers to
-`func.call` of that helper; the helper body itself contains the `pto.section.cube`
-region.
+`func.call` of that helper. Decorated compatibility wrappers lower through the
+same naked `tileop` helper contract as `@pto.tileop`; backend passes infer the
+primary domain and materialize `pto.section.cube` later in the PTOAS pipeline.
 
 **Invocation modes**: can be called from `@pto.jit` in either mode, or authored
 as an anonymous inline helper with `with pto.cube():` (Section 3.8).
 
-### 3.7.2 `@pto.simd` — SIMD unit (vector operations)
+### 3.7.2 `@pto.tileop` — custom tile op (vector operations)
 
-**Role**: `@pto.simd` is the custom tile op for row-wise vector compute on
-the SIMD unit. It operates on vector registers (`vreg`) loaded from UB tiles
-and stores results back to UB tiles. Parameters are `Tile` references and PTO
-scalars — the sub-kernel reads tile data, computes on vector hardware, and
-writes results back through mutable tile parameters. Vector registers are
-local to the function and never cross its boundary.
+**Role**: `@pto.tileop` is the recommended custom tile-op surface for
+row-wise vector compute. It operates on vector registers (`vreg`) loaded from UB tiles
+and stores results back to UB tiles. Parameters may be `Tile`,
+`TensorView`, `PartitionTensorView`, and PTO scalars — the sub-kernel reads
+device-side descriptors, computes on vector hardware, and writes results back
+through mutable tile parameters. Vector registers are local to the function
+and never cross its boundary.
 
-**Signature**: `@pto.simd(fn=None, *, name=None, target="a5")`
+**Signature**: `@pto.tileop(fn=None, *, name=None, target="a5")`
 
-<!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"kernel_entry.simd_signature","symbol":"kernel_entry_simd_signature_probe","compile":{"BLOCK":128}} -->
+<!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"kernel_entry.tileop_signature","symbol":"kernel_entry_simd_signature_probe","compile":{"BLOCK":128}} -->
 ```python
-@pto.simd
-def my_simd_kernel(
+@pto.tileop
+def my_tileop_kernel(
     input_tile: pto.Tile,            # UB tile
     output_tile: pto.Tile,           # UB tile
     rows: pto.i32,                   # PTO scalar
@@ -827,19 +836,19 @@ def my_simd_kernel(
     return
 ```
 
-Parameters are UB `Tile` references and PTO scalar values (`pto.i32`,
-`pto.f32`, etc.). Scalar parameters may come from `lds` reads or compile-time
-constants.
+Parameters are device-side `Tile` / `TensorView` / `PartitionTensorView`
+references and PTO scalar values (`pto.i32`, `pto.f32`, etc.). Scalar
+parameters may come from `lds` reads or compile-time constants.
 
-This interface contract is enforced unconditionally. A decorated `@pto.simd`
-function does not gain extra pointer-style ABI forms in explicit mode; if you
-need a broader boundary, use `@pto.jit(entry=False)` instead.
+This interface contract is enforced unconditionally. A decorated `@pto.tileop`
+function does not accept `pto.ptr(...)`; typed pointers remain SIMT-only. If
+you need a broader C-style ABI, use `@pto.jit(entry=False)` instead.
 
 **Typical body**:
 
-<!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"kernel_entry.simd_body","symbol":"kernel_entry_simd_body_probe","compile":{"BLOCK":128}} -->
+<!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"kernel_entry.tileop_body","symbol":"kernel_entry_simd_body_probe","compile":{"BLOCK":128}} -->
 ```python
-@pto.simd
+@pto.tileop
 def add_rows(a_tile: pto.Tile, b_tile: pto.Tile, o_tile: pto.Tile,
              rows: pto.i32, cols: pto.i32):
     VEC = pto.elements_per_vreg(pto.f32)
@@ -854,21 +863,28 @@ def add_rows(a_tile: pto.Tile, b_tile: pto.Tile, o_tile: pto.Tile,
 ```
 
 The boundary contract: `vreg` values (`a_vec`, `b_vec`, `o_vec`) are local to
-the function. The only way to persist data across a `@pto.simd` call is to
+the function. The only way to persist data across a `@pto.tileop` call is to
 write it back to a UB tile via `vsts` (or `psts`, etc.).
 
-**Lowering model**: a decorated `@pto.simd` function becomes one reusable
+**Lowering model**: a decorated `@pto.tileop` function becomes one reusable
 helper function inside the owning PTODSL child module. Each callsite lowers to
-`func.call` of that helper; the helper body itself contains the `pto.section.vector`
-region.
+`func.call` of that helper. The helper body is emitted as a naked `tileop`
+helper with `pto.tileop.helper`; backend passes infer the
+primary domain and materialize `pto.section.vector` later in the PTOAS
+pipeline.
 
 **Invocation modes**: can be called from `@pto.jit` in either mode, or authored
-as an anonymous inline helper with `with pto.simd():` (Section 3.8).
+as an anonymous inline helper with `with pto.tileop():` (Section 3.8).
+
+`@pto.simd` remains available for existing vector custom OP code. It follows
+the same parameter and lowering contract as `@pto.tileop` while preserving the
+`simd` surface name in trace diagnostics. At the IR layer it canonicalizes to
+the same backend helper contract as `@pto.tileop`.
 
 ### 3.7.3 `@pto.simt` — SIMT unit (scalar-parallel operations)
 
-**Role**: `@pto.simt` is the custom tile op for per-element scalar-parallel
-compute on the SIMT unit. SIMT (Single Instruction, Multiple Threads) is a
+**Role**: `@pto.simt` is the SIMT helper surface for per-element scalar-parallel
+compute. SIMT (Single Instruction, Multiple Threads) is a
 programming model where you write instructions in scalar syntax
 (`scalar.load`, `scalar.store`, `a + b`), and the hardware executes them in
 parallel across many threads — analogous to how a GPU SM runs a CUDA kernel.
@@ -997,8 +1013,9 @@ Specific SIMT micro-op APIs are documented in Chapter 13.
 
 ## 3.8 Inline context manager syntax
 
-In addition to the decorator form, each sub-kernel unit provides a context
-manager: `with pto.cube():`, `with pto.simd():`, and `with pto.simt():`. These
+In addition to the decorator form, custom tile-op helpers can be written with
+`with pto.tileop():`; retained `with pto.cube():` / `with pto.simd():` scopes
+and inline `with pto.simt():` scopes are also supported. These
 open one-off anonymous sub-kernel bodies without requiring a separate named
 Python function. Inline scopes are supported in top-level `@pto.jit` bodies.
 
@@ -1006,7 +1023,7 @@ Python function. Inline scopes are supported in top-level `@pto.jit` bodies.
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"kernel_entry.inline_simd_scope","symbol":"kernel_entry_inline_simd_scope_probe","compile":{"BLOCK":128}} -->
 ```python
-with pto.simd():
+with pto.tileop():
     a_vec = pto.vlds(a_tile[r, c:])
     b_vec = pto.vlds(b_tile[r, c:])
     o_vec = pto.vadd(a_vec, b_vec, mask)
@@ -1037,10 +1054,14 @@ with pto.cube():
   unit.
 - On block exit, PTODSL outlines the block into one anonymous helper
   `func.func` and replaces the original region with a `func.call`.
-- `with pto.simd():` and `with pto.cube():` preserve their `pto.section.vector`
-  / `pto.section.cube` bodies inside the outlined helper.
+- Decorated sub-kernel helpers and inline sub-kernel scopes do not lower
+  identically today. Named/decorated `@pto.tileop`, retained `@pto.simd`, and
+  retained `@pto.cube` helpers lower as naked `tileop` helpers and rely on
+  backend materialization of `pto.section.vector` / `pto.section.cube`. Inline
+  `with pto.tileop():`, retained `with pto.simd():`, and `with pto.cube():`
+  scopes still preserve their explicit section-wrapped helper bodies.
 - `with pto.simt():` preserves its scalar body inside one outlined
-  `pto.simt_entry` helper, and the caller emits `pto.store_vfsimt_info`.
+  `pto.simt_entry` helper, and the caller emits `pto.simt_launch`.
 - Values defined inside the inline sub-kernel cannot escape the block directly.
   Use Tiles, typed pointers, or other mutable references to communicate results
   back to the caller.
@@ -1062,10 +1083,10 @@ The two forms can be freely mixed in the same `@pto.jit` body.
 
 ## 3.9 Boundary contracts
 
-**Sub-kernels are custom tile ops.** Their I/O contract is strict: data enters
-via `Tile` handles and PTO scalars; results exit by writing to mutable `Tile`
-parameters. `TensorView` and `PartitionTensorView` belong to the orchestration
-layer and are NOT accepted by sub-kernels.
+**Sub-kernels are custom tile ops.** Their I/O contract is strict: tileop-style
+helpers accept `Tile`, `TensorView`, `PartitionTensorView`, and PTO scalars;
+SIMT helpers accept `Tile`, typed `pto.ptr(...)`, and PTO scalars. Results
+still exit by writing to mutable references or returning PTO scalar values only.
 
 **Modules use the C ABI.** Module boundaries (`entry=False`) are real function
 calls — only `pto.ptr` and PTO scalars can cross. `Tile`, `TensorView`, and
@@ -1075,13 +1096,13 @@ calls — only `pto.ptr` and PTO scalars can cross. `Tile`, `TensorView`, and
 |----------|---------|
 | Host → `@pto.jit(entry=True)` | explicit GM pointers + runtime scalars |
 | Entry / module → `@pto.jit(entry=False)` module | **`pto.ptr` + PTO scalars only** (C ABI). Caller passes `tile.as_ptr()`; module constructs local tiles internally |
-| Entry / module → sub-kernel (`auto` mode) | **`Tile` + PTO scalars only**. Compiler handles staging + sync |
-| Entry / module → sub-kernel (`explicit` mode) | `Tile`, `PartitionTensorView`, `pto.ptr`, PTO scalars |
-| `@pto.jit` → `with pto.{cube,simd,simt}:` | Captured `Tile` / ptr / scalar values from enclosing scope |
+| Entry / module → `@pto.tileop` / retained `@pto.simd` | `Tile`, `TensorView`, `PartitionTensorView`, PTO scalars |
+| Entry / module → `@pto.simt` | `Tile`, typed `pto.ptr(...)`, PTO scalars |
+| `@pto.jit` → `with pto.{tileop,cube,simd,simt}:` | Captured `Tile` / ptr / scalar values from enclosing scope |
 | Sub-kernel → sub-kernel | Not allowed (go through UB tiles via the caller) |
 | Sub-kernel → module | Not allowed (sub-kernels cannot call out) |
 | Inline sub-kernel → caller | No direct SSA return path; write through Tile / ptr / mutable references |
-| `@pto.simd` → caller | Only via `vsts`/`psts` to UB tiles; `vreg` cannot escape |
+| `@pto.tileop` / `@pto.simd` → caller | Only via `vsts`/`psts` to UB tiles; `vreg` cannot escape |
 | Cube-local → UB | Only via `mte_l0c_ub`; LEFT/RIGHT/ACC/BIAS are private |
 | `entry=False` module → caller | No return values; data crosses only via mutable references |
 
diff --git a/ptodsl/docs/user_guide/05-control-flow.md b/ptodsl/docs/user_guide/05-control-flow.md
index 4ea132257b..c80a7602e9 100644
--- a/ptodsl/docs/user_guide/05-control-flow.md
+++ b/ptodsl/docs/user_guide/05-control-flow.md
@@ -4,7 +4,7 @@ PTODSL uses a **tracing** compilation model. When you call `kernel.compile(...)`
 
 This has one critical implication for how you write loops and branches:
 
-- **Python native `for`/`if`** is rewritten to device-side control flow by default in `@pto.jit` bodies and named `@pto.cube` / `@pto.simd` / `@pto.simt` sub-kernels. A `for i in range(rows)` loop records a device loop, and a runtime `if` records both branches.
+- **Python native `for`/`if`** is rewritten to device-side control flow by default in `@pto.jit` bodies and named `@pto.cube` / `@pto.tileop` / `@pto.simt` sub-kernels. A `for i in range(rows)` loop records a device loop, and a runtime `if` records both branches.
 - **`pto.const_expr` / `pto.static_range`** keep compile-time Python behavior when you want trace-time specialization or unrolling.
 - **`pto.for_` / `pto.if_`** produce device-side control flow. The loop bound or branch condition can be a runtime value, and the hardware will execute the loop or take the branch dynamically.
 
@@ -465,7 +465,7 @@ def debug_kernel(*, BLOCK: pto.const_expr = 4):
         pto.pipe_barrier(pto.Pipe.ALL)
 
 
-@pto.simd(ast_rewrite=False)
+@pto.tileop(ast_rewrite=False)
 def debug_simd_helper():
     if pto.const_expr(True):
         pto.pipe_barrier(pto.Pipe.ALL)
diff --git a/ptodsl/docs/user_guide/07-data-movement-ops.md b/ptodsl/docs/user_guide/07-data-movement-ops.md
index cabe44abdf..7894d44d88 100644
--- a/ptodsl/docs/user_guide/07-data-movement-ops.md
+++ b/ptodsl/docs/user_guide/07-data-movement-ops.md
@@ -307,7 +307,7 @@ def process_block(k_part, v_part, k_tile, v_tile, o_tile, o_part,
 
 ## 7.3 Vector loads (simd)
 
-Inside `@pto.simd`, data moves between UB tiles and vector registers (`vreg`). Vector loads read a contiguous chunk of a tile row into a `vreg`; the chunk size equals the hardware vector width for the element type (e.g., 64 elements for `f32`, 128 for `f16`).
+Inside `@pto.tileop`, data moves between UB tiles and vector registers (`vreg`). Vector loads read a contiguous chunk of a tile row into a `vreg`; the chunk size equals the hardware vector width for the element type (e.g., 64 elements for `f32`, 128 for `f16`).
 
 ### Tile-index syntax
 
@@ -1239,7 +1239,7 @@ Vector (consumer) side:
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"pipe_communication.c2v_global_consumer","symbol":"pipe_communication_c2v_global_consumer_probe","compile":{}} -->
 ```python
-@pto.simd
+@pto.tileop
 def consumer(dst_tile: pto.Tile):
     c2v.init_simd()
     entry = c2v.pop(split=0)
@@ -1270,7 +1270,7 @@ Vector (producer) side:
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"pipe_communication.v2c_global_producer","symbol":"pipe_communication_v2c_global_producer_probe","compile":{}} -->
 ```python
-@pto.simd
+@pto.tileop
 def producer(src_tile: pto.Tile):
     v2c.init_simd()
     entry = v2c.alloc(split=0)
@@ -1330,7 +1330,7 @@ Vector (consumer) transaction:
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"pipe_communication.c2v_local_consumer","symbol":"pipe_communication_c2v_local_consumer_probe","compile":{}} -->
 ```python
-@pto.simd
+@pto.tileop
 def consumer(dst_tile: pto.Tile):
     c2v.init_simd()
     tile = c2v.pop(result_type=dst_tile, split=0)
@@ -1412,7 +1412,7 @@ def vector_consumer(
         pto.make_tensor_view(dst, shape=[16, 16], strides=[16, 1]),
         offsets=[0, 0], sizes=[16, 16])
 
-    with pto.simd():
+    with pto.tileop():
         c2v.init_simd()
         entry = c2v.pop(split=0)
         entry_part = pto.partition_view(entry, offsets=[0, 0], sizes=[16, 16])
diff --git a/ptodsl/docs/user_guide/08-compute-operations.md b/ptodsl/docs/user_guide/08-compute-operations.md
index ac4e32a3a9..039d9c92b5 100644
--- a/ptodsl/docs/user_guide/08-compute-operations.md
+++ b/ptodsl/docs/user_guide/08-compute-operations.md
@@ -1254,9 +1254,9 @@ pto.tile.gemv_mx_bias(lhs_l0a_mx, lhs_scale, rhs_l0b_mx, rhs_scale, bias_tile, a
 
 ---
 
-## 8.2 Vector compute (L3 — `@pto.simd`)
+## 8.2 Vector compute (L3 — `@pto.tileop`)
 
-Vector compute ops operate on `VRegType` values inside `@pto.simd` sub-kernels. Every vector op takes a `MaskType` predicate that gates which lanes participate; masked-off lanes produce an unspecified result (use the result only where the mask is true, or feed it to a masked store).
+Vector compute ops operate on `VRegType` values inside `@pto.tileop` sub-kernels. Every vector op takes a `MaskType` predicate that gates which lanes participate; masked-off lanes produce an unspecified result (use the result only where the mask is true, or feed it to a masked store).
 
 All vector ops in this section follow the pattern established in Section 7.3 for tile-index and pointer-form addressing. The signatures below use the vector-register form — tile-index forms load into `vreg` first, then compute.
 
@@ -1605,7 +1605,7 @@ exp_f16_odd  = pto.vmulscvt(exp_f32_odd, 1.0, mask, rnd=pto.VcvtRoundMode.A, par
 
 ### 8.2.7 Vector type conversion and packing
 
-These ops change the element type or layout of vector registers. They are distinct from the tile-level `tile.cvt` — they operate on `VRegType` values inside `@pto.simd` and are the explicit micro-op counterparts to higher-level conversion helpers.
+These ops change the element type or layout of vector registers. They are distinct from the tile-level `tile.cvt` — they operate on `VRegType` values inside `@pto.tileop` and are the explicit micro-op counterparts to higher-level conversion helpers.
 
 #### `pto.vcvt(src: VRegType, to_dtype: DType, mask: MaskType, *, rnd: VcvtRoundMode | None = None, sat: VcvtSatMode | None = None, part: VcvtPartMode | None = None) -> VRegType`
 
diff --git a/ptodsl/docs/user_guide/10-sync-ops.md b/ptodsl/docs/user_guide/10-sync-ops.md
index 42b3089400..01eba93e85 100644
--- a/ptodsl/docs/user_guide/10-sync-ops.md
+++ b/ptodsl/docs/user_guide/10-sync-ops.md
@@ -437,7 +437,7 @@ Where do sync operations belong in PTODSL's public entry model?
 |---------|---------------------|
 | `@pto.jit(mode="auto")` | Users can write sync explicitly when needed. PTOAS also provides an `--enable-insert-sync` option that auto-inserts `set_flag`/`wait_flag` pairs based on op-to-pipe mapping. |
 | `@pto.jit(mode="explicit")` | The compiler does not insert sync — the user is fully responsible. Place `set_flag`/`wait_flag` between MTE and compute, `mem_bar` between compute phases, `pipe_barrier` at phase boundaries. |
-| Shared `@pto.cube` / `@pto.simd` / `@pto.simt` helpers | Cross-pipeline ordering is provided by the surrounding `@pto.jit` schedule. Helpers may still use `mem_bar` for intra-pipeline ordering when UB addresses alias. |
+| Shared `@pto.tileop` / `@pto.simt` helpers, plus retained `@pto.cube` / `@pto.simd` entry points | Cross-pipeline ordering is provided by the surrounding `@pto.jit` schedule. Helpers may still use `mem_bar` for intra-pipeline ordering when UB addresses alias. |
 
 **Rule of thumb**: in `mode="auto"`, think in tiles and let the compiler handle
 orchestration. In `mode="explicit"`, think in micro-instructions and place the
diff --git a/ptodsl/docs/user_guide/11-flash-attention-walkthrough.md b/ptodsl/docs/user_guide/11-flash-attention-walkthrough.md
index ddd2fd42d9..6bce455263 100644
--- a/ptodsl/docs/user_guide/11-flash-attention-walkthrough.md
+++ b/ptodsl/docs/user_guide/11-flash-attention-walkthrough.md
@@ -12,7 +12,7 @@ flash_attention(...)           L0  user-facing wrapper
        ├─ Tile Ops                 tile.load / tile.store at the GM↔UB boundary
        ├─ explicit orchestration   mte_load / pipe_barrier / pointer sequencing
        ├─ @pto.cube               qk_matmul / pv_matmul
-       ├─ @pto.simd               online_softmax_rows
+       ├─ @pto.tileop               online_softmax_rows
        └─ @pto.simt               materialize_tile_bounds / blend_output_rows
 ```
 
@@ -497,10 +497,10 @@ def pv_matmul(
 
 Structurally identical to `qk_matmul`, but without transposition and with different input/output tiles. The scratch tiles `p_l0a`, `v_l0b`, and `pv_acc` are reused across KV blocks — the caller (top-level kernel) allocates them once.
 
-## 11.6 SIMD sub-kernel — online softmax
+## 11.6 Tile-op sub-kernel — online softmax
 
 ```python
-@pto.simd
+@pto.tileop
 def online_softmax_rows(
     s_tile: pto.Tile,
     p_tile: pto.Tile,
@@ -682,6 +682,6 @@ After all KV blocks: the top-level kernel issues `tile.store(o_final_tile, o_par
 
 **Tile-level boundary vs micro-instruction boundary**: `tile.load`/`tile.store` are the tile-atomic surface used in auto mode and at the top-level tile boundary of this sketch. `mte_load` appears in explicit orchestration, authored as individual pointer-based instructions. The abstraction split is auto mode as tile-centric authoring, explicit mode as user-ordered orchestration.
 
-**No vreg across sub-kernel boundaries**: vector registers are local to each `@pto.simd` kernel. Data crosses sub-kernel boundaries through UB tiles — the boundary contract is enforced by the type system.
+**No vreg across sub-kernel boundaries**: vector registers are local to each `@pto.tileop` kernel. Data crosses sub-kernel boundaries through UB tiles — the boundary contract is enforced by the type system.
 
-**Invocation flexibility**: This sketch uses the explicit `@pto.jit(entry=True, mode="explicit")` path for full micro-instruction control. The same named sub-kernels can also be reused from `@pto.jit(mode="auto")` when the body stays within the auto-mode contract, or written inline as context managers (`with pto.simd():`, etc.). The orchestration logic could be extracted into `@pto.jit(entry=False)` kernel modules for reuse across multiple entry kernels. See Chapter 3 for details.
+**Invocation flexibility**: This sketch uses the explicit `@pto.jit(entry=True, mode="explicit")` path for full micro-instruction control. The same named sub-kernels can also be reused from `@pto.jit(mode="auto")` when the body stays within the auto-mode contract, or written inline as context managers (`with pto.tileop():`, etc.). The orchestration logic could be extracted into `@pto.jit(entry=False)` kernel modules for reuse across multiple entry kernels. See Chapter 3 for details.
diff --git a/ptodsl/docs/user_guide/12-additional-examples.md b/ptodsl/docs/user_guide/12-additional-examples.md
index 6e870c9a72..79604fef83 100644
--- a/ptodsl/docs/user_guide/12-additional-examples.md
+++ b/ptodsl/docs/user_guide/12-additional-examples.md
@@ -76,11 +76,11 @@ When a data dimension is not evenly divisible by the tile size or the hardware v
 
 ### 12.2.1 Tail handling in a SIMD kernel
 
-Below is a self-contained `@pto.simd` kernel that adds two tiles row by row, handling column tails with `make_mask`:
+Below is a self-contained `@pto.tileop` kernel that adds two tiles row by row, handling column tails with `make_mask`:
 
-<!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"tail.simd_helper","symbol":"tail_simd_helper_probe","compile":{"BLOCK":128}} -->
+<!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"tail.tileop_helper","symbol":"tail_simd_helper_probe","compile":{"BLOCK":128}} -->
 ```python
-@pto.simd
+@pto.tileop
 def add_rows_with_tail(a_tile: pto.Tile, b_tile: pto.Tile, o_tile: pto.Tile,
                        rows: pto.i32, cols: pto.i32):
     VEC = pto.elements_per_vreg(pto.f32)          # 64 for f32
@@ -380,10 +380,10 @@ def online_layernorm(
 |------|-----|
 | Whole-kernel orchestration, GM↔UB boundary | `@pto.jit` |
 | Tile-level data movement | `tile.load` / `tile.store` |
-| Custom row-wise vector math | `@pto.simd` |
+| Custom row-wise vector math | `@pto.tileop` |
 | Custom per-element logic | `@pto.simt` |
 | Matrix multiply | `@pto.cube` |
 | Micro-instruction-level control | `mode="explicit"` |
-| Inline compute for quick prototyping | `with pto.simd():` etc. |
+| Inline compute for quick prototyping | `with pto.tileop():` etc. |
 
-**Respect boundary contracts.** Vregs don't cross `@pto.simd` boundaries. Cube-local state doesn't leak into UB. Tile Ops and MTE Ops belong to different programming models — use Tile Ops in `mode="auto"`, and micro-instructions in `mode="explicit"`.
+**Respect boundary contracts.** Vregs don't cross `@pto.tileop` boundaries. Cube-local state doesn't leak into UB. Tile Ops and MTE Ops belong to different programming models — use Tile Ops in `mode="auto"`, and micro-instructions in `mode="explicit"`.
diff --git a/ptodsl/examples/dynamic_softmax_launch.py b/ptodsl/examples/dynamic_softmax_launch.py
index e99343f837..f8bf040e71 100644
--- a/ptodsl/examples/dynamic_softmax_launch.py
+++ b/ptodsl/examples/dynamic_softmax_launch.py
@@ -105,7 +105,7 @@ def dynamic_softmax(
     pto.set_flag("MTE2", "V", event_id=0)
     pto.wait_flag("MTE2", "V", event_id=0)
 
-    with pto.simd():
+    with pto.tileop():
         remaining_rows = runtime_rows
         for row_base in range(0, runtime_rows, lane_num):
             active_rows, remaining_after_pack = pto.make_mask(pto.f32, remaining_rows)
diff --git a/ptodsl/examples/fast_inverse_dense_launch.py b/ptodsl/examples/fast_inverse_dense_launch.py
index bba3835cba..d02d3c7e3f 100644
--- a/ptodsl/examples/fast_inverse_dense_launch.py
+++ b/ptodsl/examples/fast_inverse_dense_launch.py
@@ -107,7 +107,7 @@ def fast_inverse_dense_f32(
     pto.set_flag("MTE2", "V", event_id=0)
     pto.wait_flag("MTE2", "V", event_id=0)
 
-    with pto.simd():
+    with pto.tileop():
         active, _ = pto.make_mask(pto.f32, batch_i32)
         d00 = pto.vlds(in_tile[0, 0:])
         d01 = pto.vlds(in_tile[1, 0:])
diff --git a/ptodsl/examples/flash_attention/gu.py b/ptodsl/examples/flash_attention/gu.py
index 996965ce55..b12cb2f94d 100644
--- a/ptodsl/examples/flash_attention/gu.py
+++ b/ptodsl/examples/flash_attention/gu.py
@@ -13,7 +13,7 @@
 
 - ``fa_gu_init_vpto_kernel`` / ``fa_gu_update_vpto_kernel`` are ptr-ABI VPTO
   child modules.
-- ``fa_gu_init_vpto`` / ``fa_gu_update_vpto`` are Tile-ABI ``@pto.simd``
+- ``fa_gu_init_vpto`` / ``fa_gu_update_vpto`` are Tile-ABI ``@pto.tileop``
   adapters for callers such as ``flash_attention_vf_fusion.py``.
 - ``fa_gu_init_vpto_validate`` / ``fa_gu_update_vpto_validate`` are host-visible
   launch wrappers for standalone validation.
@@ -115,7 +115,7 @@ def fa_gu_update_vpto_kernel(
             pto.vsts(out_vec, o_ptr, row_base + col, mask, dist="NORM_B32")
 
 
-@pto.simd
+@pto.tileop
 def fa_gu_init_vpto(
     pv_tile: pto.Tile,
     o_tile: pto.Tile,
@@ -129,7 +129,7 @@ def fa_gu_init_vpto(
     )
 
 
-@pto.simd
+@pto.tileop
 def fa_gu_update_vpto(
     o_tile: pto.Tile,
     pv_tile: pto.Tile,
diff --git a/ptodsl/examples/flash_attention/softmax.py b/ptodsl/examples/flash_attention/softmax.py
index 434744d1c4..5cca232d25 100644
--- a/ptodsl/examples/flash_attention/softmax.py
+++ b/ptodsl/examples/flash_attention/softmax.py
@@ -14,13 +14,13 @@
 - ``fa_softmax_init_vpto_kernel`` / ``fa_softmax_update_vpto_kernel``:
   ptr-ABI VPTO child modules intended to become separate backend objects
 - ``fa_softmax_init_vpto`` / ``fa_softmax_update_vpto``:
-  Tile-ABI ``@pto.simd`` adapters that materialize ``as_ptr()`` internally
+  Tile-ABI ``@pto.tileop`` adapters that materialize ``as_ptr()`` internally
 - ``fa_softmax_vpto_probe``: minimal entry wrapper for compile-only inspection
 
 The intended structure is:
 
 - auto-mode callers only see Tile arguments
-- the ``@pto.simd`` adapter bridges Tile -> ptr
+- the ``@pto.tileop`` adapter bridges Tile -> ptr
 - the explicit VPTO kernel module owns the micro-instruction body
 """
 
@@ -237,7 +237,7 @@ def fa_softmax_update_vpto_kernel(
         pto.vsts(exp_scale, exp_scale_ptr, row, one32, dist="1PT_B32")
 
 
-@pto.simd
+@pto.tileop
 def fa_softmax_init_vpto(
     qk: pto.Tile,
     p_nz: pto.Tile,
@@ -257,7 +257,7 @@ def fa_softmax_init_vpto(
     )
 
 
-@pto.simd
+@pto.tileop
 def fa_softmax_update_vpto(
     qk: pto.Tile,
     p_nz: pto.Tile,
diff --git a/ptodsl/examples/flash_attention_sketch.py b/ptodsl/examples/flash_attention_sketch.py
index 04d037a97c..77ca38db33 100644
--- a/ptodsl/examples/flash_attention_sketch.py
+++ b/ptodsl/examples/flash_attention_sketch.py
@@ -17,7 +17,7 @@
            ├─ Tile Ops                 tile.load / tile.store at the GM↔UB boundary
            ├─ explicit orchestration   mte_load / pipe_barrier / pointer sequencing
            ├─ @pto.cube               matrix products (QK^T and P@V)
-           ├─ @pto.simd               row-wise online softmax
+           ├─ @pto.tileop               row-wise online softmax
            └─ @pto.simt               scalar metadata and output blending
 
 Design rules illustrated here:
@@ -375,7 +375,7 @@ def flash_attention_kernel(
 # Boundary contract:
 # - Tile arguments are UB-backed or cube-local buffers carrying addressable
 #   storage.
-# - No vector register escapes a simd function.
+# - No vector register escapes a tile-op helper.
 # - No implicit global-memory access happens inside these kernels.
 
 
@@ -431,7 +431,7 @@ def pv_matmul(
     pto.mte_l0c_ub(pv_acc.as_ptr(), pv_tile.as_ptr(), m, n, n, n, 0)
 
 
-@pto.simd
+@pto.tileop
 def online_softmax_rows(
     s_tile: pto.Tile,          # UB, [Br, Bc]
     p_tile: pto.Tile,          # UB, [Br, Bc], output
@@ -685,7 +685,7 @@ def kv_block_process(
 # │                                                                            │
 # │   Key idea: UB tiles are inputs/outputs; cube-local state is explicit.    │
 # ├──────────────────────────────────────────────────────────────────────────┤
-# │ @pto.simd           Row-wise vector math                                   │
+# │ @pto.tileop           Row-wise vector math                                   │
 # │                                                                            │
 # │   online_softmax_rows                                                      │
 # │   vreg stays local; persistent state is written back to UB tiles           │
diff --git a/ptodsl/examples/flash_attention_softmax_launch.py b/ptodsl/examples/flash_attention_softmax_launch.py
index fe8a7bed1a..b174cc0711 100644
--- a/ptodsl/examples/flash_attention_softmax_launch.py
+++ b/ptodsl/examples/flash_attention_softmax_launch.py
@@ -118,7 +118,7 @@ def kernel(
         pto.set_flag("MTE2", "V", event_id=0)
         pto.wait_flag("MTE2", "V", event_id=0)
 
-        with pto.simd():
+        with pto.tileop():
             remaining_rows = runtime_rows
             for row_base in range(0, runtime_rows, lane_num):
                 active_rows, remaining_after_pack = pto.make_mask(pto.f32, remaining_rows)
diff --git a/ptodsl/examples/inverse_block_inversion_launch.py b/ptodsl/examples/inverse_block_inversion_launch.py
index 80e207e481..9659dcea83 100644
--- a/ptodsl/examples/inverse_block_inversion_launch.py
+++ b/ptodsl/examples/inverse_block_inversion_launch.py
@@ -106,7 +106,7 @@ def inverse_block_inversion_f32(
     pto.set_flag("MTE2", "V", event_id=0)
     pto.wait_flag("MTE2", "V", event_id=0)
 
-    with pto.simd():
+    with pto.tileop():
         active, _ = pto.make_mask(pto.f32, batch_i32)
         d00 = pto.vlds(in_tile[0, 0:])
         d10 = pto.vlds(in_tile[2, 0:])
diff --git a/ptodsl/examples/mixed_backend_kernel_module.py b/ptodsl/examples/mixed_backend_kernel_module.py
index 9cfffea591..54025b68a4 100644
--- a/ptodsl/examples/mixed_backend_kernel_module.py
+++ b/ptodsl/examples/mixed_backend_kernel_module.py
@@ -58,7 +58,7 @@ def scale_row_kernel_module(
     base_gm: pto.ptr(pto.f32, "gm"),
     row: pto.i32,
 ):
-    with pto.simd():
+    with pto.tileop():
         c0_i64 = pto.const(0, dtype=pto.i64)
         row_offset = row * _ROW_ELEMS
         row_gm = pto.addptr(base_gm, row_offset)
diff --git a/ptodsl/examples/softmax_dsl.py b/ptodsl/examples/softmax_dsl.py
index 331e6a95d7..b57eced1d8 100644
--- a/ptodsl/examples/softmax_dsl.py
+++ b/ptodsl/examples/softmax_dsl.py
@@ -85,7 +85,7 @@ def kernel(
             pto.set_flag("MTE2", "V", event_id=0)
             pto.wait_flag("MTE2", "V", event_id=0)
 
-            with pto.simd():
+            with pto.tileop():
                 remaining_rows = runtime_rows
                 for row_base in range(0, runtime_rows, packed_rows):
                     active_rows, remaining_rows = pto.make_mask(pto.f32, remaining_rows)
diff --git a/ptodsl/examples/tadd_dsl.py b/ptodsl/examples/tadd_dsl.py
index 3248643a52..5c7deaf527 100644
--- a/ptodsl/examples/tadd_dsl.py
+++ b/ptodsl/examples/tadd_dsl.py
@@ -18,7 +18,7 @@
       %c0_i64    = arith.constant 0 : i64    # pto.const(0, dtype=pto.int64)
       %c16       = arith.constant 16 : index # pto.const(16, dtype=pto.index)
       …
-      pto.simd {                              # with pto.simd():
+      pto.tileop {                              # with pto.tileop():
         %0 = pto.castptr %c4096_i64 …        #   pto.castptr(c4096_i64, …)
         scf.for %arg0 = %c0 to %c16 … {      #   for i in range(c0, c16, c1):
           %mask, _ = pto.plt_b32 …           #     pto.plt_b32(c64_i32)
@@ -43,7 +43,7 @@ def TADD():
     c64_i32   = pto.const(64,   dtype=pto.int32)
     c64       = pto.const(64)
 
-    with pto.simd():
+    with pto.tileop():
         ptr_f32_ub   = pto.ptr(pto.float32, "ub")
         vf32         = pto.vreg_type(64, pto.float32)
         ptr_src      = pto.castptr(c4096_i64, ptr_f32_ub)
diff --git a/ptodsl/examples/tilelang_codegen.py b/ptodsl/examples/tilelang_codegen.py
index dca45f0ada..9b9cebfb35 100644
--- a/ptodsl/examples/tilelang_codegen.py
+++ b/ptodsl/examples/tilelang_codegen.py
@@ -76,7 +76,7 @@ def _tilelang_generated_body(
         pto.set_flag("MTE2", "V", event_id=iter % 2)
         pto.wait_flag("MTE2", "V", event_id=iter % 2)
         pto.wait_flag("MTE3", "V", event_id=iter % 2)
-        with pto.simd():
+        with pto.tileop():
             mask_cnt = 8192
             with pto.for_(0, 128, step=1) as i:
                 mask = pto.pset_b32("PAT_ALL")
@@ -170,7 +170,7 @@ def _tilelang_generated_body_small(A, B, C):
         pto.set_flag("MTE2", "V", event_id=iter % 2)
         pto.wait_flag("MTE2", "V", event_id=iter % 2)
         pto.wait_flag("MTE3", "V", event_id=iter % 2)
-        with pto.simd():
+        with pto.tileop():
             with pto.for_(0, 2, step=1) as i:
                 mask = pto.pset_b32("PAT_ALL")
                 r0 = pto.vlds(
diff --git a/ptodsl/ptodsl/_diagnostics.py b/ptodsl/ptodsl/_diagnostics.py
index de1f303d58..d7fdb1264b 100644
--- a/ptodsl/ptodsl/_diagnostics.py
+++ b/ptodsl/ptodsl/_diagnostics.py
@@ -281,14 +281,25 @@ def inline_subkernel_value_escape_error(role: str, type_text: str) -> RuntimeErr
     )
 
 
-def simd_value_escape_error(type_text: str) -> RuntimeError:
+def simd_value_escape_error(type_text: str, *, surface: str = "@pto.simd") -> RuntimeError:
     """Return one diagnostic for transient SIMD values escaping a simd subkernel boundary."""
     return RuntimeError(
-        f"@pto.simd cannot return transient SIMD values across the subkernel boundary "
+        f"{surface} cannot return transient SIMD values across the subkernel boundary "
         f"(got {type_text}). Write the value back to a Tile/UB buffer instead."
     )
 
 
+def subkernel_return_boundary_error(role: str, observed: object) -> TypeError:
+    """Return one diagnostic for unsupported PTODSL subkernel return values."""
+    return TypeError(
+        f"@pto.{role} return values must be PTO scalar values or tuples/lists/dicts of PTO scalar values. "
+        f"Got {observed!r}. Return Tile/TensorView/PartitionTensorView/ptr data through explicit "
+        "subkernel operands instead."
+    )
+
+
+
+
 def tile_row_alignment_error(*, shape, dtype, row_bytes: int, required_alignment: int) -> TypeError:
     """Return one diagnostic for authored tile shapes violating row-byte alignment."""
     return TypeError(
@@ -360,14 +371,14 @@ def unsupported_public_surface_error(name: str) -> AttributeError:
     hints = {
         "ukernel": (
             'Use @pto.jit(mode="explicit") for explicit DMA orchestration, and call or inline '
-            "@pto.simd/@pto.simt/@pto.cube directly from that kernel."
+            "@pto.tileop helpers directly from that kernel."
         ),
         "tile_buf_type": (
             "Use pto.alloc_tile(shape=..., dtype=..., memory_space=..., valid_shape=..., addr=...) "
             "to author tiles, and keep explicit tile-type construction inside internal implementation code only."
         ),
         "vecscope": (
-            "Use @pto.simd for named SIMD helpers, or inline SIMD code with `with pto.simd():`."
+            "Use @pto.tileop for named custom OP helpers, or inline custom OP code with `with pto.tileop():`."
         ),
         "as_ptr": (
             "Use tile.as_ptr(), view.as_ptr(), or partition.as_ptr() on the authored object itself "
diff --git a/ptodsl/ptodsl/_subkernels.py b/ptodsl/ptodsl/_subkernels.py
index 1777177074..9d1ab64d2d 100644
--- a/ptodsl/ptodsl/_subkernels.py
+++ b/ptodsl/ptodsl/_subkernels.py
@@ -23,11 +23,12 @@
     subkernel_illegal_annotation_error,
     subkernel_illegal_parameter_kind_error,
     subkernel_missing_annotation_error,
+    subkernel_return_boundary_error,
     subkernel_signature_boundary_error,
 )
 from ._ast_rewrite import rewrite_jit_function
 from ._host_tensors import TensorSpec, looks_like_host_tensor
-from ._surface_types import Tile
+from ._surface_types import PartitionTensorView, TensorView, Tile
 from ._surface_values import unwrap_surface_value
 from ._tracing import current_runtime, current_session
 from ._types import (
@@ -59,9 +60,11 @@
     ui32,
     ui64,
 )
+from mlir.ir import FloatType, IndexType, IntegerType
 
 
 class KernelRole(str, Enum):
+    TILEOP = "tileop"
     CUBE = "cube"
     SIMD = "simd"
     SIMT = "simt"
@@ -188,11 +191,15 @@ def _validate_invocation(self, *args, **kwargs) -> None:
                 raise subkernel_host_tensor_boundary_error(self.spec.role.value, name)
 
     def _validate_result(self, result) -> None:
-        if self.spec.role != KernelRole.SIMD:
+        if self.spec.role in {KernelRole.TILEOP, KernelRole.SIMD}:
+            escaped_type = _find_transient_simd_escape(result)
+            if escaped_type is not None:
+                raise simd_value_escape_error(escaped_type, surface=f"@pto.{self.spec.role.value}")
+            _validate_subkernel_scalar_result(self.spec.role.value, result)
+            return
+        if self.spec.role == KernelRole.CUBE:
+            _validate_subkernel_scalar_result(self.spec.role.value, result)
             return
-        escaped_type = _find_transient_simd_escape(result)
-        if escaped_type is not None:
-            raise simd_value_escape_error(escaped_type)
 
 
 class _SimtLaunchTemplate:
@@ -233,6 +240,38 @@ def _find_transient_simd_escape(value):
     return None
 
 
+def _is_scalar_result_type(type_obj) -> bool:
+    return (
+        IndexType.isinstance(type_obj)
+        or IntegerType.isinstance(type_obj)
+        or FloatType.isinstance(type_obj)
+    )
+
+
+def _validate_subkernel_scalar_result(role: str, value) -> None:
+    if value is None:
+        return
+    if isinstance(value, tuple):
+        for item in value:
+            _validate_subkernel_scalar_result(role, item)
+        return
+    if isinstance(value, list):
+        for item in value:
+            _validate_subkernel_scalar_result(role, item)
+        return
+    if isinstance(value, dict):
+        for item in value.values():
+            _validate_subkernel_scalar_result(role, item)
+        return
+    raw_value = unwrap_surface_value(value)
+    type_obj = getattr(raw_value, "type", None)
+    if type_obj is None:
+        raise subkernel_return_boundary_error(role, type(value).__name__)
+    if _is_scalar_result_type(type_obj):
+        return
+    raise subkernel_return_boundary_error(role, str(type_obj))
+
+
 def _is_supported_runtime_scalar_annotation(annotation) -> bool:
     return (
         isinstance(annotation, _DType)
@@ -292,6 +331,10 @@ def _normalize_subkernel_annotation(annotation):
     text = annotation.strip()
     if text in {"Tile", "pto.Tile"}:
         return Tile
+    if text in {"TensorView", "pto.TensorView"}:
+        return TensorView
+    if text in {"PartitionTensorView", "pto.PartitionTensorView"}:
+        return PartitionTensorView
     if text in _POSTPONED_DTYPE_ANNOTATIONS:
         return _POSTPONED_DTYPE_ANNOTATIONS[text]
     if text.startswith("pto.ptr(") and text.endswith(")"):
@@ -301,11 +344,17 @@ def _normalize_subkernel_annotation(annotation):
     return annotation
 
 
+def _allows_view_annotations(role: KernelRole) -> bool:
+    return role in {KernelRole.TILEOP, KernelRole.SIMD}
+
+
 def _is_supported_subkernel_annotation(role: KernelRole, annotation) -> bool:
     if annotation is Tile:
         return True
     if role == KernelRole.CUBE:
         return False
+    if _allows_view_annotations(role) and annotation in {TensorView, PartitionTensorView}:
+        return True
     if _is_supported_runtime_scalar_annotation(annotation):
         return True
     if role == KernelRole.SIMT and isinstance(annotation, _PtrDescriptor):
@@ -316,8 +365,11 @@ def _is_supported_subkernel_annotation(role: KernelRole, annotation) -> bool:
 def _expected_subkernel_annotation_summary(role: KernelRole) -> str:
     if role == KernelRole.CUBE:
         return "pto.Tile parameters only"
-    if role == KernelRole.SIMD:
-        return "pto.Tile parameters plus PTO scalar annotations such as pto.i32/pto.f32"
+    if role in {KernelRole.TILEOP, KernelRole.SIMD}:
+        return (
+            "pto.Tile / pto.TensorView / pto.PartitionTensorView parameters plus PTO scalar "
+            "annotations such as pto.i32/pto.f32"
+        )
     return "pto.Tile parameters, typed pto.ptr(...) values, and PTO scalar annotations"
 
 
@@ -347,12 +399,47 @@ def _is_runtime_scalar_value(value) -> bool:
     )
 
 
+def _is_tensor_view_value(value) -> bool:
+    raw_value = unwrap_surface_value(value)
+    type_obj = getattr(raw_value, "type", None)
+    if type_obj is None:
+        return False
+    return str(type_obj).startswith("!pto.tensor_view<")
+
+
+def _is_partition_tensor_view_value(value) -> bool:
+    raw_value = unwrap_surface_value(value)
+    type_obj = getattr(raw_value, "type", None)
+    if type_obj is None:
+        return False
+    return str(type_obj).startswith("!pto.partition_tensor_view<")
+
+
 def _normalize_subkernel_argument(role: KernelRole, name: str, annotation, value):
     if annotation is Tile:
         if isinstance(value, Tile):
             return value
         raise subkernel_argument_type_error(role.value, name, "a pto.Tile value", type(value).__name__)
 
+    if annotation is TensorView:
+        if _allows_view_annotations(role) and isinstance(value, TensorView) and _is_tensor_view_value(value):
+            return value
+        raise subkernel_argument_type_error(role.value, name, "a pto.TensorView value", type(value).__name__)
+
+    if annotation is PartitionTensorView:
+        if (
+            _allows_view_annotations(role)
+            and isinstance(value, PartitionTensorView)
+            and _is_partition_tensor_view_value(value)
+        ):
+            return value
+        raise subkernel_argument_type_error(
+            role.value,
+            name,
+            "a pto.PartitionTensorView value",
+            type(value).__name__,
+        )
+
     if _is_supported_runtime_scalar_annotation(annotation):
         if isinstance(value, (bool, int, float)):
             from ._ops import const
@@ -505,6 +592,10 @@ def _decorate_subkernel(
     )
 
 
+def tileop(fn=None, *, name: str | None = None, target: str = "a5", ast_rewrite: bool = True):
+    return _decorate_subkernel(KernelRole.TILEOP, fn, name=name, target=target, ast_rewrite=ast_rewrite)
+
+
 def cube(fn=None, *, name: str | None = None, target: str = "a5", ast_rewrite: bool = True):
     return _decorate_subkernel(KernelRole.CUBE, fn, name=name, target=target, ast_rewrite=ast_rewrite)
 
@@ -551,6 +642,7 @@ def simt(
     "KernelRole",
     "SubkernelSpec",
     "SubkernelTemplate",
+    "tileop",
     "cube",
     "simd",
     "simt",
diff --git a/ptodsl/ptodsl/_tracing/runtime.py b/ptodsl/ptodsl/_tracing/runtime.py
index e11740c98a..bb41b0b684 100644
--- a/ptodsl/ptodsl/_tracing/runtime.py
+++ b/ptodsl/ptodsl/_tracing/runtime.py
@@ -63,7 +63,7 @@ def finalize_session(self, session):
     def dispatch_subkernel_call(self, subkernel, *args, **kwargs):
         """Dispatch a decorated PTODSL subkernel call in the active trace."""
         session = require_active_session(f"@pto.{subkernel.spec.role.value}")
-        if subkernel.spec.role.value in {"cube", "simd", "simt"}:
+        if subkernel.spec.role.value in {"tileop", "cube", "simd", "simt"}:
             return session.lower_helper_subkernel(subkernel, *args, **kwargs)
         return subkernel.emit_body(*args, **kwargs)
 
diff --git a/ptodsl/ptodsl/_tracing/session.py b/ptodsl/ptodsl/_tracing/session.py
index c2390f1d63..a8eafc5601 100644
--- a/ptodsl/ptodsl/_tracing/session.py
+++ b/ptodsl/ptodsl/_tracing/session.py
@@ -36,6 +36,7 @@
     IntegerType,
     Operation,
     StringAttr,
+    TypeAttr,
     UnitAttr,
 )
 
@@ -154,6 +155,7 @@ def __init__(self, module_spec, module, entry_function):
         self._carry_loop_stack = []
         self._inline_subkernel_counter = 0
         self._escaped_inline_values: dict[object, tuple[str, str]] = {}
+        self._helper_result_templates: dict[tuple[str, tuple], object] = {}
 
     @property
     def current_function(self):
@@ -232,14 +234,20 @@ def _next_inline_subkernel_symbol(self, base_symbol_name: str) -> str:
         return f"{base_symbol_name}_{suffix}"
 
     def _create_subkernel_section_op(self, role: str):
-        if role == "simd":
-            return _pto.SectionVectorOp()
-        if role == "cube":
-            return _pto.SectionCubeOp()
         return None
 
+    def _canonical_helper_role(self, role: str) -> str:
+        if role in {"tileop", "simd", "cube"}:
+            return "tileop"
+        return role
+
     def _create_inline_subkernel_wrapper(self, role: str):
-        wrapper_op = self._create_subkernel_section_op(role)
+        if role in {"tileop", "simd"}:
+            wrapper_op = _pto.SectionVectorOp()
+        elif role == "cube":
+            wrapper_op = _pto.SectionCubeOp()
+        else:
+            wrapper_op = self._create_subkernel_section_op(role)
         if wrapper_op is None:
             wrapper_op = _pto.VecScopeOp()
         body_block = wrapper_op.body.blocks.append()
@@ -247,19 +255,13 @@ def _create_inline_subkernel_wrapper(self, role: str):
 
     def _subkernel_helper_attributes(self, role: str) -> tuple[tuple[str, object], ...]:
         attrs: list[tuple[str, object]] = []
-        if role in {"simd", "cube"}:
-            attrs.append(("pto.ptodsl.subkernel_helper", StringAttr.get(role)))
+        helper_role = self._canonical_helper_role(role)
+        if helper_role == "tileop":
+            attrs.append(("pto.tileop.helper", UnitAttr.get()))
         if role == "simt":
             attrs.append(("pto.simt_entry", UnitAttr.get()))
         return tuple(attrs)
 
-    def _emit_simt_helper_launch_metadata(self) -> None:
-        i32 = IntegerType.get_signless(32)
-        dim_z = arith.ConstantOp(i32, 1).result
-        dim_y = arith.ConstantOp(i32, 1).result
-        dim_x = arith.ConstantOp(i32, 1).result
-        _pto.StoreVfSimtInfoOp(dim_z, dim_y, dim_x)
-
     def _erase_attached_op(self, op_view) -> None:
         parent = op_view.operation.parent
         if parent is not None:
@@ -381,6 +383,37 @@ def _note_escaped_inline_values(self, values, *, role: str) -> None:
         for value in values:
             self._escaped_inline_values[value] = (role, str(value.type))
 
+    def _flatten_helper_result_templates(self, value) -> tuple:
+        if value is None:
+            return ()
+        if isinstance(value, tuple):
+            flattened = []
+            for item in value:
+                flattened.extend(self._flatten_helper_result_templates(item))
+            return tuple(flattened)
+        if isinstance(value, list):
+            flattened = []
+            for item in value:
+                flattened.extend(self._flatten_helper_result_templates(item))
+            return tuple(flattened)
+        if isinstance(value, dict):
+            flattened = []
+            for item in value.values():
+                flattened.extend(self._flatten_helper_result_templates(item))
+            return tuple(flattened)
+        return (value,)
+
+    def _wrap_helper_call_results(self, template, results_iter):
+        if template is None:
+            return None
+        if isinstance(template, tuple):
+            return tuple(self._wrap_helper_call_results(item, results_iter) for item in template)
+        if isinstance(template, list):
+            return [self._wrap_helper_call_results(item, results_iter) for item in template]
+        if isinstance(template, dict):
+            return {name: self._wrap_helper_call_results(item, results_iter) for name, item in template.items()}
+        return wrap_like_surface_value(template, next(results_iter))
+
     def _remap_captured_operands(self, root_ops, capture_mapping) -> None:
         for op_view in self._walk_op_tree(root_ops):
             operands = op_view.operation.operands
@@ -391,7 +424,7 @@ def _remap_captured_operands(self, root_ops, capture_mapping) -> None:
 
     def _outline_inline_subkernel(self, outline_frame: InlineSubkernelOutlineFrame) -> None:
         role = outline_frame.trace_frame.role
-        if role in {"simd", "cube"}:
+        if role in {"tileop", "simd", "cube"}:
             root_ops = (outline_frame.wrapper_op,)
         else:
             root_ops = tuple(outline_frame.body_block.operations)
@@ -414,15 +447,16 @@ def _outline_inline_subkernel(self, outline_frame: InlineSubkernelOutlineFrame)
 
         with InsertionPoint(outline_frame.wrapper_op.operation):
             if role == "simt":
-                self._emit_simt_helper_launch_metadata()
-            func.CallOp(helper_fn, list(captures))
+                self._emit_simt_launch_call(helper_fn, captures, dims=(1, 1, 1))
+            else:
+                func.CallOp(helper_fn, list(captures))
 
         entry_block = helper_fn.add_entry_block()
         with InsertionPoint(entry_block):
             terminator = func.ReturnOp([])
         return_anchor = terminator.operation.opview
 
-        if role in {"simd", "cube"}:
+        if role in {"tileop", "simd", "cube"}:
             outline_frame.wrapper_op.move_before(return_anchor)
             outlined_roots = (outline_frame.wrapper_op,)
         else:
@@ -444,11 +478,13 @@ def lower_helper_subkernel(self, subkernel, *args, **kwargs):
         arg_templates = tuple(args)
         arg_types = tuple(unwrap_surface_value(arg).type for arg in arg_templates)
         owner_symbol_name = self.current_function_owner_symbol_name
+        result_template = None
         helper_spec = HelperFunctionSpec(
             symbol_name=subkernel.spec.symbol_name,
             arg_types=arg_types,
             attributes=self._subkernel_helper_attributes(subkernel.spec.role.value),
         )
+        helper_cache_key = (owner_symbol_name, helper_spec.cache_key())
         helper_fn, created = self.get_or_create_helper_function(
             helper_spec,
             owner_symbol_name=owner_symbol_name,
@@ -466,10 +502,21 @@ def lower_helper_subkernel(self, subkernel, *args, **kwargs):
                 InsertionPoint(entry_block),
             ):
                 with self.enter_subkernel(subkernel):
-                    subkernel.emit_body(*wrapped_args, **kwargs)
-                func.ReturnOp([])
+                    result_template = subkernel.emit_body(*wrapped_args, **kwargs)
+                flat_results = self._flatten_helper_result_templates(result_template)
+                result_types = [unwrap_surface_value(value).type for value in flat_results]
+                helper_fn.operation.attributes["function_type"] = TypeAttr.get(
+                    func.FunctionType.get(list(arg_types), result_types)
+                )
+                func.ReturnOp([unwrap_surface_value(value) for value in flat_results])
+            self._helper_result_templates[helper_cache_key] = result_template
+        else:
+            result_template = self._helper_result_templates.get(helper_cache_key)
 
-        func.CallOp(helper_fn, [unwrap_surface_value(arg) for arg in arg_templates])
+        call_op = func.CallOp(helper_fn, [unwrap_surface_value(arg) for arg in arg_templates])
+        if result_template is None:
+            return None
+        return self._wrap_helper_call_results(result_template, iter(call_op.results))
 
     def begin_carry_loop(self, start, stop, step, state_items):
         """Materialize one authored ``pto.for_(...).carry(...)`` loop body."""
@@ -497,12 +544,14 @@ def lower_simt_helper_subkernel(self, subkernel, *args, **kwargs):
         """Lower one ``@pto.simt`` call through a dedicated helper function."""
         helper_fn, arg_templates = self._get_or_create_simt_helper_function(subkernel, *args, **kwargs)
 
-        self._emit_simt_helper_launch_metadata()
-        func.CallOp(helper_fn, [unwrap_surface_value(arg) for arg in arg_templates])
+        self._emit_simt_launch_call(helper_fn, arg_templates, dims=(1, 1, 1))
 
     def lower_simt_launch_subkernel(self, subkernel, *args, dims, **kwargs):
         """Lower one explicit ``pto.simt_launch`` call through a SIMT helper."""
         helper_fn, arg_templates = self._get_or_create_simt_helper_function(subkernel, *args, **kwargs)
+        self._emit_simt_launch_call(helper_fn, arg_templates, dims=dims)
+
+    def _emit_simt_launch_call(self, helper_fn, arg_templates, *, dims) -> None:
         dim_x, dim_y, dim_z = _coerce_simt_launch_dims(dims)
         Operation.create(
             "pto.simt_launch",
diff --git a/ptodsl/ptodsl/pto.py b/ptodsl/ptodsl/pto.py
index 9b6caffa4a..2050323532 100644
--- a/ptodsl/ptodsl/pto.py
+++ b/ptodsl/ptodsl/pto.py
@@ -145,7 +145,7 @@
 
 # ── Decorator ─────────────────────────────────────────────────────────────────
 from ._jit import jit, KernelHandle, merge_jit_modules      # noqa: F401
-from ._subkernels import cube, simd, simt     # noqa: F401
+from ._subkernels import tileop, cube, simd, simt     # noqa: F401
 from ._pipe_namespace import pipe  # noqa: F401
 
 # ── Shorthand dtype aliases ───────────────────────────────────────────────────
diff --git a/ptodsl/tests/support/docs_fragment_fixtures.py b/ptodsl/tests/support/docs_fragment_fixtures.py
index 788cf31786..20a040dac3 100644
--- a/ptodsl/tests/support/docs_fragment_fixtures.py
+++ b/ptodsl/tests/support/docs_fragment_fixtures.py
@@ -537,7 +537,7 @@ def tail_vector_pattern_probe(*, BLOCK: pto.const_expr = 128):
             {SNIPPET_PLACEHOLDER}
         """
     ),
-    "tail.simd_helper": _fixture(
+    "tail.tileop_helper": _fixture(
         f"""
         {SNIPPET_PLACEHOLDER}
 
@@ -591,7 +591,7 @@ def qk_matmul(q_tile: pto.Tile, k_tile: pto.Tile, s_tile: pto.Tile):
             return
 
 
-        @pto.simd
+        @pto.tileop
         def online_softmax(s_tile: pto.Tile, o_tile: pto.Tile, rows: pto.i32, cols: pto.i32):
             return
 
@@ -664,7 +664,7 @@ def kernel_entry_cube_signature_probe(
             my_cube_kernel(input_tile, output_tile, left_scratch, right_scratch, acc_scratch)
         """
     ),
-    "kernel_entry.simd_signature": _fixture(
+    "kernel_entry.tileop_signature": _fixture(
         f"""
         {SNIPPET_PLACEHOLDER}
 
@@ -673,10 +673,10 @@ def kernel_entry_cube_signature_probe(
         def kernel_entry_simd_signature_probe(*, BLOCK: pto.const_expr = 128):
             input_tile = pto.alloc_tile(shape=[1, BLOCK], dtype=pto.f32)
             output_tile = pto.alloc_tile(shape=[1, BLOCK], dtype=pto.f32)
-            my_simd_kernel(input_tile, output_tile, pto.const(1, dtype=pto.i32), pto.const(BLOCK, dtype=pto.i32))
+            my_tileop_kernel(input_tile, output_tile, pto.const(1, dtype=pto.i32), pto.const(BLOCK, dtype=pto.i32))
         """
     ),
-    "kernel_entry.simd_body": _fixture(
+    "kernel_entry.tileop_body": _fixture(
         f"""
         {SNIPPET_PLACEHOLDER}
 
@@ -1005,7 +1005,7 @@ def qk_matmul(q_tile: pto.Tile, k_tile: pto.Tile, s_tile: pto.Tile):
             return
 
 
-        @pto.simd
+        @pto.tileop
         def online_softmax(s_tile: pto.Tile, p_tile: pto.Tile, rows: pto.i32, cols: pto.i32):
             return
 
@@ -1143,7 +1143,7 @@ def data_movement_cube_helper_probe(
     ),
     "compute_ops.vector_compute": _fixture(
         f"""
-        @pto.simd
+        @pto.tileop
         def compute_ops_vector_helper(inp_tile: pto.Tile, out_tile: pto.Tile, row: pto.index):
             col_mask = pto.make_mask(pto.f32, pto.const(16, dtype=pto.i32))
             s_row = pto.vlds(inp_tile[row, 0:])
@@ -1504,7 +1504,7 @@ def qk_matmul(
             return
 
 
-        @pto.simd
+        @pto.tileop
         def online_softmax_rows(
             s_tile: pto.Tile,
             p_tile: pto.Tile,
@@ -1701,7 +1701,7 @@ def flash_attention_inline_simt_scope_probe(*, BLOCK_Q: pto.const_expr = 16, BLO
     ),
     "flash_attention.online_softmax_loop": _fixture(
         f"""
-        @pto.simd
+        @pto.tileop
         def flash_attention_online_softmax_loop_helper(
             s_tile: pto.Tile,
             p_tile: pto.Tile,
@@ -1740,7 +1740,7 @@ def flash_attention_online_softmax_loop_probe(*, BLOCK: pto.const_expr = 16):
     ),
     "flash_attention.online_softmax_compute": _fixture(
         f"""
-        @pto.simd
+        @pto.tileop
         def flash_attention_online_softmax_compute_helper(
             s_tile: pto.Tile,
             p_tile: pto.Tile,
@@ -1784,7 +1784,7 @@ def flash_attention_online_softmax_compute_probe(*, BLOCK: pto.const_expr = 16):
     ),
     "flash_attention.online_softmax_store": _fixture(
         f"""
-        @pto.simd
+        @pto.tileop
         def flash_attention_online_softmax_store_helper(
             s_tile: pto.Tile,
             p_tile: pto.Tile,
diff --git a/ptodsl/tests/test_ast_rewrite_example_ir.py b/ptodsl/tests/test_ast_rewrite_example_ir.py
index c15ec2e3f6..1211f97d6b 100644
--- a/ptodsl/tests/test_ast_rewrite_example_ir.py
+++ b/ptodsl/tests/test_ast_rewrite_example_ir.py
@@ -91,7 +91,7 @@ def kernel():
         c64_i32 = pto.const(64, dtype=pto.int32)
         c64 = pto.const(64)
 
-        with pto.simd():
+        with pto.tileop():
             ptr_f32_ub = pto.ptr(pto.float32, "ub")
             vf32 = pto.vreg_type(64, pto.float32)
             ptr_src = pto.castptr(c4096_i64, ptr_f32_ub)
@@ -158,7 +158,7 @@ def kernel(
                 pto.set_flag("MTE2", "V", event_id=0)
                 pto.wait_flag("MTE2", "V", event_id=0)
 
-                with pto.simd():
+                with pto.tileop():
                     row_loop = pto.for_(0, runtime_rows, step=packed_rows).carry(remained=runtime_rows)
                     with row_loop:
                         row_base = row_loop.iv
@@ -266,7 +266,7 @@ def kernel(
         pto.set_flag("MTE2", "V", event_id=0)
         pto.wait_flag("MTE2", "V", event_id=0)
 
-        with pto.simd():
+        with pto.tileop():
             row_loop = pto.for_(0, runtime_rows, step=lane_num).carry(remained=runtime_rows)
             with row_loop:
                 row_base = row_loop.iv
diff --git a/ptodsl/tests/test_flash_attention_demo_compile.py b/ptodsl/tests/test_flash_attention_demo_compile.py
index b2e99cfe81..63936d522b 100644
--- a/ptodsl/tests/test_flash_attention_demo_compile.py
+++ b/ptodsl/tests/test_flash_attention_demo_compile.py
@@ -72,7 +72,7 @@ def main() -> None:
         "flash attention wrapper compile should encode the VPTO backend directly on the child module",
     )
     expect("func.func @materialize_tile_bounds" in wrapper_text, "wrapper compile should emit the SIMT helper function")
-    expect("pto.store_vfsimt_info" in wrapper_text, "wrapper compile should materialize SIMT caller metadata setup")
+    expect("pto.simt_launch" in wrapper_text, "wrapper compile should materialize SIMT launch ops")
     expect("pto.barrier <PIPE_ALL>" in wrapper_text, "demo phase boundaries should lower to pipe_barrier(Pipe.ALL)")
 
     compiled = demo.flash_attention_kernel.compile(
@@ -105,7 +105,7 @@ def main() -> None:
         "direct compile should encode the VPTO backend directly on the child module",
     )
     expect("!pto.tile_buf<mat, 64x128xf32" in specialized_text, "BLOCK_Q=64 specialization should change the physical Q tile shape")
-    expect("func.call @materialize_tile_bounds" in specialized_text, "direct compile should still route SIMT helpers through func.call")
+    expect("pto.simt_launch @materialize_tile_bounds" in specialized_text, "direct compile should route SIMT helpers through launch ops")
 
     cached = demo.flash_attention_kernel.cached_specializations()
     expect(len(cached) >= 2, "wrapper compile plus explicit compile should populate at least two cached specializations")
diff --git a/ptodsl/tests/test_jit_compile.py b/ptodsl/tests/test_jit_compile.py
index 2157acdb1a..8d9c56d3c6 100644
--- a/ptodsl/tests/test_jit_compile.py
+++ b/ptodsl/tests/test_jit_compile.py
@@ -366,6 +366,9 @@ def emitc_vpto_kernel_module_callsite_simd_helper(
     dst_tile: pto.Tile,
     cols: pto.i32,
 ):
+    mask, _ = pto.make_mask(pto.f32, cols)
+    vec = pto.vlds(src_tile[0, 0:])
+    pto.vsts(vec, dst_tile[0, 0:], mask)
     explicit_vpto_kernel_module(src_tile, dst_tile, cols)
 
 
@@ -647,8 +650,8 @@ def tile_surface_window_matmul_probe():
 INLINE_SUBKERNEL_SCOPE_OBSERVATIONS = []
 
 
-@pto.simd
-def nested_simd_probe():
+@pto.tileop
+def nested_tileop_probe():
     session = current_session()
     frame = session.current_subkernel
     SUBKERNEL_OBSERVATIONS.append((frame.role, frame.symbol_name, session.subkernel_stack_depth))
@@ -661,6 +664,13 @@ def top_level_cube_probe():
     SUBKERNEL_OBSERVATIONS.append((frame.role, frame.symbol_name, session.subkernel_stack_depth))
 
 
+@pto.tileop
+def top_level_tileop_probe():
+    session = current_session()
+    frame = session.current_subkernel
+    SUBKERNEL_OBSERVATIONS.append((frame.role, frame.symbol_name, session.subkernel_stack_depth))
+
+
 @pto.simd
 def top_level_simd_probe():
     session = current_session()
@@ -668,11 +678,83 @@ def top_level_simd_probe():
     SUBKERNEL_OBSERVATIONS.append((frame.role, frame.symbol_name, session.subkernel_stack_depth))
 
 
+@pto.tileop
+def vector_scalar_return_probe(inp_tile: pto.Tile, out_tile: pto.Tile, cols: pto.i32):
+    col_mask = pto.make_mask(pto.f32, cols)
+    row = pto.const(0)
+    s_row = pto.vlds(inp_tile[row, 0:])
+    pto.vsts(s_row, out_tile[row, 0:], col_mask)
+    return cols
+
+
+@pto.tileop
+def tileop_view_boundary_probe(
+    src_view: pto.TensorView,
+    src_part: pto.PartitionTensorView,
+    out_tile: pto.Tile,
+    rows: pto.i32,
+):
+    _ = src_view
+    pto.tile.load(src_part, out_tile)
+    return rows
+
+
+@pto.simd
+def simd_view_boundary_probe(
+    src_view: pto.TensorView,
+    src_part: pto.PartitionTensorView,
+    out_tile: pto.Tile,
+    rows: pto.i32,
+):
+    _ = src_view
+    pto.tile.load(src_part, out_tile)
+    return rows
+
+
+@pto.jit(target="a5")
+def scalar_return_subkernel_lowering_probe():
+    inp_tile = pto.alloc_tile(shape=[1, 16], dtype=pto.f32, valid_shape=[1, 16])
+    out_tile = pto.alloc_tile(shape=[1, 16], dtype=pto.f32, valid_shape=[1, 16])
+    stats_tile = pto.alloc_tile(shape=[1, 8], dtype=pto.i32, valid_shape=[1, 2])
+    cols = pto.const(16, dtype=pto.i32)
+    returned_cols = vector_scalar_return_probe(inp_tile, out_tile, cols)
+    scalar.store(returned_cols, stats_tile.as_ptr(), 0)
+
+
+@pto.jit(target="a5")
+def tileop_view_boundary_entry_probe(
+    A_ptr: pto.ptr(pto.f32, "gm"),
+    rows: pto.i32,
+    cols: pto.i32,
+):
+    src_view = pto.make_tensor_view(A_ptr, shape=[rows, cols], strides=[cols, 1])
+    src_part = pto.partition_view(src_view, offsets=[0, 0], sizes=[rows, cols])
+    out_tile = pto.alloc_tile(shape=[1, 16], dtype=pto.f32, valid_shape=[rows, cols])
+    stats_tile = pto.alloc_tile(shape=[1, 8], dtype=pto.i32, valid_shape=[1, 1])
+    returned_rows = tileop_view_boundary_probe(src_view, src_part, out_tile, rows)
+    scalar.store(returned_rows, stats_tile.as_ptr(), 0)
+
+
+@pto.jit(target="a5")
+def simd_view_boundary_entry_probe(
+    A_ptr: pto.ptr(pto.f32, "gm"),
+    rows: pto.i32,
+    cols: pto.i32,
+):
+    src_view = pto.make_tensor_view(A_ptr, shape=[rows, cols], strides=[cols, 1])
+    src_part = pto.partition_view(src_view, offsets=[0, 0], sizes=[rows, cols])
+    out_tile = pto.alloc_tile(shape=[1, 16], dtype=pto.f32, valid_shape=[rows, cols])
+    stats_tile = pto.alloc_tile(shape=[1, 8], dtype=pto.i32, valid_shape=[1, 1])
+    returned_rows = simd_view_boundary_probe(src_view, src_part, out_tile, rows)
+    scalar.store(returned_rows, stats_tile.as_ptr(), 0)
+
+
 @pto.jit(target="a5")
 def shared_subkernel_lowering_probe(*, TRACE_TOKEN: pto.const_expr = 0):
     top_level_cube_probe()
+    top_level_tileop_probe()
     top_level_simd_probe()
-    nested_simd_probe()
+    nested_tileop_probe()
 
 
 @pto.jit(target="a5", mode="explicit")
@@ -684,6 +766,10 @@ def inline_subkernel_scope_probe(*, TRACE_TOKEN: pto.const_expr = 0):
         frame = session.current_subkernel
         INLINE_SUBKERNEL_SCOPE_OBSERVATIONS.append((frame.role, frame.symbol_name, session.subkernel_stack_depth))
         scalar.store(0, meta_tile.as_ptr() + 0)
+    with pto.tileop():
+        frame = session.current_subkernel
+        INLINE_SUBKERNEL_SCOPE_OBSERVATIONS.append((frame.role, frame.symbol_name, session.subkernel_stack_depth))
+        pto.pipe_barrier(pto.Pipe.ALL)
     with pto.simd():
         frame = session.current_subkernel
         INLINE_SUBKERNEL_SCOPE_OBSERVATIONS.append((frame.role, frame.symbol_name, session.subkernel_stack_depth))
@@ -3438,6 +3524,10 @@ def main() -> None:
         decorated_mixed_backend_text,
         "emitc entry calling vpto kernel-module through @pto.simd specialization",
     )
+    decorated_helper_body = decorated_mixed_backend_text.split(
+        "func.func @emitc_vpto_kernel_module_callsite_simd_helper__ptodsl_",
+        1,
+    )[1].split("func.func private @explicit_vpto_kernel_module__ptodsl_", 1)[0]
     expect(
         re.search(
             r"call @emitc_vpto_kernel_module_callsite_simd_helper__ptodsl_[0-9a-f]+"
@@ -3447,8 +3537,8 @@ def main() -> None:
         "@pto.simd helper callsites should lower to helper function calls in the caller body",
     )
     expect(
-        "pto.section.vector {" in decorated_mixed_backend_text,
-        "the outlined @pto.simd helper body should still materialize one vector section",
+        "pto.section.vector {" not in decorated_helper_body,
+        "decorated @pto.simd helper bodies should now stay naked in PTODSL IR and rely on later PTOAS section materialization",
     )
     multi_abi_compiled = entry_calls_kernel_module_multiple_abi_probe.compile()
     multi_abi_text = multi_abi_compiled.mlir_text()
@@ -3842,20 +3932,85 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
     expect(
         SUBKERNEL_OBSERVATIONS == [
             ("cube", "top_level_cube_probe", 1),
+            ("tileop", "top_level_tileop_probe", 1),
             ("simd", "top_level_simd_probe", 1),
-            ("simd", "nested_simd_probe", 1),
+            ("tileop", "nested_tileop_probe", 1),
         ],
         f"unexpected shared subkernel lowering observations: {SUBKERNEL_OBSERVATIONS!r}",
     )
     expect(
         re.search(r"call @top_level_cube_probe__ptodsl_[0-9a-f]+\(\)", shared_subkernel_text) is not None
+        and re.search(r"call @top_level_tileop_probe__ptodsl_[0-9a-f]+\(\)", shared_subkernel_text) is not None
         and re.search(r"call @top_level_simd_probe__ptodsl_[0-9a-f]+\(\)", shared_subkernel_text) is not None
-        and re.search(r"call @nested_simd_probe__ptodsl_[0-9a-f]+\(\)", shared_subkernel_text) is not None,
-        "@pto.cube/@pto.simd decorated subkernels should lower to helper calls in the caller body",
+        and re.search(r"call @nested_tileop_probe__ptodsl_[0-9a-f]+\(\)", shared_subkernel_text) is not None,
+        "@pto.tileop/@pto.cube/@pto.simd decorated subkernels should lower to helper calls in the caller body",
+    )
+    expect(
+        shared_subkernel_text.count("pto.tileop.helper") == 4
+        and 'pto.ptodsl.subkernel_helper = "tileop"' not in shared_subkernel_text
+        and 'pto.ptodsl.subkernel_helper = "simd"' not in shared_subkernel_text
+        and 'pto.ptodsl.subkernel_helper = "cube"' not in shared_subkernel_text,
+        "decorated @pto.tileop/@pto.simd/@pto.cube helpers should canonicalize to the tileop backend helper role",
     )
     expect(
-        shared_subkernel_text.count("pto.section.vector {") == 2 and "pto.section.cube {" in shared_subkernel_text,
-        "outlined decorated helper bodies should still preserve their PTO unit sections",
+        "pto.section.vector {" not in shared_subkernel_text
+        and "pto.section.cube {" not in shared_subkernel_text,
+        "decorated @pto.tileop/@pto.simd/@pto.cube helpers should now lower as naked tileop helpers without pre-materialized sections",
+    )
+
+    scalar_return_subkernel_text = scalar_return_subkernel_lowering_probe.compile().mlir_text()
+    expect_parse_roundtrip_and_verify(scalar_return_subkernel_text, "scalar return subkernel lowering specialization")
+    expect(
+        re.search(r"func\.func @vector_scalar_return_probe__ptodsl_[0-9a-f]+\([^)]*\) -> i32", scalar_return_subkernel_text)
+        is not None,
+        "decorated tileop helpers that return PTO scalar values should materialize scalar helper result types",
+    )
+    expect(
+        "pto.vlds" in scalar_return_subkernel_text
+        and "pto.vsts" in scalar_return_subkernel_text
+        and re.search(r"%[a-zA-Z0-9_]+ = call @vector_scalar_return_probe__ptodsl_[0-9a-f]+\([^)]*\) : \([^)]*\) -> i32", scalar_return_subkernel_text)
+        is not None,
+        "decorated tileop helper callsites should consume scalar func.call results while preserving primary vector compute in the helper body",
+    )
+    expect(
+        "pto.store " in scalar_return_subkernel_text,
+        "scalar helper return values should remain usable by later PTODSL scalar stores",
+    )
+
+    tileop_view_boundary_text = tileop_view_boundary_entry_probe.compile().mlir_text()
+    expect_parse_roundtrip_and_verify(tileop_view_boundary_text, "tileop view boundary specialization")
+    expect(
+        re.search(
+            r"func\.func @tileop_view_boundary_probe__ptodsl_[0-9a-f]+\([^)]*!pto\.tensor_view<[^)]*!pto\.partition_tensor_view<[^)]*!pto\.tile_buf<",
+            tileop_view_boundary_text,
+        ) is not None,
+        "decorated tileop helpers should accept TensorView and PartitionTensorView formals in their lowered helper signature",
+    )
+    expect(
+        "pto.tload" in tileop_view_boundary_text
+        and re.search(
+            r"%[a-zA-Z0-9_]+ = call @tileop_view_boundary_probe__ptodsl_[0-9a-f]+\([^)]*\) : \([^)]*!pto\.tensor_view<[^)]*!pto\.partition_tensor_view<[^)]*\) -> i32",
+            tileop_view_boundary_text,
+        ) is not None,
+        "tileop callsites should pass TensorView/PartitionTensorView operands through the helper ABI and preserve scalar returns",
+    )
+
+    simd_view_boundary_text = simd_view_boundary_entry_probe.compile().mlir_text()
+    expect_parse_roundtrip_and_verify(simd_view_boundary_text, "simd view boundary specialization")
+    expect(
+        re.search(
+            r"func\.func @simd_view_boundary_probe__ptodsl_[0-9a-f]+\([^)]*!pto\.tensor_view<[^)]*!pto\.partition_tensor_view<[^)]*!pto\.tile_buf<",
+            simd_view_boundary_text,
+        ) is not None,
+        "retained simd helpers should accept TensorView and PartitionTensorView formals in the same lowered helper ABI as tileop",
+    )
+    expect(
+        "pto.tload" in simd_view_boundary_text
+        and re.search(
+            r"%[a-zA-Z0-9_]+ = call @simd_view_boundary_probe__ptodsl_[0-9a-f]+\([^)]*\) : \([^)]*!pto\.tensor_view<[^)]*!pto\.partition_tensor_view<[^)]*\) -> i32",
+            simd_view_boundary_text,
+        ) is not None,
+        "retained simd callsites should pass TensorView/PartitionTensorView operands through the same helper ABI and preserve scalar returns",
     )
 
     INLINE_SUBKERNEL_SCOPE_OBSERVATIONS.clear()
@@ -3864,20 +4019,28 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
     expect(
         INLINE_SUBKERNEL_SCOPE_OBSERVATIONS == [
             ("simt", "inline_simt", 1),
+            ("tileop", "inline_tileop", 1),
             ("simd", "inline_simd", 1),
             ("cube", "inline_cube", 1),
         ],
         f"unexpected inline subkernel scope observations: {INLINE_SUBKERNEL_SCOPE_OBSERVATIONS!r}",
     )
     expect(
-        inline_subkernel_scope_text.count("pto.store_vfsimt_info") == 1,
-        "inline pto.simt() should materialize one caller-side store_vfsimt_info before the helper call",
+        re.search(r"pto\.simt_launch @inline_simt_[0-9]+__ptodsl_[0-9a-f]+<<<", inline_subkernel_scope_text) is not None,
+        "inline pto.simt() should materialize one caller-side pto.simt_launch",
     )
     expect(
-        re.search(r"call @inline_simt_[0-9]+__ptodsl_[0-9a-f]+\([^\\n]*\)", inline_subkernel_scope_text) is not None
+        re.search(r"call @inline_tileop_[0-9]+__ptodsl_[0-9a-f]+\([^\\n]*\)", inline_subkernel_scope_text) is not None
         and re.search(r"call @inline_simd_[0-9]+__ptodsl_[0-9a-f]+\([^\\n]*\)", inline_subkernel_scope_text) is not None
         and re.search(r"call @inline_cube_[0-9]+__ptodsl_[0-9a-f]+\([^\\n]*\)", inline_subkernel_scope_text) is not None,
-        "inline pto.simt()/pto.simd()/pto.cube() scopes should each lower to one helper call",
+        "inline pto.tileop()/pto.simd()/pto.cube() scopes should each lower to one helper call",
+    )
+    expect(
+        inline_subkernel_scope_text.count("pto.tileop.helper") == 3
+        and 'pto.ptodsl.subkernel_helper = "tileop"' not in inline_subkernel_scope_text
+        and 'pto.ptodsl.subkernel_helper = "simd"' not in inline_subkernel_scope_text
+        and 'pto.ptodsl.subkernel_helper = "cube"' not in inline_subkernel_scope_text,
+        "outlined inline tileop/simd/cube helpers should canonicalize to the tileop backend helper role",
     )
     expect(
         inline_subkernel_scope_text.count("pto.barrier <PIPE_ALL>") >= 2
@@ -3890,16 +4053,8 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
     simt_text = simt_helper_lowering_probe.compile(TRACE_TOKEN=1).mlir_text()
     expect_parse_roundtrip_and_verify(simt_text, "simt helper lowering specialization")
     expect(
-        simt_text.count("pto.store_vfsimt_info") == 2,
-        "each @pto.simt callsite should materialize a caller-side store_vfsimt_info",
-    )
-    expect(
-        re.search(r"call @simt_tid_probe__simt_\d+\(\)", simt_text) is not None,
-        "each @pto.simt callsite should lower to a func.call of the helper symbol",
-    )
-    expect(
-        len(re.findall(r"call @simt_tid_probe__simt_\d+\(\)", simt_text)) == 2,
-        "both @pto.simt callsites should call the same helper specialization",
+        len(re.findall(r"pto\.simt_launch @simt_tid_probe__simt_\d+<<<", simt_text)) == 2,
+        "each @pto.simt callsite should materialize a caller-side pto.simt_launch",
     )
     expect(
         len(
@@ -4618,8 +4773,8 @@ def _enter_inline_simt_with_resource_attr():
     simt_pointer_offset_text = simt_pointer_offset_probe.compile().mlir_text()
     expect_parse_roundtrip_and_verify(simt_pointer_offset_text, "simt pointer offset specialization")
     expect(
-        re.search(r"call @simt_pointer_offset_helper__simt_\d+", simt_pointer_offset_text) is not None,
-        "@pto.simt pointer helper should lower to a helper func.call",
+        re.search(r"pto\.simt_launch @simt_pointer_offset_helper__simt_\d+<<<", simt_pointer_offset_text) is not None,
+        "@pto.simt pointer helper should lower to a caller-side pto.simt_launch",
     )
     expect(
         re.search(r"pto\.store %c9_i32, %(?:arg0|\d+)\[%c1(?:_\d+)?\]", simt_pointer_offset_text) is not None,
diff --git a/ptodsl/tests/test_ptoas_frontend_verify.py b/ptodsl/tests/test_ptoas_frontend_verify.py
index 608bb9a0e2..2b45d53b88 100644
--- a/ptodsl/tests/test_ptoas_frontend_verify.py
+++ b/ptodsl/tests/test_ptoas_frontend_verify.py
@@ -245,7 +245,7 @@ def process_row_ptr_kernel_module(
     dst_gm: pto.ptr(pto.f32, "gm"),
     row: pto.i32,
 ):
-    with pto.simd():
+    with pto.tileop():
         c0_i64 = pto.const(0, dtype=pto.i64)
         row_offset = row * 16
         src_row = pto.addptr(src_gm, row_offset)
diff --git a/ptodsl/tests/test_subkernel_diagnostics.py b/ptodsl/tests/test_subkernel_diagnostics.py
index 98e1486651..94fbd10d96 100644
--- a/ptodsl/tests/test_subkernel_diagnostics.py
+++ b/ptodsl/tests/test_subkernel_diagnostics.py
@@ -49,6 +49,14 @@ def bad_ptr_formal(meta_ptr: pto.ptr(pto.i32, pto.MemorySpace.UB)):
     return bad_ptr_formal
 
 
+def define_illegal_tileop_ptr_signature_probe():
+    @pto.tileop
+    def bad_ptr_formal(meta_ptr: pto.ptr(pto.i32, pto.MemorySpace.UB)):
+        pto.pipe_barrier(pto.Pipe.ALL)
+
+    return bad_ptr_formal
+
+
 def define_illegal_cube_scalar_signature_probe():
     @pto.cube
     def bad_cube_formal(tile: pto.Tile, cols: pto.i32):
@@ -131,11 +139,22 @@ def tile_only_probe(inp_tile: pto.Tile):
     pto.pipe_barrier(pto.Pipe.ALL)
 
 
+@pto.tileop
+def invalid_tileop_return_probe(inp_tile: pto.Tile):
+    return inp_tile
+
+
 @pto.jit(target="a5")
 def illegal_subkernel_callsite_entry(A_ptr: pto.ptr(pto.f32, "gm")):
     tile_only_probe(A_ptr)
 
 
+@pto.jit(target="a5")
+def invalid_tileop_return_entry():
+    meta_tile = pto.alloc_tile(shape=[1, 8], dtype=pto.i32, valid_shape=[1, 1])
+    invalid_tileop_return_probe(meta_tile)
+
+
 @pto.jit(target="a5", mode="explicit")
 def inline_simt_value_escape_entry():
     meta_tile = pto.alloc_tile(shape=[1, 8], dtype=pto.i32, valid_shape=[1, 1])
@@ -150,7 +169,7 @@ def main() -> None:
         AttributeError,
         "pto.ukernel is not a supported PTODSL public interface",
         '@pto.jit(mode="explicit")',
-        "@pto.simd/@pto.simt/@pto.cube",
+        "@pto.tileop",
     )
     expect_raises(
         define_removed_tensor_spec_surface_probe,
@@ -183,7 +202,14 @@ def main() -> None:
         define_illegal_simd_ptr_signature_probe,
         TypeError,
         "@pto.simd parameter 'meta_ptr' uses unsupported subkernel annotation",
-        "pto.Tile parameters plus PTO scalar annotations",
+        "pto.Tile / pto.TensorView / pto.PartitionTensorView parameters plus PTO scalar annotations",
+        "@pto.jit(entry=False)",
+    )
+    expect_raises(
+        define_illegal_tileop_ptr_signature_probe,
+        TypeError,
+        "@pto.tileop parameter 'meta_ptr' uses unsupported subkernel annotation",
+        "pto.Tile / pto.TensorView / pto.PartitionTensorView parameters plus PTO scalar annotations",
         "@pto.jit(entry=False)",
     )
     expect_raises(
@@ -226,6 +252,12 @@ def main() -> None:
         "Expected a pto.Tile value",
         "either pass a legal PTODSL boundary value or remove the subkernel decorator",
     )
+    expect_raises(
+        invalid_tileop_return_entry.compile,
+        TypeError,
+        "@pto.tileop return values must be PTO scalar values",
+        "Return Tile/TensorView/PartitionTensorView/ptr data through explicit subkernel operands instead",
+    )
     expect_raises(
         inline_simt_value_escape_entry.compile,
         RuntimeError,
diff --git a/test/dsl-st/cube_matrix_pipeline.py b/test/dsl-st/cube_matrix_pipeline.py
index 420d235e75..ba4d6b5249 100644
--- a/test/dsl-st/cube_matrix_pipeline.py
+++ b/test/dsl-st/cube_matrix_pipeline.py
@@ -32,7 +32,14 @@
 
 
 @pto.cube
-def cube_gemm_tile(a_mat, b_mat, o_tile, a_l0a, b_l0b, o_acc):
+def cube_gemm_tile(
+    a_mat: pto.Tile,
+    b_mat: pto.Tile,
+    o_tile: pto.Tile,
+    a_l0a: pto.Tile,
+    b_l0b: pto.Tile,
+    o_acc: pto.Tile,
+):
     m = a_mat.valid_shape[0]
     k = a_mat.valid_shape[1]
     n = b_mat.valid_shape[1]
diff --git a/test/dsl-st/predicate_pack.py b/test/dsl-st/predicate_pack.py
index 01876ce133..1a333ddb1a 100644
--- a/test/dsl-st/predicate_pack.py
+++ b/test/dsl-st/predicate_pack.py
@@ -88,7 +88,7 @@ def predicate_pack_part_kernel(
     pto.set_flag("MTE2", "V", event_id=0)
     pto.wait_flag("MTE2", "V", event_id=0)
 
-    with pto.simd():
+    with pto.tileop():
         seed = pto.pset_b8(pto.MaskPattern.ALL)
         src = pto.vlds(src_tile[0, 0:])
         active_b8 = pto.vcmp(src, src, seed, pto.CmpMode.EQ)
diff --git a/test/dsl-st/vmulscvt.py b/test/dsl-st/vmulscvt.py
index 4eb6a66449..dcfc48191c 100644
--- a/test/dsl-st/vmulscvt.py
+++ b/test/dsl-st/vmulscvt.py
@@ -85,7 +85,7 @@ def vmulscvt_pack_kernel(
     pto.set_flag("MTE2", "V", event_id=0)
     pto.wait_flag("MTE2", "V", event_id=0)
 
-    with pto.simd():
+    with pto.tileop():
         mask32 = pto.pset_b32(pto.MaskPattern.ALL)
         mask16 = pto.pset_b16(pto.MaskPattern.ALL)
 
diff --git a/test/lit/pto/plan_memory_ptodsl_tileop_helper_vlds_vsts.pto b/test/lit/pto/plan_memory_ptodsl_tileop_helper_vlds_vsts.pto
new file mode 100644
index 0000000000..550ced4fa8
--- /dev/null
+++ b/test/lit/pto/plan_memory_ptodsl_tileop_helper_vlds_vsts.pto
@@ -0,0 +1,54 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// Guards PlanMemory on PTODSL-style tileop helpers that bridge tile operands
+// through pto.tile_buf_addr into VPTO vector load/store ops.
+// RUN: ptoas --pto-arch=a5 --emit-pto-ir --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s
+
+module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind<vector>} {
+  func.func @plan_memory_ptodsl_tileop_helper_vlds_vsts() attributes {pto.entry} {
+    %src = pto.alloc_tile : !pto.tile_buf<vec, 1x16xf32>
+    %dst = pto.alloc_tile : !pto.tile_buf<vec, 1x16xf32>
+    %c16_i32 = arith.constant 16 : i32
+    func.call @tile_vec_copy(%src, %dst, %c16_i32) :
+      (!pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>, i32) -> ()
+    return
+  }
+
+  func.func private @tile_vec_copy(
+      %arg0: !pto.tile_buf<vec, 1x16xf32>,
+      %arg1: !pto.tile_buf<vec, 1x16xf32>,
+      %arg2: i32)
+      attributes {pto.tileop.helper} {
+    %mask, %tail = pto.plt_b32 %arg2 : i32 -> !pto.mask<b32>, i32
+    %0 = pto.tile_buf_addr %arg0 : !pto.tile_buf<vec, 1x16xf32> -> memref<1x16xf32, #pto.address_space<vec>>
+    %subview = memref.subview %0[0, 0] [1, 16] [1, 1] : memref<1x16xf32, #pto.address_space<vec>> to memref<16xf32, strided<[1]>, #pto.address_space<vec>>
+    %c0 = arith.constant 0 : index
+    %vec = pto.vlds %subview[%c0] : memref<16xf32, strided<[1]>, #pto.address_space<vec>> -> !pto.vreg<64xf32>
+    %1 = pto.tile_buf_addr %arg1 : !pto.tile_buf<vec, 1x16xf32> -> memref<1x16xf32, #pto.address_space<vec>>
+    %subview_0 = memref.subview %1[0, 0] [1, 16] [1, 1] : memref<1x16xf32, #pto.address_space<vec>> to memref<16xf32, strided<[1]>, #pto.address_space<vec>>
+    %c0_0 = arith.constant 0 : index
+    pto.vsts %vec, %subview_0[%c0_0], %mask : !pto.vreg<64xf32>, memref<16xf32, strided<[1]>, #pto.address_space<vec>>, !pto.mask<b32>
+    return
+  }
+}
+
+// CHECK: IR Dump After PlanMemory
+// CHECK-LABEL: func.func @plan_memory_ptodsl_tileop_helper_vlds_vsts()
+// CHECK: pto.pointer_cast(%{{.*}}) {config = #pto.tile_buf_config
+// CHECK: pto.bind_tile %{{.*}}, %c1, %c16
+// CHECK: pto.pointer_cast(%{{.*}}) {config = #pto.tile_buf_config
+// CHECK: pto.bind_tile %{{.*}}, %c1, %c16
+// CHECK: call @tile_vec_copy
+// CHECK-LABEL: func.func private @tile_vec_copy(
+// CHECK-SAME: pto.tileop.helper
+// CHECK: pto.tile_buf_addr %{{.*}} : memref<1x16xf32, strided<[16, 1], offset: ?>, #pto.address_space<vec>> -> memref<1x16xf32, #pto.address_space<vec>>
+// CHECK: pto.section.vector {
+// CHECK: pto.vlds
+// CHECK: pto.tile_buf_addr %{{.*}} : memref<1x16xf32, strided<[16, 1], offset: ?>, #pto.address_space<vec>> -> memref<1x16xf32, #pto.address_space<vec>>
+// CHECK: pto.vsts
diff --git a/test/lit/pto/tileop_subkernel_call_autosync.pto b/test/lit/pto/tileop_subkernel_call_autosync.pto
new file mode 100644
index 0000000000..fca59f52d7
--- /dev/null
+++ b/test/lit/pto/tileop_subkernel_call_autosync.pto
@@ -0,0 +1,70 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: ptoas --pto-arch=a3 --enable-insert-sync %s -o - | FileCheck %s
+
+module {
+  func.func @tileop_subkernel_call_autosync(%src_ptr: !pto.ptr<f32>,
+                                            %dst_ptr: !pto.ptr<f32>) {
+    pto.section.vector {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c16 = arith.constant 16 : index
+      %src_view = pto.make_tensor_view %src_ptr, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<16x16xf32>
+      %dst_view = pto.make_tensor_view %dst_ptr, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<16x16xf32>
+      %src_part = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<16x16xf32> -> !pto.partition_tensor_view<16x16xf32>
+      %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<16x16xf32> -> !pto.partition_tensor_view<16x16xf32>
+      %seed = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %scratch = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %out = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %check = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %bias = arith.constant 1.0 : f32
+      pto.tstore ins(%seed : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+                 outs(%src_part : !pto.partition_tensor_view<16x16xf32>)
+      func.call @scale_store(%src_part, %dst_part, %scratch, %out, %bias) :
+        (!pto.partition_tensor_view<16x16xf32>, !pto.partition_tensor_view<16x16xf32>,
+         !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+         !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+         f32) -> ()
+      pto.tload ins(%dst_part : !pto.partition_tensor_view<16x16xf32>)
+                outs(%check : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    }
+    return
+  }
+
+  func.func private @scale_store(
+      %src: !pto.partition_tensor_view<16x16xf32>,
+      %dst: !pto.partition_tensor_view<16x16xf32>,
+      %scratch: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+      %out: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+      %bias: f32)
+      attributes {pto.internal.non_entry = true, pto.tileop.helper} {
+    pto.tload ins(%src : !pto.partition_tensor_view<16x16xf32>)
+             outs(%scratch : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tadds ins(%scratch, %bias : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32)
+              outs(%out : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tstore ins(%out : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%dst : !pto.partition_tensor_view<16x16xf32>)
+    return
+  }
+}
+
+// CHECK-LABEL: AICORE void tileop_subkernel_call_autosync(
+// CHECK:      TSTORE(
+// CHECK-NEXT: set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+// CHECK-NEXT: wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+// CHECK:      TLOAD(
+// CHECK:      set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+// CHECK:      wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+// CHECK:      TADDS(
+// CHECK:      set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+// CHECK:      wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+// CHECK:      TSTORE(
+// CHECK:      set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1);
+// CHECK-NEXT: wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1);
+// CHECK:      TLOAD(
diff --git a/test/lit/pto/tileop_subkernel_call_sync_model.pto b/test/lit/pto/tileop_subkernel_call_sync_model.pto
new file mode 100644
index 0000000000..92c474dc48
--- /dev/null
+++ b/test/lit/pto/tileop_subkernel_call_sync_model.pto
@@ -0,0 +1,55 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// Tileop helper calls carry a phase summary. InsertSync must model the call as
+// multiple synthetic compounds, one per memory-visible phase, instead of the
+// old single conservative V/CUBE compound used for legacy simd/cube helpers.
+// RUN: ptoas --pto-arch=a3 --enable-insert-sync --pto-insert-sync-debug=2 %s -o /dev/null 2>&1 | FileCheck %s
+
+module {
+  func.func @tileop_subkernel_call_sync_model(%arg0: !pto.ptr<f32>, %arg1: !pto.ptr<f32>) {
+    pto.section.vector {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c16 = arith.constant 16 : index
+      %src = pto.make_tensor_view %arg0, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<16x16xf32>
+      %dst = pto.make_tensor_view %arg1, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<16x16xf32>
+      %src_part = pto.partition_view %src, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<16x16xf32> -> !pto.partition_tensor_view<16x16xf32>
+      %dst_part = pto.partition_view %dst, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<16x16xf32> -> !pto.partition_tensor_view<16x16xf32>
+      %scratch = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %out = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      %bias = arith.constant 1.0 : f32
+      func.call @scale_store(%src_part, %dst_part, %scratch, %out, %bias) :
+        (!pto.partition_tensor_view<16x16xf32>, !pto.partition_tensor_view<16x16xf32>,
+         !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+         !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+         f32) -> ()
+    }
+    return
+  }
+
+  func.func private @scale_store(
+      %src: !pto.partition_tensor_view<16x16xf32>,
+      %dst: !pto.partition_tensor_view<16x16xf32>,
+      %scratch: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+      %out: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+      %bias: f32)
+      attributes {pto.internal.non_entry = true, pto.tileop.helper} {
+    pto.tload ins(%src : !pto.partition_tensor_view<16x16xf32>)
+             outs(%scratch : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tadds ins(%scratch, %bias : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32)
+              outs(%out : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tstore ins(%out : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%dst : !pto.partition_tensor_view<16x16xf32>)
+    return
+  }
+}
+
+// CHECK: COMPOUND func.call [PIPE_MTE2]
+// CHECK: COMPOUND func.call [PIPE_V]
+// CHECK: COMPOUND func.call [PIPE_MTE3]
diff --git a/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto b/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto
index affb448722..f350d05aa0 100644
--- a/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto
+++ b/test/lit/pto/tload_tprefetch_low_precision_a5_valid.pto
@@ -23,7 +23,7 @@ module {
   }
 }
 
-// CHECK: func.func @tload_tprefetch_low_precision_a5_valid(%arg0: memref<16x16xf8E4M3FN>, %arg1: memref<16x16x!pto.hif8>)
-// CHECK: pto.declare_tile_memref -> memref<16x16x!pto.hif8
-// CHECK: pto.tload ins(%arg0 : memref<16x16xf8E4M3FN>) outs(
-// CHECK: pto.tprefetch ins(%arg1 : memref<16x16x!pto.hif8>) outs(
+// CHECK: func.func @tload_tprefetch_low_precision_a5_valid(%arg0: memref<16x16xf8E4M3FN, strided<[?, ?], offset: ?>, #pto.address_space<gm>>, %arg1: memref<16x16x!pto.hif8, strided<[?, ?], offset: ?>, #pto.address_space<gm>>)
+// CHECK: pto.declare_tile_memref -> memref<16x16x!pto.hif8, strided<[16, 1], offset: ?>, #pto.address_space<vec>>
+// CHECK: pto.tload ins(%arg0 : memref<16x16xf8E4M3FN, strided<[?, ?], offset: ?>, #pto.address_space<gm>>) outs(
+// CHECK: pto.tprefetch ins(%arg1 : memref<16x16x!pto.hif8, strided<[?, ?], offset: ?>, #pto.address_space<gm>>) outs(
diff --git a/test/lit/pto/tstore_low_precision_a5_valid.pto b/test/lit/pto/tstore_low_precision_a5_valid.pto
index b139645a04..55f96186d6 100644
--- a/test/lit/pto/tstore_low_precision_a5_valid.pto
+++ b/test/lit/pto/tstore_low_precision_a5_valid.pto
@@ -24,13 +24,13 @@ module {
   }
 }
 
-// CHECK: func.func @tstore_low_precision_a5_valid(%arg0: memref<16x16xf8E4M3FN>, %arg1: memref<16x16x!pto.hif8>, %arg2: i64)
+// CHECK: func.func @tstore_low_precision_a5_valid(%arg0: memref<16x16xf8E4M3FN, strided<[?, ?], offset: ?>, #pto.address_space<gm>>, %arg1: memref<16x16x!pto.hif8, strided<[?, ?], offset: ?>, #pto.address_space<gm>>, %arg2: i64)
 // CHECK: pto.bind_tile
 // CHECK: pto.section.vector {
 // CHECK: pto.tstore ins(
-// CHECK: outs(%arg0 : memref<16x16xf8E4M3FN>)
+// CHECK: outs(%arg0 : memref<16x16xf8E4M3FN, strided<[?, ?], offset: ?>, #pto.address_space<gm>>)
 // CHECK: }
 // CHECK: pto.section.cube {
 // CHECK: pto.tstore ins(
-// CHECK: outs(%arg1 : memref<16x16x!pto.hif8>)
+// CHECK: outs(%arg1 : memref<16x16x!pto.hif8, strided<[?, ?], offset: ?>, #pto.address_space<gm>>)
 // CHECK: }
diff --git a/test/lit/vpto/tileop_helper_normalize_uncovered_skip.pto b/test/lit/vpto/tileop_helper_normalize_uncovered_skip.pto
new file mode 100644
index 0000000000..de61e93ae8
--- /dev/null
+++ b/test/lit/vpto/tileop_helper_normalize_uncovered_skip.pto
@@ -0,0 +1,60 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// Guard PR1 for the tileop subkernel redesign: a helper marked only with
+// pto.tileop.helper may contain a naked mixed MTE+V body.
+// NormalizeUncovered must leave it alone for the later tileop summary/materialize
+// passes instead of trying to infer one ordinary cube/vector section.
+// RUN: ( ptoas --pto-arch=a5 --pto-backend=vpto --mlir-print-ir-before=pto-infer-tileop-summary %s -o %t 2>&1 || true ) | FileCheck %s
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @entry(
+      %src: !pto.ptr<f32, gm>,
+      %dst: !pto.ptr<f32, gm>)
+      attributes {pto.kernel} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %src_view = pto.make_tensor_view %src, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %dst_view = pto.make_tensor_view %dst, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %src_part = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %scratch = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %out = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %scale = arith.constant 1.0 : f32
+    func.call @scale_store(%src_part, %dst_part, %scratch, %out, %scale) :
+      (!pto.partition_tensor_view<16x16xf32>, !pto.partition_tensor_view<16x16xf32>,
+       !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+       !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+       f32) -> ()
+    return
+  }
+
+  func.func private @scale_store(
+      %src: !pto.partition_tensor_view<16x16xf32>,
+      %dst: !pto.partition_tensor_view<16x16xf32>,
+      %scratch: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+      %out: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+      %scale: f32)
+      attributes {pto.tileop.helper} {
+    pto.tload ins(%src : !pto.partition_tensor_view<16x16xf32>)
+             outs(%scratch : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tadds ins(%scratch, %scale : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32)
+              outs(%out : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tstore ins(%out : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%dst : !pto.partition_tensor_view<16x16xf32>)
+    return
+  }
+}
+
+// CHECK: func.func private @scale_store
+// CHECK-SAME: pto.tileop.helper
+// CHECK-NOT: pto.section.vector
+// CHECK: pto.tload
+// CHECK: pto.tadds
+// CHECK: pto.tstore
diff --git a/test/lit/vpto/tileop_materialize_sections.pto b/test/lit/vpto/tileop_materialize_sections.pto
new file mode 100644
index 0000000000..acc3a985bd
--- /dev/null
+++ b/test/lit/vpto/tileop_materialize_sections.pto
@@ -0,0 +1,57 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: ( ptoas --pto-arch=a5 --pto-backend=vpto --mlir-print-ir-after=pto-materialize-tileop-sections %s -o %t 2>&1 || true ) | FileCheck %s
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @entry(
+      %src: !pto.ptr<f32, gm>,
+      %dst: !pto.ptr<f32, gm>)
+      attributes {pto.kernel} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %src_view = pto.make_tensor_view %src, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %dst_view = pto.make_tensor_view %dst, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %src_part = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %scratch = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %out = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %scale = arith.constant 1.0 : f32
+    func.call @scale_store(%src_part, %dst_part, %scratch, %out, %scale) :
+      (!pto.partition_tensor_view<16x16xf32>, !pto.partition_tensor_view<16x16xf32>,
+       !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+       !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+       f32) -> ()
+    return
+  }
+
+  func.func private @scale_store(
+      %src: !pto.partition_tensor_view<16x16xf32>,
+      %dst: !pto.partition_tensor_view<16x16xf32>,
+      %scratch: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+      %out: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+      %scale: f32)
+      attributes {pto.tileop.helper} {
+    pto.tload ins(%src : !pto.partition_tensor_view<16x16xf32>)
+             outs(%scratch : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tadds ins(%scratch, %scale : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32)
+              outs(%out : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tstore ins(%out : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%dst : !pto.partition_tensor_view<16x16xf32>)
+    return
+  }
+}
+
+// CHECK: func.func private @scale_store
+// CHECK-SAME: pto.tileop.helper
+// CHECK: pto.tload
+// CHECK: pto.section.vector {
+// CHECK: pto.tadds
+// CHECK: }
+// CHECK: pto.tstore
diff --git a/test/lit/vpto/tileop_materialize_sections_control_flow.pto b/test/lit/vpto/tileop_materialize_sections_control_flow.pto
new file mode 100644
index 0000000000..1a8b870b78
--- /dev/null
+++ b/test/lit/vpto/tileop_materialize_sections_control_flow.pto
@@ -0,0 +1,48 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: ptoas --pto-arch=a5 --emit-pto-ir --mlir-print-ir-after=pto-materialize-tileop-sections %s 2>&1 | FileCheck %s
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @entry() attributes {pto.kernel} {
+    %src = pto.alloc_tile : !pto.tile_buf<vec, 1x16xf32>
+    %dst = pto.alloc_tile : !pto.tile_buf<vec, 1x16xf32>
+    %c16_i32 = arith.constant 16 : i32
+    func.call @tile_vec_copy_loop(%src, %dst, %c16_i32) :
+      (!pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>, i32) -> ()
+    return
+  }
+
+  func.func private @tile_vec_copy_loop(
+      %arg0: !pto.tile_buf<vec, 1x16xf32>,
+      %arg1: !pto.tile_buf<vec, 1x16xf32>,
+      %arg2: i32)
+      attributes {pto.tileop.helper} {
+    %mask, %tail = pto.plt_b32 %arg2 : i32 -> !pto.mask<b32>, i32
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    scf.for %i = %c0 to %c1 step %c1 {
+      %0 = pto.tile_buf_addr %arg0 : !pto.tile_buf<vec, 1x16xf32> -> memref<1x16xf32, #pto.address_space<vec>>
+      %subview = memref.subview %0[0, 0] [1, 16] [1, 1] : memref<1x16xf32, #pto.address_space<vec>> to memref<16xf32, strided<[1]>, #pto.address_space<vec>>
+      %vec = pto.vlds %subview[%c0] : memref<16xf32, strided<[1]>, #pto.address_space<vec>> -> !pto.vreg<64xf32>
+      %1 = pto.tile_buf_addr %arg1 : !pto.tile_buf<vec, 1x16xf32> -> memref<1x16xf32, #pto.address_space<vec>>
+      %subview_0 = memref.subview %1[0, 0] [1, 16] [1, 1] : memref<1x16xf32, #pto.address_space<vec>> to memref<16xf32, strided<[1]>, #pto.address_space<vec>>
+      pto.vsts %vec, %subview_0[%c0], %mask : !pto.vreg<64xf32>, memref<16xf32, strided<[1]>, #pto.address_space<vec>>, !pto.mask<b32>
+    }
+    return
+  }
+}
+
+// CHECK: func.func private @tile_vec_copy_loop
+// CHECK-SAME: pto.tileop.helper
+// CHECK: pto.plt_b32
+// CHECK: scf.for
+// CHECK: pto.section.vector {
+// CHECK: pto.vlds
+// CHECK: pto.vsts
+// CHECK: }
diff --git a/test/lit/vpto/tileop_materialize_sections_control_flow_mixed.pto b/test/lit/vpto/tileop_materialize_sections_control_flow_mixed.pto
new file mode 100644
index 0000000000..b29f029886
--- /dev/null
+++ b/test/lit/vpto/tileop_materialize_sections_control_flow_mixed.pto
@@ -0,0 +1,62 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: ptoas --pto-arch=a5 --emit-pto-ir --mlir-print-ir-after=pto-materialize-tileop-sections %s 2>&1 | FileCheck %s
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @entry(
+      %src: !pto.ptr<f32, gm>,
+      %dst: !pto.ptr<f32, gm>)
+      attributes {pto.kernel} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %src_view = pto.make_tensor_view %src, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %dst_view = pto.make_tensor_view %dst, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %src_part = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %scratch = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %out = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %scale = arith.constant 1.0 : f32
+    func.call @loop_scale_store(%src_part, %dst_part, %scratch, %out, %scale) :
+      (!pto.partition_tensor_view<16x16xf32>, !pto.partition_tensor_view<16x16xf32>,
+       !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+       !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+       f32) -> ()
+    return
+  }
+
+  func.func private @loop_scale_store(
+      %src: !pto.partition_tensor_view<16x16xf32>,
+      %dst: !pto.partition_tensor_view<16x16xf32>,
+      %scratch: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+      %out: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+      %scale: f32)
+      attributes {pto.tileop.helper} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    scf.for %i = %c0 to %c1 step %c1 {
+      pto.tload ins(%src : !pto.partition_tensor_view<16x16xf32>)
+               outs(%scratch : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      pto.tadds ins(%scratch, %scale : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32)
+                outs(%out : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      pto.tstore ins(%out : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+                 outs(%dst : !pto.partition_tensor_view<16x16xf32>)
+    }
+    return
+  }
+}
+
+// CHECK: func.func private @loop_scale_store
+// CHECK-SAME: pto.tileop.helper
+// CHECK: scf.for
+// CHECK: pto.tload
+// CHECK: pto.section.vector {
+// CHECK: pto.tadds
+// CHECK: }
+// CHECK: pto.tstore
diff --git a/test/lit/vpto/tileop_materialize_sections_cube.pto b/test/lit/vpto/tileop_materialize_sections_cube.pto
new file mode 100644
index 0000000000..99915d3375
--- /dev/null
+++ b/test/lit/vpto/tileop_materialize_sections_cube.pto
@@ -0,0 +1,64 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: ( ptoas --pto-arch=a5 --pto-backend=vpto --mlir-print-ir-after=pto-materialize-tileop-sections %s -o %t 2>&1 || true ) | FileCheck %s
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @entry(
+      %lhs_src: !pto.ptr<f16, gm>,
+      %rhs_src: !pto.ptr<f16, gm>,
+      %dst_out: !pto.ptr<f32, gm>)
+      attributes {pto.kernel} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %lhs_view = pto.make_tensor_view %lhs_src, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf16>
+    %rhs_view = pto.make_tensor_view %rhs_src, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf16>
+    %dst_view = pto.make_tensor_view %dst_out, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %lhs_part = pto.partition_view %lhs_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf16> -> !pto.partition_tensor_view<16x16xf16>
+    %rhs_part = pto.partition_view %rhs_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf16> -> !pto.partition_tensor_view<16x16xf16>
+    %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %lhs_tile = pto.alloc_tile : !pto.tile_buf<loc=left, dtype=f16, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=row_major, fractal=512, pad=0>
+    %rhs_tile = pto.alloc_tile : !pto.tile_buf<loc=right, dtype=f16, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    %acc_tile = pto.alloc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    func.call @cube_store(%lhs_part, %rhs_part, %dst_part, %lhs_tile, %rhs_tile, %acc_tile) :
+      (!pto.partition_tensor_view<16x16xf16>, !pto.partition_tensor_view<16x16xf16>, !pto.partition_tensor_view<16x16xf32>,
+       !pto.tile_buf<loc=left, dtype=f16, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=row_major, fractal=512, pad=0>,
+       !pto.tile_buf<loc=right, dtype=f16, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>,
+       !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) -> ()
+    return
+  }
+
+  func.func private @cube_store(
+      %lhs: !pto.partition_tensor_view<16x16xf16>,
+      %rhs: !pto.partition_tensor_view<16x16xf16>,
+      %dst: !pto.partition_tensor_view<16x16xf32>,
+      %lhs_tile: !pto.tile_buf<loc=left, dtype=f16, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=row_major, fractal=512, pad=0>,
+      %rhs_tile: !pto.tile_buf<loc=right, dtype=f16, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>,
+      %acc_tile: !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+      attributes {pto.tileop.helper} {
+    pto.tload ins(%lhs : !pto.partition_tensor_view<16x16xf16>)
+             outs(%lhs_tile : !pto.tile_buf<loc=left, dtype=f16, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=row_major, fractal=512, pad=0>)
+    pto.tload ins(%rhs : !pto.partition_tensor_view<16x16xf16>)
+             outs(%rhs_tile : !pto.tile_buf<loc=right, dtype=f16, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    pto.tmatmul ins(%lhs_tile, %rhs_tile : !pto.tile_buf<loc=left, dtype=f16, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=f16, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+               outs(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tstore ins(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+               outs(%dst : !pto.partition_tensor_view<16x16xf32>)
+    return
+  }
+}
+
+// CHECK: func.func private @cube_store
+// CHECK-SAME: pto.tileop.helper
+// CHECK: pto.tload
+// CHECK: pto.tload
+// CHECK: pto.section.cube {
+// CHECK: pto.tmatmul
+// CHECK: }
+// CHECK: pto.tstore
diff --git a/test/lit/vpto/tileop_summary_attrs.pto b/test/lit/vpto/tileop_summary_attrs.pto
new file mode 100644
index 0000000000..f49b5654a6
--- /dev/null
+++ b/test/lit/vpto/tileop_summary_attrs.pto
@@ -0,0 +1,55 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: ( ptoas --pto-arch=a5 --pto-backend=vpto --mlir-print-ir-after=pto-infer-tileop-summary %s -o %t 2>&1 || true ) | FileCheck %s
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @entry(
+      %src: !pto.ptr<f32, gm>,
+      %dst: !pto.ptr<f32, gm>)
+      attributes {pto.kernel} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %src_view = pto.make_tensor_view %src, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %dst_view = pto.make_tensor_view %dst, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %src_part = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %scratch = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %out = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %scale = arith.constant 1.0 : f32
+    func.call @scale_store(%src_part, %dst_part, %scratch, %out, %scale) :
+      (!pto.partition_tensor_view<16x16xf32>, !pto.partition_tensor_view<16x16xf32>,
+       !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+       !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+       f32) -> ()
+    return
+  }
+
+  func.func private @scale_store(
+      %src: !pto.partition_tensor_view<16x16xf32>,
+      %dst: !pto.partition_tensor_view<16x16xf32>,
+      %scratch: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+      %out: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+      %scale: f32)
+      attributes {pto.tileop.helper} {
+    pto.tload ins(%src : !pto.partition_tensor_view<16x16xf32>)
+             outs(%scratch : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tadds ins(%scratch, %scale : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32)
+              outs(%out : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tstore ins(%out : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%dst : !pto.partition_tensor_view<16x16xf32>)
+    return
+  }
+}
+
+// CHECK: func.func private @scale_store
+// CHECK-SAME: pto.tileop.helper
+// CHECK-SAME: pto.tileop.operand_effects = ["read", "write", "readwrite", "readwrite", "read"]
+// CHECK-SAME: pto.tileop.phases = [{{.*}}operand_defs = [2]{{.*}}operand_uses = [0]{{.*}}pipe = #pto.pipe<PIPE_MTE2>{{.*}}operand_defs = [3]{{.*}}operand_uses = [2]{{.*}}pipe = #pto.pipe<PIPE_V>{{.*}}operand_defs = [1]{{.*}}operand_uses = [3]{{.*}}pipe = #pto.pipe<PIPE_MTE3>{{.*}}]
+// CHECK-SAME: pto.tileop.primary_domain = #pto.kernel_kind<vector>
diff --git a/test/lit/vpto/tileop_summary_attrs_subview.pto b/test/lit/vpto/tileop_summary_attrs_subview.pto
new file mode 100644
index 0000000000..b7d5db30e5
--- /dev/null
+++ b/test/lit/vpto/tileop_summary_attrs_subview.pto
@@ -0,0 +1,43 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: ptoas --pto-arch=a5 --emit-pto-ir --mlir-print-ir-after=pto-infer-tileop-summary %s 2>&1 | FileCheck %s
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @entry() attributes {pto.kernel} {
+    %src = pto.alloc_tile : !pto.tile_buf<vec, 1x16xf32>
+    %dst = pto.alloc_tile : !pto.tile_buf<vec, 1x16xf32>
+    %c16_i32 = arith.constant 16 : i32
+    func.call @tile_vec_copy(%src, %dst, %c16_i32) :
+      (!pto.tile_buf<vec, 1x16xf32>, !pto.tile_buf<vec, 1x16xf32>, i32) -> ()
+    return
+  }
+
+  func.func private @tile_vec_copy(
+      %arg0: !pto.tile_buf<vec, 1x16xf32>,
+      %arg1: !pto.tile_buf<vec, 1x16xf32>,
+      %arg2: i32)
+      attributes {pto.tileop.helper} {
+    %mask, %tail = pto.plt_b32 %arg2 : i32 -> !pto.mask<b32>, i32
+    %0 = pto.tile_buf_addr %arg0 : !pto.tile_buf<vec, 1x16xf32> -> memref<1x16xf32, #pto.address_space<vec>>
+    %subview = memref.subview %0[0, 0] [1, 16] [1, 1] : memref<1x16xf32, #pto.address_space<vec>> to memref<16xf32, strided<[1]>, #pto.address_space<vec>>
+    %c0 = arith.constant 0 : index
+    %vec = pto.vlds %subview[%c0] : memref<16xf32, strided<[1]>, #pto.address_space<vec>> -> !pto.vreg<64xf32>
+    %1 = pto.tile_buf_addr %arg1 : !pto.tile_buf<vec, 1x16xf32> -> memref<1x16xf32, #pto.address_space<vec>>
+    %subview_0 = memref.subview %1[0, 0] [1, 16] [1, 1] : memref<1x16xf32, #pto.address_space<vec>> to memref<16xf32, strided<[1]>, #pto.address_space<vec>>
+    %c0_0 = arith.constant 0 : index
+    pto.vsts %vec, %subview_0[%c0_0], %mask : !pto.vreg<64xf32>, memref<16xf32, strided<[1]>, #pto.address_space<vec>>, !pto.mask<b32>
+    return
+  }
+}
+
+// CHECK: func.func private @tile_vec_copy
+// CHECK-SAME: pto.tileop.helper
+// CHECK-SAME: pto.tileop.operand_effects = ["read", "write", "read"]
+// CHECK-SAME: pto.tileop.phases = [{{.*}}operand_defs = []{{.*}}operand_uses = []{{.*}}pipe = #pto.pipe<PIPE_S>{{.*}}operand_defs = [1]{{.*}}operand_uses = [0]{{.*}}pipe = #pto.pipe<PIPE_V>{{.*}}]
+// CHECK-SAME: pto.tileop.primary_domain = #pto.kernel_kind<vector>
diff --git a/test/lit/vpto/tileop_verify_contract_alloc_tile_invalid.pto b/test/lit/vpto/tileop_verify_contract_alloc_tile_invalid.pto
new file mode 100644
index 0000000000..5906f1a0af
--- /dev/null
+++ b/test/lit/vpto/tileop_verify_contract_alloc_tile_invalid.pto
@@ -0,0 +1,42 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: not ptoas --pto-arch=a5 --pto-backend=vpto %s -o - 2>&1 | FileCheck %s
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @entry(%src: !pto.ptr<f32, gm>, %dst: !pto.ptr<f32, gm>)
+      attributes {pto.kernel} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %src_view = pto.make_tensor_view %src, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %dst_view = pto.make_tensor_view %dst, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %src_part = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %tmp = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    func.call @helper_local_alloc(%src_part, %dst_part, %tmp) :
+      (!pto.partition_tensor_view<16x16xf32>, !pto.partition_tensor_view<16x16xf32>,
+       !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) -> ()
+    return
+  }
+
+  func.func private @helper_local_alloc(
+      %src: !pto.partition_tensor_view<16x16xf32>,
+      %dst: !pto.partition_tensor_view<16x16xf32>,
+      %tmp: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      attributes {pto.tileop.helper} {
+    %local = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tload ins(%src : !pto.partition_tensor_view<16x16xf32>)
+             outs(%local : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tstore ins(%local : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%dst : !pto.partition_tensor_view<16x16xf32>)
+    return
+  }
+}
+
+// CHECK: error: is not allowed inside a tileop helper; tileop helpers must not allocate helper-local tile or reserved-buffer state
diff --git a/test/lit/vpto/tileop_verify_contract_memref_boundary_invalid.pto b/test/lit/vpto/tileop_verify_contract_memref_boundary_invalid.pto
new file mode 100644
index 0000000000..403e3f61f6
--- /dev/null
+++ b/test/lit/vpto/tileop_verify_contract_memref_boundary_invalid.pto
@@ -0,0 +1,27 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: not ptoas --pto-arch=a5 --emit-pto-ir %s -o - 2>&1 | FileCheck %s
+
+module attributes {pto.target_arch = "a5"} {
+  func.func private @memref_boundary_invalid(
+      %tmp: memref<16xf32, #pto.address_space<vec>>,
+      %tile: !pto.tile_buf<vec, 1x16xf32>,
+      %cols: i32)
+      attributes {pto.tileop.helper} {
+    %mask, %tail = pto.plt_b32 %cols : i32 -> !pto.mask<b32>, i32
+    %c0 = arith.constant 0 : index
+    %vec = pto.vlds %tmp[%c0] : memref<16xf32, #pto.address_space<vec>> -> !pto.vreg<64xf32>
+    %tile_mem = pto.tile_buf_addr %tile : !pto.tile_buf<vec, 1x16xf32> -> memref<1x16xf32, #pto.address_space<vec>>
+    %tile_row = memref.subview %tile_mem[0, 0] [1, 16] [1, 1] : memref<1x16xf32, #pto.address_space<vec>> to memref<16xf32, strided<[1]>, #pto.address_space<vec>>
+    pto.vsts %vec, %tile_row[%c0], %mask : !pto.vreg<64xf32>, memref<16xf32, strided<[1]>, #pto.address_space<vec>>, !pto.mask<b32>
+    return
+  }
+}
+
+// CHECK: error: 'func.func' op tileop helper arguments must be Tile/TensorView/PartitionTensorView or PTO scalar values
diff --git a/test/lit/vpto/tileop_verify_contract_nested_call_invalid.pto b/test/lit/vpto/tileop_verify_contract_nested_call_invalid.pto
new file mode 100644
index 0000000000..a19e65b48e
--- /dev/null
+++ b/test/lit/vpto/tileop_verify_contract_nested_call_invalid.pto
@@ -0,0 +1,60 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: not ptoas --pto-arch=a5 --pto-backend=vpto %s -o - 2>&1 | FileCheck %s
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @entry(%src: !pto.ptr<f32, gm>, %dst: !pto.ptr<f32, gm>)
+      attributes {pto.kernel} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %src_view = pto.make_tensor_view %src, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %dst_view = pto.make_tensor_view %dst, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %src_part = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %tmp0 = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %tmp1 = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    func.call @nested_tileop_parent(%src_part, %dst_part, %tmp0, %tmp1) :
+      (!pto.partition_tensor_view<16x16xf32>, !pto.partition_tensor_view<16x16xf32>,
+       !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+       !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) -> ()
+    return
+  }
+
+  func.func private @nested_tileop_leaf(
+      %src: !pto.partition_tensor_view<16x16xf32>,
+      %tmp: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      attributes {pto.tileop.helper} {
+    %scale = arith.constant 1.000000e+00 : f32
+    pto.tload ins(%src : !pto.partition_tensor_view<16x16xf32>)
+             outs(%tmp : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tadds ins(%tmp, %scale : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32)
+              outs(%tmp : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    return
+  }
+
+  func.func private @nested_tileop_parent(
+      %src: !pto.partition_tensor_view<16x16xf32>,
+      %dst: !pto.partition_tensor_view<16x16xf32>,
+      %tmp0: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+      %tmp1: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      attributes {pto.tileop.helper} {
+    %scale = arith.constant 1.000000e+00 : f32
+    func.call @nested_tileop_leaf(%src, %tmp0) :
+      (!pto.partition_tensor_view<16x16xf32>,
+       !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) -> ()
+    pto.tadds ins(%tmp0, %scale : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, f32)
+              outs(%tmp1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tstore ins(%tmp1 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%dst : !pto.partition_tensor_view<16x16xf32>)
+    return
+  }
+}
+
+// CHECK: error: 'func.call' op cannot call tileop helper @nested_tileop_leaf from another tileop helper; nested tileop calls are rejected
diff --git a/test/lit/vpto/tileop_verify_contract_no_primary_invalid.pto b/test/lit/vpto/tileop_verify_contract_no_primary_invalid.pto
new file mode 100644
index 0000000000..63249fd173
--- /dev/null
+++ b/test/lit/vpto/tileop_verify_contract_no_primary_invalid.pto
@@ -0,0 +1,41 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: not ptoas --pto-arch=a5 --pto-backend=vpto %s -o - 2>&1 | FileCheck %s
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @entry(%src: !pto.ptr<f32, gm>, %dst: !pto.ptr<f32, gm>)
+      attributes {pto.kernel} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %src_view = pto.make_tensor_view %src, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %dst_view = pto.make_tensor_view %dst, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %src_part = pto.partition_view %src_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %dst_part = pto.partition_view %dst_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %tmp = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    func.call @no_primary_compute(%src_part, %dst_part, %tmp) :
+      (!pto.partition_tensor_view<16x16xf32>, !pto.partition_tensor_view<16x16xf32>,
+       !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) -> ()
+    return
+  }
+
+  func.func private @no_primary_compute(
+      %src: !pto.partition_tensor_view<16x16xf32>,
+      %dst: !pto.partition_tensor_view<16x16xf32>,
+      %tmp: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      attributes {pto.tileop.helper} {
+    pto.tload ins(%src : !pto.partition_tensor_view<16x16xf32>)
+             outs(%tmp : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tstore ins(%tmp : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%dst : !pto.partition_tensor_view<16x16xf32>)
+    return
+  }
+}
+
+// CHECK: error: 'func.func' op requires at least one vector or cube primary compute op; helpers with only MTE/scalar/sync phases are rejected
diff --git a/test/lit/vpto/tileop_verify_contract_ptr_boundary_invalid.pto b/test/lit/vpto/tileop_verify_contract_ptr_boundary_invalid.pto
new file mode 100644
index 0000000000..7984ed8772
--- /dev/null
+++ b/test/lit/vpto/tileop_verify_contract_ptr_boundary_invalid.pto
@@ -0,0 +1,27 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: not ptoas --pto-arch=a5 --emit-pto-ir %s -o - 2>&1 | FileCheck %s
+
+module attributes {pto.target_arch = "a5"} {
+  func.func private @ptr_boundary_invalid(
+      %tmp: !pto.ptr<f32, ub>,
+      %tile: !pto.tile_buf<vec, 1x16xf32>,
+      %cols: i32)
+      attributes {pto.tileop.helper} {
+    %mask, %tail = pto.plt_b32 %cols : i32 -> !pto.mask<b32>, i32
+    %c0 = arith.constant 0 : index
+    %vec = pto.vlds %tmp[%c0] : !pto.ptr<f32, ub> -> !pto.vreg<64xf32>
+    %tile_mem = pto.tile_buf_addr %tile : !pto.tile_buf<vec, 1x16xf32> -> memref<1x16xf32, #pto.address_space<vec>>
+    %tile_row = memref.subview %tile_mem[0, 0] [1, 16] [1, 1] : memref<1x16xf32, #pto.address_space<vec>> to memref<16xf32, strided<[1]>, #pto.address_space<vec>>
+    pto.vsts %vec, %tile_row[%c0], %mask : !pto.vreg<64xf32>, memref<16xf32, strided<[1]>, #pto.address_space<vec>>, !pto.mask<b32>
+    return
+  }
+}
+
+// CHECK: error: 'func.func' op tileop helper arguments must be Tile/TensorView/PartitionTensorView or PTO scalar values
diff --git a/test/lit/vpto/tileop_verify_contract_result_invalid.pto b/test/lit/vpto/tileop_verify_contract_result_invalid.pto
new file mode 100644
index 0000000000..4b68c81e3a
--- /dev/null
+++ b/test/lit/vpto/tileop_verify_contract_result_invalid.pto
@@ -0,0 +1,29 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: not ptoas --pto-arch=a5 --pto-backend=vpto %s -o - 2>&1 | FileCheck %s
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @entry() attributes {pto.kernel} {
+    %tmp = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %0 = func.call @non_scalar_result_helper(%tmp) :
+      (!pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) ->
+      !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %1 = pto.treshape %0 : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0> -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    func.return
+  }
+
+  func.func private @non_scalar_result_helper(
+      %tmp: !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      attributes {pto.tileop.helper} {
+    return %tmp : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+  }
+}
+
+// CHECK: error: 'func.func' op tileop helper results are limited to PTO scalar values in the MVP, but found result type
diff --git a/test/lit/vpto/tileop_verify_contract_simt_only_invalid.pto b/test/lit/vpto/tileop_verify_contract_simt_only_invalid.pto
new file mode 100644
index 0000000000..6cb21cadd4
--- /dev/null
+++ b/test/lit/vpto/tileop_verify_contract_simt_only_invalid.pto
@@ -0,0 +1,38 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: not ptoas --pto-arch=a5 --pto-backend=vpto %s -o - 2>&1 | FileCheck %s
+
+module attributes {pto.target_arch = "a5"} {
+  func.func @entry(%meta: !pto.ptr<i32, gm>) attributes {pto.kernel} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    %meta_view = pto.make_tensor_view %meta, shape = [%c1, %c8], strides = [%c8, %c1] : !pto.tensor_view<?x?xi32>
+    %meta_part = pto.partition_view %meta_view, offsets = [%c0, %c0], sizes = [%c1, %c8] : !pto.tensor_view<?x?xi32> -> !pto.partition_tensor_view<1x8xi32>
+    %meta_tile = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=i32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=32, pad=0>
+    func.call @simt_only_helper(%meta_part, %meta_tile) :
+      (!pto.partition_tensor_view<1x8xi32>,
+       !pto.tile_buf<loc=vec, dtype=i32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=32, pad=0>) -> ()
+    return
+  }
+
+  func.func private @simt_only_helper(
+      %meta: !pto.partition_tensor_view<1x8xi32>,
+      %tmp: !pto.tile_buf<loc=vec, dtype=i32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=32, pad=0>)
+      attributes {pto.tileop.helper} {
+    %tid = pto.get_tid_x : i32
+    pto.tadds ins(%tmp, %tid : !pto.tile_buf<loc=vec, dtype=i32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=32, pad=0>, i32)
+              outs(%tmp : !pto.tile_buf<loc=vec, dtype=i32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=32, pad=0>)
+    pto.tstore ins(%tmp : !pto.tile_buf<loc=vec, dtype=i32, rows=1, cols=8, v_row=1, v_col=8, blayout=row_major, slayout=none_box, fractal=32, pad=0>)
+               outs(%meta : !pto.partition_tensor_view<1x8xi32>)
+    return
+  }
+}
+
+// CHECK: error: is SIMT-only and cannot appear inside a tileop helper
diff --git a/tools/ptoas/ptoas.cpp b/tools/ptoas/ptoas.cpp
index 6443f1ca72..99ecafde5a 100644
--- a/tools/ptoas/ptoas.cpp
+++ b/tools/ptoas/ptoas.cpp
@@ -1825,6 +1825,11 @@ int mlir::pto::compilePTOASModule(
   // lifted to make it unconditional for all backends.
   if (effectiveBackend == PTOBackend::VPTO)
     pm.addNestedPass<mlir::func::FuncOp>(pto::createPTOCanonicalizeIRPass());
+  pm.addNestedPass<mlir::func::FuncOp>(pto::createPTOInferTileOpSummaryPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      pto::createPTOMaterializeTileOpSectionsPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      pto::createPTOVerifyTileOpContractPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       pto::createPTOAssignDefaultFrontendPipeIdPass());
   pm.addNestedPass<mlir::func::FuncOp>(

From bc56f1dd1a733de7ed16fb08b11187507861eb78 Mon Sep 17 00:00:00 2001
From: jimmychou <47636600+jimmychou0@users.noreply.github.com>
Date: Sat, 27 Jun 2026 16:51:12 +0800
Subject: [PATCH 2/3] feat(ptodsl): remove legacy simd cube subkernel surfaces

---
 .../mix-kernel-mix-backend-compile-flow.md    |  14 +-
 .../ptodsl-ast-preprocess-control-flow.md     |   4 +-
 ...sl-redesign-of-simd-simt-cube-subkernel.md |  11 +-
 lib/PTO/IR/PTO.cpp                            |   2 +
 ptodsl/README.md                              |   4 +-
 ptodsl/docs/user_guide/01-introduction.md     |  32 +++--
 .../03-kernel-entry-and-subkernels.md         | 136 +++++++++---------
 ptodsl/docs/user_guide/05-control-flow.md     |   2 +-
 .../docs/user_guide/07-data-movement-ops.md   |  17 ++-
 .../docs/user_guide/08-compute-operations.md  |   4 +-
 ptodsl/docs/user_guide/10-sync-ops.md         |   2 +-
 .../11-flash-attention-walkthrough.md         |  10 +-
 .../docs/user_guide/12-additional-examples.md |   9 +-
 ptodsl/examples/fa_dn_ptodsl.py               |   4 +-
 ptodsl/examples/flash_attention_sketch.py     |  19 +--
 ptodsl/ptodsl/_diagnostics.py                 |  10 ++
 ptodsl/ptodsl/_subkernels.py                  |   5 +
 ptodsl/ptodsl/_tracing/session.py             |  26 +---
 .../tests/support/docs_fragment_fixtures.py   |  20 +--
 ptodsl/tests/test_jit_compile.py              |  78 +++++-----
 ptodsl/tests/test_subkernel_diagnostics.py    |  68 +++++----
 test/dsl-st/cube_matrix_pipeline.py           |   2 +-
 .../TPushTPop/ptodsl/local_c2v/kernel.py      |   4 +-
 23 files changed, 248 insertions(+), 235 deletions(-)

diff --git a/docs/designs/mix-kernel-mix-backend-compile-flow.md b/docs/designs/mix-kernel-mix-backend-compile-flow.md
index fa729491e6..36b0671b69 100644
--- a/docs/designs/mix-kernel-mix-backend-compile-flow.md
+++ b/docs/designs/mix-kernel-mix-backend-compile-flow.md
@@ -133,7 +133,7 @@ For example:
 ```python
 @pto.jit(target="a5", entry=False, backend="vpto", mode="explicit")
 def scale_row(base_gm: pto.ptr(pto.f32, "gm"), row: pto.i32):
-    with pto.simd():
+    with pto.tileop():
         ...
 
 @pto.jit(target="a5", backend="emitc")
@@ -168,10 +168,9 @@ kernel. The Vector/Cube execution ownership is a PTOAS responsibility:
   the VPTO backend.
 
 This keeps the PTODSL programming model independent of the physical sectioning
-rules. PTODSL can still expose helper abstractions such as `@pto.simd`,
-`@pto.cube`, `with pto.simd():`, and `with pto.cube():`, but the design does
-not require users or the frontend to manually partition every operation into a
-final section.
+rules. PTODSL exposes logical helper abstractions such as `@pto.tileop` and
+`with pto.tileop():`, but the design does not require users or the frontend to
+manually partition every operation into a final section.
 
 ### PTODSL IR Codegen Shape
 
@@ -233,8 +232,7 @@ Python-only structure. This lowering records PTODSL helper structure and call
 boundaries; it does not make PTODSL responsible for the final Vector/Cube
 section partition.
 
-For `@pto.simd` / `@pto.cube` and inline `with pto.simd():` / `with pto.cube():`
-scopes, PTODSL:
+For `@pto.tileop` and inline `with pto.tileop():` scopes, PTODSL:
 
 - outlines the subkernel body into a helper `func.func` when needed
 - marks the helper with canonical `pto.tileop.helper`
@@ -526,5 +524,5 @@ Use this order when debugging mixed compilation:
 | `pto.aicore` | `func.func` | Legacy entry marker accepted for compatibility. |
 | `pto.internal.non_entry` | `func.func` | Frontend/helper metadata; not used for current entry inference. |
 | `pto.ptodsl.logical_name` | `func.func` | Source-level logical name used when assembling wrappers and peer references. |
-| `pto.tileop.helper` | `func.func` | Canonical tileop-style helper marker emitted for `@pto.tileop` and retained `@pto.simd` / `@pto.cube`. |
+| `pto.tileop.helper` | `func.func` | Canonical tileop-style helper marker emitted for `@pto.tileop` helpers. |
 | `pto.ptodsl.subkernel_helper` | `func.func` | Legacy helper role marker retained for compatibility with older/manual IR. |
diff --git a/docs/designs/ptodsl-ast-preprocess-control-flow.md b/docs/designs/ptodsl-ast-preprocess-control-flow.md
index c67a1fd4df..5c555d39dd 100644
--- a/docs/designs/ptodsl-ast-preprocess-control-flow.md
+++ b/docs/designs/ptodsl-ast-preprocess-control-flow.md
@@ -19,8 +19,8 @@ static specialization readable.
 ## Goals
 
 - Make native Python control-flow syntax usable by default for runtime control
-  flow in `@pto.jit(...)` kernels and named `@pto.cube` / `@pto.simd` /
-  `@pto.simt` sub-kernels.
+  flow in `@pto.jit(...)` kernels and named `@pto.tileop` / `@pto.simt`
+  sub-kernels.
 - Use `ast_rewrite` as the public name for the source rewrite feature.
 - Rewrite legal Python `if` / `for range(...)` into existing PTODSL
   control-flow surfaces.
diff --git a/docs/designs/ptodsl-redesign-of-simd-simt-cube-subkernel.md b/docs/designs/ptodsl-redesign-of-simd-simt-cube-subkernel.md
index 49efb25f26..d81c9e1ae0 100644
--- a/docs/designs/ptodsl-redesign-of-simd-simt-cube-subkernel.md
+++ b/docs/designs/ptodsl-redesign-of-simd-simt-cube-subkernel.md
@@ -3,7 +3,7 @@
 
 ## 1. 目标与用户模型
 
-`pto.tileop` 统一 custom subkernel 标识（取代 `pto.cube`/`pto.simd` 作 subkernel 标识职责；`pto.simt` 专属 launched SIMT）。建模为 tile-level IR 上以 tile/tensorview/scalar 为 IO、带 phase 摘要的命名 helper + `func.call`，让 `PTOInsertSync`/`PTOPlanMemory` 当一等公民。用户零参数，**摘要全由后端 `PTOInferTileOpSummaryPass` 推导；canonical helper marker 统一收敛到 `pto.tileop.helper` unit attr。**
+`pto.tileop` 统一 custom subkernel 标识（取代 `pto.cube`/`pto.simd` 的 public subkernel 入口职责；`pto.simt` 专属 launched SIMT）。建模为 tile-level IR 上以 tile/tensorview/scalar 为 IO、带 phase 摘要的命名 helper + `func.call`，让 `PTOInsertSync`/`PTOPlanMemory` 当一等公民。用户零参数，**摘要全由后端 `PTOInferTileOpSummaryPass` 推导；canonical helper marker 统一收敛到 `pto.tileop.helper` unit attr。**
 
 ```python
 @pto.tileop
@@ -142,7 +142,7 @@ func.func @kernel(%out, %x, %scratch, %rows, %cols) {
 2. helper 只附 `pto.tileop.helper`；**不写 primary_domain/phases/operand_effects**。
 3. helper 函数类型：输出全走 operand；results 只用于 scalar。
 4. 前端 public boundary 契约：保留 vreg/mask 不外逃；results 限 scalar；**禁 tileop 嵌套调用**。helper body 内 `alloc_tile/reserve_buffer/TAlloc` 等 helper-local 资源分配由后端 `PTOVerifyTileOpContractPass` 兜底拒绝。
-5. 装饰器无 `kind` 参数；`@pto.cube`/`@pto.simd` 别名（IR 层统一 "tileop"）。
+5. 装饰器无 `kind` 参数；public surface 仅保留 `@pto.tileop` / `@pto.simt`，legacy `@pto.cube` / `@pto.simd` 前端直接报迁移诊断。
 
 ### 后端
 
@@ -183,11 +183,10 @@ func.func @kernel(%out, %x, %scratch, %rows, %cols) {
 2. `PTOInferTileOpSummaryPass`、`PTOMaterializeTileOpSectionsPass`、`PTOVerifyTileOpContractPass` 已接入主 pipeline，且都位于 `PlanMemory` 之前。
 3. `UpdatePTODSLSubkernelCallInfo` 已能消费 tileop 摘要，按 phase 建模跨 helper 边界的 InsertSync 依赖；legacy `simd/cube` 兼容路径仍保留保守单-pipe 建模。
 4. tileop helper ABI 已收敛为 Tile/TensorView/PartitionTensorView/PTO scalar；`ptr` 仍为 SIMT-only。
-5. `@pto.tileop` / retained `@pto.simd` / `@pto.cube` 在 IR 层语义上已统一到 tileop helper role，并使用 canonical `pto.tileop.helper` marker；后端仍兼容 legacy `pto.ptodsl.subkernel_helper = "tileop"`。
+5. `@pto.tileop` 在 IR 层语义上统一到 tileop helper role，并使用 canonical `pto.tileop.helper` marker；legacy public `@pto.simd` / `@pto.cube` 已前端报错；后端仍兼容 legacy `pto.ptodsl.subkernel_helper = "tileop"`。
+6. inline `with pto.tileop()` 已与 decorated `@pto.tileop` 对齐：前端不再预套 section，统一交后端 `PTOMaterializeTileOpSectionsPass` 物化。
 
 ## 7. 仍待单独收敛的差异
 
-1. **inline `with pto.tileop()` 仍有前端预套 section 的实现残留。**
-   目标设计仍是 tileop helper 不预套 section、统一交后端 materialize；当前 inline 路径仍会先包 `SectionVectorOp`。
-2. **`pto.tileop.operand_effects` 的“无显式 boundary effect 时默认值”尚未与本文最终写法重新对齐。**
+1. **`pto.tileop.operand_effects` 的“无显式 boundary effect 时默认值”尚未与本文最终写法重新对齐。**
    本文目标写法仍按 `unknown→readwrite` 记录；当前实现会把无 boundary effect 的 operand 物化/校验为 `"read"`。这一点需要单独决策后再统一设计与实现。
diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp
index 8f28b00190..9f2723b241 100644
--- a/lib/PTO/IR/PTO.cpp
+++ b/lib/PTO/IR/PTO.cpp
@@ -13651,6 +13651,8 @@ static LogicalResult verifySplitAttr(Operation *op, int64_t split) {
 static LogicalResult verifyFrontendKernelKind(Operation *op,
                                               FunctionKernelKind expected,
                                               StringRef kernelName) {
+  if (isInsideTileOpSubkernelHelper(op))
+    return success();
   if (isInsideSectionCube(op)) {
     if (expected == FunctionKernelKind::Cube)
       return success();
diff --git a/ptodsl/README.md b/ptodsl/README.md
index 3c227e4a13..5609d8284f 100644
--- a/ptodsl/README.md
+++ b/ptodsl/README.md
@@ -243,9 +243,9 @@ The user guide under `ptodsl/docs/user_guide/` is the canonical PTODSL API
 reference. This README keeps only a compact map of the public surface:
 
 - `@pto.jit`: the only host-visible kernel entry
-- `@pto.tileop`: custom tile-op helper surface for vector-style sub-kernels
+- `@pto.tileop`: custom tile-op helper surface for vector-style and cube-style sub-kernels
 - `@pto.simt`: SIMT helper surface with launch dimensions
-- `@pto.cube`, `@pto.simd`: retained hardware-specific custom OP entry points
+- legacy `@pto.cube`, `@pto.simd`: removed public aliases; PTODSL now diagnoses them and directs users to `@pto.tileop`
 - `pto.ptr(...)` + runtime PTO scalar annotations: public entry ABI
 - `pto.make_tensor_view(...)`, `pto.partition_view(...)`, `pto.alloc_tile(...)`:
   core data-model builders
diff --git a/ptodsl/docs/user_guide/01-introduction.md b/ptodsl/docs/user_guide/01-introduction.md
index 951b384762..f52b00bc0a 100644
--- a/ptodsl/docs/user_guide/01-introduction.md
+++ b/ptodsl/docs/user_guide/01-introduction.md
@@ -64,8 +64,7 @@ Python Wrapper              L0  user-facing wrapper (NumPy, torch-npu, pure Pyth
   │    └─ backend="emitc"         EmitC backend, mode="auto" only
   ├─ Tile Ops                     tile.load, tile.store, tile.add, ...
   ├─ MTE Ops                      mte_load / mte_store / mte_gm_ub / ...
-  ├─ @pto.tileop                  row-wise vector custom OPs (vlds, vadd, vexp, vsts, ...)
-  ├─ @pto.cube / @pto.simd        retained hardware-specific custom OP entry points
+  ├─ @pto.tileop                  custom tile-op helpers (vector- and cube-style bodies)
   └─ @pto.simt                    scalar-like compute (lds, sts, pointwise blends, ...)
 ```
 
@@ -249,9 +248,7 @@ the same kernel.
 
 These are hardware-bound compute sub-kernels, each mapped to a specific NPU compute unit:
 
-- **`@pto.tileop`** is the recommended surface for custom tile-op bodies that operate on UB tiles and vector registers. Typical operations: `vlds`, `vadd`, `vexp`, `vcgmax`, `vsts`.
-
-- **`@pto.cube` / `@pto.simd`** remain available for existing code that wants hardware-specific names. They preserve the authored surface name in PTODSL diagnostics and tracing, but decorated helpers canonicalize to the same backend helper contract as `@pto.tileop`.
+- **`@pto.tileop`** is the custom tile-op surface for both vector-style and cube-style helper bodies that operate on UB tiles and hardware-local scratch. Typical operations range from `vlds` / `vadd` / `vexp` / `vsts` to `mte_l1_l0a` / `mad` / `mte_l0c_ub`.
 
 - **`@pto.simt`** is a scalar-programmable processor group that executes scalar instructions across many work-items in parallel. Typical operations: `lds`, `sts`, scalar arithmetic and comparison. Well-suited for per-element tile walks, boundary metadata, and pointwise blends.
 
@@ -259,7 +256,13 @@ Tile-op helpers can be invoked as named decorated functions (`@pto.tileop`) or
 inline context managers (`with pto.tileop():`). SIMT helpers use `@pto.simt`
 and launch dimensions via `helper[x, y, z](...)` or `pto.simt_launch(...)`.
 
-The boundary contract is strict: vreg values do not escape a simd kernel, cube-local state does not leak into UB, and data crosses layer boundaries only through UB-backed tiles or typed UB pointers.
+Legacy `@pto.cube` / `@pto.simd` and `with pto.cube():` / `with pto.simd():`
+are no longer supported public surfaces. PTODSL raises a diagnostic directing
+users to `@pto.tileop`.
+
+The boundary contract is strict: transient `vreg` values do not escape a
+tileop helper, cube-local state does not leak into UB, and data crosses layer
+boundaries only through UB-backed tiles or typed UB pointers.
 
 ## 1.3 Tracing execution model
 
@@ -293,15 +296,16 @@ GM boundary.
 
 **Explicit orchestration path** stages the current K and V blocks with
 `mte_load`, issues `pipe_barrier(Pipe.ALL)` at phase boundaries, then
-sequences four sub-kernel calls: `qk_matmul` (cube),
-`online_softmax_rows` (simd), `pv_matmul` (cube), `blend_output_rows` (simt).
+sequences four sub-kernel calls: `qk_matmul` (cube-style tileop),
+`online_softmax_rows` (vector-style tileop), `pv_matmul` (cube-style tileop),
+`blend_output_rows` (simt).
 
-**`@pto.cube`** performs `mte_l1_l0a` / `mte_l1_l0b` / `mad` /
-`mte_l0c_ub` for both QK^T and P@V products.
+**Cube-style `@pto.tileop` bodies** perform `mte_l1_l0a` / `mte_l1_l0b` /
+`mad` / `mte_l0c_ub` for both QK^T and P@V products.
 
-**`@pto.tileop`** implements the online softmax update: per-row max, exp, sum,
-and alpha/beta computation using vector ops (`vlds`, `vcgmax`, `vexp`,
-`vcgadd`, `vsts`).
+**Vector-style `@pto.tileop` bodies** implement the online softmax update:
+per-row max, exp, sum, and alpha/beta computation using vector ops (`vlds`,
+`vcgmax`, `vexp`, `vcgadd`, `vsts`).
 
 **`@pto.simt`** blends the old and new output accumulators with per-element
 `lds`/`sts` and scalar arithmetic.
@@ -323,7 +327,7 @@ Chapter 11 walks through this example in full detail.
 |---------|-------|
 | 1 | Introduction (this chapter) |
 | 2 | Quick Start — a minimal working kernel |
-| 3 | Kernel entries, kernel modules, and sub-kernels: `@pto.jit(entry=True/False, backend=...)`, `@pto.tileop`, `@pto.simt`, retained `@pto.cube` / `@pto.simd` |
+| 3 | Kernel entries, kernel modules, and sub-kernels: `@pto.jit(entry=True/False, backend=...)`, `@pto.tileop`, `@pto.simt` |
 | 4 | Type system and buffer management: scalars, tiles, views, allocation |
 | 5 | Control flow: trace-time Python vs device-side `pto.for_` / `pto.if_` |
 | 6 | Scalar and pointer operations |
diff --git a/ptodsl/docs/user_guide/03-kernel-entry-and-subkernels.md b/ptodsl/docs/user_guide/03-kernel-entry-and-subkernels.md
index 3dbfdf9151..fd4885e02a 100644
--- a/ptodsl/docs/user_guide/03-kernel-entry-and-subkernels.md
+++ b/ptodsl/docs/user_guide/03-kernel-entry-and-subkernels.md
@@ -2,11 +2,12 @@
 
 PTODSL provides one kernel decorator (`@pto.jit`) with two roles
 (`entry=True` / `entry=False`), two compilation backends (`vpto` / `emitc`),
-custom OP sub-kernel decorators (`@pto.tileop`, retained `@pto.cube` /
-`@pto.simd`, and `@pto.simt`), plus matching context managers for inline use. This chapter covers
-the `@pto.jit` entry and module contracts, the two programming models, the two
-compilation backends, sub-kernel reference, parameter contracts, and boundary
-constraints.
+custom OP sub-kernel decorators (`@pto.tileop` and `@pto.simt`), plus
+matching context managers for inline use. Legacy `@pto.cube` / `@pto.simd`
+and `with pto.cube():` / `with pto.simd():` now raise migration diagnostics
+that direct users to `@pto.tileop`. This chapter covers the `@pto.jit` entry
+and module contracts, the two programming models, the two compilation
+backends, sub-kernel reference, parameter contracts, and boundary constraints.
 
 
 ## 3.1 `@pto.jit` — roles, backends, and modes
@@ -21,8 +22,8 @@ Decorator overview:
   mode="auto"               tile-first authoring, compiler-managed staging (default)
   mode="explicit"           micro-instruction authoring, user-managed staging
 
-@pto.tileop                 Custom tile-op sub-kernel
-@pto.cube / @pto.simd       Retained hardware-specific custom OP entry points
+@pto.tileop                 Custom tile-op sub-kernel (vector- or cube-style body)
+@pto.cube / @pto.simd       Legacy sub-kernel aliases; use @pto.tileop
 @pto.simt                   SIMT-unit scalar sub-kernel
 ```
 
@@ -54,9 +55,10 @@ The **`mode`** parameter selects the programming model within the kernel body
 it doesn't change how you compile or launch the kernel.
 
 `@pto.jit` owns compilation (tracing + lowering), caching, and — for
-`entry=True` — runtime launch binding. The sub-kernel decorators
-(`@pto.tileop`, retained `@pto.cube` / `@pto.simd`, and `@pto.simt`) define helpers that are called from
-within `@pto.jit` bodies.
+`entry=True` — runtime launch binding. The supported sub-kernel decorators
+(`@pto.tileop` and `@pto.simt`) define helpers that are called from within
+`@pto.jit` bodies. Legacy `@pto.cube` / `@pto.simd` are diagnosed at the
+front-end boundary and should be migrated to `@pto.tileop`.
 
 
 ## 3.2 `entry=True` — host-launchable kernel entry
@@ -326,7 +328,7 @@ rewritten to device-side control flow, and the compiler handles hardware
 section placement automatically — you can write `vlds` / `vadd` / `vsts`
 directly in the module body without an explicit `with pto.tileop():`. In
 `mode="explicit"`, you must manage hardware sections yourself with
-`with pto.tileop():`, `with pto.cube():`, or `with pto.simt():`.
+`with pto.tileop():` or `with pto.simt():`.
 
 ### Interface protocol
 
@@ -463,7 +465,7 @@ there are two kinds of helpers:
 - **Plain Python helpers** for code organization, repeated index math,
   partition construction, and orchestration that should stay in the caller's
   context.
-- **Sub-kernels** (`@pto.tileop`, retained `@pto.cube` / `@pto.simd`, `@pto.simt`) when the helper must
+- **Sub-kernels** (`@pto.tileop`, `@pto.simt`) when the helper must
   run on a specific hardware unit or use unit-local value categories such as
   `vreg` or cube-local scratch.
 
@@ -473,8 +475,9 @@ backend. They are traced as part of the enclosing `@pto.jit` specialization
 and therefore inherit the caller's context.
 
 Use a sub-kernel when the helper's semantics belong to a specific unit:
-vector register math on SIMD, matrix instructions on Cube, or scalar-thread
-work on SIMT. Sub-kernels are the only public way to express that boundary.
+vector register math or cube instructions via `@pto.tileop`, or scalar-thread
+work via `@pto.simt`. Sub-kernels are the only public way to express that
+boundary.
 
 Named sub-kernels and plain nested helpers both use the same default AST
 rewrite behavior when they are traced from a compiled specialization.
@@ -483,17 +486,17 @@ Sub-kernels are the mechanism for custom compute in PTODSL — when Tile Ops
 cover your needs, you don't need one; when they don't, a sub-kernel gives you
 direct access to the hardware unit. In auto mode, a sub-kernel's parameters
 follow the decorator's role-specific ABI — the compiler still owns staging and
-sync for tileop-style helpers. `@pto.tileop` and retained `@pto.simd` helpers
-accept `Tile`, `TensorView`, `PartitionTensorView`, and PTO scalar parameters.
-`@pto.simt` additionally accepts typed `pto.ptr(...)` parameters. These richer
-in-kernel boundary types do not change the public `@pto.jit` host entry ABI.
-Section 3.3 covers each sub-kernel decorator in detail.
+sync for tileop-style helpers. `@pto.tileop` accepts `Tile`, `TensorView`,
+`PartitionTensorView`, and PTO scalar parameters. `@pto.simt` additionally
+accepts typed `pto.ptr(...)` parameters. These richer in-kernel boundary types
+do not change the public `@pto.jit` host entry ABI. Section 3.3 covers each
+sub-kernel decorator in detail.
 
 ### Module vs sub-kernel
 
 **Module or sub-kernel?** A simple rule:
 - Logic that **must run as a custom tile op or SIMT helper**
-  and operates on tiles → use a sub-kernel (`@pto.tileop`, `@pto.simt`, or the retained `@pto.cube` / `@pto.simd` names).
+  and operates on tiles → use a sub-kernel (`@pto.tileop` or `@pto.simt`).
 - General device-side code organisation — allocating tiles, partitioning GM
   views, calling sub-kernels, mixing backends → use a kernel module
   (`@pto.jit(entry=False)`).
@@ -502,11 +505,11 @@ Modules **can** call sub-kernels (they are callable from both entries and
 modules). Sub-kernels **cannot** call modules — data crosses the sub-kernel
 boundary only through UB tiles, not through nested function calls.
 
-| | `@pto.jit(entry=False)` module | `@pto.tileop` / `@pto.simt` / retained `@pto.cube` / `@pto.simd` |
+| | `@pto.jit(entry=False)` module | `@pto.tileop` / `@pto.simt` |
 |---|---|---|
 | Positioning | General device-side function | **Custom tile op** — hardware-bound compute primitive |
 | Scope | Orchestration, tile allocation, data movement, sub-kernel dispatch | Single-hardware-unit compute logic |
-| ABI | **C ABI: ptr + PTO scalars only**. Tile/TensorView/PartitionTensorView cannot cross the function boundary. Caller passes `tile.as_ptr()`; module constructs local tiles internally | Role-specific PTODSL helper ABI. `@pto.tileop` / retained `@pto.simd`: `Tile`, `TensorView`, `PartitionTensorView`, PTO scalars. `@pto.simt`: `Tile`, typed `pto.ptr(...)`, PTO scalars |
+| ABI | **C ABI: ptr + PTO scalars only**. Tile/TensorView/PartitionTensorView cannot cross the function boundary. Caller passes `tile.as_ptr()`; module constructs local tiles internally | Role-specific PTODSL helper ABI. `@pto.tileop`: `Tile`, `TensorView`, `PartitionTensorView`, PTO scalars. `@pto.simt`: `Tile`, typed `pto.ptr(...)`, PTO scalars |
 | Backend | VPTO or EmitC | Always VPTO |
 | Compilation | Compiled as a separate child module, linked automatically | Outlined as a helper function inside the owning caller/module |
 | Callable from | Entries and other modules | Entries and modules |
@@ -544,9 +547,9 @@ you can drop below the tile abstraction without leaving the `@pto.jit` entry.
 
 Explicit mode broadens the orchestration code you can write inside `@pto.jit`
 and `@pto.jit(entry=False)` bodies. Sub-kernel ABIs themselves remain
-role-specific rather than mode-specific: `@pto.tileop` / retained `@pto.simd`
-accept `Tile`, `TensorView`, `PartitionTensorView`, and PTO scalars, while
-`@pto.simt` accepts `Tile`, typed `pto.ptr(...)`, and PTO scalars.
+role-specific rather than mode-specific: `@pto.tileop` accepts `Tile`,
+`TensorView`, `PartitionTensorView`, and PTO scalars, while `@pto.simt`
+accepts `Tile`, typed `pto.ptr(...)`, and PTO scalars.
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"kernel_entry.explicit_signature","symbol":"kernel_entry_explicit_signature_probe","compile":{"BLOCK":16}} -->
 ```python
@@ -715,11 +718,11 @@ built-in ops don't cover your needs.
 
 **Sub-kernels are custom tile ops.** Their contract is strict:
 
-- **Inputs**: role-specific PTODSL boundary values. `@pto.tileop` and retained
-  `@pto.simd` accept `Tile`, `TensorView`, `PartitionTensorView`, and PTO
-  scalars (`pto.i32`, `pto.f32`, ...). `@pto.simt` accepts `Tile`, typed
-  `pto.ptr(...)`, and PTO scalars. Data still flows through device-side
-  boundary objects; sub-kernels do not define a host-visible ABI.
+- **Inputs**: role-specific PTODSL boundary values. `@pto.tileop` accepts
+  `Tile`, `TensorView`, `PartitionTensorView`, and PTO scalars (`pto.i32`,
+  `pto.f32`, ...). `@pto.simt` accepts `Tile`, typed `pto.ptr(...)`, and PTO
+  scalars. Data still flows through device-side boundary objects; sub-kernels
+  do not define a host-visible ABI.
 - **Outputs**: written back to UB tiles. Mutable `Tile` parameters remain the
   primary output path. In the current MVP, decorated tileop-style helpers may
   additionally return PTO scalar values through `func.call` results, but
@@ -735,11 +738,12 @@ When to use a sub-kernel vs a kernel module:
   views, or mix backends → use an `@pto.jit(entry=False)` kernel module
   instead. Modules can call sub-kernels, but sub-kernels cannot call modules.
 
-Sub-kernels are decorated with `@pto.tileop`, `@pto.simt`, or the retained
-hardware-specific `@pto.cube` / `@pto.simd` entry points.
-PTODSL lowers both surface forms to real helper `func.func` bodies instead of
-flattening them directly into the surrounding caller. They can be authored in
-two ways:
+Sub-kernels are decorated with `@pto.tileop` or `@pto.simt`. PTODSL lowers
+them to real helper `func.func` bodies instead of flattening them directly
+into the surrounding caller. Legacy `@pto.cube` / `@pto.simd` and inline
+`with pto.cube():` / `with pto.simd():` are removed public surfaces; PTODSL
+diagnoses them and directs users to `@pto.tileop`. Sub-kernels can be authored
+in two ways:
 
 1. **As decorated functions** — reusable, named sub-kernels called from
    `@pto.jit` entries and modules.
@@ -750,18 +754,18 @@ Named sub-kernel decorators use the same default AST rewrite model as
 `@pto.jit`: supported Python `if` and `for range(...)` statements lower to
 device-side control flow.
 
-### 3.7.1 `@pto.cube` — Cube unit (matrix operations)
+### 3.7.1 Cube-style `@pto.tileop` — matrix operations
 
-**Role**: `@pto.cube` is the custom tile op for matrix multiplication on the
-Cube unit. It consumes UB-resident tiles and explicit cube-local scratch
-buffers. All parameters are `Tile` references — the caller owns tile
-allocation, and the sub-kernel only expresses the compute dataflow.
+**Role**: `@pto.tileop` is also the custom tile-op surface for matrix
+multiplication on the Cube unit. It consumes UB-resident tiles and explicit
+cube-local scratch buffers. All parameters are `Tile` references — the caller
+owns tile allocation, and the sub-kernel only expresses the compute dataflow.
 
-**Signature**: `@pto.cube(fn=None, *, name=None, target="a5")`
+**Signature**: `@pto.tileop(fn=None, *, name=None, target="a5")`
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"kernel_entry.cube_signature","symbol":"kernel_entry_cube_signature_probe","compile":{"BLOCK_M":16,"BLOCK_K":16,"BLOCK_N":16}} -->
 ```python
-@pto.cube
+@pto.tileop
 def my_cube_kernel(
     input_tile: pto.Tile,            # UB tile (source data)
     output_tile: pto.Tile,           # UB tile (destination)
@@ -780,7 +784,7 @@ allocated with the appropriate `memory_space` (e.g., `pto.MemorySpace.LEFT`,
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"data_movement.cube_helper","symbol":"data_movement_cube_helper_probe","compile":{"BLOCK_M":16,"BLOCK_K":16,"BLOCK_N":16}} -->
 ```python
-@pto.cube
+@pto.tileop
 def qk_matmul(
     q_tile: pto.Tile,
     k_tile: pto.Tile,
@@ -803,14 +807,14 @@ Cube-local state (LEFT, RIGHT, ACC, BIAS) never leaks into UB — it is the
 caller's responsibility to allocate scratch buffers and pass them in
 explicitly.
 
-**Lowering model**: a decorated `@pto.cube` function becomes one reusable
-helper function inside the owning PTODSL child module. Each callsite lowers to
-`func.call` of that helper. Decorated compatibility wrappers lower through the
-same naked `tileop` helper contract as `@pto.tileop`; backend passes infer the
-primary domain and materialize `pto.section.cube` later in the PTOAS pipeline.
+**Lowering model**: a decorated cube-style `@pto.tileop` function becomes one
+reusable helper function inside the owning PTODSL child module. Each callsite
+lowers to `func.call` of that helper. The helper body stays as a naked
+`tileop` helper; backend passes infer the primary domain and materialize
+`pto.section.cube` later in the PTOAS pipeline.
 
 **Invocation modes**: can be called from `@pto.jit` in either mode, or authored
-as an anonymous inline helper with `with pto.cube():` (Section 3.8).
+as an anonymous inline helper with `with pto.tileop():` (Section 3.8).
 
 ### 3.7.2 `@pto.tileop` — custom tile op (vector operations)
 
@@ -876,10 +880,9 @@ pipeline.
 **Invocation modes**: can be called from `@pto.jit` in either mode, or authored
 as an anonymous inline helper with `with pto.tileop():` (Section 3.8).
 
-`@pto.simd` remains available for existing vector custom OP code. It follows
-the same parameter and lowering contract as `@pto.tileop` while preserving the
-`simd` surface name in trace diagnostics. At the IR layer it canonicalizes to
-the same backend helper contract as `@pto.tileop`.
+PTODSL no longer exposes a separate `@pto.simd` public surface. Vector-style
+helpers use `@pto.tileop`, and the backend infers `pto.section.vector` from
+the helper body later in the PTOAS pipeline.
 
 ### 3.7.3 `@pto.simt` — SIMT unit (scalar-parallel operations)
 
@@ -1014,10 +1017,11 @@ Specific SIMT micro-op APIs are documented in Chapter 13.
 ## 3.8 Inline context manager syntax
 
 In addition to the decorator form, custom tile-op helpers can be written with
-`with pto.tileop():`; retained `with pto.cube():` / `with pto.simd():` scopes
-and inline `with pto.simt():` scopes are also supported. These
-open one-off anonymous sub-kernel bodies without requiring a separate named
-Python function. Inline scopes are supported in top-level `@pto.jit` bodies.
+`with pto.tileop():`, and SIMT helpers can be written with `with pto.simt():`.
+These open one-off anonymous sub-kernel bodies without requiring a separate
+named Python function. Inline scopes are supported in top-level `@pto.jit`
+bodies. Legacy `with pto.cube():` / `with pto.simd():` are diagnosed and
+should be rewritten to `with pto.tileop():`.
 
 ### Syntax
 
@@ -1041,7 +1045,7 @@ with pto.simt():
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"kernel_entry.inline_cube_scope","symbol":"kernel_entry_inline_cube_scope_probe","compile":{"BLOCK_M":16,"BLOCK_K":16,"BLOCK_N":16}} -->
 ```python
-with pto.cube():
+with pto.tileop():
     pto.mte_l1_l0a(q_tile.as_ptr(), q_l0a.as_ptr(), m, k)
     pto.mte_l1_l0b(k_tile.as_ptr(), k_l0b.as_ptr(), k, n, transpose=True)
     pto.mad(q_l0a.as_ptr(), k_l0b.as_ptr(), s_acc.as_ptr(), m, n, k)
@@ -1054,12 +1058,10 @@ with pto.cube():
   unit.
 - On block exit, PTODSL outlines the block into one anonymous helper
   `func.func` and replaces the original region with a `func.call`.
-- Decorated sub-kernel helpers and inline sub-kernel scopes do not lower
-  identically today. Named/decorated `@pto.tileop`, retained `@pto.simd`, and
-  retained `@pto.cube` helpers lower as naked `tileop` helpers and rely on
-  backend materialization of `pto.section.vector` / `pto.section.cube`. Inline
-  `with pto.tileop():`, retained `with pto.simd():`, and `with pto.cube():`
-  scopes still preserve their explicit section-wrapped helper bodies.
+- Named/decorated `@pto.tileop` helpers and inline `with pto.tileop():`
+  scopes both lower as naked `pto.tileop.helper` bodies. PTOAS infers the
+  primary domain and materializes `pto.section.vector` /
+  `pto.section.cube` later in the pipeline.
 - `with pto.simt():` preserves its scalar body inside one outlined
   `pto.simt_entry` helper, and the caller emits `pto.simt_launch`.
 - Values defined inside the inline sub-kernel cannot escape the block directly.
@@ -1096,13 +1098,13 @@ calls — only `pto.ptr` and PTO scalars can cross. `Tile`, `TensorView`, and
 |----------|---------|
 | Host → `@pto.jit(entry=True)` | explicit GM pointers + runtime scalars |
 | Entry / module → `@pto.jit(entry=False)` module | **`pto.ptr` + PTO scalars only** (C ABI). Caller passes `tile.as_ptr()`; module constructs local tiles internally |
-| Entry / module → `@pto.tileop` / retained `@pto.simd` | `Tile`, `TensorView`, `PartitionTensorView`, PTO scalars |
+| Entry / module → `@pto.tileop` | `Tile`, `TensorView`, `PartitionTensorView`, PTO scalars |
 | Entry / module → `@pto.simt` | `Tile`, typed `pto.ptr(...)`, PTO scalars |
-| `@pto.jit` → `with pto.{tileop,cube,simd,simt}:` | Captured `Tile` / ptr / scalar values from enclosing scope |
+| `@pto.jit` → `with pto.{tileop,simt}:` | Captured `Tile` / ptr / scalar values from enclosing scope |
 | Sub-kernel → sub-kernel | Not allowed (go through UB tiles via the caller) |
 | Sub-kernel → module | Not allowed (sub-kernels cannot call out) |
 | Inline sub-kernel → caller | No direct SSA return path; write through Tile / ptr / mutable references |
-| `@pto.tileop` / `@pto.simd` → caller | Only via `vsts`/`psts` to UB tiles; `vreg` cannot escape |
+| `@pto.tileop` → caller | Write results through mutable Tile / view operands; transient `vreg` and cube-local state cannot escape |
 | Cube-local → UB | Only via `mte_l0c_ub`; LEFT/RIGHT/ACC/BIAS are private |
 | `entry=False` module → caller | No return values; data crosses only via mutable references |
 
diff --git a/ptodsl/docs/user_guide/05-control-flow.md b/ptodsl/docs/user_guide/05-control-flow.md
index c80a7602e9..32df78726d 100644
--- a/ptodsl/docs/user_guide/05-control-flow.md
+++ b/ptodsl/docs/user_guide/05-control-flow.md
@@ -4,7 +4,7 @@ PTODSL uses a **tracing** compilation model. When you call `kernel.compile(...)`
 
 This has one critical implication for how you write loops and branches:
 
-- **Python native `for`/`if`** is rewritten to device-side control flow by default in `@pto.jit` bodies and named `@pto.cube` / `@pto.tileop` / `@pto.simt` sub-kernels. A `for i in range(rows)` loop records a device loop, and a runtime `if` records both branches.
+- **Python native `for`/`if`** is rewritten to device-side control flow by default in `@pto.jit` bodies and named `@pto.tileop` / `@pto.simt` sub-kernels. A `for i in range(rows)` loop records a device loop, and a runtime `if` records both branches.
 - **`pto.const_expr` / `pto.static_range`** keep compile-time Python behavior when you want trace-time specialization or unrolling.
 - **`pto.for_` / `pto.if_`** produce device-side control flow. The loop bound or branch condition can be a runtime value, and the hardware will execute the loop or take the branch dynamically.
 
diff --git a/ptodsl/docs/user_guide/07-data-movement-ops.md b/ptodsl/docs/user_guide/07-data-movement-ops.md
index 7894d44d88..e49aa40b3c 100644
--- a/ptodsl/docs/user_guide/07-data-movement-ops.md
+++ b/ptodsl/docs/user_guide/07-data-movement-ops.md
@@ -824,7 +824,9 @@ pto.vstas(align, ub_dst_f32, pto.const(64))
 
 ## 7.5 Cube data movement (cube)
 
-Inside `@pto.cube`, data flows through a hierarchy of private buffers: GM → L1 (cbuf) → L0A/L0B (operand buffers) → L0C (accumulator) → UB or back to GM.
+Inside a cube-style `@pto.tileop` helper, data flows through a hierarchy of
+private buffers: GM → L1 (cbuf) → L0A/L0B (operand buffers) → L0C
+(accumulator) → UB or back to GM.
 
 ### Staging: GM → L1 and L1 → UB
 
@@ -1065,11 +1067,12 @@ pto.mte_l0c_ub(acc, ub, 16, 32, 16, 32, split=pto.SplitMode.N)   # split N
 
 ### Typical cube dataflow in a matmul
 
-A full cube matmul (`@pto.cube`) follows this dataflow pattern:
+A full cube matmul (a cube-style `@pto.tileop` helper) follows this dataflow
+pattern:
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"data_movement.cube_helper","symbol":"data_movement_cube_helper_probe","compile":{"BLOCK_M":16,"BLOCK_K":16,"BLOCK_N":16}} -->
 ```python
-@pto.cube
+@pto.tileop
 def qk_matmul(
     q_tile: pto.Tile,
     k_tile: pto.Tile,
@@ -1226,7 +1229,7 @@ Cube (producer) side:
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"pipe_communication.c2v_global_producer","symbol":"pipe_communication_c2v_global_producer_probe","compile":{}} -->
 ```python
-@pto.cube
+@pto.tileop
 def producer(src_tile: pto.Tile):
     c2v.init_cube()
     entry = c2v.alloc(split=0)
@@ -1282,7 +1285,7 @@ Cube (consumer) side:
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"pipe_communication.v2c_global_consumer","symbol":"pipe_communication_v2c_global_consumer_probe","compile":{}} -->
 ```python
-@pto.cube
+@pto.tileop
 def consumer(dst_tile: pto.Tile):
     v2c.init_cube()
     entry = v2c.pop(split=0)
@@ -1320,7 +1323,7 @@ Cube (producer) transaction:
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"pipe_communication.c2v_local_producer","symbol":"pipe_communication_c2v_local_producer_probe","compile":{}} -->
 ```python
-@pto.cube
+@pto.tileop
 def producer(src_tile: pto.Tile):
     c2v_peer.init_cube()
     c2v_peer.push(src_tile, split=0)
@@ -1385,7 +1388,7 @@ def cube_producer(
         offsets=[0, 0], sizes=[16, 16])
     a_tile = pto.alloc_tile(shape=[16, 16], dtype=pto.f32)
 
-    with pto.cube():
+    with pto.tileop():
         pto.tile.load(a_part, a_tile)
         c2v.init_cube()
         entry = c2v.alloc(split=0)
diff --git a/ptodsl/docs/user_guide/08-compute-operations.md b/ptodsl/docs/user_guide/08-compute-operations.md
index 039d9c92b5..8f5b3f623a 100644
--- a/ptodsl/docs/user_guide/08-compute-operations.md
+++ b/ptodsl/docs/user_guide/08-compute-operations.md
@@ -1720,7 +1720,7 @@ packed_high = pto.vpack(vec_i32, pto.VPackPart.HIGHER)  # upper 64 lanes -> 128
 
 ---
 
-## 8.3 Cube compute (L3 — `@pto.cube`)
+## 8.3 Cube compute (L3 — cube-style `@pto.tileop`)
 
 The Cube unit performs matrix multiplication. Its operands are typed pointers into cube-local buffers — L0A (left operand), L0B (right operand), L0C (accumulator), and BIAS. Cube data movement (`mte_l1_l0a`, `mte_l1_l0b`, `mte_l0c_ub`, etc.) was covered in Section 7.5; this section covers the compute instruction itself.
 
@@ -1824,7 +1824,7 @@ A full cube matmul follows a three-stage pattern: stage operands into L0A/L0B, c
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"data_movement.cube_helper","symbol":"data_movement_cube_helper_probe","compile":{"BLOCK_M":16,"BLOCK_K":16,"BLOCK_N":16}} -->
 ```python
-@pto.cube
+@pto.tileop
 def qk_matmul(
     q_tile: pto.Tile,
     k_tile: pto.Tile,
diff --git a/ptodsl/docs/user_guide/10-sync-ops.md b/ptodsl/docs/user_guide/10-sync-ops.md
index 01eba93e85..c0d649f01f 100644
--- a/ptodsl/docs/user_guide/10-sync-ops.md
+++ b/ptodsl/docs/user_guide/10-sync-ops.md
@@ -437,7 +437,7 @@ Where do sync operations belong in PTODSL's public entry model?
 |---------|---------------------|
 | `@pto.jit(mode="auto")` | Users can write sync explicitly when needed. PTOAS also provides an `--enable-insert-sync` option that auto-inserts `set_flag`/`wait_flag` pairs based on op-to-pipe mapping. |
 | `@pto.jit(mode="explicit")` | The compiler does not insert sync — the user is fully responsible. Place `set_flag`/`wait_flag` between MTE and compute, `mem_bar` between compute phases, `pipe_barrier` at phase boundaries. |
-| Shared `@pto.tileop` / `@pto.simt` helpers, plus retained `@pto.cube` / `@pto.simd` entry points | Cross-pipeline ordering is provided by the surrounding `@pto.jit` schedule. Helpers may still use `mem_bar` for intra-pipeline ordering when UB addresses alias. |
+| Shared `@pto.tileop` / `@pto.simt` helpers | Cross-pipeline ordering is provided by the surrounding `@pto.jit` schedule. Helpers may still use `mem_bar` for intra-pipeline ordering when UB addresses alias. |
 
 **Rule of thumb**: in `mode="auto"`, think in tiles and let the compiler handle
 orchestration. In `mode="explicit"`, think in micro-instructions and place the
diff --git a/ptodsl/docs/user_guide/11-flash-attention-walkthrough.md b/ptodsl/docs/user_guide/11-flash-attention-walkthrough.md
index 6bce455263..1cac1de4b5 100644
--- a/ptodsl/docs/user_guide/11-flash-attention-walkthrough.md
+++ b/ptodsl/docs/user_guide/11-flash-attention-walkthrough.md
@@ -11,8 +11,8 @@ flash_attention(...)           L0  user-facing wrapper
   └─ @pto.jit(entry=True, mode="explicit") flash_attention_kernel
        ├─ Tile Ops                 tile.load / tile.store at the GM↔UB boundary
        ├─ explicit orchestration   mte_load / pipe_barrier / pointer sequencing
-       ├─ @pto.cube               qk_matmul / pv_matmul
-       ├─ @pto.tileop               online_softmax_rows
+       ├─ @pto.tileop             qk_matmul / pv_matmul (cube-style)
+       ├─ @pto.tileop             online_softmax_rows (vector-style)
        └─ @pto.simt               materialize_tile_bounds / blend_output_rows
 ```
 
@@ -438,13 +438,13 @@ The simt sub-kernel blends the old output accumulator with the new PV contributi
 
 Each `pipe_barrier(Pipe.ALL)` between phases is explicit in the orchestration body. This is intentional: at the orchestration boundary, the user controls pipeline ordering. Auto mode may still use synchronization primitives where needed, but it does so around compiler-managed tile staging rather than user-authored instruction scheduling.
 
-## 11.5 Cube sub-kernel — `@pto.cube`
+## 11.5 Cube-style tileop sub-kernel
 
 ### `qk_matmul` — `S = Q @ K^T`
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"flash_attention.qk_cube_helper","symbol":"flash_attention_qk_cube_helper_probe","compile":{"BLOCK_Q":16,"BLOCK_KV":16}} -->
 ```python
-@pto.cube
+@pto.tileop
 def qk_matmul(
     q_mat: pto.Tile,
     k_mat: pto.Tile,
@@ -476,7 +476,7 @@ The cube kernel does not allocate scratch — the caller (top-level kernel) owns
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"flash_attention.pv_cube_helper","symbol":"flash_attention_pv_cube_helper_probe","compile":{"BLOCK_Q":16,"BLOCK_KV":16}} -->
 ```python
-@pto.cube
+@pto.tileop
 def pv_matmul(
     p_mat: pto.Tile,
     v_mat: pto.Tile,
diff --git a/ptodsl/docs/user_guide/12-additional-examples.md b/ptodsl/docs/user_guide/12-additional-examples.md
index 79604fef83..65a1958377 100644
--- a/ptodsl/docs/user_guide/12-additional-examples.md
+++ b/ptodsl/docs/user_guide/12-additional-examples.md
@@ -160,13 +160,16 @@ def vec_add_with_tail(
 
 ## 12.3 GEMM: matrix multiplication on the Cube unit
 
-This example demonstrates a complete GEMM kernel: `C = A @ B` where A is `[M, K]` and B is `[K, N]`. It uses `@pto.jit` for tile allocation and loop scheduling, and `@pto.cube` for the actual matrix multiply.
+This example demonstrates a complete GEMM kernel: `C = A @ B` where A is
+`[M, K]` and B is `[K, N]`. It uses `@pto.jit` for tile allocation and loop
+scheduling, and a cube-style `@pto.tileop` helper for the actual matrix
+multiply.
 
 ### 12.3.1 Cube sub-kernel
 
 <!-- ptodsl-doc-test: {"mode":"compile_fragment","fixture":"gemm.cube_helper","symbol":"gemm_tile_probe","compile":{"BLOCK_M":64,"BLOCK_K":64,"BLOCK_N":64}} -->
 ```python
-@pto.cube
+@pto.tileop
 def gemm_tile(a_mat: pto.Tile, b_mat: pto.Tile, o_tile: pto.Tile,
               a_l0a: pto.Tile, b_l0b: pto.Tile, o_acc: pto.Tile):
     m = a_mat.valid_shape[0]
@@ -382,7 +385,7 @@ def online_layernorm(
 | Tile-level data movement | `tile.load` / `tile.store` |
 | Custom row-wise vector math | `@pto.tileop` |
 | Custom per-element logic | `@pto.simt` |
-| Matrix multiply | `@pto.cube` |
+| Matrix multiply | cube-style `@pto.tileop` |
 | Micro-instruction-level control | `mode="explicit"` |
 | Inline compute for quick prototyping | `with pto.tileop():` etc. |
 
diff --git a/ptodsl/examples/fa_dn_ptodsl.py b/ptodsl/examples/fa_dn_ptodsl.py
index 1de471c0e2..df39e512b8 100644
--- a/ptodsl/examples/fa_dn_ptodsl.py
+++ b/ptodsl/examples/fa_dn_ptodsl.py
@@ -195,7 +195,7 @@ def __call__(self):
                 self.sync.free()
 
 
-@pto.cube
+@pto.tileop
 def qk_matmul_stage(
     qMatTile: pto.Tile,
     kMatTile: pto.Tile,
@@ -214,7 +214,7 @@ def qk_matmul_stage(
     pto.mte_l0c_ub(qkAccTile.as_ptr(), qkVecTileSub.as_ptr(), rows, cols, cols, cols, 0)
 
 
-@pto.cube
+@pto.tileop
 def pv_matmul_stage(
     pMatTileSub: pto.Tile,
     vMatTile: pto.Tile,
diff --git a/ptodsl/examples/flash_attention_sketch.py b/ptodsl/examples/flash_attention_sketch.py
index 77ca38db33..7006be6df5 100644
--- a/ptodsl/examples/flash_attention_sketch.py
+++ b/ptodsl/examples/flash_attention_sketch.py
@@ -16,8 +16,8 @@
       └─ flash_attention_kernel   (@pto.jit, mode="explicit")
            ├─ Tile Ops                 tile.load / tile.store at the GM↔UB boundary
            ├─ explicit orchestration   mte_load / pipe_barrier / pointer sequencing
-           ├─ @pto.cube               matrix products (QK^T and P@V)
-           ├─ @pto.tileop               row-wise online softmax
+           ├─ @pto.tileop             matrix products (QK^T and P@V, cube-style)
+           ├─ @pto.tileop             row-wise online softmax (vector-style)
            └─ @pto.simt               scalar metadata and output blending
 
 Design rules illustrated here:
@@ -39,9 +39,10 @@
    such as ``mte_load`` are used instead of tile ops where needed.
    ``mte_load`` / ``mte_store`` accept partitions and tiles directly,
    deriving strides and burst sizes from the type metadata.
-6. ``simd`` / ``simt`` / ``cube`` are hardware boundaries. They do not expose
-   vreg values across the function boundary. Data crosses the boundary through
-   UB-backed tiles or typed UB pointers only.
+6. ``tileop`` / ``simt`` are the public helper boundaries. ``@pto.tileop``
+   covers both vector-style and cube-style helper bodies, but transient
+   hardware-local values still do not cross the function boundary. Data crosses
+   the boundary through UB-backed tiles or typed UB pointers only.
 7. Named sub-kernels are reusable wherever their parameter contract is
    satisfied. This sketch uses the explicit ``@pto.jit(mode="explicit")`` path
    because it needs user-ordered DMA and phase barriers; smaller kernels can
@@ -379,7 +380,7 @@ def flash_attention_kernel(
 # - No implicit global-memory access happens inside these kernels.
 
 
-@pto.cube
+@pto.tileop
 def qk_matmul(
     q_mat: pto.Tile,       # MAT, [Br, dim]
     k_mat: pto.Tile,       # MAT, [Bc, dim]
@@ -406,7 +407,7 @@ def qk_matmul(
     pto.mte_l0c_ub(s_acc.as_ptr(), s_tile.as_ptr(), m, n, n, n, 0)
 
 
-@pto.cube
+@pto.tileop
 def pv_matmul(
     p_mat: pto.Tile,       # MAT, [Br, Bc]
     v_mat: pto.Tile,       # MAT, [Bc, dim]
@@ -677,7 +678,7 @@ def kv_block_process(
 # │                                                                            │
 # │   Key idea: one place owns the "how this block runs on hardware" story.   │
 # ├──────────────────────────────────────────────────────────────────────────┤
-# │ @pto.cube           Matrix-product kernels                                 │
+# │ @pto.tileop         Matrix-product kernels (cube-style bodies)             │
 # │                                                                            │
 # │   qk_matmul: Q @ K^T                                                       │
 # │   pv_matmul: P @ V                                                         │
@@ -685,7 +686,7 @@ def kv_block_process(
 # │                                                                            │
 # │   Key idea: UB tiles are inputs/outputs; cube-local state is explicit.    │
 # ├──────────────────────────────────────────────────────────────────────────┤
-# │ @pto.tileop           Row-wise vector math                                   │
+# │ @pto.tileop         Row-wise vector math                                   │
 # │                                                                            │
 # │   online_softmax_rows                                                      │
 # │   vreg stays local; persistent state is written back to UB tiles           │
diff --git a/ptodsl/ptodsl/_diagnostics.py b/ptodsl/ptodsl/_diagnostics.py
index d7fdb1264b..f39f8a8af1 100644
--- a/ptodsl/ptodsl/_diagnostics.py
+++ b/ptodsl/ptodsl/_diagnostics.py
@@ -366,6 +366,15 @@ def invalid_jit_backend_error(
     )
 
 
+def legacy_subkernel_surface_error(surface: str) -> TypeError:
+    """Return one diagnostic for removed legacy subkernel public surfaces."""
+    return TypeError(
+        f"{surface} is a legacy PTODSL subkernel surface and is no longer supported. "
+        "Use @pto.tileop for named custom-op helpers, or inline custom-op code with "
+        "`with pto.tileop():`. Keep @pto.simt only for launched SIMT helpers."
+    )
+
+
 def unsupported_public_surface_error(name: str) -> AttributeError:
     """Return one diagnostic for unsupported names on the public ``pto`` surface."""
     hints = {
@@ -432,6 +441,7 @@ def unsupported_public_surface_error(name: str) -> AttributeError:
     "kernel_module_return_value_error",
     "invalid_jit_mode_error",
     "invalid_jit_backend_error",
+    "legacy_subkernel_surface_error",
     "jit_legacy_tensor_spec_helper_error",
     "native_python_control_flow_error",
     "simd_value_escape_error",
diff --git a/ptodsl/ptodsl/_subkernels.py b/ptodsl/ptodsl/_subkernels.py
index 9d1ab64d2d..2cb228351d 100644
--- a/ptodsl/ptodsl/_subkernels.py
+++ b/ptodsl/ptodsl/_subkernels.py
@@ -17,6 +17,7 @@
 from ._diagnostics import (
     illegal_inline_subkernel_placement_error,
     illegal_subkernel_placement_error,
+    legacy_subkernel_surface_error,
     simd_value_escape_error,
     subkernel_argument_type_error,
     subkernel_host_tensor_boundary_error,
@@ -502,6 +503,8 @@ def __init__(
         self._session_cm = None
 
     def __call__(self, fn):
+        if self._role in {KernelRole.CUBE, KernelRole.SIMD}:
+            raise legacy_subkernel_surface_error(f"@pto.{self._role.value}")
         return SubkernelTemplate(
             SubkernelSpec(
                 role=self._role,
@@ -515,6 +518,8 @@ def __call__(self, fn):
         )
 
     def __enter__(self):
+        if self._role in {KernelRole.CUBE, KernelRole.SIMD}:
+            raise legacy_subkernel_surface_error(f"with pto.{self._role.value}()")
         if self._role == KernelRole.SIMT and (
             self._simt_max_threads is not None or self._simt_max_regs is not None
         ):
diff --git a/ptodsl/ptodsl/_tracing/session.py b/ptodsl/ptodsl/_tracing/session.py
index a8eafc5601..3337420d11 100644
--- a/ptodsl/ptodsl/_tracing/session.py
+++ b/ptodsl/ptodsl/_tracing/session.py
@@ -242,12 +242,7 @@ def _canonical_helper_role(self, role: str) -> str:
         return role
 
     def _create_inline_subkernel_wrapper(self, role: str):
-        if role in {"tileop", "simd"}:
-            wrapper_op = _pto.SectionVectorOp()
-        elif role == "cube":
-            wrapper_op = _pto.SectionCubeOp()
-        else:
-            wrapper_op = self._create_subkernel_section_op(role)
+        wrapper_op = self._create_subkernel_section_op(role)
         if wrapper_op is None:
             wrapper_op = _pto.VecScopeOp()
         body_block = wrapper_op.body.blocks.append()
@@ -424,10 +419,7 @@ def _remap_captured_operands(self, root_ops, capture_mapping) -> None:
 
     def _outline_inline_subkernel(self, outline_frame: InlineSubkernelOutlineFrame) -> None:
         role = outline_frame.trace_frame.role
-        if role in {"tileop", "simd", "cube"}:
-            root_ops = (outline_frame.wrapper_op,)
-        else:
-            root_ops = tuple(outline_frame.body_block.operations)
+        root_ops = tuple(outline_frame.body_block.operations)
 
         defined_values = self._collect_defined_values(root_ops)
         captures = self._collect_capture_values(root_ops)
@@ -456,15 +448,11 @@ def _outline_inline_subkernel(self, outline_frame: InlineSubkernelOutlineFrame)
             terminator = func.ReturnOp([])
         return_anchor = terminator.operation.opview
 
-        if role in {"tileop", "simd", "cube"}:
-            outline_frame.wrapper_op.move_before(return_anchor)
-            outlined_roots = (outline_frame.wrapper_op,)
-        else:
-            body_ops = tuple(outline_frame.body_block.operations)
-            for op_view in body_ops:
-                op_view.move_before(return_anchor)
-            outline_frame.wrapper_op.operation.erase()
-            outlined_roots = body_ops
+        body_ops = tuple(outline_frame.body_block.operations)
+        for op_view in body_ops:
+            op_view.move_before(return_anchor)
+        outline_frame.wrapper_op.operation.erase()
+        outlined_roots = body_ops
 
         capture_mapping = dict(zip(captures, entry_block.arguments))
         self._remap_captured_operands(outlined_roots, capture_mapping)
diff --git a/ptodsl/tests/support/docs_fragment_fixtures.py b/ptodsl/tests/support/docs_fragment_fixtures.py
index 20a040dac3..da0e7aed3e 100644
--- a/ptodsl/tests/support/docs_fragment_fixtures.py
+++ b/ptodsl/tests/support/docs_fragment_fixtures.py
@@ -586,7 +586,7 @@ def kernel_entry_explicit_signature_probe(
     ),
     "kernel_entry.explicit_body": _fixture(
         f"""
-        @pto.cube
+        @pto.tileop
         def qk_matmul(q_tile: pto.Tile, k_tile: pto.Tile, s_tile: pto.Tile):
             return
 
@@ -951,12 +951,12 @@ def data_movement_explicit_dma_probe(
     ),
     "sync_ops.flag_pattern_explicit": _fixture(
         f"""
-        @pto.cube
+        @pto.tileop
         def qk_matmul(q_tile: pto.Tile, k_tile: pto.Tile, p_tile: pto.Tile):
             return
 
 
-        @pto.cube
+        @pto.tileop
         def pv_matmul(p_tile: pto.Tile, v_tile: pto.Tile, o_tile: pto.Tile):
             return
 
@@ -1000,7 +1000,7 @@ def sync_ops_flag_pattern_explicit_probe(
     ),
     "sync_ops.phase_barrier_explicit": _fixture(
         f"""
-        @pto.cube
+        @pto.tileop
         def qk_matmul(q_tile: pto.Tile, k_tile: pto.Tile, s_tile: pto.Tile):
             return
 
@@ -1010,7 +1010,7 @@ def online_softmax(s_tile: pto.Tile, p_tile: pto.Tile, rows: pto.i32, cols: pto.
             return
 
 
-        @pto.cube
+        @pto.tileop
         def pv_matmul(p_tile: pto.Tile, v_tile: pto.Tile, pv_tile: pto.Tile):
             return
 
@@ -1113,7 +1113,7 @@ def data_movement_tile_slice_1d_probe(
     ),
     "data_movement.cube_helper": _fixture(
         f"""
-        @pto.cube
+        @pto.tileop
         def qk_matmul(
             q_tile: pto.Tile,
             k_tile: pto.Tile,
@@ -1492,7 +1492,7 @@ def flash_attention_l1_loop_body_probe(
     ),
     "flash_attention.explicit_phase": _fixture(
         f"""
-        @pto.cube
+        @pto.tileop
         def qk_matmul(
             q_mat: pto.Tile,
             k_mat: pto.Tile,
@@ -1521,7 +1521,7 @@ def online_softmax_rows(
             return
 
 
-        @pto.cube
+        @pto.tileop
         def pv_matmul(
             p_mat: pto.Tile,
             v_mat: pto.Tile,
@@ -1869,7 +1869,7 @@ def flash_attention_simt_blend_probe(*, BLOCK: pto.const_expr = 8):
     ),
     "gemm.cube_helper": _fixture(
         f"""
-        @pto.cube
+        @pto.tileop
         def gemm_tile(
             a_mat: pto.Tile,
             b_mat: pto.Tile,
@@ -1894,7 +1894,7 @@ def gemm_tile_probe(*, BLOCK_M: pto.const_expr = 64, BLOCK_K: pto.const_expr = 6
     ),
     "gemm.jit_kernel": _fixture(
         f"""
-        @pto.cube
+        @pto.tileop
         def gemm_tile(
             a_mat: pto.Tile,
             b_mat: pto.Tile,
diff --git a/ptodsl/tests/test_jit_compile.py b/ptodsl/tests/test_jit_compile.py
index 8d9c56d3c6..45fb1d46e4 100644
--- a/ptodsl/tests/test_jit_compile.py
+++ b/ptodsl/tests/test_jit_compile.py
@@ -237,7 +237,7 @@ def process_tile_module(
     rows: pto.i32,
     cols: pto.i32,
 ):
-    with pto.simd():
+    with pto.tileop():
         vec = pto.elements_per_vreg(pto.f32)
         initial_remained = cols
         with pto.for_(0, rows, step=1) as r:
@@ -259,7 +259,7 @@ def explicit_vpto_kernel_module(
     o_tile: pto.Tile,
     cols: pto.i32,
 ):
-    with pto.simd():
+    with pto.tileop():
         remained = cols
         vec = pto.elements_per_vreg(pto.f32)
         loop = pto.for_(0, cols, step=vec).carry(remained=remained)
@@ -277,7 +277,7 @@ def process_row_ptr_kernel_module(
     dst_gm: pto.ptr(pto.f32, "gm"),
     row: pto.i32,
 ):
-    with pto.simd():
+    with pto.tileop():
         c0_i64 = pto.const(0, dtype=pto.i64)
         row_offset = row * 16
         src_row = pto.addptr(src_gm, row_offset)
@@ -360,7 +360,7 @@ def emitc_entry_calls_vpto_kernel_module_probe(
         process_row_ptr_kernel_module(A_ptr, O_ptr, row)
 
 
-@pto.simd
+@pto.tileop
 def emitc_vpto_kernel_module_callsite_simd_helper(
     src_tile: pto.Tile,
     dst_tile: pto.Tile,
@@ -657,7 +657,7 @@ def nested_tileop_probe():
     SUBKERNEL_OBSERVATIONS.append((frame.role, frame.symbol_name, session.subkernel_stack_depth))
 
 
-@pto.cube
+@pto.tileop
 def top_level_cube_probe():
     session = current_session()
     frame = session.current_subkernel
@@ -671,7 +671,7 @@ def top_level_tileop_probe():
     SUBKERNEL_OBSERVATIONS.append((frame.role, frame.symbol_name, session.subkernel_stack_depth))
 
 
-@pto.simd
+@pto.tileop
 def top_level_simd_probe():
     session = current_session()
     frame = session.current_subkernel
@@ -699,7 +699,7 @@ def tileop_view_boundary_probe(
     return rows
 
 
-@pto.simd
+@pto.tileop
 def simd_view_boundary_probe(
     src_view: pto.TensorView,
     src_part: pto.PartitionTensorView,
@@ -770,11 +770,11 @@ def inline_subkernel_scope_probe(*, TRACE_TOKEN: pto.const_expr = 0):
         frame = session.current_subkernel
         INLINE_SUBKERNEL_SCOPE_OBSERVATIONS.append((frame.role, frame.symbol_name, session.subkernel_stack_depth))
         pto.pipe_barrier(pto.Pipe.ALL)
-    with pto.simd():
+    with pto.tileop():
         frame = session.current_subkernel
         INLINE_SUBKERNEL_SCOPE_OBSERVATIONS.append((frame.role, frame.symbol_name, session.subkernel_stack_depth))
         pto.pipe_barrier(pto.Pipe.ALL)
-    with pto.cube():
+    with pto.tileop():
         frame = session.current_subkernel
         INLINE_SUBKERNEL_SCOPE_OBSERVATIONS.append((frame.role, frame.symbol_name, session.subkernel_stack_depth))
         pto.pipe_barrier(pto.Pipe.ALL)
@@ -937,7 +937,7 @@ def simt_invalid_atomic_signedness_probe(gm: pto.ptr(pto.f32, "gm")):
     pto.atomic_add(gm, value, signedness="signed")
 
 
-@pto.simd
+@pto.tileop
 def ast_subkernel_runtime_for_helper(rows: pto.i32):
     for row in range(0, rows, 1):
         _ = row
@@ -1355,7 +1355,7 @@ def ast_signature_closure_default_kernel(*, BLOCK: pto.const_expr = limit):
 def make_ast_rebound_subkernel_probe():
     limit = 2
 
-    @pto.simd
+    @pto.tileop
     def helper():
         for _ in pto.static_range(limit):
             pto.pipe_barrier(pto.Pipe.ALL)
@@ -1393,7 +1393,7 @@ def make_sourceless_subkernel_entry():
     namespace = {"pto": pto}
     exec(
         """
-@pto.simd
+@pto.tileop
 def sourceless_subkernel_helper():
     if True:
         pto.pipe_barrier(pto.Pipe.ALL)
@@ -1539,7 +1539,7 @@ def host_runtime_scalar_entry_probe(
     pto.tile.store(o_tile, o_part)
 
 
-@pto.simd
+@pto.tileop
 def tile_slice_vector_probe(inp_tile: pto.Tile, out_tile: pto.Tile, row: pto.index):
     mask, _ = pto.plt_b32(pto.const(64, dtype=pto.i32))
     vec = pto.vlds(inp_tile[row, 0:])
@@ -1729,7 +1729,7 @@ def shared_index_coercion_probe():
         pto.wait_flag(pto.Pipe.V, pto.Pipe.MTE2, event_id=limit)
 
 
-@pto.simd
+@pto.tileop
 def public_vector_surface_probe(inp_tile: pto.Tile, out_tile: pto.Tile, stats_tile: pto.Tile):
     col_mask = pto.make_mask(pto.f32, pto.const(16, dtype=pto.i32))
     row = pto.const(0)
@@ -1743,7 +1743,7 @@ def public_vector_surface_probe(inp_tile: pto.Tile, out_tile: pto.Tile, stats_ti
     scalar.store(row_sum, stats_tile[row, 1])
 
 
-@pto.cube
+@pto.tileop
 def public_cube_surface_probe(
     lhs_tile: pto.Tile,
     rhs_tile: pto.Tile,
@@ -1834,7 +1834,7 @@ def public_cube_surface_probe(
     pto.mte_l0c_ub(acc_tile.as_ptr(), out_tile.as_ptr(), m, n, n, n, split=pto.SplitMode.M, layout="nz2nd")
 
 
-@pto.cube
+@pto.tileop
 def public_cube_tile_mx_probe(
     mat_lhs: pto.Tile,
     mat_lhs_scale: pto.Tile,
@@ -3481,8 +3481,9 @@ def main() -> None:
         "mixed-backend EmitC entry should keep its top-level tile load/store path alongside the kernel-module call",
     )
     expect(
-        mixed_backend_text.count("pto.section.vector {") == 1,
-        "before PTOAS inferred normalization, the mixed-backend PTODSL IR should only carry the helper-authored explicit vector section",
+        mixed_backend_text.count("pto.section.vector {") == 0
+        and mixed_backend_text.count("pto.section.cube {") == 0,
+        "before PTOAS inferred normalization, the mixed-backend PTODSL IR should stay naked and let PTOAS infer missing sections later",
     )
     expect(
         "pto.tload" in mixed_backend_text
@@ -3522,7 +3523,7 @@ def main() -> None:
     decorated_mixed_backend_text = emitc_entry_calls_vpto_kernel_module_via_decorated_simd_probe.compile().mlir_text()
     expect_parse_roundtrip_and_verify(
         decorated_mixed_backend_text,
-        "emitc entry calling vpto kernel-module through @pto.simd specialization",
+        "emitc entry calling vpto kernel-module through @pto.tileop specialization",
     )
     decorated_helper_body = decorated_mixed_backend_text.split(
         "func.func @emitc_vpto_kernel_module_callsite_simd_helper__ptodsl_",
@@ -3534,11 +3535,11 @@ def main() -> None:
             r"\(%[a-zA-Z0-9_]+, %[a-zA-Z0-9_]+, %[a-zA-Z0-9_]+\)",
             decorated_mixed_backend_text,
         ) is not None,
-        "@pto.simd helper callsites should lower to helper function calls in the caller body",
+        "@pto.tileop helper callsites should lower to helper function calls in the caller body",
     )
     expect(
         "pto.section.vector {" not in decorated_helper_body,
-        "decorated @pto.simd helper bodies should now stay naked in PTODSL IR and rely on later PTOAS section materialization",
+        "decorated @pto.tileop helper bodies should now stay naked in PTODSL IR and rely on later PTOAS section materialization",
     )
     multi_abi_compiled = entry_calls_kernel_module_multiple_abi_probe.compile()
     multi_abi_text = multi_abi_compiled.mlir_text()
@@ -3931,9 +3932,9 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
     expect_parse_roundtrip_and_verify(shared_subkernel_text, "shared subkernel lowering specialization")
     expect(
         SUBKERNEL_OBSERVATIONS == [
-            ("cube", "top_level_cube_probe", 1),
+            ("tileop", "top_level_cube_probe", 1),
             ("tileop", "top_level_tileop_probe", 1),
-            ("simd", "top_level_simd_probe", 1),
+            ("tileop", "top_level_simd_probe", 1),
             ("tileop", "nested_tileop_probe", 1),
         ],
         f"unexpected shared subkernel lowering observations: {SUBKERNEL_OBSERVATIONS!r}",
@@ -3943,19 +3944,19 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
         and re.search(r"call @top_level_tileop_probe__ptodsl_[0-9a-f]+\(\)", shared_subkernel_text) is not None
         and re.search(r"call @top_level_simd_probe__ptodsl_[0-9a-f]+\(\)", shared_subkernel_text) is not None
         and re.search(r"call @nested_tileop_probe__ptodsl_[0-9a-f]+\(\)", shared_subkernel_text) is not None,
-        "@pto.tileop/@pto.cube/@pto.simd decorated subkernels should lower to helper calls in the caller body",
+        "@pto.tileop decorated subkernels should lower to helper calls in the caller body",
     )
     expect(
         shared_subkernel_text.count("pto.tileop.helper") == 4
         and 'pto.ptodsl.subkernel_helper = "tileop"' not in shared_subkernel_text
         and 'pto.ptodsl.subkernel_helper = "simd"' not in shared_subkernel_text
         and 'pto.ptodsl.subkernel_helper = "cube"' not in shared_subkernel_text,
-        "decorated @pto.tileop/@pto.simd/@pto.cube helpers should canonicalize to the tileop backend helper role",
+        "decorated @pto.tileop helpers should canonicalize to the tileop backend helper role",
     )
     expect(
         "pto.section.vector {" not in shared_subkernel_text
         and "pto.section.cube {" not in shared_subkernel_text,
-        "decorated @pto.tileop/@pto.simd/@pto.cube helpers should now lower as naked tileop helpers without pre-materialized sections",
+        "decorated @pto.tileop helpers should now lower as naked tileop helpers without pre-materialized sections",
     )
 
     scalar_return_subkernel_text = scalar_return_subkernel_lowering_probe.compile().mlir_text()
@@ -4002,7 +4003,7 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
             r"func\.func @simd_view_boundary_probe__ptodsl_[0-9a-f]+\([^)]*!pto\.tensor_view<[^)]*!pto\.partition_tensor_view<[^)]*!pto\.tile_buf<",
             simd_view_boundary_text,
         ) is not None,
-        "retained simd helpers should accept TensorView and PartitionTensorView formals in the same lowered helper ABI as tileop",
+        "tileop helpers should accept TensorView and PartitionTensorView formals in the same lowered helper ABI",
     )
     expect(
         "pto.tload" in simd_view_boundary_text
@@ -4010,7 +4011,7 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
             r"%[a-zA-Z0-9_]+ = call @simd_view_boundary_probe__ptodsl_[0-9a-f]+\([^)]*\) : \([^)]*!pto\.tensor_view<[^)]*!pto\.partition_tensor_view<[^)]*\) -> i32",
             simd_view_boundary_text,
         ) is not None,
-        "retained simd callsites should pass TensorView/PartitionTensorView operands through the same helper ABI and preserve scalar returns",
+        "tileop callsites should pass TensorView/PartitionTensorView operands through the same helper ABI and preserve scalar returns",
     )
 
     INLINE_SUBKERNEL_SCOPE_OBSERVATIONS.clear()
@@ -4020,8 +4021,8 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
         INLINE_SUBKERNEL_SCOPE_OBSERVATIONS == [
             ("simt", "inline_simt", 1),
             ("tileop", "inline_tileop", 1),
-            ("simd", "inline_simd", 1),
-            ("cube", "inline_cube", 1),
+            ("tileop", "inline_tileop", 1),
+            ("tileop", "inline_tileop", 1),
         ],
         f"unexpected inline subkernel scope observations: {INLINE_SUBKERNEL_SCOPE_OBSERVATIONS!r}",
     )
@@ -4031,23 +4032,22 @@ def fake_run_ptoas_cmd(cmd, *, cwd=None):
     )
     expect(
         re.search(r"call @inline_tileop_[0-9]+__ptodsl_[0-9a-f]+\([^\\n]*\)", inline_subkernel_scope_text) is not None
-        and re.search(r"call @inline_simd_[0-9]+__ptodsl_[0-9a-f]+\([^\\n]*\)", inline_subkernel_scope_text) is not None
-        and re.search(r"call @inline_cube_[0-9]+__ptodsl_[0-9a-f]+\([^\\n]*\)", inline_subkernel_scope_text) is not None,
-        "inline pto.tileop()/pto.simd()/pto.cube() scopes should each lower to one helper call",
+        and len(re.findall(r"call @inline_tileop_[0-9]+__ptodsl_[0-9a-f]+\([^\\n]*\)", inline_subkernel_scope_text)) == 3,
+        "inline pto.tileop() scopes should each lower to one helper call",
     )
     expect(
         inline_subkernel_scope_text.count("pto.tileop.helper") == 3
         and 'pto.ptodsl.subkernel_helper = "tileop"' not in inline_subkernel_scope_text
         and 'pto.ptodsl.subkernel_helper = "simd"' not in inline_subkernel_scope_text
         and 'pto.ptodsl.subkernel_helper = "cube"' not in inline_subkernel_scope_text,
-        "outlined inline tileop/simd/cube helpers should canonicalize to the tileop backend helper role",
+        "outlined inline tileop helpers should canonicalize to the tileop backend helper role",
     )
     expect(
         inline_subkernel_scope_text.count("pto.barrier <PIPE_ALL>") >= 2
-        and "pto.section.vector {" in inline_subkernel_scope_text
-        and "pto.section.cube {" in inline_subkernel_scope_text
+        and "pto.section.vector {" not in inline_subkernel_scope_text
+        and "pto.section.cube {" not in inline_subkernel_scope_text
         and "pto.store" in inline_subkernel_scope_text,
-        "outlined inline helpers should preserve the authored SIMD/Cube sections and SIMT scalar ops",
+        "outlined inline helpers should lower as naked tileop bodies while preserving SIMT scalar ops",
     )
 
     simt_text = simt_helper_lowering_probe.compile(TRACE_TOKEN=1).mlir_text()
@@ -4272,11 +4272,11 @@ def _enter_inline_simt_with_resource_attr():
     )
     expect(
         ast_subkernel_runtime_for_text.count("scf.for") == 1,
-        "@pto.simd helper should rewrite Python range(...) loops into runtime scf.for",
+        "@pto.tileop helper should rewrite Python range(...) loops into runtime scf.for",
     )
     expect(
         "pto.barrier <PIPE_ALL>" in ast_subkernel_runtime_for_text,
-        "rewritten @pto.simd helper body should lower inside the caller trace",
+        "rewritten @pto.tileop helper body should lower inside the caller trace",
     )
 
     carry_text = carry_loop_lowering_probe.compile(BLOCK=32).mlir_text()
diff --git a/ptodsl/tests/test_subkernel_diagnostics.py b/ptodsl/tests/test_subkernel_diagnostics.py
index 94fbd10d96..7f51e10780 100644
--- a/ptodsl/tests/test_subkernel_diagnostics.py
+++ b/ptodsl/tests/test_subkernel_diagnostics.py
@@ -34,35 +34,35 @@ def expect_raises(callback, exc_type, *message_fragments: str) -> None:
 
 
 def define_bad_subkernel_signature_probe():
-    @pto.simd
+    @pto.tileop
     def bad_tensor_formal(A: TensorSpec(rank=2, dtype=pto.f32)):
         pto.pipe_barrier(pto.Pipe.ALL)
 
     return bad_tensor_formal
 
 
-def define_illegal_simd_ptr_signature_probe():
+def define_legacy_simd_surface_probe():
     @pto.simd
-    def bad_ptr_formal(meta_ptr: pto.ptr(pto.i32, pto.MemorySpace.UB)):
+    def legacy_simd_probe(tile: pto.Tile):
         pto.pipe_barrier(pto.Pipe.ALL)
 
-    return bad_ptr_formal
+    return legacy_simd_probe
 
 
-def define_illegal_tileop_ptr_signature_probe():
-    @pto.tileop
-    def bad_ptr_formal(meta_ptr: pto.ptr(pto.i32, pto.MemorySpace.UB)):
+def define_legacy_cube_surface_probe():
+    @pto.cube
+    def legacy_cube_probe(tile: pto.Tile):
         pto.pipe_barrier(pto.Pipe.ALL)
 
-    return bad_ptr_formal
+    return legacy_cube_probe
 
 
-def define_illegal_cube_scalar_signature_probe():
-    @pto.cube
-    def bad_cube_formal(tile: pto.Tile, cols: pto.i32):
+def define_illegal_tileop_ptr_signature_probe():
+    @pto.tileop
+    def bad_ptr_formal(meta_ptr: pto.ptr(pto.i32, pto.MemorySpace.UB)):
         pto.pipe_barrier(pto.Pipe.ALL)
 
-    return bad_cube_formal
+    return bad_ptr_formal
 
 
 def define_removed_ukernel_surface_probe():
@@ -85,7 +85,7 @@ def bad_mode_probe():
     return bad_mode_probe
 
 
-@pto.simd
+@pto.tileop
 def host_tensor_operand_probe(tensor: pto.Tile):
     pto.pipe_barrier(pto.Pipe.ALL)
 
@@ -103,7 +103,7 @@ def nested_simt_probe():
     pto.get_tid_x()
 
 
-@pto.simd
+@pto.tileop
 def illegal_simt_placement_probe():
     nested_simt_probe()
 
@@ -113,7 +113,7 @@ def nested_simt_from_simd_entry(*, TRACE_TOKEN: pto.const_expr = 0):
     illegal_simt_placement_probe()
 
 
-@pto.simd
+@pto.tileop
 def illegal_inline_simt_placement_probe():
     with pto.simt():
         pto.get_tid_x()
@@ -124,17 +124,17 @@ def nested_inline_simt_from_simd_entry(*, TRACE_TOKEN: pto.const_expr = 0):
     illegal_inline_simt_placement_probe()
 
 
-@pto.simd
-def simd_value_escape_probe():
+@pto.tileop
+def tileop_value_escape_probe():
     return pto.pset_b32("PAT_ALL")
 
 
 @pto.jit(target="a5")
 def simd_value_escape_entry(*, TRACE_TOKEN: pto.const_expr = 0):
-    simd_value_escape_probe()
+    tileop_value_escape_probe()
 
 
-@pto.simd
+@pto.tileop
 def tile_only_probe(inp_tile: pto.Tile):
     pto.pipe_barrier(pto.Pipe.ALL)
 
@@ -195,28 +195,26 @@ def main() -> None:
     expect_raises(
         define_bad_subkernel_signature_probe,
         TypeError,
-        "@pto.simd parameter 'A' cannot be annotated with pto.tensor_spec(...)",
+        "@pto.tileop parameter 'A' cannot be annotated with pto.tensor_spec(...)",
         "@pto.jit positional parameters",
     )
     expect_raises(
-        define_illegal_simd_ptr_signature_probe,
+        define_legacy_simd_surface_probe,
         TypeError,
-        "@pto.simd parameter 'meta_ptr' uses unsupported subkernel annotation",
-        "pto.Tile / pto.TensorView / pto.PartitionTensorView parameters plus PTO scalar annotations",
-        "@pto.jit(entry=False)",
+        "@pto.simd is a legacy PTODSL subkernel surface",
+        "Use @pto.tileop",
     )
     expect_raises(
-        define_illegal_tileop_ptr_signature_probe,
+        define_legacy_cube_surface_probe,
         TypeError,
-        "@pto.tileop parameter 'meta_ptr' uses unsupported subkernel annotation",
-        "pto.Tile / pto.TensorView / pto.PartitionTensorView parameters plus PTO scalar annotations",
-        "@pto.jit(entry=False)",
+        "@pto.cube is a legacy PTODSL subkernel surface",
+        "Use @pto.tileop",
     )
     expect_raises(
-        define_illegal_cube_scalar_signature_probe,
+        define_illegal_tileop_ptr_signature_probe,
         TypeError,
-        "@pto.cube parameter 'cols' uses unsupported subkernel annotation",
-        "pto.Tile parameters only",
+        "@pto.tileop parameter 'meta_ptr' uses unsupported subkernel annotation",
+        "pto.Tile / pto.TensorView / pto.PartitionTensorView parameters plus PTO scalar annotations",
         "@pto.jit(entry=False)",
     )
     expect_raises(
@@ -230,25 +228,25 @@ def main() -> None:
         nested_simt_from_simd_entry.compile,
         RuntimeError,
         "@pto.simt helper materialization is only supported from the top-level @pto.jit body",
-        "inside @pto.simd",
+        "inside @pto.tileop",
     )
     expect_raises(
         nested_inline_simt_from_simd_entry.compile,
         RuntimeError,
         "inline pto.simt() may only be used from the top-level @pto.jit body",
-        "inside @pto.simd",
+        "inside @pto.tileop",
     )
     expect_raises(
         simd_value_escape_entry.compile,
         RuntimeError,
-        "@pto.simd cannot return transient SIMD values",
+        "@pto.tileop cannot return transient SIMD values",
         "!pto.mask<b32>",
         "Write the value back to a Tile/UB buffer instead",
     )
     expect_raises(
         illegal_subkernel_callsite_entry.compile,
         TypeError,
-        "@pto.simd argument 'inp_tile' violates the declared subkernel interface",
+        "@pto.tileop argument 'inp_tile' violates the declared subkernel interface",
         "Expected a pto.Tile value",
         "either pass a legal PTODSL boundary value or remove the subkernel decorator",
     )
diff --git a/test/dsl-st/cube_matrix_pipeline.py b/test/dsl-st/cube_matrix_pipeline.py
index ba4d6b5249..6e1dec8c02 100644
--- a/test/dsl-st/cube_matrix_pipeline.py
+++ b/test/dsl-st/cube_matrix_pipeline.py
@@ -31,7 +31,7 @@
 L0C_ADDR = 0
 
 
-@pto.cube
+@pto.tileop
 def cube_gemm_tile(
     a_mat: pto.Tile,
     b_mat: pto.Tile,
diff --git a/test/samples/TPushTPop/ptodsl/local_c2v/kernel.py b/test/samples/TPushTPop/ptodsl/local_c2v/kernel.py
index a29afd7ab8..4a0eaf2de8 100644
--- a/test/samples/TPushTPop/ptodsl/local_c2v/kernel.py
+++ b/test/samples/TPushTPop/ptodsl/local_c2v/kernel.py
@@ -69,7 +69,7 @@ def ptodsl_tpush_tpop_local_c2v_cube(
     )
     src_tile = pto.alloc_tile(shape=[_ROWS, _COLS], dtype=pto.f32)
     pto.tile.load(a_part, src_tile)
-    with pto.cube():
+    with pto.tileop():
         c2v.init_cube()
         c2v.push(src_tile, split=0)
 
@@ -93,7 +93,7 @@ def ptodsl_tpush_tpop_local_c2v_vector(
         id=0,
     )
     dst_type = pto.alloc_tile(shape=[_ROWS, _COLS], dtype=pto.f32)
-    with pto.simd():
+    with pto.tileop():
         c2v.init_simd()
         fifo_tile = c2v.pop(result_type=dst_type, split=0)
         pto.tile.store(fifo_tile, o_part)

From 876a11e60cab551c88f2b3d405b723892bdf3a8c Mon Sep 17 00:00:00 2001
From: jimmychou <47636600+jimmychou0@users.noreply.github.com>
Date: Mon, 29 Jun 2026 15:31:05 +0800
Subject: [PATCH 3/3] test(ptodsl): align frontend verify with tileop helper
 lowering

---
 ptodsl/tests/test_ptoas_frontend_verify.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ptodsl/tests/test_ptoas_frontend_verify.py b/ptodsl/tests/test_ptoas_frontend_verify.py
index 2b45d53b88..06c650d6b8 100644
--- a/ptodsl/tests/test_ptoas_frontend_verify.py
+++ b/ptodsl/tests/test_ptoas_frontend_verify.py
@@ -474,8 +474,10 @@ def main() -> None:
     expect(
         "func.func public @scale_row_kernel_module__ptodsl_" in example_vpto_child
         and 'pto.visibility = "external"' in example_vpto_child
-        and "pto.section.vector {" in example_vpto_child,
-        "mixed_backend_kernel_module.py VPTO child should expose a public helper definition with explicit vector authoring, matching the vector-helper side of mixed-external-vadd",
+        and "func.func @inline_tileop_0__ptodsl_" in example_vpto_child
+        and "pto.tileop.helper" in example_vpto_child
+        and "pto.section.vector {" not in example_vpto_child,
+        "mixed_backend_kernel_module.py VPTO child should expose a public wrapper plus a naked tileop helper body, leaving vector section materialization to later PTOAS normalization",
     )
 
     example_frontend_texts = run_ptoas_frontend_verify(